Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from dataclasses import dataclass, InitVar 

28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, merge_headers 

33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 ButlerURI, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetRef, 

41 DatasetType, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 FileDataset, 

45 Formatter, 

46) 

47from lsst.pex.config import Config, ChoiceField, Field 

48from lsst.pipe.base import Task, timeMethod 

49 

50from ._instrument import Instrument, makeExposureRecordFromObsInfo 

51from ._fitsRawFormatterBase import FitsRawFormatterBase 

52 

53 

54def _do_nothing(*args, **kwargs) -> None: 

55 """Do nothing. 

56 

57 This is a function that accepts anything and does nothing. 

58 For use as a default in callback arguments. 

59 """ 

60 pass 

61 

62 

63def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

64 """Count the iterable and return the count and plural modifier. 

65 

66 Parameters 

67 ---------- 

68 noun : Iterable or `int` 

69 Thing to count. If given an integer it is assumed to be the count 

70 to use to calculate modifier. 

71 

72 Returns 

73 ------- 

74 num : `int` 

75 Number of items found in ``noun``. 

76 modifier : `str` 

77 Character to add to the end of a string referring to these items 

78 to indicate whether it was a single item or not. Returns empty 

79 string if there is one item or "s" otherwise. 

80 

81 Examples 

82 -------- 

83 

84 .. code-block:: python 

85 

86 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

87 """ 

88 if isinstance(noun, int): 

89 num = noun 

90 else: 

91 num = len(noun) 

92 return num, "" if num == 1 else "s" 

93 

94 

95@dataclass 

96class RawFileDatasetInfo: 

97 """Information about a single dataset within a raw file.""" 

98 

99 dataId: DataCoordinate 

100 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

101 

102 obsInfo: ObservationInfo 

103 """Standardized observation metadata extracted directly from the file 

104 headers (`astro_metadata_translator.ObservationInfo`). 

105 """ 

106 

107 

108@dataclass 

109class RawFileData: 

110 """Information about a single raw file, used during ingest.""" 

111 

112 datasets: List[RawFileDatasetInfo] 

113 """The information describing each dataset within this raw file. 

114 (`list` of `RawFileDatasetInfo`) 

115 """ 

116 

117 filename: ButlerURI 

118 """URI of the file this information was extracted from (`str`). 

119 

120 This is the path prior to ingest, not the path after ingest. 

121 """ 

122 

123 FormatterClass: Type[FitsRawFormatterBase] 

124 """Formatter class that should be used to ingest this file (`type`; as 

125 subclass of `FitsRawFormatterBase`). 

126 """ 

127 

128 instrumentClass: Optional[Type[Instrument]] 

129 """The `Instrument` class associated with this file. Can be `None` 

130 if ``datasets`` is an empty list.""" 

131 

132 

133@dataclass 

134class RawExposureData: 

135 """Information about a complete raw exposure, used during ingest.""" 

136 

137 dataId: DataCoordinate 

138 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

139 """ 

140 

141 files: List[RawFileData] 

142 """List of structures containing file-level information. 

143 """ 

144 

145 universe: InitVar[DimensionUniverse] 

146 """Set of all known dimensions. 

147 """ 

148 

149 record: Optional[DimensionRecord] = None 

150 """The exposure `DimensionRecord` that must be inserted into the 

151 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

152 """ 

153 

154 def __post_init__(self, universe: DimensionUniverse): 

155 # We don't care which file or dataset we read metadata from, because 

156 # we're assuming they'll all be the same; just use the first ones. 

157 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

158 

159 

160def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

161 """Create a Config field with options for transferring data between repos. 

162 

163 The allowed options for the field are exactly those supported by 

164 `lsst.daf.butler.Datastore.ingest`. 

165 

166 Parameters 

167 ---------- 

168 doc : `str` 

169 Documentation for the configuration field. 

170 

171 Returns 

172 ------- 

173 field : `lsst.pex.config.ChoiceField` 

174 Configuration field. 

175 """ 

176 return ChoiceField( 

177 doc=doc, 

178 dtype=str, 

179 allowed={"move": "move", 

180 "copy": "copy", 

181 "auto": "choice will depend on datastore", 

182 "direct": "use URI to ingested file directly in datastore", 

183 "link": "hard link falling back to symbolic link", 

184 "hardlink": "hard link", 

185 "symlink": "symbolic (soft) link", 

186 "relsymlink": "relative symbolic link", 

187 }, 

188 optional=True, 

189 default=default 

190 ) 

191 

192 

193class RawIngestConfig(Config): 

194 """Configuration class for RawIngestTask.""" 

195 

196 transfer = makeTransferChoiceField() 

197 failFast = Field( 

198 dtype=bool, 

199 default=False, 

200 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

201 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

202 ) 

203 

204 

205class RawIngestTask(Task): 

206 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

207 

208 Parameters 

209 ---------- 

210 config : `RawIngestConfig` 

211 Configuration for the task. 

212 butler : `~lsst.daf.butler.Butler` 

213 Writeable butler instance, with ``butler.run`` set to the appropriate 

214 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

215 datasets. 

216 on_success : `Callable`, optional 

217 A callback invoked when all of the raws associated with an exposure 

218 are ingested. Will be passed a list of `FileDataset` objects, each 

219 containing one or more resolved `DatasetRef` objects. If this callback 

220 raises it will interrupt the entire ingest process, even if 

221 `RawIngestConfig.failFast` is `False`. 

222 on_metadata_failure : `Callable`, optional 

223 A callback invoked when a failure occurs trying to translate the 

224 metadata for a file. Will be passed the URI and the exception, in 

225 that order, as positional arguments. Guaranteed to be called in an 

226 ``except`` block, allowing the callback to re-raise or replace (with 

227 ``raise ... from``) to override the task's usual error handling (before 

228 `RawIngestConfig.failFast` logic occurs). 

229 on_ingest_failure : `Callable`, optional 

230 A callback invoked when dimension record or dataset insertion into the 

231 database fails for an exposure. Will be passed a `RawExposureData` 

232 instance and the exception, in that order, as positional arguments. 

233 Guaranteed to be called in an ``except`` block, allowing the callback 

234 to re-raise or replace (with ``raise ... from``) to override the task's 

235 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

236 **kwargs 

237 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

238 constructor. 

239 

240 Notes 

241 ----- 

242 Each instance of `RawIngestTask` writes to the same Butler. Each 

243 invocation of `RawIngestTask.run` ingests a list of files. 

244 """ 

245 

246 ConfigClass = RawIngestConfig 

247 

248 _DefaultName = "ingest" 

249 

250 def getDatasetType(self): 

251 """Return the DatasetType of the datasets ingested by this Task.""" 

252 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

253 universe=self.butler.registry.dimensions) 

254 

255 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, 

256 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

257 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

258 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

259 **kwargs: Any): 

260 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

261 super().__init__(config, **kwargs) 

262 self.butler = butler 

263 self.universe = self.butler.registry.dimensions 

264 self.datasetType = self.getDatasetType() 

265 self._on_success = on_success 

266 self._on_metadata_failure = on_metadata_failure 

267 self._on_ingest_failure = on_ingest_failure 

268 

269 # Import all the instrument classes so that we ensure that we 

270 # have all the relevant metadata translators loaded. 

271 Instrument.importAll(self.butler.registry) 

272 

273 def _reduce_kwargs(self): 

274 # Add extra parameters to pickle. 

275 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success, 

276 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure) 

277 

278 def _determine_instrument_formatter(self, dataId, filename): 

279 """Determine the instrument and formatter class. 

280 

281 Parameters 

282 ---------- 

283 dataId : `lsst.daf.butler.DataCoordinate` 

284 The dataId associated with this dataset. 

285 filename : `ButlerURI` 

286 URI of file used for error reporting. 

287 

288 Returns 

289 ------- 

290 instrument : `Instrument` or `None` 

291 Instance of the `Instrument` associated with this dataset. `None` 

292 indicates that the instrument could not be determined. 

293 formatterClass : `type` 

294 Class to be used as the formatter for this dataset. 

295 """ 

296 # The data model currently assumes that whilst multiple datasets 

297 # can be associated with a single file, they must all share the 

298 # same formatter. 

299 try: 

300 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

301 except LookupError as e: 

302 self._on_metadata_failure(filename, e) 

303 self.log.warning("Instrument %s for file %s not known to registry", 

304 dataId["instrument"], filename) 

305 if self.config.failFast: 

306 raise RuntimeError(f"Instrument {dataId['instrument']} for" 

307 f" file {filename} not known to registry") from e 

308 FormatterClass = Formatter 

309 # Indicate that we could not work out the instrument. 

310 instrument = None 

311 else: 

312 FormatterClass = instrument.getRawFormatter(dataId) 

313 return instrument, FormatterClass 

314 

315 def extractMetadata(self, filename: ButlerURI) -> RawFileData: 

316 """Extract and process metadata from a single raw file. 

317 

318 Parameters 

319 ---------- 

320 filename : `ButlerURI` 

321 URI to the file. 

322 

323 Returns 

324 ------- 

325 data : `RawFileData` 

326 A structure containing the metadata extracted from the file, 

327 as well as the original filename. All fields will be populated, 

328 but the `RawFileData.dataId` attribute will be a minimal 

329 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

330 ``instrumentClass`` field will be `None` if there is a problem 

331 with metadata extraction. 

332 

333 Notes 

334 ----- 

335 Assumes that there is a single dataset associated with the given 

336 file. Instruments using a single file to store multiple datasets 

337 must implement their own version of this method. 

338 

339 By default the method will catch all exceptions unless the ``failFast`` 

340 configuration item is `True`. If an error is encountered the 

341 `_on_metadata_failure()` method will be called. If no exceptions 

342 result and an error was encountered the returned object will have 

343 a null-instrument class and no datasets. 

344 

345 This method supports sidecar JSON files which can be used to 

346 extract metadata without having to read the data file itself. 

347 The sidecar file is always used if found. 

348 """ 

349 sidecar_fail_msg = "" # Requires prepended space when set. 

350 try: 

351 sidecar_file = filename.updatedExtension(".json") 

352 if sidecar_file.exists(): 

353 content = json.loads(sidecar_file.read()) 

354 header = process_sidecar_data(content) 

355 sidecar_fail_msg = " (via sidecar)" 

356 else: 

357 # Read the metadata from the data file itself. 

358 # Manually merge the primary and "first data" headers here 

359 # because we do not know in general if an input file has 

360 # set INHERIT=T. 

361 # For remote files download the entire file to get the 

362 # header. This is very inefficient and it would be better 

363 # to have some way of knowing where in the file the headers 

364 # are and to only download those parts of the file. 

365 with filename.as_local() as local_file: 

366 phdu = readMetadata(local_file.ospath, 0) 

367 header = merge_headers([phdu, readMetadata(local_file.ospath)], mode="overwrite") 

368 datasets = [self._calculate_dataset_info(header, filename)] 

369 except Exception as e: 

370 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

371 # Indicate to the caller that we failed to read. 

372 datasets = [] 

373 formatterClass = Formatter 

374 instrument = None 

375 self._on_metadata_failure(filename, e) 

376 if self.config.failFast: 

377 raise RuntimeError("Problem extracting metadata for file " 

378 f"{filename}{sidecar_fail_msg}") from e 

379 else: 

380 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

381 # The data model currently assumes that whilst multiple datasets 

382 # can be associated with a single file, they must all share the 

383 # same formatter. 

384 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

385 if instrument is None: 

386 datasets = [] 

387 

388 return RawFileData(datasets=datasets, filename=filename, 

389 FormatterClass=formatterClass, 

390 instrumentClass=instrument) 

391 

392 def _calculate_dataset_info(self, header, filename): 

393 """Calculate a RawFileDatasetInfo from the supplied information. 

394 

395 Parameters 

396 ---------- 

397 header : Mapping or `astro_metadata_translator.ObservationInfo` 

398 Header from the dataset or previously-translated content. 

399 filename : `ButlerURI` 

400 Filename to use for error messages. 

401 

402 Returns 

403 ------- 

404 dataset : `RawFileDatasetInfo` 

405 The dataId, and observation information associated with this 

406 dataset. 

407 """ 

408 # To ensure we aren't slowed down for no reason, explicitly 

409 # list here the properties we need for the schema. 

410 # Use a dict with values a boolean where True indicates 

411 # that it is required that we calculate this property. 

412 ingest_subset = { 

413 "altaz_begin": False, 

414 "boresight_rotation_coord": False, 

415 "boresight_rotation_angle": False, 

416 "dark_time": False, 

417 "datetime_begin": True, 

418 "datetime_end": True, 

419 "detector_num": True, 

420 "exposure_group": False, 

421 "exposure_id": True, 

422 "exposure_time": True, 

423 "instrument": True, 

424 "tracking_radec": False, 

425 "object": False, 

426 "observation_counter": False, 

427 "observation_id": True, 

428 "observation_reason": False, 

429 "observation_type": True, 

430 "observing_day": False, 

431 "physical_filter": True, 

432 "science_program": False, 

433 "visit_id": False, 

434 } 

435 

436 if isinstance(header, ObservationInfo): 

437 obsInfo = header 

438 missing = [] 

439 # Need to check the required properties are present. 

440 for property, required in ingest_subset.items(): 

441 if not required: 

442 continue 

443 # getattr does not need to be protected because it is using 

444 # the defined list above containing properties that must exist. 

445 value = getattr(obsInfo, property) 

446 if value is None: 

447 missing.append(property) 

448 if missing: 

449 raise ValueError(f"Requested required properties are missing from file {filename}:" 

450 f" {missing} (via JSON)") 

451 

452 else: 

453 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename), 

454 required={k for k in ingest_subset if ingest_subset[k]}, 

455 subset=set(ingest_subset)) 

456 

457 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

458 exposure=obsInfo.exposure_id, 

459 detector=obsInfo.detector_num, 

460 universe=self.universe) 

461 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

462 

463 def locateAndReadIndexFiles(self, files): 

464 """Given a list of files, look for index files and read them. 

465 

466 Index files can either be explicitly in the list of files to 

467 ingest, or else located in the same directory as a file to ingest. 

468 Index entries are always used if present. 

469 

470 Parameters 

471 ---------- 

472 files : iterable over `ButlerURI` 

473 URIs to the files to be ingested. 

474 

475 Returns 

476 ------- 

477 index : `dict` [`str`, Any] 

478 Merged contents of all relevant index files found. These can 

479 be explicitly specified index files or ones found in the 

480 directory alongside a data file to be ingested. 

481 updated_files : iterable of `str` 

482 Updated list of the input files with entries removed that were 

483 found listed in an index file. Order is not guaranteed to 

484 match the order of the files given to this routine. 

485 bad_index_files: `set[str]` 

486 Files that looked like index files but failed to read properly. 

487 """ 

488 # Convert the paths to absolute for easy comparison with index content. 

489 # Do not convert to real paths since we have to assume that index 

490 # files are in this location and not the location which it links to. 

491 files = tuple(f.abspath() for f in files) 

492 

493 # Index files must be named this. 

494 index_root_file = "_index.json" 

495 

496 # Group the files by directory. 

497 files_by_directory = defaultdict(set) 

498 

499 for path in files: 

500 directory, file_in_dir = path.split() 

501 files_by_directory[directory].add(file_in_dir) 

502 

503 # All the metadata read from index files with keys of full path. 

504 index_entries = {} 

505 

506 # Index files we failed to read. 

507 bad_index_files = set() 

508 

509 # Any good index files that were found and used. 

510 good_index_files = set() 

511 

512 # Look for index files in those directories. 

513 for directory, files_in_directory in files_by_directory.items(): 

514 possible_index_file = directory.join(index_root_file) 

515 if possible_index_file.exists(): 

516 # If we are explicitly requesting an index file the 

517 # messages should be different. 

518 index_msg = "inferred" 

519 is_implied = True 

520 if index_root_file in files_in_directory: 

521 index_msg = "explicit" 

522 is_implied = False 

523 

524 # Try to read the index file and catch and report any 

525 # problems. 

526 try: 

527 content = json.loads(possible_index_file.read()) 

528 index = process_index_data(content, force_dict=True) 

529 except Exception as e: 

530 # Only trigger the callback if the index file 

531 # was asked for explicitly. Triggering on implied file 

532 # might be surprising. 

533 if not is_implied: 

534 self._on_metadata_failure(possible_index_file, e) 

535 if self.config.failFast: 

536 raise RuntimeError(f"Problem reading index file from {index_msg} " 

537 f"location {possible_index_file}") from e 

538 bad_index_files.add(possible_index_file) 

539 continue 

540 

541 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

542 good_index_files.add(possible_index_file) 

543 

544 # Go through the index adding entries for files. 

545 # If we have non-index files in this directory marked for 

546 # ingest we should only get index information for those. 

547 # If the index file was explicit we use all entries. 

548 if is_implied: 

549 files_to_ingest = files_in_directory 

550 else: 

551 files_to_ingest = set(index) 

552 

553 # Copy relevant metadata into a single dict for all index 

554 # entries. 

555 for file_in_dir in files_to_ingest: 

556 # Skip an explicitly specified index file. 

557 # This should never happen because an explicit index 

558 # file will force ingest of all files in the index 

559 # and not use the explicit file list. If somehow 

560 # this is not true we continue. Raising an exception 

561 # seems like the wrong thing to do since this is harmless. 

562 if file_in_dir == index_root_file: 

563 self.log.info("Logic error found scanning directory %s. Please file ticket.", 

564 directory) 

565 continue 

566 if file_in_dir in index: 

567 file = directory.join(file_in_dir) 

568 if file in index_entries: 

569 # ObservationInfo overrides raw metadata 

570 if isinstance(index[file_in_dir], ObservationInfo) \ 

571 and not isinstance(index_entries[file], ObservationInfo): 

572 self.log.warning("File %s already specified in an index file but overriding" 

573 " with ObservationInfo content from %s", 

574 file, possible_index_file) 

575 else: 

576 self.log.warning("File %s already specified in an index file, " 

577 "ignoring content from %s", file, possible_index_file) 

578 # Do nothing in this case 

579 continue 

580 

581 index_entries[file] = index[file_in_dir] 

582 

583 # Remove files from list that have index entries and also 

584 # any files that we determined to be explicit index files 

585 # or any index files that we failed to read. 

586 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

587 

588 # The filtered list loses the initial order. Retaining the order 

589 # is good for testing but does have a cost if there are many 

590 # files when copying the good values out. A dict would have faster 

591 # lookups (using the files as keys) but use more memory. 

592 ordered = [f for f in filtered if f in files] 

593 

594 return index_entries, ordered, good_index_files, bad_index_files 

595 

596 def processIndexEntries(self, index_entries): 

597 """Convert index entries to RawFileData. 

598 

599 Parameters 

600 ---------- 

601 index_entries : `dict` [`str`, Any] 

602 Dict indexed by name of file to ingest and with keys either 

603 raw metadata or translated 

604 `~astro_metadata_translator.ObservationInfo`. 

605 

606 Returns 

607 ------- 

608 data : `RawFileData` 

609 A structure containing the metadata extracted from the file, 

610 as well as the original filename. All fields will be populated, 

611 but the `RawFileData.dataId` attribute will be a minimal 

612 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

613 """ 

614 fileData = [] 

615 for filename, metadata in index_entries.items(): 

616 try: 

617 datasets = [self._calculate_dataset_info(metadata, filename)] 

618 except Exception as e: 

619 self.log.debug("Problem extracting metadata for file %s found in index file: %s", 

620 filename, e) 

621 datasets = [] 

622 formatterClass = Formatter 

623 instrument = None 

624 self._on_metadata_failure(filename, e) 

625 if self.config.failFast: 

626 raise RuntimeError(f"Problem extracting metadata for file {filename} " 

627 "found in index file") from e 

628 else: 

629 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, 

630 filename) 

631 if instrument is None: 

632 datasets = [] 

633 fileData.append(RawFileData(datasets=datasets, filename=filename, 

634 FormatterClass=formatterClass, instrumentClass=instrument)) 

635 return fileData 

636 

637 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

638 """Group an iterable of `RawFileData` by exposure. 

639 

640 Parameters 

641 ---------- 

642 files : iterable of `RawFileData` 

643 File-level information to group. 

644 

645 Returns 

646 ------- 

647 exposures : `list` of `RawExposureData` 

648 A list of structures that group the file-level information by 

649 exposure. All fields will be populated. The 

650 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

651 `~lsst.daf.butler.DataCoordinate` instances. 

652 """ 

653 exposureDimensions = self.universe["exposure"].graph 

654 byExposure = defaultdict(list) 

655 for f in files: 

656 # Assume that the first dataset is representative for the file. 

657 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

658 

659 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

660 for dataId, exposureFiles in byExposure.items()] 

661 

662 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

663 """Expand the data IDs associated with a raw exposure. 

664 

665 This adds the metadata records. 

666 

667 Parameters 

668 ---------- 

669 exposure : `RawExposureData` 

670 A structure containing information about the exposure to be 

671 ingested. Must have `RawExposureData.records` populated. Should 

672 be considered consumed upon return. 

673 

674 Returns 

675 ------- 

676 exposure : `RawExposureData` 

677 An updated version of the input structure, with 

678 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

679 updated to data IDs for which 

680 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

681 """ 

682 # We start by expanded the exposure-level data ID; we won't use that 

683 # directly in file ingest, but this lets us do some database lookups 

684 # once per exposure instead of once per file later. 

685 data.dataId = self.butler.registry.expandDataId( 

686 data.dataId, 

687 # We pass in the records we'll be inserting shortly so they aren't 

688 # looked up from the database. We do expect instrument and filter 

689 # records to be retrieved from the database here (though the 

690 # Registry may cache them so there isn't a lookup every time). 

691 records={ 

692 self.butler.registry.dimensions["exposure"]: data.record, 

693 } 

694 ) 

695 # Now we expand the per-file (exposure+detector) data IDs. This time 

696 # we pass in the records we just retrieved from the exposure data ID 

697 # expansion. 

698 for file in data.files: 

699 for dataset in file.datasets: 

700 dataset.dataId = self.butler.registry.expandDataId( 

701 dataset.dataId, 

702 records=dict(data.dataId.records) 

703 ) 

704 return data 

705 

706 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1 

707 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

708 """Perform all non-database-updating ingest preprocessing steps. 

709 

710 Parameters 

711 ---------- 

712 files : iterable over `str` or path-like objects 

713 Paths to the files to be ingested. Will be made absolute 

714 if they are not already. 

715 pool : `multiprocessing.Pool`, optional 

716 If not `None`, a process pool with which to parallelize some 

717 operations. 

718 processes : `int`, optional 

719 The number of processes to use. Ignored if ``pool`` is not `None`. 

720 

721 Returns 

722 ------- 

723 exposures : `Iterator` [ `RawExposureData` ] 

724 Data structures containing dimension records, filenames, and data 

725 IDs to be ingested (one structure for each exposure). 

726 bad_files : `list` of `str` 

727 List of all the files that could not have metadata extracted. 

728 """ 

729 if pool is None and processes > 1: 

730 pool = Pool(processes) 

731 mapFunc = map if pool is None else pool.imap_unordered 

732 

733 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

734 """Filter out bad files and return good with list of bad.""" 

735 good_files = [] 

736 bad_files = [] 

737 for fileDatum in file_data: 

738 if not fileDatum.datasets: 

739 bad_files.append(fileDatum.filename) 

740 else: 

741 good_files.append(fileDatum) 

742 return good_files, bad_files 

743 

744 # Look for index files and read them. 

745 # There should be far fewer index files than data files. 

746 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

747 if bad_index_files: 

748 self.log.info("Failed to read the following explicitly requested index files:"), 

749 for bad in sorted(bad_index_files): 

750 self.log.info("- %s", bad) 

751 

752 # Now convert all the index file entries to standard form for ingest. 

753 bad_index_file_data = [] 

754 indexFileData = self.processIndexEntries(index_entries) 

755 if indexFileData: 

756 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

757 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s" 

758 " with %d failure%s", 

759 *_log_msg_counter(indexFileData), 

760 *_log_msg_counter(good_index_files), 

761 *_log_msg_counter(bad_index_file_data)) 

762 

763 # Extract metadata and build per-detector regions. 

764 # This could run in a subprocess so collect all output 

765 # before looking at failures. 

766 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

767 

768 # Filter out all the failed reads and store them for later 

769 # reporting. 

770 fileData, bad_files = _partition_good_bad(fileData) 

771 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

772 *_log_msg_counter(fileData), 

773 *_log_msg_counter(bad_files)) 

774 

775 # Combine with data from index files. 

776 fileData.extend(indexFileData) 

777 bad_files.extend(bad_index_file_data) 

778 bad_files.extend(bad_index_files) 

779 

780 # Use that metadata to group files (and extracted metadata) by 

781 # exposure. Never parallelized because it's intrinsically a gather 

782 # step. 

783 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

784 

785 # The next operation operates on RawExposureData instances (one at 

786 # a time) in-place and then returns the modified instance. We call it 

787 # as a pass-through instead of relying on the arguments we pass in to 

788 # have been modified because in the parallel case those arguments are 

789 # going to be pickled and unpickled, and I'm not certain 

790 # multiprocessing is careful enough with that for output arguments to 

791 # work. 

792 

793 # Expand the data IDs to include all dimension metadata; we need this 

794 # because we may need to generate path templates that rely on that 

795 # metadata. 

796 # This is the first step that involves actual database calls (but just 

797 # SELECTs), so if there's going to be a problem with connections vs. 

798 # multiple processes, or lock contention (in SQLite) slowing things 

799 # down, it'll happen here. 

800 return mapFunc(self.expandDataIds, exposureData), bad_files 

801 

802 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

803 ) -> List[FileDataset]: 

804 """Ingest all raw files in one exposure. 

805 

806 Parameters 

807 ---------- 

808 exposure : `RawExposureData` 

809 A structure containing information about the exposure to be 

810 ingested. Must have `RawExposureData.records` populated and all 

811 data ID attributes expanded. 

812 run : `str`, optional 

813 Name of a RUN-type collection to write to, overriding 

814 ``self.butler.run``. 

815 

816 Returns 

817 ------- 

818 datasets : `list` of `lsst.daf.butler.FileDataset` 

819 Per-file structures identifying the files ingested and their 

820 dataset representation in the data repository. 

821 """ 

822 datasets = [FileDataset(path=file.filename.abspath(), 

823 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

824 formatter=file.FormatterClass) 

825 for file in exposure.files] 

826 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

827 return datasets 

828 

829 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1, 

830 run: Optional[str] = None): 

831 """Ingest files into a Butler data repository. 

832 

833 This creates any new exposure or visit Dimension entries needed to 

834 identify the ingested files, creates new Dataset entries in the 

835 Registry and finally ingests the files themselves into the Datastore. 

836 Any needed instrument, detector, and physical_filter Dimension entries 

837 must exist in the Registry before `run` is called. 

838 

839 Parameters 

840 ---------- 

841 files : iterable over `ButlerURI` 

842 URIs to the files to be ingested. 

843 pool : `multiprocessing.Pool`, optional 

844 If not `None`, a process pool with which to parallelize some 

845 operations. 

846 processes : `int`, optional 

847 The number of processes to use. Ignored if ``pool`` is not `None`. 

848 run : `str`, optional 

849 Name of a RUN-type collection to write to, overriding 

850 the default derived from the instrument name. 

851 

852 Returns 

853 ------- 

854 refs : `list` of `lsst.daf.butler.DatasetRef` 

855 Dataset references for ingested raws. 

856 """ 

857 

858 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

859 

860 # Up to this point, we haven't modified the data repository at all. 

861 # Now we finally do that, with one transaction per exposure. This is 

862 # not parallelized at present because the performance of this step is 

863 # limited by the database server. That may or may not change in the 

864 # future once we increase our usage of bulk inserts and reduce our 

865 # usage of savepoints; we've tried to get everything but the database 

866 # operations done in advance to reduce the time spent inside 

867 # transactions. 

868 self.butler.registry.registerDatasetType(self.datasetType) 

869 

870 refs = [] 

871 runs = set() 

872 n_exposures = 0 

873 n_exposures_failed = 0 

874 n_ingests_failed = 0 

875 for exposure in exposureData: 

876 

877 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

878 *_log_msg_counter(exposure.files), 

879 exposure.record.instrument, exposure.record.obs_id) 

880 

881 try: 

882 self.butler.registry.syncDimensionData("exposure", exposure.record) 

883 except Exception as e: 

884 self._on_ingest_failure(exposure, e) 

885 n_exposures_failed += 1 

886 self.log.warning("Exposure %s:%s could not be registered: %s", 

887 exposure.record.instrument, exposure.record.obs_id, e) 

888 if self.config.failFast: 

889 raise e 

890 continue 

891 

892 # Override default run if nothing specified explicitly. 

893 if run is None: 

894 instrumentClass = exposure.files[0].instrumentClass 

895 this_run = instrumentClass.makeDefaultRawIngestRunName() 

896 else: 

897 this_run = run 

898 if this_run not in runs: 

899 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

900 runs.add(this_run) 

901 try: 

902 with self.butler.transaction(): 

903 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run) 

904 except Exception as e: 

905 self._on_ingest_failure(exposure, e) 

906 n_ingests_failed += 1 

907 self.log.warning("Failed to ingest the following for reason: %s", e) 

908 for f in exposure.files: 

909 self.log.warning("- %s", f.filename) 

910 if self.config.failFast: 

911 raise e 

912 continue 

913 else: 

914 self._on_success(datasets_for_exposure) 

915 for dataset in datasets_for_exposure: 

916 refs.extend(dataset.refs) 

917 

918 # Success for this exposure. 

919 n_exposures += 1 

920 self.log.info("Exposure %s:%s ingested successfully", 

921 exposure.record.instrument, exposure.record.obs_id) 

922 

923 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

924 

925 @timeMethod 

926 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None, 

927 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True): 

928 """Ingest files into a Butler data repository. 

929 

930 This creates any new exposure or visit Dimension entries needed to 

931 identify the ingested files, creates new Dataset entries in the 

932 Registry and finally ingests the files themselves into the Datastore. 

933 Any needed instrument, detector, and physical_filter Dimension entries 

934 must exist in the Registry before `run` is called. 

935 

936 Parameters 

937 ---------- 

938 files : iterable over `ButlerURI`, `str` or path-like objects 

939 Paths to the files to be ingested. Can refer to directories. 

940 Will be made absolute if they are not already. 

941 pool : `multiprocessing.Pool`, optional 

942 If not `None`, a process pool with which to parallelize some 

943 operations. 

944 processes : `int`, optional 

945 The number of processes to use. Ignored if ``pool`` is not `None`. 

946 run : `str`, optional 

947 Name of a RUN-type collection to write to, overriding 

948 the default derived from the instrument name. 

949 file_filter : `str` or `re.Pattern`, optional 

950 Pattern to use to discover files to ingest within directories. 

951 The default is to search for FITS files. The regex applies to 

952 files within the directory. 

953 group_files : `bool`, optional 

954 Group files by directory if they have been discovered in 

955 directories. Will not affect files explicitly provided. 

956 

957 Returns 

958 ------- 

959 refs : `list` of `lsst.daf.butler.DatasetRef` 

960 Dataset references for ingested raws. 

961 

962 Notes 

963 ----- 

964 This method inserts all datasets for an exposure within a transaction, 

965 guaranteeing that partial exposures are never ingested. The exposure 

966 dimension record is inserted with `Registry.syncDimensionData` first 

967 (in its own transaction), which inserts only if a record with the same 

968 primary key does not already exist. This allows different files within 

969 the same exposure to be incremented in different runs. 

970 """ 

971 

972 refs = [] 

973 bad_files = [] 

974 n_exposures = 0 

975 n_exposures_failed = 0 

976 n_ingests_failed = 0 

977 if group_files: 

978 for group in ButlerURI.findFileResources(files, file_filter, group_files): 

979 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(group, pool=pool, 

980 processes=processes, 

981 run=run) 

982 refs.extend(new_refs) 

983 bad_files.extend(bad) 

984 n_exposures += n_exp 

985 n_exposures_failed += n_exp_fail 

986 n_ingests_failed += n_ingest_fail 

987 else: 

988 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

989 ButlerURI.findFileResources(files, file_filter, group_files), 

990 pool=pool, 

991 processes=processes, 

992 run=run, 

993 ) 

994 

995 had_failure = False 

996 

997 if bad_files: 

998 had_failure = True 

999 self.log.warning("Could not extract observation metadata from the following:") 

1000 for f in bad_files: 

1001 self.log.warning("- %s", f) 

1002 

1003 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1004 " registration and %d failure%s from file ingest.", 

1005 *_log_msg_counter(n_exposures), 

1006 *_log_msg_counter(n_exposures_failed), 

1007 *_log_msg_counter(n_ingests_failed)) 

1008 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1009 had_failure = True 

1010 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1011 

1012 if had_failure: 

1013 raise RuntimeError("Some failures encountered during ingestion") 

1014 

1015 return refs