Coverage for python/lsst/obs/base/ingest.py: 16%

373 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-23 03:34 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from collections.abc import Callable, Iterable, Iterator, MutableMapping, Sized 

29from dataclasses import InitVar, dataclass 

30from multiprocessing import Pool 

31from typing import Any, ClassVar 

32 

33from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

34from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

35from lsst.afw.fits import readMetadata 

36from lsst.daf.butler import ( 

37 Butler, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetIdGenEnum, 

41 DatasetRef, 

42 DatasetType, 

43 DimensionRecord, 

44 DimensionUniverse, 

45 FileDataset, 

46 Formatter, 

47 Progress, 

48 Timespan, 

49) 

50from lsst.pex.config import ChoiceField, Config, Field 

51from lsst.pipe.base import Instrument, Task 

52from lsst.resources import ResourcePath, ResourcePathExpression 

53from lsst.utils.timer import timeMethod 

54 

55from ._instrument import makeExposureRecordFromObsInfo 

56 

57# multiprocessing.Pool is actually a function, not a type, and the real type 

58# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

59# this alias instead. 

60PoolType = Any 

61 

62 

63def _do_nothing(*args: Any, **kwargs: Any) -> None: 

64 """Do nothing. 

65 

66 This is a function that accepts anything and does nothing. 

67 For use as a default in callback arguments. 

68 """ 

69 pass 

70 

71 

72def _log_msg_counter(noun: int | Sized) -> tuple[int, str]: 

73 """Count the iterable and return the count and plural modifier. 

74 

75 Parameters 

76 ---------- 

77 noun : `Sized` or `int` 

78 Thing to count. If given an integer it is assumed to be the count 

79 to use to calculate modifier. 

80 

81 Returns 

82 ------- 

83 num : `int` 

84 Number of items found in ``noun``. 

85 modifier : `str` 

86 Character to add to the end of a string referring to these items 

87 to indicate whether it was a single item or not. Returns empty 

88 string if there is one item or "s" otherwise. 

89 

90 Examples 

91 -------- 

92 .. code-block:: python 

93 

94 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

95 """ 

96 if isinstance(noun, int): 

97 num = noun 

98 else: 

99 num = len(noun) 

100 return num, "" if num == 1 else "s" 

101 

102 

103@dataclass 

104class RawFileDatasetInfo: 

105 """Information about a single dataset within a raw file.""" 

106 

107 dataId: DataCoordinate 

108 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

109 

110 obsInfo: ObservationInfo 

111 """Standardized observation metadata extracted directly from the file 

112 headers (`astro_metadata_translator.ObservationInfo`). 

113 """ 

114 

115 

116@dataclass 

117class RawFileData: 

118 """Information about a single raw file, used during ingest.""" 

119 

120 datasets: list[RawFileDatasetInfo] 

121 """The information describing each dataset within this raw file. 

122 (`list` of `RawFileDatasetInfo`) 

123 """ 

124 

125 filename: ResourcePath 

126 """URI of the file this information was extracted from (`str`). 

127 

128 This is the path prior to ingest, not the path after ingest. 

129 """ 

130 

131 FormatterClass: type[Formatter] 

132 """Formatter class that should be used to ingest this file (`type`; as 

133 subclass of `~lsst.daf.butler.Formatter`). 

134 """ 

135 

136 instrument: Instrument | None 

137 """The `Instrument` instance associated with this file. Can be `None` 

138 if ``datasets`` is an empty list.""" 

139 

140 

141@dataclass 

142class RawExposureData: 

143 """Information about a complete raw exposure, used during ingest.""" 

144 

145 dataId: DataCoordinate 

146 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

147 """ 

148 

149 files: list[RawFileData] 

150 """List of structures containing file-level information. 

151 """ 

152 

153 universe: InitVar[DimensionUniverse] 

154 """Set of all known dimensions. 

155 """ 

156 

157 record: DimensionRecord 

158 """The exposure `DimensionRecord` that must be inserted into the 

159 `~lsst.daf.butler.Registry` prior to file-level ingest 

160 (`~lsst.daf.butler.DimensionRecord`). 

161 """ 

162 

163 dependencyRecords: dict[str, DimensionRecord] 

164 """Additional records that must be inserted into the 

165 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

166 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

167 """ 

168 

169 

170def makeTransferChoiceField( 

171 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

172) -> ChoiceField: 

173 """Create a Config field with options for transferring data between repos. 

174 

175 The allowed options for the field are exactly those supported by 

176 `lsst.daf.butler.Datastore.ingest`. 

177 

178 Parameters 

179 ---------- 

180 doc : `str` 

181 Documentation for the configuration field. 

182 default : `str`, optional 

183 Default transfer mode for the field. 

184 

185 Returns 

186 ------- 

187 field : `lsst.pex.config.ChoiceField` 

188 Configuration field. 

189 """ 

190 return ChoiceField( 

191 doc=doc, 

192 dtype=str, 

193 allowed={ 

194 "move": "move", 

195 "copy": "copy", 

196 "auto": "choice will depend on datastore", 

197 "direct": "use URI to ingested file directly in datastore", 

198 "link": "hard link falling back to symbolic link", 

199 "hardlink": "hard link", 

200 "symlink": "symbolic (soft) link", 

201 "relsymlink": "relative symbolic link", 

202 }, 

203 optional=True, 

204 default=default, 

205 ) 

206 

207 

208class RawIngestConfig(Config): 

209 """Configuration class for RawIngestTask.""" 

210 

211 transfer = makeTransferChoiceField() 

212 failFast: Field[bool] = Field( 

213 dtype=bool, 

214 default=False, 

215 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

216 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

217 ) 

218 

219 

220class RawIngestTask(Task): 

221 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

222 

223 Parameters 

224 ---------- 

225 config : `RawIngestConfig` 

226 Configuration for the task. 

227 butler : `~lsst.daf.butler.Butler` 

228 Writeable butler instance, with ``butler.run`` set to the appropriate 

229 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

230 datasets. 

231 on_success : `Callable`, optional 

232 A callback invoked when all of the raws associated with an exposure 

233 are ingested. Will be passed a list of `FileDataset` objects, each 

234 containing one or more resolved `DatasetRef` objects. If this callback 

235 raises it will interrupt the entire ingest process, even if 

236 `RawIngestConfig.failFast` is `False`. 

237 on_metadata_failure : `Callable`, optional 

238 A callback invoked when a failure occurs trying to translate the 

239 metadata for a file. Will be passed the URI and the exception, in 

240 that order, as positional arguments. Guaranteed to be called in an 

241 ``except`` block, allowing the callback to re-raise or replace (with 

242 ``raise ... from``) to override the task's usual error handling (before 

243 `RawIngestConfig.failFast` logic occurs). 

244 on_ingest_failure : `Callable`, optional 

245 A callback invoked when dimension record or dataset insertion into the 

246 database fails for an exposure. Will be passed a `RawExposureData` 

247 instance and the exception, in that order, as positional arguments. 

248 Guaranteed to be called in an ``except`` block, allowing the callback 

249 to re-raise or replace (with ``raise ... from``) to override the task's 

250 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

251 **kwargs 

252 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

253 constructor. 

254 

255 Notes 

256 ----- 

257 Each instance of `RawIngestTask` writes to the same Butler. Each 

258 invocation of `RawIngestTask.run` ingests a list of files. 

259 """ 

260 

261 ConfigClass: ClassVar[type[Config]] = RawIngestConfig 

262 

263 _DefaultName: ClassVar[str] = "ingest" 

264 

265 def getDatasetType(self) -> DatasetType: 

266 """Return the default DatasetType of the datasets ingested by this 

267 Task. 

268 

269 Returns 

270 ------- 

271 datasetType : `DatasetType` 

272 The default dataset type to use for the data being ingested. This 

273 is only used if the relevant `~lsst.pipe.base.Instrument` does not 

274 define an override. 

275 """ 

276 return DatasetType( 

277 "raw", 

278 ("instrument", "detector", "exposure"), 

279 "Exposure", 

280 universe=self.butler.dimensions, 

281 ) 

282 

283 # Mypy can not determine that the config passed to super() is this type. 

284 config: RawIngestConfig 

285 

286 def __init__( 

287 self, 

288 config: RawIngestConfig, 

289 *, 

290 butler: Butler, 

291 on_success: Callable[[list[FileDataset]], Any] = _do_nothing, 

292 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

293 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

294 **kwargs: Any, 

295 ): 

296 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

297 super().__init__(config, **kwargs) 

298 self.butler = butler 

299 self.universe = self.butler.dimensions 

300 self.datasetType = self.getDatasetType() 

301 self._on_success = on_success 

302 self._on_metadata_failure = on_metadata_failure 

303 self._on_ingest_failure = on_ingest_failure 

304 self.progress = Progress("obs.base.RawIngestTask") 

305 

306 # Import all the instrument classes so that we ensure that we 

307 # have all the relevant metadata translators loaded. 

308 Instrument.importAll(self.butler.registry) 

309 

310 # Read all the instrument records into a cache since they will be 

311 # needed later to calculate day_obs timespans, if appropriate. 

312 self._instrument_records = { 

313 rec.name: rec for rec in butler.registry.queryDimensionRecords("instrument") 

314 } 

315 

316 def _reduce_kwargs(self) -> dict[str, Any]: 

317 # Add extra parameters to pickle. 

318 return dict( 

319 **super()._reduce_kwargs(), 

320 butler=self.butler, 

321 on_success=self._on_success, 

322 on_metadata_failure=self._on_metadata_failure, 

323 on_ingest_failure=self._on_ingest_failure, 

324 ) 

325 

326 def _determine_instrument_formatter( 

327 self, dataId: DataCoordinate, filename: ResourcePath 

328 ) -> tuple[Instrument | None, type[Formatter]]: 

329 """Determine the instrument and formatter class. 

330 

331 Parameters 

332 ---------- 

333 dataId : `lsst.daf.butler.DataCoordinate` 

334 The dataId associated with this dataset. 

335 filename : `lsst.resources.ResourcePath` 

336 URI of file used for error reporting. 

337 

338 Returns 

339 ------- 

340 instrument : `Instrument` or `None` 

341 Instance of the `Instrument` associated with this dataset. `None` 

342 indicates that the instrument could not be determined. 

343 formatterClass : `type` 

344 Class to be used as the formatter for this dataset. 

345 """ 

346 # The data model currently assumes that whilst multiple datasets 

347 # can be associated with a single file, they must all share the 

348 # same formatter. 

349 try: 

350 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

351 except LookupError as e: 

352 self._on_metadata_failure(filename, e) 

353 self.log.warning( 

354 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

355 ) 

356 if self.config.failFast: 

357 raise RuntimeError( 

358 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

359 ) from e 

360 FormatterClass = Formatter 

361 # Indicate that we could not work out the instrument. 

362 instrument = None 

363 else: 

364 assert instrument is not None, "Should be guaranted by fromName succeeding." 

365 FormatterClass = instrument.getRawFormatter(dataId) 

366 return instrument, FormatterClass 

367 

368 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

369 """Extract and process metadata from a single raw file. 

370 

371 Parameters 

372 ---------- 

373 filename : `lsst.resources.ResourcePath` 

374 URI to the file. 

375 

376 Returns 

377 ------- 

378 data : `RawFileData` 

379 A structure containing the metadata extracted from the file, 

380 as well as the original filename. All fields will be populated, 

381 but the `RawFileData.dataId` attribute will be a minimal 

382 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

383 ``instrument`` field will be `None` if there is a problem 

384 with metadata extraction. 

385 

386 Notes 

387 ----- 

388 Assumes that there is a single dataset associated with the given 

389 file. Instruments using a single file to store multiple datasets 

390 must implement their own version of this method. 

391 

392 By default the method will catch all exceptions unless the ``failFast`` 

393 configuration item is `True`. If an error is encountered the 

394 `_on_metadata_failure()` method will be called. If no exceptions 

395 result and an error was encountered the returned object will have 

396 a null-instrument class and no datasets. 

397 

398 This method supports sidecar JSON files which can be used to 

399 extract metadata without having to read the data file itself. 

400 The sidecar file is always used if found. 

401 """ 

402 sidecar_fail_msg = "" # Requires prepended space when set. 

403 try: 

404 sidecar_file = filename.updatedExtension(".json") 

405 if sidecar_file.exists(): 

406 content = json.loads(sidecar_file.read()) 

407 headers = [process_sidecar_data(content)] 

408 sidecar_fail_msg = " (via sidecar)" 

409 else: 

410 # Read the metadata from the data file itself. 

411 

412 # For remote files download the entire file to get the 

413 # header. This is very inefficient and it would be better 

414 # to have some way of knowing where in the file the headers 

415 # are and to only download those parts of the file. 

416 with filename.as_local() as local_file: 

417 # Read the primary. This might be sufficient. 

418 header = readMetadata(local_file.ospath, 0) 

419 translator_class = None 

420 

421 try: 

422 # Try to work out a translator class early. 

423 translator_class = MetadataTranslator.determine_translator( 

424 header, filename=str(filename) 

425 ) 

426 except ValueError: 

427 # Primary header was not sufficient (maybe this file 

428 # has been compressed or is a MEF with minimal 

429 # primary). Read second header and merge with primary. 

430 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

431 

432 # Try again to work out a translator class, letting this 

433 # fail. 

434 if translator_class is None: 

435 translator_class = MetadataTranslator.determine_translator( 

436 header, filename=str(filename) 

437 ) 

438 

439 # Request the headers to use for ingest 

440 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header)) 

441 

442 # Add each header to the dataset list 

443 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

444 

445 except Exception as e: 

446 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

447 # Indicate to the caller that we failed to read. 

448 datasets = [] 

449 formatterClass = Formatter 

450 instrument = None 

451 self._on_metadata_failure(filename, e) 

452 if self.config.failFast: 

453 raise RuntimeError( 

454 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

455 ) from e 

456 else: 

457 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

458 # The data model currently assumes that whilst multiple datasets 

459 # can be associated with a single file, they must all share the 

460 # same formatter. 

461 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

462 if instrument is None: 

463 datasets = [] 

464 

465 return RawFileData( 

466 datasets=datasets, 

467 filename=filename, 

468 # MyPy wants this to be a non-abstract class, which is not true 

469 # for the error case where instrument is None and datasets=[]. 

470 FormatterClass=formatterClass, # type: ignore 

471 instrument=instrument, 

472 ) 

473 

474 @classmethod 

475 def getObservationInfoSubsets(cls) -> tuple[set, set]: 

476 """Return subsets of fields in the `ObservationInfo` that we care 

477 about. 

478 

479 These fields will be used in constructing an exposure record. 

480 

481 Returns 

482 ------- 

483 required : `set` 

484 Set of `ObservationInfo` field names that are required. 

485 optional : `set` 

486 Set of `ObservationInfo` field names we will use if they are 

487 available. 

488 """ 

489 # Marking the new properties "group_counter_*" and 

490 # "has_simulated_content" as required, assumes that we either 

491 # recreate any existing index/sidecar files that include translated 

492 # values, or else allow astro_metadata_translator to fill in 

493 # defaults. 

494 required = { 

495 "datetime_begin", 

496 "datetime_end", 

497 "detector_num", 

498 "exposure_group", 

499 "exposure_id", 

500 "exposure_time", 

501 "group_counter_end", 

502 "group_counter_start", 

503 "has_simulated_content", 

504 "instrument", 

505 "observation_id", 

506 "observation_type", 

507 "observing_day", 

508 "physical_filter", 

509 } 

510 optional = { 

511 "altaz_begin", 

512 "boresight_rotation_coord", 

513 "boresight_rotation_angle", 

514 "dark_time", 

515 "tracking_radec", 

516 "object", 

517 "observation_counter", 

518 "observation_reason", 

519 "observing_day_offset", 

520 "science_program", 

521 "visit_id", 

522 } 

523 return required, optional 

524 

525 def _calculate_dataset_info( 

526 self, header: MutableMapping[str, Any] | ObservationInfo, filename: ResourcePath 

527 ) -> RawFileDatasetInfo: 

528 """Calculate a RawFileDatasetInfo from the supplied information. 

529 

530 Parameters 

531 ---------- 

532 header : Mapping or `astro_metadata_translator.ObservationInfo` 

533 Header from the dataset or previously-translated content. 

534 filename : `lsst.resources.ResourcePath` 

535 Filename to use for error messages. 

536 

537 Returns 

538 ------- 

539 dataset : `RawFileDatasetInfo` 

540 The dataId, and observation information associated with this 

541 dataset. 

542 """ 

543 required, optional = self.getObservationInfoSubsets() 

544 if isinstance(header, ObservationInfo): 

545 obsInfo = header 

546 missing = [] 

547 # Need to check the required properties are present. 

548 for property in required: 

549 # getattr does not need to be protected because it is using 

550 # the defined list above containing properties that must exist. 

551 value = getattr(obsInfo, property) 

552 if value is None: 

553 missing.append(property) 

554 if missing: 

555 raise ValueError( 

556 f"Requested required properties are missing from file {filename}: {missing} (via JSON)" 

557 ) 

558 

559 else: 

560 obsInfo = ObservationInfo( 

561 header, 

562 pedantic=False, 

563 filename=str(filename), 

564 required=required, 

565 subset=required | optional, 

566 ) 

567 

568 dataId = DataCoordinate.standardize( 

569 instrument=obsInfo.instrument, 

570 exposure=obsInfo.exposure_id, 

571 detector=obsInfo.detector_num, 

572 universe=self.universe, 

573 ) 

574 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

575 

576 def locateAndReadIndexFiles( 

577 self, files: Iterable[ResourcePath] 

578 ) -> tuple[dict[ResourcePath, Any], list[ResourcePath], set[ResourcePath], set[ResourcePath]]: 

579 """Given a list of files, look for index files and read them. 

580 

581 Index files can either be explicitly in the list of files to 

582 ingest, or else located in the same directory as a file to ingest. 

583 Index entries are always used if present. 

584 

585 Parameters 

586 ---------- 

587 files : iterable over `lsst.resources.ResourcePath` 

588 URIs to the files to be ingested. 

589 

590 Returns 

591 ------- 

592 index : `dict` [`ResourcePath`, Any] 

593 Merged contents of all relevant index files found. These can 

594 be explicitly specified index files or ones found in the 

595 directory alongside a data file to be ingested. 

596 updated_files : `list` of `ResourcePath` 

597 Updated list of the input files with entries removed that were 

598 found listed in an index file. Order is not guaranteed to 

599 match the order of the files given to this routine. 

600 good_index_files: `set` [ `ResourcePath` ] 

601 Index files that were successfully read. 

602 bad_index_files: `set` [ `ResourcePath` ] 

603 Files that looked like index files but failed to read properly. 

604 """ 

605 # Convert the paths to absolute for easy comparison with index content. 

606 # Do not convert to real paths since we have to assume that index 

607 # files are in this location and not the location which it links to. 

608 files = tuple(f.abspath() for f in files) 

609 

610 # Index files must be named this. 

611 index_root_file = "_index.json" 

612 

613 # Group the files by directory. 

614 files_by_directory = defaultdict(set) 

615 

616 for path in files: 

617 directory, file_in_dir = path.split() 

618 files_by_directory[directory].add(file_in_dir) 

619 

620 # All the metadata read from index files with keys of full path. 

621 index_entries: dict[ResourcePath, Any] = {} 

622 

623 # Index files we failed to read. 

624 bad_index_files = set() 

625 

626 # Any good index files that were found and used. 

627 good_index_files = set() 

628 

629 # Look for index files in those directories. 

630 for directory, files_in_directory in files_by_directory.items(): 

631 possible_index_file = directory.join(index_root_file) 

632 if possible_index_file.exists(): 

633 # If we are explicitly requesting an index file the 

634 # messages should be different. 

635 index_msg = "inferred" 

636 is_implied = True 

637 if index_root_file in files_in_directory: 

638 index_msg = "explicit" 

639 is_implied = False 

640 

641 # Try to read the index file and catch and report any 

642 # problems. 

643 try: 

644 content = json.loads(possible_index_file.read()) 

645 index = process_index_data(content, force_dict=True) 

646 # mypy should in theory know that this is a mapping 

647 # from the overload type annotation of process_index_data. 

648 assert isinstance(index, MutableMapping) 

649 except Exception as e: 

650 # Only trigger the callback if the index file 

651 # was asked for explicitly. Triggering on implied file 

652 # might be surprising. 

653 if not is_implied: 

654 self._on_metadata_failure(possible_index_file, e) 

655 if self.config.failFast: 

656 raise RuntimeError( 

657 f"Problem reading index file from {index_msg} location {possible_index_file}" 

658 ) from e 

659 bad_index_files.add(possible_index_file) 

660 continue 

661 

662 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

663 good_index_files.add(possible_index_file) 

664 

665 # Go through the index adding entries for files. 

666 # If we have non-index files in this directory marked for 

667 # ingest we should only get index information for those. 

668 # If the index file was explicit we use all entries. 

669 if is_implied: 

670 files_to_ingest = files_in_directory 

671 else: 

672 files_to_ingest = set(index) 

673 

674 # Copy relevant metadata into a single dict for all index 

675 # entries. 

676 for file_in_dir in files_to_ingest: 

677 # Skip an explicitly specified index file. 

678 # This should never happen because an explicit index 

679 # file will force ingest of all files in the index 

680 # and not use the explicit file list. If somehow 

681 # this is not true we continue. Raising an exception 

682 # seems like the wrong thing to do since this is harmless. 

683 if file_in_dir == index_root_file: 

684 self.log.info( 

685 "Logic error found scanning directory %s. Please file ticket.", directory 

686 ) 

687 continue 

688 if file_in_dir in index: 

689 file = directory.join(file_in_dir) 

690 if file in index_entries: 

691 # ObservationInfo overrides raw metadata 

692 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

693 index_entries[file], ObservationInfo 

694 ): 

695 self.log.warning( 

696 "File %s already specified in an index file but overriding" 

697 " with ObservationInfo content from %s", 

698 file, 

699 possible_index_file, 

700 ) 

701 else: 

702 self.log.warning( 

703 "File %s already specified in an index file, ignoring content from %s", 

704 file, 

705 possible_index_file, 

706 ) 

707 # Do nothing in this case 

708 continue 

709 

710 index_entries[file] = index[file_in_dir] 

711 

712 # Remove files from list that have index entries and also 

713 # any files that we determined to be explicit index files 

714 # or any index files that we failed to read. 

715 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

716 

717 # The filtered list loses the initial order. Retaining the order 

718 # is good for testing but does have a cost if there are many 

719 # files when copying the good values out. A dict would have faster 

720 # lookups (using the files as keys) but use more memory. 

721 ordered = [f for f in filtered if f in files] 

722 

723 return index_entries, ordered, good_index_files, bad_index_files 

724 

725 def processIndexEntries(self, index_entries: dict[ResourcePath, Any]) -> list[RawFileData]: 

726 """Convert index entries to RawFileData. 

727 

728 Parameters 

729 ---------- 

730 index_entries : `dict` [`ResourcePath`, Any] 

731 Dict indexed by name of file to ingest and with keys either 

732 raw metadata or translated 

733 `~astro_metadata_translator.ObservationInfo`. 

734 

735 Returns 

736 ------- 

737 data : `list` [ `RawFileData` ] 

738 Structures containing the metadata extracted from the file, 

739 as well as the original filename. All fields will be populated, 

740 but the `RawFileData.dataId` attributes will be minimal 

741 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

742 """ 

743 fileData = [] 

744 for filename, metadata in index_entries.items(): 

745 try: 

746 datasets = [self._calculate_dataset_info(metadata, filename)] 

747 except Exception as e: 

748 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

749 datasets = [] 

750 formatterClass = Formatter 

751 instrument = None 

752 self._on_metadata_failure(filename, e) 

753 if self.config.failFast: 

754 raise RuntimeError( 

755 f"Problem extracting metadata for file {filename} found in index file" 

756 ) from e 

757 else: 

758 instrument, formatterClass = self._determine_instrument_formatter( 

759 datasets[0].dataId, filename 

760 ) 

761 if instrument is None: 

762 datasets = [] 

763 fileData.append( 

764 RawFileData( 

765 datasets=datasets, 

766 filename=filename, 

767 # MyPy wants this to be a non-abstract class, which is not 

768 # true for the error case where instrument is None and 

769 # datasets=[]. 

770 FormatterClass=formatterClass, # type: ignore 

771 instrument=instrument, 

772 ) 

773 ) 

774 return fileData 

775 

776 def groupByExposure(self, files: Iterable[RawFileData]) -> list[RawExposureData]: 

777 """Group an iterable of `RawFileData` by exposure. 

778 

779 Parameters 

780 ---------- 

781 files : iterable of `RawFileData` 

782 File-level information to group. 

783 

784 Returns 

785 ------- 

786 exposures : `list` of `RawExposureData` 

787 A list of structures that group the file-level information by 

788 exposure. All fields will be populated. The 

789 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

790 `~lsst.daf.butler.DataCoordinate` instances. 

791 """ 

792 exposureDimensions = self.universe["exposure"].graph 

793 byExposure = defaultdict(list) 

794 for f in files: 

795 # Assume that the first dataset is representative for the file. 

796 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

797 

798 return [ 

799 RawExposureData( 

800 dataId=dataId, 

801 files=exposureFiles, 

802 universe=self.universe, 

803 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

804 dependencyRecords=self.makeDependencyRecords( 

805 exposureFiles[0].datasets[0].obsInfo, self.universe 

806 ), 

807 ) 

808 for dataId, exposureFiles in byExposure.items() 

809 ] 

810 

811 def makeExposureRecord( 

812 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

813 ) -> DimensionRecord: 

814 """Construct a registry record for an exposure. 

815 

816 This is a method that subclasses will often want to customize. This can 

817 often be done by calling this base class implementation with additional 

818 ``kwargs``. 

819 

820 Parameters 

821 ---------- 

822 obsInfo : `ObservationInfo` 

823 Observation details for (one of the components of) the exposure. 

824 universe : `DimensionUniverse` 

825 Set of all known dimensions. 

826 **kwargs 

827 Additional field values for this record. 

828 

829 Returns 

830 ------- 

831 record : `DimensionRecord` 

832 The exposure record that must be inserted into the 

833 `~lsst.daf.butler.Registry` prior to file-level ingest. 

834 """ 

835 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

836 

837 def makeDependencyRecords( 

838 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

839 ) -> dict[str, DimensionRecord]: 

840 """Construct dependency records. 

841 

842 These dependency records will be inserted into the 

843 `~lsst.daf.butler.Registry` before the exposure records, because they 

844 are dependencies of the exposure. This allows an opportunity to satisfy 

845 foreign key constraints that exist because of dimensions related to the 

846 exposure. 

847 

848 This is a method that subclasses may want to customize, if they've 

849 added dimensions that relate to an exposure. 

850 

851 Parameters 

852 ---------- 

853 obsInfo : `ObservationInfo` 

854 Observation details for (one of the components of) the exposure. 

855 universe : `DimensionUniverse` 

856 Set of all known dimensions. 

857 

858 Returns 

859 ------- 

860 records : `dict` [`str`, `DimensionRecord`] 

861 The records to insert, indexed by dimension name. 

862 """ 

863 records: dict[str, DimensionRecord] = {} 

864 if "exposure" not in universe: 

865 return records 

866 exposure = universe["exposure"] 

867 if "group" in exposure.implied: 

868 records["group"] = universe["group"].RecordClass( 

869 name=obsInfo.exposure_group, 

870 instrument=obsInfo.instrument, 

871 ) 

872 if "day_obs" in exposure.implied: 

873 if (offset := getattr(obsInfo, "observing_day_offset")) is not None: 

874 offset_int = round(offset.to_value("s")) 

875 timespan = Timespan.from_day_obs(obsInfo.observing_day, offset_int) 

876 else: 

877 timespan = None 

878 records["day_obs"] = universe["day_obs"].RecordClass( 

879 instrument=obsInfo.instrument, 

880 id=obsInfo.observing_day, 

881 timespan=timespan, 

882 ) 

883 return records 

884 

885 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

886 """Expand the data IDs associated with a raw exposure. 

887 

888 This adds the metadata records. 

889 

890 Parameters 

891 ---------- 

892 exposure : `RawExposureData` 

893 A structure containing information about the exposure to be 

894 ingested. Must have `RawExposureData.record` populated. Should 

895 be considered consumed upon return. 

896 

897 Returns 

898 ------- 

899 exposure : `RawExposureData` 

900 An updated version of the input structure, with 

901 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

902 updated to data IDs for which 

903 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

904 """ 

905 # We start by expanded the exposure-level data ID; we won't use that 

906 # directly in file ingest, but this lets us do some database lookups 

907 # once per exposure instead of once per file later. 

908 data.dataId = self.butler.registry.expandDataId( 

909 data.dataId, 

910 # We pass in the records we'll be inserting shortly so they aren't 

911 # looked up from the database. We do expect instrument and filter 

912 # records to be retrieved from the database here (though the 

913 # Registry may cache them so there isn't a lookup every time). 

914 records={"exposure": data.record}, 

915 ) 

916 # Now we expand the per-file (exposure+detector) data IDs. This time 

917 # we pass in the records we just retrieved from the exposure data ID 

918 # expansion. 

919 for file in data.files: 

920 for dataset in file.datasets: 

921 dataset.dataId = self.butler.registry.expandDataId( 

922 dataset.dataId, 

923 records={k: data.dataId.records[k] for k in data.dataId.dimensions.elements}, 

924 ) 

925 return data 

926 

927 def prep( 

928 self, files: Iterable[ResourcePath], *, pool: PoolType | None = None 

929 ) -> tuple[Iterator[RawExposureData], list[ResourcePath]]: 

930 """Perform all non-database-updating ingest preprocessing steps. 

931 

932 Parameters 

933 ---------- 

934 files : iterable over `str` or path-like objects 

935 Paths to the files to be ingested. Will be made absolute 

936 if they are not already. 

937 pool : `multiprocessing.Pool`, optional 

938 If not `None`, a process pool with which to parallelize some 

939 operations. 

940 

941 Returns 

942 ------- 

943 exposures : `Iterator` [ `RawExposureData` ] 

944 Data structures containing dimension records, filenames, and data 

945 IDs to be ingested (one structure for each exposure). 

946 bad_files : `list` of `str` 

947 List of all the files that could not have metadata extracted. 

948 """ 

949 mapFunc = map if pool is None else pool.imap_unordered 

950 

951 def _partition_good_bad( 

952 file_data: Iterable[RawFileData], 

953 ) -> tuple[list[RawFileData], list[ResourcePath]]: 

954 """Filter out bad files and return good with list of bad.""" 

955 good_files = [] 

956 bad_files = [] 

957 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

958 if not fileDatum.datasets: 

959 bad_files.append(fileDatum.filename) 

960 else: 

961 good_files.append(fileDatum) 

962 return good_files, bad_files 

963 

964 # Look for index files and read them. 

965 # There should be far fewer index files than data files. 

966 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

967 if bad_index_files: 

968 self.log.info("Failed to read the following explicitly requested index files:") 

969 for bad in sorted(bad_index_files): 

970 self.log.info("- %s", bad) 

971 

972 # Now convert all the index file entries to standard form for ingest. 

973 processed_bad_index_files: list[ResourcePath] = [] 

974 indexFileData = self.processIndexEntries(index_entries) 

975 if indexFileData: 

976 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

977 self.log.info( 

978 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

979 *_log_msg_counter(indexFileData), 

980 *_log_msg_counter(good_index_files), 

981 *_log_msg_counter(processed_bad_index_files), 

982 ) 

983 

984 # Extract metadata and build per-detector regions. 

985 # This could run in a subprocess so collect all output 

986 # before looking at failures. 

987 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

988 

989 # Filter out all the failed reads and store them for later 

990 # reporting. 

991 good_file_data, bad_files = _partition_good_bad(fileData) 

992 self.log.info( 

993 "Successfully extracted metadata from %d file%s with %d failure%s", 

994 *_log_msg_counter(good_file_data), 

995 *_log_msg_counter(bad_files), 

996 ) 

997 

998 # Combine with data from index files. 

999 good_file_data.extend(indexFileData) 

1000 bad_files.extend(processed_bad_index_files) 

1001 bad_files.extend(bad_index_files) 

1002 

1003 # Use that metadata to group files (and extracted metadata) by 

1004 # exposure. Never parallelized because it's intrinsically a gather 

1005 # step. 

1006 exposureData: list[RawExposureData] = self.groupByExposure(good_file_data) 

1007 

1008 # The next operation operates on RawExposureData instances (one at 

1009 # a time) in-place and then returns the modified instance. We call it 

1010 # as a pass-through instead of relying on the arguments we pass in to 

1011 # have been modified because in the parallel case those arguments are 

1012 # going to be pickled and unpickled, and I'm not certain 

1013 # multiprocessing is careful enough with that for output arguments to 

1014 # work. 

1015 

1016 # Expand the data IDs to include all dimension metadata; we need this 

1017 # because we may need to generate path templates that rely on that 

1018 # metadata. 

1019 # This is the first step that involves actual database calls (but just 

1020 # SELECTs), so if there's going to be a problem with connections vs. 

1021 # multiple processes, or lock contention (in SQLite) slowing things 

1022 # down, it'll happen here. 

1023 return mapFunc(self.expandDataIds, exposureData), bad_files 

1024 

1025 def ingestExposureDatasets( 

1026 self, 

1027 exposure: RawExposureData, 

1028 datasetType: DatasetType, 

1029 *, 

1030 run: str, 

1031 skip_existing_exposures: bool = False, 

1032 track_file_attrs: bool = True, 

1033 ) -> list[FileDataset]: 

1034 """Ingest all raw files in one exposure. 

1035 

1036 Parameters 

1037 ---------- 

1038 exposure : `RawExposureData` 

1039 A structure containing information about the exposure to be 

1040 ingested. Must have `RawExposureData.records` populated and all 

1041 data ID attributes expanded. 

1042 datasetType : `DatasetType` 

1043 The dataset type associated with this exposure. 

1044 run : `str` 

1045 Name of a RUN-type collection to write to. 

1046 skip_existing_exposures : `bool`, optional 

1047 If `True` (`False` is default), skip raws that have already been 

1048 ingested (i.e. raws for which we already have a dataset with the 

1049 same data ID in the target collection, even if from another file). 

1050 Note that this is much slower than just not passing 

1051 already-ingested files as inputs, because we still need to read and 

1052 process metadata to identify which exposures to search for. It 

1053 also will not work reliably if multiple processes are attempting to 

1054 ingest raws from the same exposure concurrently, in that different 

1055 processes may still attempt to ingest the same raw and conflict, 

1056 causing a failure that prevents other raws from the same exposure 

1057 from being ingested. 

1058 track_file_attrs : `bool`, optional 

1059 Control whether file attributes such as the size or checksum should 

1060 be tracked by the datastore. Whether this parameter is honored 

1061 depends on the specific datastore implementation. 

1062 

1063 Returns 

1064 ------- 

1065 datasets : `list` of `lsst.daf.butler.FileDataset` 

1066 Per-file structures identifying the files ingested and their 

1067 dataset representation in the data repository. 

1068 """ 

1069 if skip_existing_exposures: 

1070 existing = { 

1071 ref.dataId 

1072 for ref in self.butler.registry.queryDatasets( 

1073 datasetType, 

1074 collections=[run], 

1075 dataId=exposure.dataId, 

1076 ) 

1077 } 

1078 else: 

1079 existing = set() 

1080 

1081 # Raw files are preferentially ingested using a UUID derived from 

1082 # the collection name and dataId. 

1083 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1084 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1085 else: 

1086 mode = DatasetIdGenEnum.UNIQUE 

1087 

1088 datasets = [] 

1089 for file in exposure.files: 

1090 refs = [ 

1091 DatasetRef(datasetType, d.dataId, run=run, id_generation_mode=mode) 

1092 for d in file.datasets 

1093 if d.dataId not in existing 

1094 ] 

1095 if refs: 

1096 datasets.append( 

1097 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1098 ) 

1099 

1100 self.butler.ingest( 

1101 *datasets, 

1102 transfer=self.config.transfer, 

1103 record_validation_info=track_file_attrs, 

1104 ) 

1105 return datasets 

1106 

1107 def ingestFiles( 

1108 self, 

1109 files: Iterable[ResourcePath], 

1110 *, 

1111 pool: PoolType | None = None, 

1112 processes: int = 1, 

1113 run: str | None = None, 

1114 skip_existing_exposures: bool = False, 

1115 update_exposure_records: bool = False, 

1116 track_file_attrs: bool = True, 

1117 ) -> tuple[list[DatasetRef], list[ResourcePath], int, int, int]: 

1118 """Ingest files into a Butler data repository. 

1119 

1120 This creates any new exposure or visit Dimension entries needed to 

1121 identify the ingested files, creates new Dataset entries in the 

1122 Registry and finally ingests the files themselves into the Datastore. 

1123 Any needed instrument, detector, and physical_filter Dimension entries 

1124 must exist in the Registry before `run` is called. 

1125 

1126 Parameters 

1127 ---------- 

1128 files : iterable over `lsst.resources.ResourcePath` 

1129 URIs to the files to be ingested. 

1130 pool : `multiprocessing.Pool`, optional 

1131 If not `None`, a process pool with which to parallelize some 

1132 operations. 

1133 processes : `int`, optional 

1134 The number of processes to use. Ignored if ``pool`` is not `None`. 

1135 run : `str`, optional 

1136 Name of a RUN-type collection to write to, overriding 

1137 the default derived from the instrument name. 

1138 skip_existing_exposures : `bool`, optional 

1139 If `True` (`False` is default), skip raws that have already been 

1140 ingested (i.e. raws for which we already have a dataset with the 

1141 same data ID in the target collection, even if from another file). 

1142 Note that this is much slower than just not passing 

1143 already-ingested files as inputs, because we still need to read and 

1144 process metadata to identify which exposures to search for. It 

1145 also will not work reliably if multiple processes are attempting to 

1146 ingest raws from the same exposure concurrently, in that different 

1147 processes may still attempt to ingest the same raw and conflict, 

1148 causing a failure that prevents other raws from the same exposure 

1149 from being ingested. 

1150 update_exposure_records : `bool`, optional 

1151 If `True` (`False` is default), update existing exposure records 

1152 that conflict with the new ones instead of rejecting them. THIS IS 

1153 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1154 KNOWN TO BE BAD. This should usually be combined with 

1155 ``skip_existing_exposures=True``. 

1156 track_file_attrs : `bool`, optional 

1157 Control whether file attributes such as the size or checksum should 

1158 be tracked by the datastore. Whether this parameter is honored 

1159 depends on the specific datastore implementation. 

1160 

1161 Returns 

1162 ------- 

1163 refs : `list` of `lsst.daf.butler.DatasetRef` 

1164 Dataset references for ingested raws. 

1165 bad_files : `list` of `ResourcePath` 

1166 Given paths that could not be ingested. 

1167 n_exposures : `int` 

1168 Number of exposures successfully ingested. 

1169 n_exposures_failed : `int` 

1170 Number of exposures that failed when inserting dimension data. 

1171 n_ingests_failed : `int` 

1172 Number of exposures that failed when ingesting raw datasets. 

1173 """ 

1174 created_pool = False 

1175 if pool is None and processes > 1: 

1176 pool = Pool(processes) 

1177 created_pool = True 

1178 

1179 try: 

1180 exposureData, bad_files = self.prep(files, pool=pool) 

1181 finally: 

1182 if created_pool and pool: 

1183 # The pool is not needed any more so close it if we created 

1184 # it to ensure we clean up resources. 

1185 pool.close() 

1186 pool.join() 

1187 

1188 # Up to this point, we haven't modified the data repository at all. 

1189 # Now we finally do that, with one transaction per exposure. This is 

1190 # not parallelized at present because the performance of this step is 

1191 # limited by the database server. That may or may not change in the 

1192 # future once we increase our usage of bulk inserts and reduce our 

1193 # usage of savepoints; we've tried to get everything but the database 

1194 # operations done in advance to reduce the time spent inside 

1195 # transactions. 

1196 refs = [] 

1197 runs = set() 

1198 datasetTypes: dict[str, DatasetType] = {} 

1199 n_exposures = 0 

1200 n_exposures_failed = 0 

1201 n_ingests_failed = 0 

1202 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1203 assert exposure.record is not None, "Should be guaranteed by prep()" 

1204 self.log.debug( 

1205 "Attempting to ingest %d file%s from exposure %s:%s", 

1206 *_log_msg_counter(exposure.files), 

1207 exposure.record.instrument, 

1208 exposure.record.obs_id, 

1209 ) 

1210 

1211 try: 

1212 for name, record in exposure.dependencyRecords.items(): 

1213 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1214 inserted_or_updated = self.butler.registry.syncDimensionData( 

1215 "exposure", 

1216 exposure.record, 

1217 update=update_exposure_records, 

1218 ) 

1219 except Exception as e: 

1220 self._on_ingest_failure(exposure, e) 

1221 n_exposures_failed += 1 

1222 self.log.warning( 

1223 "Exposure %s:%s could not be registered: %s", 

1224 exposure.record.instrument, 

1225 exposure.record.obs_id, 

1226 e, 

1227 ) 

1228 if self.config.failFast: 

1229 raise e 

1230 continue 

1231 

1232 if isinstance(inserted_or_updated, dict): 

1233 # Exposure is in the registry and we updated it, so 

1234 # syncDimensionData returned a dict. 

1235 self.log.info( 

1236 "Exposure %s:%s was already present, but columns %s were updated.", 

1237 exposure.record.instrument, 

1238 exposure.record.obs_id, 

1239 str(list(inserted_or_updated.keys())), 

1240 ) 

1241 

1242 # Determine the instrument so we can work out the dataset type. 

1243 instrument = exposure.files[0].instrument 

1244 assert ( 

1245 instrument is not None 

1246 ), "file should have been removed from this list by prep if instrument could not be found" 

1247 

1248 if raw_definition := getattr(instrument, "raw_definition", None): 

1249 datasetTypeName, dimensions, storageClass = raw_definition 

1250 if not (datasetType := datasetTypes.get(datasetTypeName)): 

1251 datasetType = DatasetType( 

1252 datasetTypeName, dimensions, storageClass, universe=self.butler.dimensions 

1253 ) 

1254 else: 

1255 datasetType = self.datasetType 

1256 if datasetType.name not in datasetTypes: 

1257 self.butler.registry.registerDatasetType(datasetType) 

1258 datasetTypes[datasetType.name] = datasetType 

1259 

1260 # Override default run if nothing specified explicitly. 

1261 if run is None: 

1262 this_run = instrument.makeDefaultRawIngestRunName() 

1263 else: 

1264 this_run = run 

1265 if this_run not in runs: 

1266 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1267 runs.add(this_run) 

1268 try: 

1269 datasets_for_exposure = self.ingestExposureDatasets( 

1270 exposure, 

1271 datasetType=datasetType, 

1272 run=this_run, 

1273 skip_existing_exposures=skip_existing_exposures, 

1274 track_file_attrs=track_file_attrs, 

1275 ) 

1276 except Exception as e: 

1277 self._on_ingest_failure(exposure, e) 

1278 n_ingests_failed += 1 

1279 self.log.warning("Failed to ingest the following for reason: %s", e) 

1280 for f in exposure.files: 

1281 self.log.warning("- %s", f.filename) 

1282 if self.config.failFast: 

1283 raise e 

1284 continue 

1285 else: 

1286 self._on_success(datasets_for_exposure) 

1287 for dataset in datasets_for_exposure: 

1288 refs.extend(dataset.refs) 

1289 

1290 # Success for this exposure. 

1291 n_exposures += 1 

1292 self.log.info( 

1293 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1294 ) 

1295 

1296 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1297 

1298 @timeMethod 

1299 def run( 

1300 self, 

1301 files: Iterable[ResourcePathExpression], 

1302 *, 

1303 pool: PoolType | None = None, 

1304 processes: int = 1, 

1305 run: str | None = None, 

1306 file_filter: str | re.Pattern = r"\.fit[s]?\b", 

1307 group_files: bool = True, 

1308 skip_existing_exposures: bool = False, 

1309 update_exposure_records: bool = False, 

1310 track_file_attrs: bool = True, 

1311 ) -> list[DatasetRef]: 

1312 """Ingest files into a Butler data repository. 

1313 

1314 This creates any new exposure or visit Dimension entries needed to 

1315 identify the ingested files, creates new Dataset entries in the 

1316 Registry and finally ingests the files themselves into the Datastore. 

1317 Any needed instrument, detector, and physical_filter Dimension entries 

1318 must exist in the Registry before `run` is called. 

1319 

1320 Parameters 

1321 ---------- 

1322 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1323 Paths to the files to be ingested. Can refer to directories. 

1324 Will be made absolute if they are not already. 

1325 pool : `multiprocessing.Pool`, optional 

1326 If not `None`, a process pool with which to parallelize some 

1327 operations. 

1328 processes : `int`, optional 

1329 The number of processes to use. Ignored if ``pool`` is not `None`. 

1330 run : `str`, optional 

1331 Name of a RUN-type collection to write to, overriding 

1332 the default derived from the instrument name. 

1333 file_filter : `str` or `re.Pattern`, optional 

1334 Pattern to use to discover files to ingest within directories. 

1335 The default is to search for FITS files. The regex applies to 

1336 files within the directory. 

1337 group_files : `bool`, optional 

1338 Group files by directory if they have been discovered in 

1339 directories. Will not affect files explicitly provided. 

1340 skip_existing_exposures : `bool`, optional 

1341 If `True` (`False` is default), skip raws that have already been 

1342 ingested (i.e. raws for which we already have a dataset with the 

1343 same data ID in the target collection, even if from another file). 

1344 Note that this is much slower than just not passing 

1345 already-ingested files as inputs, because we still need to read and 

1346 process metadata to identify which exposures to search for. It 

1347 also will not work reliably if multiple processes are attempting to 

1348 ingest raws from the same exposure concurrently, in that different 

1349 processes may still attempt to ingest the same raw and conflict, 

1350 causing a failure that prevents other raws from the same exposure 

1351 from being ingested. 

1352 update_exposure_records : `bool`, optional 

1353 If `True` (`False` is default), update existing exposure records 

1354 that conflict with the new ones instead of rejecting them. THIS IS 

1355 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1356 KNOWN TO BE BAD. This should usually be combined with 

1357 ``skip_existing_exposures=True``. 

1358 track_file_attrs : `bool`, optional 

1359 Control whether file attributes such as the size or checksum should 

1360 be tracked by the datastore. Whether this parameter is honored 

1361 depends on the specific datastore implementation. 

1362 

1363 Returns 

1364 ------- 

1365 refs : `list` of `lsst.daf.butler.DatasetRef` 

1366 Dataset references for ingested raws. 

1367 

1368 Notes 

1369 ----- 

1370 This method inserts all datasets for an exposure within a transaction, 

1371 guaranteeing that partial exposures are never ingested. The exposure 

1372 dimension record is inserted with `Registry.syncDimensionData` first 

1373 (in its own transaction), which inserts only if a record with the same 

1374 primary key does not already exist. This allows different files within 

1375 the same exposure to be ingested in different runs. 

1376 """ 

1377 refs = [] 

1378 bad_files = [] 

1379 n_exposures = 0 

1380 n_exposures_failed = 0 

1381 n_ingests_failed = 0 

1382 if group_files: 

1383 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1384 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1385 group, 

1386 pool=pool, 

1387 processes=processes, 

1388 run=run, 

1389 skip_existing_exposures=skip_existing_exposures, 

1390 update_exposure_records=update_exposure_records, 

1391 track_file_attrs=track_file_attrs, 

1392 ) 

1393 refs.extend(new_refs) 

1394 bad_files.extend(bad) 

1395 n_exposures += n_exp 

1396 n_exposures_failed += n_exp_fail 

1397 n_ingests_failed += n_ingest_fail 

1398 else: 

1399 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1400 ResourcePath.findFileResources(files, file_filter, group_files), 

1401 pool=pool, 

1402 processes=processes, 

1403 run=run, 

1404 skip_existing_exposures=skip_existing_exposures, 

1405 update_exposure_records=update_exposure_records, 

1406 ) 

1407 

1408 had_failure = False 

1409 

1410 if bad_files: 

1411 had_failure = True 

1412 self.log.warning("Could not extract observation metadata from the following:") 

1413 for f in bad_files: 

1414 self.log.warning("- %s", f) 

1415 

1416 self.log.info( 

1417 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1418 " registration and %d failure%s from file ingest.", 

1419 *_log_msg_counter(n_exposures), 

1420 *_log_msg_counter(n_exposures_failed), 

1421 *_log_msg_counter(n_ingests_failed), 

1422 ) 

1423 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1424 had_failure = True 

1425 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1426 

1427 if had_failure: 

1428 raise RuntimeError("Some failures encountered during ingestion") 

1429 

1430 return refs