Coverage for python/lsst/obs/base/ingest.py: 17%

358 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-10 12:00 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from collections.abc import Callable, Iterable, Iterator, MutableMapping, Sized 

29from dataclasses import InitVar, dataclass 

30from multiprocessing import Pool 

31from typing import Any, ClassVar 

32 

33from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

34from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

35from lsst.afw.fits import readMetadata 

36from lsst.daf.butler import ( 

37 Butler, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetIdGenEnum, 

41 DatasetRef, 

42 DatasetType, 

43 DimensionRecord, 

44 DimensionUniverse, 

45 FileDataset, 

46 Formatter, 

47 Progress, 

48) 

49from lsst.pex.config import ChoiceField, Config, Field 

50from lsst.pipe.base import Instrument, Task 

51from lsst.resources import ResourcePath, ResourcePathExpression 

52from lsst.utils.timer import timeMethod 

53 

54from ._instrument import makeExposureRecordFromObsInfo 

55 

56# multiprocessing.Pool is actually a function, not a type, and the real type 

57# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

58# this alias instead. 

59PoolType = Any 

60 

61 

62def _do_nothing(*args: Any, **kwargs: Any) -> None: 

63 """Do nothing. 

64 

65 This is a function that accepts anything and does nothing. 

66 For use as a default in callback arguments. 

67 """ 

68 pass 

69 

70 

71def _log_msg_counter(noun: int | Sized) -> tuple[int, str]: 

72 """Count the iterable and return the count and plural modifier. 

73 

74 Parameters 

75 ---------- 

76 noun : `Sized` or `int` 

77 Thing to count. If given an integer it is assumed to be the count 

78 to use to calculate modifier. 

79 

80 Returns 

81 ------- 

82 num : `int` 

83 Number of items found in ``noun``. 

84 modifier : `str` 

85 Character to add to the end of a string referring to these items 

86 to indicate whether it was a single item or not. Returns empty 

87 string if there is one item or "s" otherwise. 

88 

89 Examples 

90 -------- 

91 .. code-block:: python 

92 

93 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

94 """ 

95 if isinstance(noun, int): 

96 num = noun 

97 else: 

98 num = len(noun) 

99 return num, "" if num == 1 else "s" 

100 

101 

102@dataclass 

103class RawFileDatasetInfo: 

104 """Information about a single dataset within a raw file.""" 

105 

106 dataId: DataCoordinate 

107 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

108 

109 obsInfo: ObservationInfo 

110 """Standardized observation metadata extracted directly from the file 

111 headers (`astro_metadata_translator.ObservationInfo`). 

112 """ 

113 

114 

115@dataclass 

116class RawFileData: 

117 """Information about a single raw file, used during ingest.""" 

118 

119 datasets: list[RawFileDatasetInfo] 

120 """The information describing each dataset within this raw file. 

121 (`list` of `RawFileDatasetInfo`) 

122 """ 

123 

124 filename: ResourcePath 

125 """URI of the file this information was extracted from (`str`). 

126 

127 This is the path prior to ingest, not the path after ingest. 

128 """ 

129 

130 FormatterClass: type[Formatter] 

131 """Formatter class that should be used to ingest this file (`type`; as 

132 subclass of `~lsst.daf.butler.Formatter`). 

133 """ 

134 

135 instrument: Instrument | None 

136 """The `Instrument` instance associated with this file. Can be `None` 

137 if ``datasets`` is an empty list.""" 

138 

139 

140@dataclass 

141class RawExposureData: 

142 """Information about a complete raw exposure, used during ingest.""" 

143 

144 dataId: DataCoordinate 

145 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

146 """ 

147 

148 files: list[RawFileData] 

149 """List of structures containing file-level information. 

150 """ 

151 

152 universe: InitVar[DimensionUniverse] 

153 """Set of all known dimensions. 

154 """ 

155 

156 record: DimensionRecord 

157 """The exposure `DimensionRecord` that must be inserted into the 

158 `~lsst.daf.butler.Registry` prior to file-level ingest 

159 (`~lsst.daf.butler.DimensionRecord`). 

160 """ 

161 

162 dependencyRecords: dict[str, DimensionRecord] 

163 """Additional records that must be inserted into the 

164 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

165 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

166 """ 

167 

168 

169def makeTransferChoiceField( 

170 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

171) -> ChoiceField: 

172 """Create a Config field with options for transferring data between repos. 

173 

174 The allowed options for the field are exactly those supported by 

175 `lsst.daf.butler.Datastore.ingest`. 

176 

177 Parameters 

178 ---------- 

179 doc : `str` 

180 Documentation for the configuration field. 

181 default : `str`, optional 

182 Default transfer mode for the field. 

183 

184 Returns 

185 ------- 

186 field : `lsst.pex.config.ChoiceField` 

187 Configuration field. 

188 """ 

189 return ChoiceField( 

190 doc=doc, 

191 dtype=str, 

192 allowed={ 

193 "move": "move", 

194 "copy": "copy", 

195 "auto": "choice will depend on datastore", 

196 "direct": "use URI to ingested file directly in datastore", 

197 "link": "hard link falling back to symbolic link", 

198 "hardlink": "hard link", 

199 "symlink": "symbolic (soft) link", 

200 "relsymlink": "relative symbolic link", 

201 }, 

202 optional=True, 

203 default=default, 

204 ) 

205 

206 

207class RawIngestConfig(Config): 

208 """Configuration class for RawIngestTask.""" 

209 

210 transfer = makeTransferChoiceField() 

211 failFast: Field[bool] = Field( 

212 dtype=bool, 

213 default=False, 

214 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

215 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

216 ) 

217 

218 

219class RawIngestTask(Task): 

220 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

221 

222 Parameters 

223 ---------- 

224 config : `RawIngestConfig` 

225 Configuration for the task. 

226 butler : `~lsst.daf.butler.Butler` 

227 Writeable butler instance, with ``butler.run`` set to the appropriate 

228 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

229 datasets. 

230 on_success : `Callable`, optional 

231 A callback invoked when all of the raws associated with an exposure 

232 are ingested. Will be passed a list of `FileDataset` objects, each 

233 containing one or more resolved `DatasetRef` objects. If this callback 

234 raises it will interrupt the entire ingest process, even if 

235 `RawIngestConfig.failFast` is `False`. 

236 on_metadata_failure : `Callable`, optional 

237 A callback invoked when a failure occurs trying to translate the 

238 metadata for a file. Will be passed the URI and the exception, in 

239 that order, as positional arguments. Guaranteed to be called in an 

240 ``except`` block, allowing the callback to re-raise or replace (with 

241 ``raise ... from``) to override the task's usual error handling (before 

242 `RawIngestConfig.failFast` logic occurs). 

243 on_ingest_failure : `Callable`, optional 

244 A callback invoked when dimension record or dataset insertion into the 

245 database fails for an exposure. Will be passed a `RawExposureData` 

246 instance and the exception, in that order, as positional arguments. 

247 Guaranteed to be called in an ``except`` block, allowing the callback 

248 to re-raise or replace (with ``raise ... from``) to override the task's 

249 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

250 **kwargs 

251 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

252 constructor. 

253 

254 Notes 

255 ----- 

256 Each instance of `RawIngestTask` writes to the same Butler. Each 

257 invocation of `RawIngestTask.run` ingests a list of files. 

258 """ 

259 

260 ConfigClass: ClassVar[type[Config]] = RawIngestConfig 

261 

262 _DefaultName: ClassVar[str] = "ingest" 

263 

264 def getDatasetType(self) -> DatasetType: 

265 """Return the default DatasetType of the datasets ingested by this 

266 Task. 

267 

268 Returns 

269 ------- 

270 datasetType : `DatasetType` 

271 The default dataset type to use for the data being ingested. This 

272 is only used if the relevant `~lsst.pipe.base.Instrument` does not 

273 define an override. 

274 """ 

275 return DatasetType( 

276 "raw", 

277 ("instrument", "detector", "exposure"), 

278 "Exposure", 

279 universe=self.butler.dimensions, 

280 ) 

281 

282 # Mypy can not determine that the config passed to super() is this type. 

283 config: RawIngestConfig 

284 

285 def __init__( 

286 self, 

287 config: RawIngestConfig, 

288 *, 

289 butler: Butler, 

290 on_success: Callable[[list[FileDataset]], Any] = _do_nothing, 

291 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

292 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

293 **kwargs: Any, 

294 ): 

295 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

296 super().__init__(config, **kwargs) 

297 self.butler = butler 

298 self.universe = self.butler.dimensions 

299 self.datasetType = self.getDatasetType() 

300 self._on_success = on_success 

301 self._on_metadata_failure = on_metadata_failure 

302 self._on_ingest_failure = on_ingest_failure 

303 self.progress = Progress("obs.base.RawIngestTask") 

304 

305 # Import all the instrument classes so that we ensure that we 

306 # have all the relevant metadata translators loaded. 

307 Instrument.importAll(self.butler.registry) 

308 

309 def _reduce_kwargs(self) -> dict[str, Any]: 

310 # Add extra parameters to pickle. 

311 return dict( 

312 **super()._reduce_kwargs(), 

313 butler=self.butler, 

314 on_success=self._on_success, 

315 on_metadata_failure=self._on_metadata_failure, 

316 on_ingest_failure=self._on_ingest_failure, 

317 ) 

318 

319 def _determine_instrument_formatter( 

320 self, dataId: DataCoordinate, filename: ResourcePath 

321 ) -> tuple[Instrument | None, type[Formatter]]: 

322 """Determine the instrument and formatter class. 

323 

324 Parameters 

325 ---------- 

326 dataId : `lsst.daf.butler.DataCoordinate` 

327 The dataId associated with this dataset. 

328 filename : `lsst.resources.ResourcePath` 

329 URI of file used for error reporting. 

330 

331 Returns 

332 ------- 

333 instrument : `Instrument` or `None` 

334 Instance of the `Instrument` associated with this dataset. `None` 

335 indicates that the instrument could not be determined. 

336 formatterClass : `type` 

337 Class to be used as the formatter for this dataset. 

338 """ 

339 # The data model currently assumes that whilst multiple datasets 

340 # can be associated with a single file, they must all share the 

341 # same formatter. 

342 try: 

343 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

344 except LookupError as e: 

345 self._on_metadata_failure(filename, e) 

346 self.log.warning( 

347 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

348 ) 

349 if self.config.failFast: 

350 raise RuntimeError( 

351 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

352 ) from e 

353 FormatterClass = Formatter 

354 # Indicate that we could not work out the instrument. 

355 instrument = None 

356 else: 

357 assert instrument is not None, "Should be guaranted by fromName succeeding." 

358 FormatterClass = instrument.getRawFormatter(dataId) 

359 return instrument, FormatterClass 

360 

361 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

362 """Extract and process metadata from a single raw file. 

363 

364 Parameters 

365 ---------- 

366 filename : `lsst.resources.ResourcePath` 

367 URI to the file. 

368 

369 Returns 

370 ------- 

371 data : `RawFileData` 

372 A structure containing the metadata extracted from the file, 

373 as well as the original filename. All fields will be populated, 

374 but the `RawFileData.dataId` attribute will be a minimal 

375 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

376 ``instrument`` field will be `None` if there is a problem 

377 with metadata extraction. 

378 

379 Notes 

380 ----- 

381 Assumes that there is a single dataset associated with the given 

382 file. Instruments using a single file to store multiple datasets 

383 must implement their own version of this method. 

384 

385 By default the method will catch all exceptions unless the ``failFast`` 

386 configuration item is `True`. If an error is encountered the 

387 `_on_metadata_failure()` method will be called. If no exceptions 

388 result and an error was encountered the returned object will have 

389 a null-instrument class and no datasets. 

390 

391 This method supports sidecar JSON files which can be used to 

392 extract metadata without having to read the data file itself. 

393 The sidecar file is always used if found. 

394 """ 

395 sidecar_fail_msg = "" # Requires prepended space when set. 

396 try: 

397 sidecar_file = filename.updatedExtension(".json") 

398 if sidecar_file.exists(): 

399 content = json.loads(sidecar_file.read()) 

400 headers = [process_sidecar_data(content)] 

401 sidecar_fail_msg = " (via sidecar)" 

402 else: 

403 # Read the metadata from the data file itself. 

404 

405 # For remote files download the entire file to get the 

406 # header. This is very inefficient and it would be better 

407 # to have some way of knowing where in the file the headers 

408 # are and to only download those parts of the file. 

409 with filename.as_local() as local_file: 

410 # Read the primary. This might be sufficient. 

411 header = readMetadata(local_file.ospath, 0) 

412 

413 try: 

414 # Try to work out a translator class early. 

415 translator_class = MetadataTranslator.determine_translator( 

416 header, filename=str(filename) 

417 ) 

418 except ValueError: 

419 # Primary header was not sufficient (maybe this file 

420 # has been compressed or is a MEF with minimal 

421 # primary). Read second header and merge with primary. 

422 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

423 

424 # Try again to work out a translator class, letting this 

425 # fail. 

426 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename)) 

427 

428 # Request the headers to use for ingest 

429 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header)) 

430 

431 # Add each header to the dataset list 

432 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

433 

434 except Exception as e: 

435 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

436 # Indicate to the caller that we failed to read. 

437 datasets = [] 

438 formatterClass = Formatter 

439 instrument = None 

440 self._on_metadata_failure(filename, e) 

441 if self.config.failFast: 

442 raise RuntimeError( 

443 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

444 ) from e 

445 else: 

446 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

447 # The data model currently assumes that whilst multiple datasets 

448 # can be associated with a single file, they must all share the 

449 # same formatter. 

450 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

451 if instrument is None: 

452 datasets = [] 

453 

454 return RawFileData( 

455 datasets=datasets, 

456 filename=filename, 

457 # MyPy wants this to be a non-abstract class, which is not true 

458 # for the error case where instrument is None and datasets=[]. 

459 FormatterClass=formatterClass, # type: ignore 

460 instrument=instrument, 

461 ) 

462 

463 @classmethod 

464 def getObservationInfoSubsets(cls) -> tuple[set, set]: 

465 """Return subsets of fields in the `ObservationInfo` that we care 

466 about. 

467 

468 These fields will be used in constructing an exposure record. 

469 

470 Returns 

471 ------- 

472 required : `set` 

473 Set of `ObservationInfo` field names that are required. 

474 optional : `set` 

475 Set of `ObservationInfo` field names we will use if they are 

476 available. 

477 """ 

478 # Marking the new properties "group_counter_*" and 

479 # "has_simulated_content" as required, assumes that we either 

480 # recreate any existing index/sidecar files that include translated 

481 # values, or else allow astro_metadata_translator to fill in 

482 # defaults. 

483 required = { 

484 "datetime_begin", 

485 "datetime_end", 

486 "detector_num", 

487 "exposure_id", 

488 "exposure_time", 

489 "group_counter_end", 

490 "group_counter_start", 

491 "has_simulated_content", 

492 "instrument", 

493 "observation_id", 

494 "observation_type", 

495 "physical_filter", 

496 } 

497 optional = { 

498 "altaz_begin", 

499 "boresight_rotation_coord", 

500 "boresight_rotation_angle", 

501 "dark_time", 

502 "exposure_group", 

503 "tracking_radec", 

504 "object", 

505 "observation_counter", 

506 "observation_reason", 

507 "observing_day", 

508 "science_program", 

509 "visit_id", 

510 } 

511 return required, optional 

512 

513 def _calculate_dataset_info( 

514 self, header: MutableMapping[str, Any] | ObservationInfo, filename: ResourcePath 

515 ) -> RawFileDatasetInfo: 

516 """Calculate a RawFileDatasetInfo from the supplied information. 

517 

518 Parameters 

519 ---------- 

520 header : Mapping or `astro_metadata_translator.ObservationInfo` 

521 Header from the dataset or previously-translated content. 

522 filename : `lsst.resources.ResourcePath` 

523 Filename to use for error messages. 

524 

525 Returns 

526 ------- 

527 dataset : `RawFileDatasetInfo` 

528 The dataId, and observation information associated with this 

529 dataset. 

530 """ 

531 required, optional = self.getObservationInfoSubsets() 

532 if isinstance(header, ObservationInfo): 

533 obsInfo = header 

534 missing = [] 

535 # Need to check the required properties are present. 

536 for property in required: 

537 # getattr does not need to be protected because it is using 

538 # the defined list above containing properties that must exist. 

539 value = getattr(obsInfo, property) 

540 if value is None: 

541 missing.append(property) 

542 if missing: 

543 raise ValueError( 

544 f"Requested required properties are missing from file {filename}: {missing} (via JSON)" 

545 ) 

546 

547 else: 

548 obsInfo = ObservationInfo( 

549 header, 

550 pedantic=False, 

551 filename=str(filename), 

552 required=required, 

553 subset=required | optional, 

554 ) 

555 

556 dataId = DataCoordinate.standardize( 

557 instrument=obsInfo.instrument, 

558 exposure=obsInfo.exposure_id, 

559 detector=obsInfo.detector_num, 

560 universe=self.universe, 

561 ) 

562 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

563 

564 def locateAndReadIndexFiles( 

565 self, files: Iterable[ResourcePath] 

566 ) -> tuple[dict[ResourcePath, Any], list[ResourcePath], set[ResourcePath], set[ResourcePath]]: 

567 """Given a list of files, look for index files and read them. 

568 

569 Index files can either be explicitly in the list of files to 

570 ingest, or else located in the same directory as a file to ingest. 

571 Index entries are always used if present. 

572 

573 Parameters 

574 ---------- 

575 files : iterable over `lsst.resources.ResourcePath` 

576 URIs to the files to be ingested. 

577 

578 Returns 

579 ------- 

580 index : `dict` [`ResourcePath`, Any] 

581 Merged contents of all relevant index files found. These can 

582 be explicitly specified index files or ones found in the 

583 directory alongside a data file to be ingested. 

584 updated_files : `list` of `ResourcePath` 

585 Updated list of the input files with entries removed that were 

586 found listed in an index file. Order is not guaranteed to 

587 match the order of the files given to this routine. 

588 good_index_files: `set` [ `ResourcePath` ] 

589 Index files that were successfully read. 

590 bad_index_files: `set` [ `ResourcePath` ] 

591 Files that looked like index files but failed to read properly. 

592 """ 

593 # Convert the paths to absolute for easy comparison with index content. 

594 # Do not convert to real paths since we have to assume that index 

595 # files are in this location and not the location which it links to. 

596 files = tuple(f.abspath() for f in files) 

597 

598 # Index files must be named this. 

599 index_root_file = "_index.json" 

600 

601 # Group the files by directory. 

602 files_by_directory = defaultdict(set) 

603 

604 for path in files: 

605 directory, file_in_dir = path.split() 

606 files_by_directory[directory].add(file_in_dir) 

607 

608 # All the metadata read from index files with keys of full path. 

609 index_entries: dict[ResourcePath, Any] = {} 

610 

611 # Index files we failed to read. 

612 bad_index_files = set() 

613 

614 # Any good index files that were found and used. 

615 good_index_files = set() 

616 

617 # Look for index files in those directories. 

618 for directory, files_in_directory in files_by_directory.items(): 

619 possible_index_file = directory.join(index_root_file) 

620 if possible_index_file.exists(): 

621 # If we are explicitly requesting an index file the 

622 # messages should be different. 

623 index_msg = "inferred" 

624 is_implied = True 

625 if index_root_file in files_in_directory: 

626 index_msg = "explicit" 

627 is_implied = False 

628 

629 # Try to read the index file and catch and report any 

630 # problems. 

631 try: 

632 content = json.loads(possible_index_file.read()) 

633 index = process_index_data(content, force_dict=True) 

634 # mypy should in theory know that this is a mapping 

635 # from the overload type annotation of process_index_data. 

636 assert isinstance(index, MutableMapping) 

637 except Exception as e: 

638 # Only trigger the callback if the index file 

639 # was asked for explicitly. Triggering on implied file 

640 # might be surprising. 

641 if not is_implied: 

642 self._on_metadata_failure(possible_index_file, e) 

643 if self.config.failFast: 

644 raise RuntimeError( 

645 f"Problem reading index file from {index_msg} location {possible_index_file}" 

646 ) from e 

647 bad_index_files.add(possible_index_file) 

648 continue 

649 

650 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

651 good_index_files.add(possible_index_file) 

652 

653 # Go through the index adding entries for files. 

654 # If we have non-index files in this directory marked for 

655 # ingest we should only get index information for those. 

656 # If the index file was explicit we use all entries. 

657 if is_implied: 

658 files_to_ingest = files_in_directory 

659 else: 

660 files_to_ingest = set(index) 

661 

662 # Copy relevant metadata into a single dict for all index 

663 # entries. 

664 for file_in_dir in files_to_ingest: 

665 # Skip an explicitly specified index file. 

666 # This should never happen because an explicit index 

667 # file will force ingest of all files in the index 

668 # and not use the explicit file list. If somehow 

669 # this is not true we continue. Raising an exception 

670 # seems like the wrong thing to do since this is harmless. 

671 if file_in_dir == index_root_file: 

672 self.log.info( 

673 "Logic error found scanning directory %s. Please file ticket.", directory 

674 ) 

675 continue 

676 if file_in_dir in index: 

677 file = directory.join(file_in_dir) 

678 if file in index_entries: 

679 # ObservationInfo overrides raw metadata 

680 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

681 index_entries[file], ObservationInfo 

682 ): 

683 self.log.warning( 

684 "File %s already specified in an index file but overriding" 

685 " with ObservationInfo content from %s", 

686 file, 

687 possible_index_file, 

688 ) 

689 else: 

690 self.log.warning( 

691 "File %s already specified in an index file, ignoring content from %s", 

692 file, 

693 possible_index_file, 

694 ) 

695 # Do nothing in this case 

696 continue 

697 

698 index_entries[file] = index[file_in_dir] 

699 

700 # Remove files from list that have index entries and also 

701 # any files that we determined to be explicit index files 

702 # or any index files that we failed to read. 

703 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

704 

705 # The filtered list loses the initial order. Retaining the order 

706 # is good for testing but does have a cost if there are many 

707 # files when copying the good values out. A dict would have faster 

708 # lookups (using the files as keys) but use more memory. 

709 ordered = [f for f in filtered if f in files] 

710 

711 return index_entries, ordered, good_index_files, bad_index_files 

712 

713 def processIndexEntries(self, index_entries: dict[ResourcePath, Any]) -> list[RawFileData]: 

714 """Convert index entries to RawFileData. 

715 

716 Parameters 

717 ---------- 

718 index_entries : `dict` [`ResourcePath`, Any] 

719 Dict indexed by name of file to ingest and with keys either 

720 raw metadata or translated 

721 `~astro_metadata_translator.ObservationInfo`. 

722 

723 Returns 

724 ------- 

725 data : `list` [ `RawFileData` ] 

726 Structures containing the metadata extracted from the file, 

727 as well as the original filename. All fields will be populated, 

728 but the `RawFileData.dataId` attributes will be minimal 

729 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

730 """ 

731 fileData = [] 

732 for filename, metadata in index_entries.items(): 

733 try: 

734 datasets = [self._calculate_dataset_info(metadata, filename)] 

735 except Exception as e: 

736 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

737 datasets = [] 

738 formatterClass = Formatter 

739 instrument = None 

740 self._on_metadata_failure(filename, e) 

741 if self.config.failFast: 

742 raise RuntimeError( 

743 f"Problem extracting metadata for file {filename} found in index file" 

744 ) from e 

745 else: 

746 instrument, formatterClass = self._determine_instrument_formatter( 

747 datasets[0].dataId, filename 

748 ) 

749 if instrument is None: 

750 datasets = [] 

751 fileData.append( 

752 RawFileData( 

753 datasets=datasets, 

754 filename=filename, 

755 # MyPy wants this to be a non-abstract class, which is not 

756 # true for the error case where instrument is None and 

757 # datasets=[]. 

758 FormatterClass=formatterClass, # type: ignore 

759 instrument=instrument, 

760 ) 

761 ) 

762 return fileData 

763 

764 def groupByExposure(self, files: Iterable[RawFileData]) -> list[RawExposureData]: 

765 """Group an iterable of `RawFileData` by exposure. 

766 

767 Parameters 

768 ---------- 

769 files : iterable of `RawFileData` 

770 File-level information to group. 

771 

772 Returns 

773 ------- 

774 exposures : `list` of `RawExposureData` 

775 A list of structures that group the file-level information by 

776 exposure. All fields will be populated. The 

777 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

778 `~lsst.daf.butler.DataCoordinate` instances. 

779 """ 

780 exposureDimensions = self.universe["exposure"].graph 

781 byExposure = defaultdict(list) 

782 for f in files: 

783 # Assume that the first dataset is representative for the file. 

784 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

785 

786 return [ 

787 RawExposureData( 

788 dataId=dataId, 

789 files=exposureFiles, 

790 universe=self.universe, 

791 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

792 dependencyRecords=self.makeDependencyRecords( 

793 exposureFiles[0].datasets[0].obsInfo, self.universe 

794 ), 

795 ) 

796 for dataId, exposureFiles in byExposure.items() 

797 ] 

798 

799 def makeExposureRecord( 

800 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

801 ) -> DimensionRecord: 

802 """Construct a registry record for an exposure. 

803 

804 This is a method that subclasses will often want to customize. This can 

805 often be done by calling this base class implementation with additional 

806 ``kwargs``. 

807 

808 Parameters 

809 ---------- 

810 obsInfo : `ObservationInfo` 

811 Observation details for (one of the components of) the exposure. 

812 universe : `DimensionUniverse` 

813 Set of all known dimensions. 

814 **kwargs 

815 Additional field values for this record. 

816 

817 Returns 

818 ------- 

819 record : `DimensionRecord` 

820 The exposure record that must be inserted into the 

821 `~lsst.daf.butler.Registry` prior to file-level ingest. 

822 """ 

823 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

824 

825 def makeDependencyRecords( 

826 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

827 ) -> dict[str, DimensionRecord]: 

828 """Construct dependency records. 

829 

830 These dependency records will be inserted into the 

831 `~lsst.daf.butler.Registry` before the exposure records, because they 

832 are dependencies of the exposure. This allows an opportunity to satisfy 

833 foreign key constraints that exist because of dimensions related to the 

834 exposure. 

835 

836 This is a method that subclasses may want to customize, if they've 

837 added dimensions that relate to an exposure. 

838 

839 Parameters 

840 ---------- 

841 obsInfo : `ObservationInfo` 

842 Observation details for (one of the components of) the exposure. 

843 universe : `DimensionUniverse` 

844 Set of all known dimensions. 

845 

846 Returns 

847 ------- 

848 records : `dict` [`str`, `DimensionRecord`] 

849 The records to insert, indexed by dimension name. 

850 """ 

851 return {} 

852 

853 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

854 """Expand the data IDs associated with a raw exposure. 

855 

856 This adds the metadata records. 

857 

858 Parameters 

859 ---------- 

860 exposure : `RawExposureData` 

861 A structure containing information about the exposure to be 

862 ingested. Must have `RawExposureData.record` populated. Should 

863 be considered consumed upon return. 

864 

865 Returns 

866 ------- 

867 exposure : `RawExposureData` 

868 An updated version of the input structure, with 

869 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

870 updated to data IDs for which 

871 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

872 """ 

873 # We start by expanded the exposure-level data ID; we won't use that 

874 # directly in file ingest, but this lets us do some database lookups 

875 # once per exposure instead of once per file later. 

876 data.dataId = self.butler.registry.expandDataId( 

877 data.dataId, 

878 # We pass in the records we'll be inserting shortly so they aren't 

879 # looked up from the database. We do expect instrument and filter 

880 # records to be retrieved from the database here (though the 

881 # Registry may cache them so there isn't a lookup every time). 

882 records={"exposure": data.record}, 

883 ) 

884 # Now we expand the per-file (exposure+detector) data IDs. This time 

885 # we pass in the records we just retrieved from the exposure data ID 

886 # expansion. 

887 for file in data.files: 

888 for dataset in file.datasets: 

889 dataset.dataId = self.butler.registry.expandDataId( 

890 dataset.dataId, 

891 records={k: data.dataId.records[k] for k in data.dataId.dimensions.elements}, 

892 ) 

893 return data 

894 

895 def prep( 

896 self, files: Iterable[ResourcePath], *, pool: PoolType | None = None 

897 ) -> tuple[Iterator[RawExposureData], list[ResourcePath]]: 

898 """Perform all non-database-updating ingest preprocessing steps. 

899 

900 Parameters 

901 ---------- 

902 files : iterable over `str` or path-like objects 

903 Paths to the files to be ingested. Will be made absolute 

904 if they are not already. 

905 pool : `multiprocessing.Pool`, optional 

906 If not `None`, a process pool with which to parallelize some 

907 operations. 

908 

909 Returns 

910 ------- 

911 exposures : `Iterator` [ `RawExposureData` ] 

912 Data structures containing dimension records, filenames, and data 

913 IDs to be ingested (one structure for each exposure). 

914 bad_files : `list` of `str` 

915 List of all the files that could not have metadata extracted. 

916 """ 

917 mapFunc = map if pool is None else pool.imap_unordered 

918 

919 def _partition_good_bad( 

920 file_data: Iterable[RawFileData], 

921 ) -> tuple[list[RawFileData], list[ResourcePath]]: 

922 """Filter out bad files and return good with list of bad.""" 

923 good_files = [] 

924 bad_files = [] 

925 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

926 if not fileDatum.datasets: 

927 bad_files.append(fileDatum.filename) 

928 else: 

929 good_files.append(fileDatum) 

930 return good_files, bad_files 

931 

932 # Look for index files and read them. 

933 # There should be far fewer index files than data files. 

934 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

935 if bad_index_files: 

936 self.log.info("Failed to read the following explicitly requested index files:") 

937 for bad in sorted(bad_index_files): 

938 self.log.info("- %s", bad) 

939 

940 # Now convert all the index file entries to standard form for ingest. 

941 processed_bad_index_files: list[ResourcePath] = [] 

942 indexFileData = self.processIndexEntries(index_entries) 

943 if indexFileData: 

944 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

945 self.log.info( 

946 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

947 *_log_msg_counter(indexFileData), 

948 *_log_msg_counter(good_index_files), 

949 *_log_msg_counter(processed_bad_index_files), 

950 ) 

951 

952 # Extract metadata and build per-detector regions. 

953 # This could run in a subprocess so collect all output 

954 # before looking at failures. 

955 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

956 

957 # Filter out all the failed reads and store them for later 

958 # reporting. 

959 good_file_data, bad_files = _partition_good_bad(fileData) 

960 self.log.info( 

961 "Successfully extracted metadata from %d file%s with %d failure%s", 

962 *_log_msg_counter(good_file_data), 

963 *_log_msg_counter(bad_files), 

964 ) 

965 

966 # Combine with data from index files. 

967 good_file_data.extend(indexFileData) 

968 bad_files.extend(processed_bad_index_files) 

969 bad_files.extend(bad_index_files) 

970 

971 # Use that metadata to group files (and extracted metadata) by 

972 # exposure. Never parallelized because it's intrinsically a gather 

973 # step. 

974 exposureData: list[RawExposureData] = self.groupByExposure(good_file_data) 

975 

976 # The next operation operates on RawExposureData instances (one at 

977 # a time) in-place and then returns the modified instance. We call it 

978 # as a pass-through instead of relying on the arguments we pass in to 

979 # have been modified because in the parallel case those arguments are 

980 # going to be pickled and unpickled, and I'm not certain 

981 # multiprocessing is careful enough with that for output arguments to 

982 # work. 

983 

984 # Expand the data IDs to include all dimension metadata; we need this 

985 # because we may need to generate path templates that rely on that 

986 # metadata. 

987 # This is the first step that involves actual database calls (but just 

988 # SELECTs), so if there's going to be a problem with connections vs. 

989 # multiple processes, or lock contention (in SQLite) slowing things 

990 # down, it'll happen here. 

991 return mapFunc(self.expandDataIds, exposureData), bad_files 

992 

993 def ingestExposureDatasets( 

994 self, 

995 exposure: RawExposureData, 

996 datasetType: DatasetType, 

997 *, 

998 run: str, 

999 skip_existing_exposures: bool = False, 

1000 track_file_attrs: bool = True, 

1001 ) -> list[FileDataset]: 

1002 """Ingest all raw files in one exposure. 

1003 

1004 Parameters 

1005 ---------- 

1006 exposure : `RawExposureData` 

1007 A structure containing information about the exposure to be 

1008 ingested. Must have `RawExposureData.records` populated and all 

1009 data ID attributes expanded. 

1010 datasetType : `DatasetType` 

1011 The dataset type associated with this exposure. 

1012 run : `str` 

1013 Name of a RUN-type collection to write to. 

1014 skip_existing_exposures : `bool`, optional 

1015 If `True` (`False` is default), skip raws that have already been 

1016 ingested (i.e. raws for which we already have a dataset with the 

1017 same data ID in the target collection, even if from another file). 

1018 Note that this is much slower than just not passing 

1019 already-ingested files as inputs, because we still need to read and 

1020 process metadata to identify which exposures to search for. It 

1021 also will not work reliably if multiple processes are attempting to 

1022 ingest raws from the same exposure concurrently, in that different 

1023 processes may still attempt to ingest the same raw and conflict, 

1024 causing a failure that prevents other raws from the same exposure 

1025 from being ingested. 

1026 track_file_attrs : `bool`, optional 

1027 Control whether file attributes such as the size or checksum should 

1028 be tracked by the datastore. Whether this parameter is honored 

1029 depends on the specific datastore implementation. 

1030 

1031 Returns 

1032 ------- 

1033 datasets : `list` of `lsst.daf.butler.FileDataset` 

1034 Per-file structures identifying the files ingested and their 

1035 dataset representation in the data repository. 

1036 """ 

1037 if skip_existing_exposures: 

1038 existing = { 

1039 ref.dataId 

1040 for ref in self.butler.registry.queryDatasets( 

1041 datasetType, 

1042 collections=[run], 

1043 dataId=exposure.dataId, 

1044 ) 

1045 } 

1046 else: 

1047 existing = set() 

1048 

1049 # Raw files are preferentially ingested using a UUID derived from 

1050 # the collection name and dataId. 

1051 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1052 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1053 else: 

1054 mode = DatasetIdGenEnum.UNIQUE 

1055 

1056 datasets = [] 

1057 for file in exposure.files: 

1058 refs = [ 

1059 DatasetRef(datasetType, d.dataId, run=run, id_generation_mode=mode) 

1060 for d in file.datasets 

1061 if d.dataId not in existing 

1062 ] 

1063 if refs: 

1064 datasets.append( 

1065 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1066 ) 

1067 

1068 self.butler.ingest( 

1069 *datasets, 

1070 transfer=self.config.transfer, 

1071 record_validation_info=track_file_attrs, 

1072 ) 

1073 return datasets 

1074 

1075 def ingestFiles( 

1076 self, 

1077 files: Iterable[ResourcePath], 

1078 *, 

1079 pool: PoolType | None = None, 

1080 processes: int = 1, 

1081 run: str | None = None, 

1082 skip_existing_exposures: bool = False, 

1083 update_exposure_records: bool = False, 

1084 track_file_attrs: bool = True, 

1085 ) -> tuple[list[DatasetRef], list[ResourcePath], int, int, int]: 

1086 """Ingest files into a Butler data repository. 

1087 

1088 This creates any new exposure or visit Dimension entries needed to 

1089 identify the ingested files, creates new Dataset entries in the 

1090 Registry and finally ingests the files themselves into the Datastore. 

1091 Any needed instrument, detector, and physical_filter Dimension entries 

1092 must exist in the Registry before `run` is called. 

1093 

1094 Parameters 

1095 ---------- 

1096 files : iterable over `lsst.resources.ResourcePath` 

1097 URIs to the files to be ingested. 

1098 pool : `multiprocessing.Pool`, optional 

1099 If not `None`, a process pool with which to parallelize some 

1100 operations. 

1101 processes : `int`, optional 

1102 The number of processes to use. Ignored if ``pool`` is not `None`. 

1103 run : `str`, optional 

1104 Name of a RUN-type collection to write to, overriding 

1105 the default derived from the instrument name. 

1106 skip_existing_exposures : `bool`, optional 

1107 If `True` (`False` is default), skip raws that have already been 

1108 ingested (i.e. raws for which we already have a dataset with the 

1109 same data ID in the target collection, even if from another file). 

1110 Note that this is much slower than just not passing 

1111 already-ingested files as inputs, because we still need to read and 

1112 process metadata to identify which exposures to search for. It 

1113 also will not work reliably if multiple processes are attempting to 

1114 ingest raws from the same exposure concurrently, in that different 

1115 processes may still attempt to ingest the same raw and conflict, 

1116 causing a failure that prevents other raws from the same exposure 

1117 from being ingested. 

1118 update_exposure_records : `bool`, optional 

1119 If `True` (`False` is default), update existing exposure records 

1120 that conflict with the new ones instead of rejecting them. THIS IS 

1121 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1122 KNOWN TO BE BAD. This should usually be combined with 

1123 ``skip_existing_exposures=True``. 

1124 track_file_attrs : `bool`, optional 

1125 Control whether file attributes such as the size or checksum should 

1126 be tracked by the datastore. Whether this parameter is honored 

1127 depends on the specific datastore implementation. 

1128 

1129 Returns 

1130 ------- 

1131 refs : `list` of `lsst.daf.butler.DatasetRef` 

1132 Dataset references for ingested raws. 

1133 bad_files : `list` of `ResourcePath` 

1134 Given paths that could not be ingested. 

1135 n_exposures : `int` 

1136 Number of exposures successfully ingested. 

1137 n_exposures_failed : `int` 

1138 Number of exposures that failed when inserting dimension data. 

1139 n_ingests_failed : `int` 

1140 Number of exposures that failed when ingesting raw datasets. 

1141 """ 

1142 created_pool = False 

1143 if pool is None and processes > 1: 

1144 pool = Pool(processes) 

1145 created_pool = True 

1146 

1147 try: 

1148 exposureData, bad_files = self.prep(files, pool=pool) 

1149 finally: 

1150 if created_pool and pool: 

1151 # The pool is not needed any more so close it if we created 

1152 # it to ensure we clean up resources. 

1153 pool.close() 

1154 pool.join() 

1155 

1156 # Up to this point, we haven't modified the data repository at all. 

1157 # Now we finally do that, with one transaction per exposure. This is 

1158 # not parallelized at present because the performance of this step is 

1159 # limited by the database server. That may or may not change in the 

1160 # future once we increase our usage of bulk inserts and reduce our 

1161 # usage of savepoints; we've tried to get everything but the database 

1162 # operations done in advance to reduce the time spent inside 

1163 # transactions. 

1164 refs = [] 

1165 runs = set() 

1166 datasetTypes: dict[str, DatasetType] = {} 

1167 n_exposures = 0 

1168 n_exposures_failed = 0 

1169 n_ingests_failed = 0 

1170 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1171 assert exposure.record is not None, "Should be guaranteed by prep()" 

1172 self.log.debug( 

1173 "Attempting to ingest %d file%s from exposure %s:%s", 

1174 *_log_msg_counter(exposure.files), 

1175 exposure.record.instrument, 

1176 exposure.record.obs_id, 

1177 ) 

1178 

1179 try: 

1180 for name, record in exposure.dependencyRecords.items(): 

1181 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1182 inserted_or_updated = self.butler.registry.syncDimensionData( 

1183 "exposure", 

1184 exposure.record, 

1185 update=update_exposure_records, 

1186 ) 

1187 except Exception as e: 

1188 self._on_ingest_failure(exposure, e) 

1189 n_exposures_failed += 1 

1190 self.log.warning( 

1191 "Exposure %s:%s could not be registered: %s", 

1192 exposure.record.instrument, 

1193 exposure.record.obs_id, 

1194 e, 

1195 ) 

1196 if self.config.failFast: 

1197 raise e 

1198 continue 

1199 

1200 if isinstance(inserted_or_updated, dict): 

1201 # Exposure is in the registry and we updated it, so 

1202 # syncDimensionData returned a dict. 

1203 self.log.info( 

1204 "Exposure %s:%s was already present, but columns %s were updated.", 

1205 exposure.record.instrument, 

1206 exposure.record.obs_id, 

1207 str(list(inserted_or_updated.keys())), 

1208 ) 

1209 

1210 # Determine the instrument so we can work out the dataset type. 

1211 instrument = exposure.files[0].instrument 

1212 assert ( 

1213 instrument is not None 

1214 ), "file should have been removed from this list by prep if instrument could not be found" 

1215 

1216 if raw_definition := getattr(instrument, "raw_definition", None): 

1217 datasetTypeName, dimensions, storageClass = raw_definition 

1218 if not (datasetType := datasetTypes.get(datasetTypeName)): 

1219 datasetType = DatasetType( 

1220 datasetTypeName, dimensions, storageClass, universe=self.butler.dimensions 

1221 ) 

1222 else: 

1223 datasetType = self.datasetType 

1224 if datasetType.name not in datasetTypes: 

1225 self.butler.registry.registerDatasetType(datasetType) 

1226 datasetTypes[datasetType.name] = datasetType 

1227 

1228 # Override default run if nothing specified explicitly. 

1229 if run is None: 

1230 this_run = instrument.makeDefaultRawIngestRunName() 

1231 else: 

1232 this_run = run 

1233 if this_run not in runs: 

1234 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1235 runs.add(this_run) 

1236 try: 

1237 datasets_for_exposure = self.ingestExposureDatasets( 

1238 exposure, 

1239 datasetType=datasetType, 

1240 run=this_run, 

1241 skip_existing_exposures=skip_existing_exposures, 

1242 track_file_attrs=track_file_attrs, 

1243 ) 

1244 except Exception as e: 

1245 self._on_ingest_failure(exposure, e) 

1246 n_ingests_failed += 1 

1247 self.log.warning("Failed to ingest the following for reason: %s", e) 

1248 for f in exposure.files: 

1249 self.log.warning("- %s", f.filename) 

1250 if self.config.failFast: 

1251 raise e 

1252 continue 

1253 else: 

1254 self._on_success(datasets_for_exposure) 

1255 for dataset in datasets_for_exposure: 

1256 refs.extend(dataset.refs) 

1257 

1258 # Success for this exposure. 

1259 n_exposures += 1 

1260 self.log.info( 

1261 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1262 ) 

1263 

1264 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1265 

1266 @timeMethod 

1267 def run( 

1268 self, 

1269 files: Iterable[ResourcePathExpression], 

1270 *, 

1271 pool: PoolType | None = None, 

1272 processes: int = 1, 

1273 run: str | None = None, 

1274 file_filter: str | re.Pattern = r"\.fit[s]?\b", 

1275 group_files: bool = True, 

1276 skip_existing_exposures: bool = False, 

1277 update_exposure_records: bool = False, 

1278 track_file_attrs: bool = True, 

1279 ) -> list[DatasetRef]: 

1280 """Ingest files into a Butler data repository. 

1281 

1282 This creates any new exposure or visit Dimension entries needed to 

1283 identify the ingested files, creates new Dataset entries in the 

1284 Registry and finally ingests the files themselves into the Datastore. 

1285 Any needed instrument, detector, and physical_filter Dimension entries 

1286 must exist in the Registry before `run` is called. 

1287 

1288 Parameters 

1289 ---------- 

1290 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1291 Paths to the files to be ingested. Can refer to directories. 

1292 Will be made absolute if they are not already. 

1293 pool : `multiprocessing.Pool`, optional 

1294 If not `None`, a process pool with which to parallelize some 

1295 operations. 

1296 processes : `int`, optional 

1297 The number of processes to use. Ignored if ``pool`` is not `None`. 

1298 run : `str`, optional 

1299 Name of a RUN-type collection to write to, overriding 

1300 the default derived from the instrument name. 

1301 file_filter : `str` or `re.Pattern`, optional 

1302 Pattern to use to discover files to ingest within directories. 

1303 The default is to search for FITS files. The regex applies to 

1304 files within the directory. 

1305 group_files : `bool`, optional 

1306 Group files by directory if they have been discovered in 

1307 directories. Will not affect files explicitly provided. 

1308 skip_existing_exposures : `bool`, optional 

1309 If `True` (`False` is default), skip raws that have already been 

1310 ingested (i.e. raws for which we already have a dataset with the 

1311 same data ID in the target collection, even if from another file). 

1312 Note that this is much slower than just not passing 

1313 already-ingested files as inputs, because we still need to read and 

1314 process metadata to identify which exposures to search for. It 

1315 also will not work reliably if multiple processes are attempting to 

1316 ingest raws from the same exposure concurrently, in that different 

1317 processes may still attempt to ingest the same raw and conflict, 

1318 causing a failure that prevents other raws from the same exposure 

1319 from being ingested. 

1320 update_exposure_records : `bool`, optional 

1321 If `True` (`False` is default), update existing exposure records 

1322 that conflict with the new ones instead of rejecting them. THIS IS 

1323 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1324 KNOWN TO BE BAD. This should usually be combined with 

1325 ``skip_existing_exposures=True``. 

1326 track_file_attrs : `bool`, optional 

1327 Control whether file attributes such as the size or checksum should 

1328 be tracked by the datastore. Whether this parameter is honored 

1329 depends on the specific datastore implementation. 

1330 

1331 Returns 

1332 ------- 

1333 refs : `list` of `lsst.daf.butler.DatasetRef` 

1334 Dataset references for ingested raws. 

1335 

1336 Notes 

1337 ----- 

1338 This method inserts all datasets for an exposure within a transaction, 

1339 guaranteeing that partial exposures are never ingested. The exposure 

1340 dimension record is inserted with `Registry.syncDimensionData` first 

1341 (in its own transaction), which inserts only if a record with the same 

1342 primary key does not already exist. This allows different files within 

1343 the same exposure to be ingested in different runs. 

1344 """ 

1345 refs = [] 

1346 bad_files = [] 

1347 n_exposures = 0 

1348 n_exposures_failed = 0 

1349 n_ingests_failed = 0 

1350 if group_files: 

1351 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1352 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1353 group, 

1354 pool=pool, 

1355 processes=processes, 

1356 run=run, 

1357 skip_existing_exposures=skip_existing_exposures, 

1358 update_exposure_records=update_exposure_records, 

1359 track_file_attrs=track_file_attrs, 

1360 ) 

1361 refs.extend(new_refs) 

1362 bad_files.extend(bad) 

1363 n_exposures += n_exp 

1364 n_exposures_failed += n_exp_fail 

1365 n_ingests_failed += n_ingest_fail 

1366 else: 

1367 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1368 ResourcePath.findFileResources(files, file_filter, group_files), 

1369 pool=pool, 

1370 processes=processes, 

1371 run=run, 

1372 skip_existing_exposures=skip_existing_exposures, 

1373 update_exposure_records=update_exposure_records, 

1374 ) 

1375 

1376 had_failure = False 

1377 

1378 if bad_files: 

1379 had_failure = True 

1380 self.log.warning("Could not extract observation metadata from the following:") 

1381 for f in bad_files: 

1382 self.log.warning("- %s", f) 

1383 

1384 self.log.info( 

1385 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1386 " registration and %d failure%s from file ingest.", 

1387 *_log_msg_counter(n_exposures), 

1388 *_log_msg_counter(n_exposures_failed), 

1389 *_log_msg_counter(n_ingests_failed), 

1390 ) 

1391 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1392 had_failure = True 

1393 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1394 

1395 if had_failure: 

1396 raise RuntimeError("Some failures encountered during ingestion") 

1397 

1398 return refs