Coverage for python/lsst/obs/base/ingest.py: 17%

357 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 02:54 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from dataclasses import InitVar, dataclass 

29from multiprocessing import Pool 

30from typing import ( 

31 Any, 

32 Callable, 

33 ClassVar, 

34 Dict, 

35 Iterable, 

36 Iterator, 

37 List, 

38 MutableMapping, 

39 Optional, 

40 Set, 

41 Sized, 

42 Tuple, 

43 Type, 

44 Union, 

45) 

46 

47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

49from lsst.afw.fits import readMetadata 

50from lsst.daf.butler import ( 

51 Butler, 

52 CollectionType, 

53 DataCoordinate, 

54 DatasetIdGenEnum, 

55 DatasetRef, 

56 DatasetType, 

57 DimensionRecord, 

58 DimensionUniverse, 

59 FileDataset, 

60 Formatter, 

61 Progress, 

62) 

63from lsst.pex.config import ChoiceField, Config, Field 

64from lsst.pipe.base import Instrument, Task 

65from lsst.resources import ResourcePath, ResourcePathExpression 

66from lsst.utils.timer import timeMethod 

67 

68from ._instrument import makeExposureRecordFromObsInfo 

69 

70# multiprocessing.Pool is actually a function, not a type, and the real type 

71# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

72# this alias instead. 

73PoolType = Any 

74 

75 

76def _do_nothing(*args: Any, **kwargs: Any) -> None: 

77 """Do nothing. 

78 

79 This is a function that accepts anything and does nothing. 

80 For use as a default in callback arguments. 

81 """ 

82 pass 

83 

84 

85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]: 

86 """Count the iterable and return the count and plural modifier. 

87 

88 Parameters 

89 ---------- 

90 noun : `Sized` or `int` 

91 Thing to count. If given an integer it is assumed to be the count 

92 to use to calculate modifier. 

93 

94 Returns 

95 ------- 

96 num : `int` 

97 Number of items found in ``noun``. 

98 modifier : `str` 

99 Character to add to the end of a string referring to these items 

100 to indicate whether it was a single item or not. Returns empty 

101 string if there is one item or "s" otherwise. 

102 

103 Examples 

104 -------- 

105 

106 .. code-block:: python 

107 

108 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

109 """ 

110 if isinstance(noun, int): 

111 num = noun 

112 else: 

113 num = len(noun) 

114 return num, "" if num == 1 else "s" 

115 

116 

117@dataclass 

118class RawFileDatasetInfo: 

119 """Information about a single dataset within a raw file.""" 

120 

121 dataId: DataCoordinate 

122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

123 

124 obsInfo: ObservationInfo 

125 """Standardized observation metadata extracted directly from the file 

126 headers (`astro_metadata_translator.ObservationInfo`). 

127 """ 

128 

129 

130@dataclass 

131class RawFileData: 

132 """Information about a single raw file, used during ingest.""" 

133 

134 datasets: List[RawFileDatasetInfo] 

135 """The information describing each dataset within this raw file. 

136 (`list` of `RawFileDatasetInfo`) 

137 """ 

138 

139 filename: ResourcePath 

140 """URI of the file this information was extracted from (`str`). 

141 

142 This is the path prior to ingest, not the path after ingest. 

143 """ 

144 

145 FormatterClass: Type[Formatter] 

146 """Formatter class that should be used to ingest this file (`type`; as 

147 subclass of `Formatter`). 

148 """ 

149 

150 instrument: Optional[Instrument] 

151 """The `Instrument` instance associated with this file. Can be `None` 

152 if ``datasets`` is an empty list.""" 

153 

154 

155@dataclass 

156class RawExposureData: 

157 """Information about a complete raw exposure, used during ingest.""" 

158 

159 dataId: DataCoordinate 

160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

161 """ 

162 

163 files: List[RawFileData] 

164 """List of structures containing file-level information. 

165 """ 

166 

167 universe: InitVar[DimensionUniverse] 

168 """Set of all known dimensions. 

169 """ 

170 

171 record: DimensionRecord 

172 """The exposure `DimensionRecord` that must be inserted into the 

173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

174 """ 

175 

176 dependencyRecords: Dict[str, DimensionRecord] 

177 """Additional records that must be inserted into the 

178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

179 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

180 """ 

181 

182 

183def makeTransferChoiceField( 

184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

185) -> ChoiceField: 

186 """Create a Config field with options for transferring data between repos. 

187 

188 The allowed options for the field are exactly those supported by 

189 `lsst.daf.butler.Datastore.ingest`. 

190 

191 Parameters 

192 ---------- 

193 doc : `str` 

194 Documentation for the configuration field. 

195 default : `str`, optional 

196 Default transfer mode for the field. 

197 

198 Returns 

199 ------- 

200 field : `lsst.pex.config.ChoiceField` 

201 Configuration field. 

202 """ 

203 return ChoiceField( 

204 doc=doc, 

205 dtype=str, 

206 allowed={ 

207 "move": "move", 

208 "copy": "copy", 

209 "auto": "choice will depend on datastore", 

210 "direct": "use URI to ingested file directly in datastore", 

211 "link": "hard link falling back to symbolic link", 

212 "hardlink": "hard link", 

213 "symlink": "symbolic (soft) link", 

214 "relsymlink": "relative symbolic link", 

215 }, 

216 optional=True, 

217 default=default, 

218 ) 

219 

220 

221class RawIngestConfig(Config): 

222 """Configuration class for RawIngestTask.""" 

223 

224 transfer = makeTransferChoiceField() 

225 failFast: Field[bool] = Field( 

226 dtype=bool, 

227 default=False, 

228 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

229 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

230 ) 

231 

232 

233class RawIngestTask(Task): 

234 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

235 

236 Parameters 

237 ---------- 

238 config : `RawIngestConfig` 

239 Configuration for the task. 

240 butler : `~lsst.daf.butler.Butler` 

241 Writeable butler instance, with ``butler.run`` set to the appropriate 

242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

243 datasets. 

244 on_success : `Callable`, optional 

245 A callback invoked when all of the raws associated with an exposure 

246 are ingested. Will be passed a list of `FileDataset` objects, each 

247 containing one or more resolved `DatasetRef` objects. If this callback 

248 raises it will interrupt the entire ingest process, even if 

249 `RawIngestConfig.failFast` is `False`. 

250 on_metadata_failure : `Callable`, optional 

251 A callback invoked when a failure occurs trying to translate the 

252 metadata for a file. Will be passed the URI and the exception, in 

253 that order, as positional arguments. Guaranteed to be called in an 

254 ``except`` block, allowing the callback to re-raise or replace (with 

255 ``raise ... from``) to override the task's usual error handling (before 

256 `RawIngestConfig.failFast` logic occurs). 

257 on_ingest_failure : `Callable`, optional 

258 A callback invoked when dimension record or dataset insertion into the 

259 database fails for an exposure. Will be passed a `RawExposureData` 

260 instance and the exception, in that order, as positional arguments. 

261 Guaranteed to be called in an ``except`` block, allowing the callback 

262 to re-raise or replace (with ``raise ... from``) to override the task's 

263 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

264 **kwargs 

265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

266 constructor. 

267 

268 Notes 

269 ----- 

270 Each instance of `RawIngestTask` writes to the same Butler. Each 

271 invocation of `RawIngestTask.run` ingests a list of files. 

272 """ 

273 

274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig 

275 

276 _DefaultName: ClassVar[str] = "ingest" 

277 

278 def getDatasetType(self) -> DatasetType: 

279 """Return the default DatasetType of the datasets ingested by this 

280 Task. 

281 

282 Returns 

283 ------- 

284 datasetType : `DatasetType` 

285 The default dataset type to use for the data being ingested. This 

286 is only used if the relevant `~lsst.pipe.base.Instrument` does not 

287 define an override. 

288 """ 

289 return DatasetType( 

290 "raw", 

291 ("instrument", "detector", "exposure"), 

292 "Exposure", 

293 universe=self.butler.registry.dimensions, 

294 ) 

295 

296 # Mypy can not determine that the config passed to super() is this type. 

297 config: RawIngestConfig 

298 

299 def __init__( 

300 self, 

301 config: RawIngestConfig, 

302 *, 

303 butler: Butler, 

304 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

305 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

306 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

307 **kwargs: Any, 

308 ): 

309 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

310 super().__init__(config, **kwargs) 

311 self.butler = butler 

312 self.universe = self.butler.registry.dimensions 

313 self.datasetType = self.getDatasetType() 

314 self._on_success = on_success 

315 self._on_metadata_failure = on_metadata_failure 

316 self._on_ingest_failure = on_ingest_failure 

317 self.progress = Progress("obs.base.RawIngestTask") 

318 

319 # Import all the instrument classes so that we ensure that we 

320 # have all the relevant metadata translators loaded. 

321 Instrument.importAll(self.butler.registry) 

322 

323 def _reduce_kwargs(self) -> Dict[str, Any]: 

324 # Add extra parameters to pickle. 

325 return dict( 

326 **super()._reduce_kwargs(), 

327 butler=self.butler, 

328 on_success=self._on_success, 

329 on_metadata_failure=self._on_metadata_failure, 

330 on_ingest_failure=self._on_ingest_failure, 

331 ) 

332 

333 def _determine_instrument_formatter( 

334 self, dataId: DataCoordinate, filename: ResourcePath 

335 ) -> Tuple[Optional[Instrument], Type[Formatter]]: 

336 """Determine the instrument and formatter class. 

337 

338 Parameters 

339 ---------- 

340 dataId : `lsst.daf.butler.DataCoordinate` 

341 The dataId associated with this dataset. 

342 filename : `lsst.resources.ResourcePath` 

343 URI of file used for error reporting. 

344 

345 Returns 

346 ------- 

347 instrument : `Instrument` or `None` 

348 Instance of the `Instrument` associated with this dataset. `None` 

349 indicates that the instrument could not be determined. 

350 formatterClass : `type` 

351 Class to be used as the formatter for this dataset. 

352 """ 

353 # The data model currently assumes that whilst multiple datasets 

354 # can be associated with a single file, they must all share the 

355 # same formatter. 

356 try: 

357 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

358 except LookupError as e: 

359 self._on_metadata_failure(filename, e) 

360 self.log.warning( 

361 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

362 ) 

363 if self.config.failFast: 

364 raise RuntimeError( 

365 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

366 ) from e 

367 FormatterClass = Formatter 

368 # Indicate that we could not work out the instrument. 

369 instrument = None 

370 else: 

371 assert instrument is not None, "Should be guaranted by fromName succeeding." 

372 FormatterClass = instrument.getRawFormatter(dataId) 

373 return instrument, FormatterClass 

374 

375 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

376 """Extract and process metadata from a single raw file. 

377 

378 Parameters 

379 ---------- 

380 filename : `lsst.resources.ResourcePath` 

381 URI to the file. 

382 

383 Returns 

384 ------- 

385 data : `RawFileData` 

386 A structure containing the metadata extracted from the file, 

387 as well as the original filename. All fields will be populated, 

388 but the `RawFileData.dataId` attribute will be a minimal 

389 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

390 ``instrument`` field will be `None` if there is a problem 

391 with metadata extraction. 

392 

393 Notes 

394 ----- 

395 Assumes that there is a single dataset associated with the given 

396 file. Instruments using a single file to store multiple datasets 

397 must implement their own version of this method. 

398 

399 By default the method will catch all exceptions unless the ``failFast`` 

400 configuration item is `True`. If an error is encountered the 

401 `_on_metadata_failure()` method will be called. If no exceptions 

402 result and an error was encountered the returned object will have 

403 a null-instrument class and no datasets. 

404 

405 This method supports sidecar JSON files which can be used to 

406 extract metadata without having to read the data file itself. 

407 The sidecar file is always used if found. 

408 """ 

409 sidecar_fail_msg = "" # Requires prepended space when set. 

410 try: 

411 sidecar_file = filename.updatedExtension(".json") 

412 if sidecar_file.exists(): 

413 content = json.loads(sidecar_file.read()) 

414 headers = [process_sidecar_data(content)] 

415 sidecar_fail_msg = " (via sidecar)" 

416 else: 

417 # Read the metadata from the data file itself. 

418 

419 # For remote files download the entire file to get the 

420 # header. This is very inefficient and it would be better 

421 # to have some way of knowing where in the file the headers 

422 # are and to only download those parts of the file. 

423 with filename.as_local() as local_file: 

424 # Read the primary. This might be sufficient. 

425 header = readMetadata(local_file.ospath, 0) 

426 

427 try: 

428 # Try to work out a translator class early. 

429 translator_class = MetadataTranslator.determine_translator( 

430 header, filename=str(filename) 

431 ) 

432 except ValueError: 

433 # Primary header was not sufficient (maybe this file 

434 # has been compressed or is a MEF with minimal 

435 # primary). Read second header and merge with primary. 

436 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

437 

438 # Try again to work out a translator class, letting this 

439 # fail. 

440 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename)) 

441 

442 # Request the headers to use for ingest 

443 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header)) 

444 

445 # Add each header to the dataset list 

446 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

447 

448 except Exception as e: 

449 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

450 # Indicate to the caller that we failed to read. 

451 datasets = [] 

452 formatterClass = Formatter 

453 instrument = None 

454 self._on_metadata_failure(filename, e) 

455 if self.config.failFast: 

456 raise RuntimeError( 

457 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

458 ) from e 

459 else: 

460 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

461 # The data model currently assumes that whilst multiple datasets 

462 # can be associated with a single file, they must all share the 

463 # same formatter. 

464 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

465 if instrument is None: 

466 datasets = [] 

467 

468 return RawFileData( 

469 datasets=datasets, 

470 filename=filename, 

471 # MyPy wants this to be a non-abstract class, which is not true 

472 # for the error case where instrument is None and datasets=[]. 

473 FormatterClass=formatterClass, # type: ignore 

474 instrument=instrument, 

475 ) 

476 

477 @classmethod 

478 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]: 

479 """Return subsets of fields in the `ObservationInfo` that we care about 

480 

481 These fields will be used in constructing an exposure record. 

482 

483 Returns 

484 ------- 

485 required : `set` 

486 Set of `ObservationInfo` field names that are required. 

487 optional : `set` 

488 Set of `ObservationInfo` field names we will use if they are 

489 available. 

490 """ 

491 # Marking the new properties "group_counter_*" and 

492 # "has_simulated_content" as required, assumes that we either 

493 # recreate any existing index/sidecar files that include translated 

494 # values, or else allow astro_metadata_translator to fill in 

495 # defaults. 

496 required = { 

497 "datetime_begin", 

498 "datetime_end", 

499 "detector_num", 

500 "exposure_id", 

501 "exposure_time", 

502 "group_counter_end", 

503 "group_counter_start", 

504 "has_simulated_content", 

505 "instrument", 

506 "observation_id", 

507 "observation_type", 

508 "physical_filter", 

509 } 

510 optional = { 

511 "altaz_begin", 

512 "boresight_rotation_coord", 

513 "boresight_rotation_angle", 

514 "dark_time", 

515 "exposure_group", 

516 "tracking_radec", 

517 "object", 

518 "observation_counter", 

519 "observation_reason", 

520 "observing_day", 

521 "science_program", 

522 "visit_id", 

523 } 

524 return required, optional 

525 

526 def _calculate_dataset_info( 

527 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath 

528 ) -> RawFileDatasetInfo: 

529 """Calculate a RawFileDatasetInfo from the supplied information. 

530 

531 Parameters 

532 ---------- 

533 header : Mapping or `astro_metadata_translator.ObservationInfo` 

534 Header from the dataset or previously-translated content. 

535 filename : `lsst.resources.ResourcePath` 

536 Filename to use for error messages. 

537 

538 Returns 

539 ------- 

540 dataset : `RawFileDatasetInfo` 

541 The dataId, and observation information associated with this 

542 dataset. 

543 """ 

544 required, optional = self.getObservationInfoSubsets() 

545 if isinstance(header, ObservationInfo): 

546 obsInfo = header 

547 missing = [] 

548 # Need to check the required properties are present. 

549 for property in required: 

550 # getattr does not need to be protected because it is using 

551 # the defined list above containing properties that must exist. 

552 value = getattr(obsInfo, property) 

553 if value is None: 

554 missing.append(property) 

555 if missing: 

556 raise ValueError( 

557 f"Requested required properties are missing from file {filename}: {missing} (via JSON)" 

558 ) 

559 

560 else: 

561 obsInfo = ObservationInfo( 

562 header, 

563 pedantic=False, 

564 filename=str(filename), 

565 required=required, 

566 subset=required | optional, 

567 ) 

568 

569 dataId = DataCoordinate.standardize( 

570 instrument=obsInfo.instrument, 

571 exposure=obsInfo.exposure_id, 

572 detector=obsInfo.detector_num, 

573 universe=self.universe, 

574 ) 

575 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

576 

577 def locateAndReadIndexFiles( 

578 self, files: Iterable[ResourcePath] 

579 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]: 

580 """Given a list of files, look for index files and read them. 

581 

582 Index files can either be explicitly in the list of files to 

583 ingest, or else located in the same directory as a file to ingest. 

584 Index entries are always used if present. 

585 

586 Parameters 

587 ---------- 

588 files : iterable over `lsst.resources.ResourcePath` 

589 URIs to the files to be ingested. 

590 

591 Returns 

592 ------- 

593 index : `dict` [`ResourcePath`, Any] 

594 Merged contents of all relevant index files found. These can 

595 be explicitly specified index files or ones found in the 

596 directory alongside a data file to be ingested. 

597 updated_files : `list` of `ResourcePath` 

598 Updated list of the input files with entries removed that were 

599 found listed in an index file. Order is not guaranteed to 

600 match the order of the files given to this routine. 

601 good_index_files: `set` [ `ResourcePath` ] 

602 Index files that were successfully read. 

603 bad_index_files: `set` [ `ResourcePath` ] 

604 Files that looked like index files but failed to read properly. 

605 """ 

606 # Convert the paths to absolute for easy comparison with index content. 

607 # Do not convert to real paths since we have to assume that index 

608 # files are in this location and not the location which it links to. 

609 files = tuple(f.abspath() for f in files) 

610 

611 # Index files must be named this. 

612 index_root_file = "_index.json" 

613 

614 # Group the files by directory. 

615 files_by_directory = defaultdict(set) 

616 

617 for path in files: 

618 directory, file_in_dir = path.split() 

619 files_by_directory[directory].add(file_in_dir) 

620 

621 # All the metadata read from index files with keys of full path. 

622 index_entries: Dict[ResourcePath, Any] = {} 

623 

624 # Index files we failed to read. 

625 bad_index_files = set() 

626 

627 # Any good index files that were found and used. 

628 good_index_files = set() 

629 

630 # Look for index files in those directories. 

631 for directory, files_in_directory in files_by_directory.items(): 

632 possible_index_file = directory.join(index_root_file) 

633 if possible_index_file.exists(): 

634 # If we are explicitly requesting an index file the 

635 # messages should be different. 

636 index_msg = "inferred" 

637 is_implied = True 

638 if index_root_file in files_in_directory: 

639 index_msg = "explicit" 

640 is_implied = False 

641 

642 # Try to read the index file and catch and report any 

643 # problems. 

644 try: 

645 content = json.loads(possible_index_file.read()) 

646 index = process_index_data(content, force_dict=True) 

647 # mypy should in theory know that this is a mapping 

648 # from the overload type annotation of process_index_data. 

649 assert isinstance(index, MutableMapping) 

650 except Exception as e: 

651 # Only trigger the callback if the index file 

652 # was asked for explicitly. Triggering on implied file 

653 # might be surprising. 

654 if not is_implied: 

655 self._on_metadata_failure(possible_index_file, e) 

656 if self.config.failFast: 

657 raise RuntimeError( 

658 f"Problem reading index file from {index_msg} location {possible_index_file}" 

659 ) from e 

660 bad_index_files.add(possible_index_file) 

661 continue 

662 

663 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

664 good_index_files.add(possible_index_file) 

665 

666 # Go through the index adding entries for files. 

667 # If we have non-index files in this directory marked for 

668 # ingest we should only get index information for those. 

669 # If the index file was explicit we use all entries. 

670 if is_implied: 

671 files_to_ingest = files_in_directory 

672 else: 

673 files_to_ingest = set(index) 

674 

675 # Copy relevant metadata into a single dict for all index 

676 # entries. 

677 for file_in_dir in files_to_ingest: 

678 # Skip an explicitly specified index file. 

679 # This should never happen because an explicit index 

680 # file will force ingest of all files in the index 

681 # and not use the explicit file list. If somehow 

682 # this is not true we continue. Raising an exception 

683 # seems like the wrong thing to do since this is harmless. 

684 if file_in_dir == index_root_file: 

685 self.log.info( 

686 "Logic error found scanning directory %s. Please file ticket.", directory 

687 ) 

688 continue 

689 if file_in_dir in index: 

690 file = directory.join(file_in_dir) 

691 if file in index_entries: 

692 # ObservationInfo overrides raw metadata 

693 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

694 index_entries[file], ObservationInfo 

695 ): 

696 self.log.warning( 

697 "File %s already specified in an index file but overriding" 

698 " with ObservationInfo content from %s", 

699 file, 

700 possible_index_file, 

701 ) 

702 else: 

703 self.log.warning( 

704 "File %s already specified in an index file, ignoring content from %s", 

705 file, 

706 possible_index_file, 

707 ) 

708 # Do nothing in this case 

709 continue 

710 

711 index_entries[file] = index[file_in_dir] 

712 

713 # Remove files from list that have index entries and also 

714 # any files that we determined to be explicit index files 

715 # or any index files that we failed to read. 

716 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

717 

718 # The filtered list loses the initial order. Retaining the order 

719 # is good for testing but does have a cost if there are many 

720 # files when copying the good values out. A dict would have faster 

721 # lookups (using the files as keys) but use more memory. 

722 ordered = [f for f in filtered if f in files] 

723 

724 return index_entries, ordered, good_index_files, bad_index_files 

725 

726 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]: 

727 """Convert index entries to RawFileData. 

728 

729 Parameters 

730 ---------- 

731 index_entries : `dict` [`ResourcePath`, Any] 

732 Dict indexed by name of file to ingest and with keys either 

733 raw metadata or translated 

734 `~astro_metadata_translator.ObservationInfo`. 

735 

736 Returns 

737 ------- 

738 data : `list` [ `RawFileData` ] 

739 Structures containing the metadata extracted from the file, 

740 as well as the original filename. All fields will be populated, 

741 but the `RawFileData.dataId` attributes will be minimal 

742 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

743 """ 

744 fileData = [] 

745 for filename, metadata in index_entries.items(): 

746 try: 

747 datasets = [self._calculate_dataset_info(metadata, filename)] 

748 except Exception as e: 

749 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

750 datasets = [] 

751 formatterClass = Formatter 

752 instrument = None 

753 self._on_metadata_failure(filename, e) 

754 if self.config.failFast: 

755 raise RuntimeError( 

756 f"Problem extracting metadata for file {filename} found in index file" 

757 ) from e 

758 else: 

759 instrument, formatterClass = self._determine_instrument_formatter( 

760 datasets[0].dataId, filename 

761 ) 

762 if instrument is None: 

763 datasets = [] 

764 fileData.append( 

765 RawFileData( 

766 datasets=datasets, 

767 filename=filename, 

768 # MyPy wants this to be a non-abstract class, which is not 

769 # true for the error case where instrument is None and 

770 # datasets=[]. 

771 FormatterClass=formatterClass, # type: ignore 

772 instrument=instrument, 

773 ) 

774 ) 

775 return fileData 

776 

777 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

778 """Group an iterable of `RawFileData` by exposure. 

779 

780 Parameters 

781 ---------- 

782 files : iterable of `RawFileData` 

783 File-level information to group. 

784 

785 Returns 

786 ------- 

787 exposures : `list` of `RawExposureData` 

788 A list of structures that group the file-level information by 

789 exposure. All fields will be populated. The 

790 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

791 `~lsst.daf.butler.DataCoordinate` instances. 

792 """ 

793 exposureDimensions = self.universe["exposure"].graph 

794 byExposure = defaultdict(list) 

795 for f in files: 

796 # Assume that the first dataset is representative for the file. 

797 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

798 

799 return [ 

800 RawExposureData( 

801 dataId=dataId, 

802 files=exposureFiles, 

803 universe=self.universe, 

804 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

805 dependencyRecords=self.makeDependencyRecords( 

806 exposureFiles[0].datasets[0].obsInfo, self.universe 

807 ), 

808 ) 

809 for dataId, exposureFiles in byExposure.items() 

810 ] 

811 

812 def makeExposureRecord( 

813 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

814 ) -> DimensionRecord: 

815 """Construct a registry record for an exposure 

816 

817 This is a method that subclasses will often want to customize. This can 

818 often be done by calling this base class implementation with additional 

819 ``kwargs``. 

820 

821 Parameters 

822 ---------- 

823 obsInfo : `ObservationInfo` 

824 Observation details for (one of the components of) the exposure. 

825 universe : `DimensionUniverse` 

826 Set of all known dimensions. 

827 **kwargs 

828 Additional field values for this record. 

829 

830 Returns 

831 ------- 

832 record : `DimensionRecord` 

833 The exposure record that must be inserted into the 

834 `~lsst.daf.butler.Registry` prior to file-level ingest. 

835 """ 

836 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

837 

838 def makeDependencyRecords( 

839 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

840 ) -> Dict[str, DimensionRecord]: 

841 """Construct dependency records 

842 

843 These dependency records will be inserted into the 

844 `~lsst.daf.butler.Registry` before the exposure records, because they 

845 are dependencies of the exposure. This allows an opportunity to satisfy 

846 foreign key constraints that exist because of dimensions related to the 

847 exposure. 

848 

849 This is a method that subclasses may want to customize, if they've 

850 added dimensions that relate to an exposure. 

851 

852 Parameters 

853 ---------- 

854 obsInfo : `ObservationInfo` 

855 Observation details for (one of the components of) the exposure. 

856 universe : `DimensionUniverse` 

857 Set of all known dimensions. 

858 

859 Returns 

860 ------- 

861 records : `dict` [`str`, `DimensionRecord`] 

862 The records to insert, indexed by dimension name. 

863 """ 

864 return {} 

865 

866 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

867 """Expand the data IDs associated with a raw exposure. 

868 

869 This adds the metadata records. 

870 

871 Parameters 

872 ---------- 

873 exposure : `RawExposureData` 

874 A structure containing information about the exposure to be 

875 ingested. Must have `RawExposureData.record` populated. Should 

876 be considered consumed upon return. 

877 

878 Returns 

879 ------- 

880 exposure : `RawExposureData` 

881 An updated version of the input structure, with 

882 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

883 updated to data IDs for which 

884 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

885 """ 

886 # We start by expanded the exposure-level data ID; we won't use that 

887 # directly in file ingest, but this lets us do some database lookups 

888 # once per exposure instead of once per file later. 

889 data.dataId = self.butler.registry.expandDataId( 

890 data.dataId, 

891 # We pass in the records we'll be inserting shortly so they aren't 

892 # looked up from the database. We do expect instrument and filter 

893 # records to be retrieved from the database here (though the 

894 # Registry may cache them so there isn't a lookup every time). 

895 records={"exposure": data.record}, 

896 ) 

897 # Now we expand the per-file (exposure+detector) data IDs. This time 

898 # we pass in the records we just retrieved from the exposure data ID 

899 # expansion. 

900 for file in data.files: 

901 for dataset in file.datasets: 

902 dataset.dataId = self.butler.registry.expandDataId( 

903 dataset.dataId, records=data.dataId.records 

904 ) 

905 return data 

906 

907 def prep( 

908 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None 

909 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]: 

910 """Perform all non-database-updating ingest preprocessing steps. 

911 

912 Parameters 

913 ---------- 

914 files : iterable over `str` or path-like objects 

915 Paths to the files to be ingested. Will be made absolute 

916 if they are not already. 

917 pool : `multiprocessing.Pool`, optional 

918 If not `None`, a process pool with which to parallelize some 

919 operations. 

920 

921 Returns 

922 ------- 

923 exposures : `Iterator` [ `RawExposureData` ] 

924 Data structures containing dimension records, filenames, and data 

925 IDs to be ingested (one structure for each exposure). 

926 bad_files : `list` of `str` 

927 List of all the files that could not have metadata extracted. 

928 """ 

929 mapFunc = map if pool is None else pool.imap_unordered 

930 

931 def _partition_good_bad( 

932 file_data: Iterable[RawFileData], 

933 ) -> Tuple[List[RawFileData], List[ResourcePath]]: 

934 """Filter out bad files and return good with list of bad.""" 

935 good_files = [] 

936 bad_files = [] 

937 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

938 if not fileDatum.datasets: 

939 bad_files.append(fileDatum.filename) 

940 else: 

941 good_files.append(fileDatum) 

942 return good_files, bad_files 

943 

944 # Look for index files and read them. 

945 # There should be far fewer index files than data files. 

946 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

947 if bad_index_files: 

948 self.log.info("Failed to read the following explicitly requested index files:") 

949 for bad in sorted(bad_index_files): 

950 self.log.info("- %s", bad) 

951 

952 # Now convert all the index file entries to standard form for ingest. 

953 processed_bad_index_files: List[ResourcePath] = [] 

954 indexFileData = self.processIndexEntries(index_entries) 

955 if indexFileData: 

956 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

957 self.log.info( 

958 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

959 *_log_msg_counter(indexFileData), 

960 *_log_msg_counter(good_index_files), 

961 *_log_msg_counter(processed_bad_index_files), 

962 ) 

963 

964 # Extract metadata and build per-detector regions. 

965 # This could run in a subprocess so collect all output 

966 # before looking at failures. 

967 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

968 

969 # Filter out all the failed reads and store them for later 

970 # reporting. 

971 good_file_data, bad_files = _partition_good_bad(fileData) 

972 self.log.info( 

973 "Successfully extracted metadata from %d file%s with %d failure%s", 

974 *_log_msg_counter(good_file_data), 

975 *_log_msg_counter(bad_files), 

976 ) 

977 

978 # Combine with data from index files. 

979 good_file_data.extend(indexFileData) 

980 bad_files.extend(processed_bad_index_files) 

981 bad_files.extend(bad_index_files) 

982 

983 # Use that metadata to group files (and extracted metadata) by 

984 # exposure. Never parallelized because it's intrinsically a gather 

985 # step. 

986 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data) 

987 

988 # The next operation operates on RawExposureData instances (one at 

989 # a time) in-place and then returns the modified instance. We call it 

990 # as a pass-through instead of relying on the arguments we pass in to 

991 # have been modified because in the parallel case those arguments are 

992 # going to be pickled and unpickled, and I'm not certain 

993 # multiprocessing is careful enough with that for output arguments to 

994 # work. 

995 

996 # Expand the data IDs to include all dimension metadata; we need this 

997 # because we may need to generate path templates that rely on that 

998 # metadata. 

999 # This is the first step that involves actual database calls (but just 

1000 # SELECTs), so if there's going to be a problem with connections vs. 

1001 # multiple processes, or lock contention (in SQLite) slowing things 

1002 # down, it'll happen here. 

1003 return mapFunc(self.expandDataIds, exposureData), bad_files 

1004 

1005 def ingestExposureDatasets( 

1006 self, 

1007 exposure: RawExposureData, 

1008 datasetType: DatasetType, 

1009 *, 

1010 run: str, 

1011 skip_existing_exposures: bool = False, 

1012 track_file_attrs: bool = True, 

1013 ) -> List[FileDataset]: 

1014 """Ingest all raw files in one exposure. 

1015 

1016 Parameters 

1017 ---------- 

1018 exposure : `RawExposureData` 

1019 A structure containing information about the exposure to be 

1020 ingested. Must have `RawExposureData.records` populated and all 

1021 data ID attributes expanded. 

1022 datasetType : `DatasetType` 

1023 The dataset type associated with this exposure. 

1024 run : `str` 

1025 Name of a RUN-type collection to write to. 

1026 skip_existing_exposures : `bool`, optional 

1027 If `True` (`False` is default), skip raws that have already been 

1028 ingested (i.e. raws for which we already have a dataset with the 

1029 same data ID in the target collection, even if from another file). 

1030 Note that this is much slower than just not passing 

1031 already-ingested files as inputs, because we still need to read and 

1032 process metadata to identify which exposures to search for. It 

1033 also will not work reliably if multiple processes are attempting to 

1034 ingest raws from the same exposure concurrently, in that different 

1035 processes may still attempt to ingest the same raw and conflict, 

1036 causing a failure that prevents other raws from the same exposure 

1037 from being ingested. 

1038 track_file_attrs : `bool`, optional 

1039 Control whether file attributes such as the size or checksum should 

1040 be tracked by the datastore. Whether this parameter is honored 

1041 depends on the specific datastore implementation. 

1042 

1043 Returns 

1044 ------- 

1045 datasets : `list` of `lsst.daf.butler.FileDataset` 

1046 Per-file structures identifying the files ingested and their 

1047 dataset representation in the data repository. 

1048 """ 

1049 if skip_existing_exposures: 

1050 existing = { 

1051 ref.dataId 

1052 for ref in self.butler.registry.queryDatasets( 

1053 datasetType, 

1054 collections=[run], 

1055 dataId=exposure.dataId, 

1056 ) 

1057 } 

1058 else: 

1059 existing = set() 

1060 

1061 # Raw files are preferentially ingested using a UUID derived from 

1062 # the collection name and dataId. 

1063 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1064 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1065 else: 

1066 mode = DatasetIdGenEnum.UNIQUE 

1067 

1068 datasets = [] 

1069 for file in exposure.files: 

1070 refs = [ 

1071 DatasetRef(datasetType, d.dataId, run=run, id_generation_mode=mode) 

1072 for d in file.datasets 

1073 if d.dataId not in existing 

1074 ] 

1075 if refs: 

1076 datasets.append( 

1077 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1078 ) 

1079 

1080 self.butler.ingest( 

1081 *datasets, 

1082 transfer=self.config.transfer, 

1083 record_validation_info=track_file_attrs, 

1084 ) 

1085 return datasets 

1086 

1087 def ingestFiles( 

1088 self, 

1089 files: Iterable[ResourcePath], 

1090 *, 

1091 pool: Optional[PoolType] = None, 

1092 processes: int = 1, 

1093 run: Optional[str] = None, 

1094 skip_existing_exposures: bool = False, 

1095 update_exposure_records: bool = False, 

1096 track_file_attrs: bool = True, 

1097 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]: 

1098 """Ingest files into a Butler data repository. 

1099 

1100 This creates any new exposure or visit Dimension entries needed to 

1101 identify the ingested files, creates new Dataset entries in the 

1102 Registry and finally ingests the files themselves into the Datastore. 

1103 Any needed instrument, detector, and physical_filter Dimension entries 

1104 must exist in the Registry before `run` is called. 

1105 

1106 Parameters 

1107 ---------- 

1108 files : iterable over `lsst.resources.ResourcePath` 

1109 URIs to the files to be ingested. 

1110 pool : `multiprocessing.Pool`, optional 

1111 If not `None`, a process pool with which to parallelize some 

1112 operations. 

1113 processes : `int`, optional 

1114 The number of processes to use. Ignored if ``pool`` is not `None`. 

1115 run : `str`, optional 

1116 Name of a RUN-type collection to write to, overriding 

1117 the default derived from the instrument name. 

1118 skip_existing_exposures : `bool`, optional 

1119 If `True` (`False` is default), skip raws that have already been 

1120 ingested (i.e. raws for which we already have a dataset with the 

1121 same data ID in the target collection, even if from another file). 

1122 Note that this is much slower than just not passing 

1123 already-ingested files as inputs, because we still need to read and 

1124 process metadata to identify which exposures to search for. It 

1125 also will not work reliably if multiple processes are attempting to 

1126 ingest raws from the same exposure concurrently, in that different 

1127 processes may still attempt to ingest the same raw and conflict, 

1128 causing a failure that prevents other raws from the same exposure 

1129 from being ingested. 

1130 update_exposure_records : `bool`, optional 

1131 If `True` (`False` is default), update existing exposure records 

1132 that conflict with the new ones instead of rejecting them. THIS IS 

1133 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1134 KNOWN TO BE BAD. This should usually be combined with 

1135 ``skip_existing_exposures=True``. 

1136 track_file_attrs : `bool`, optional 

1137 Control whether file attributes such as the size or checksum should 

1138 be tracked by the datastore. Whether this parameter is honored 

1139 depends on the specific datastore implentation. 

1140 

1141 Returns 

1142 ------- 

1143 refs : `list` of `lsst.daf.butler.DatasetRef` 

1144 Dataset references for ingested raws. 

1145 bad_files : `list` of `ResourcePath` 

1146 Given paths that could not be ingested. 

1147 n_exposures : `int` 

1148 Number of exposures successfully ingested. 

1149 n_exposures_failed : `int` 

1150 Number of exposures that failed when inserting dimension data. 

1151 n_ingests_failed : `int` 

1152 Number of exposures that failed when ingesting raw datasets. 

1153 """ 

1154 

1155 created_pool = False 

1156 if pool is None and processes > 1: 

1157 pool = Pool(processes) 

1158 created_pool = True 

1159 

1160 try: 

1161 exposureData, bad_files = self.prep(files, pool=pool) 

1162 finally: 

1163 if created_pool and pool: 

1164 # The pool is not needed any more so close it if we created 

1165 # it to ensure we clean up resources. 

1166 pool.close() 

1167 pool.join() 

1168 

1169 # Up to this point, we haven't modified the data repository at all. 

1170 # Now we finally do that, with one transaction per exposure. This is 

1171 # not parallelized at present because the performance of this step is 

1172 # limited by the database server. That may or may not change in the 

1173 # future once we increase our usage of bulk inserts and reduce our 

1174 # usage of savepoints; we've tried to get everything but the database 

1175 # operations done in advance to reduce the time spent inside 

1176 # transactions. 

1177 refs = [] 

1178 runs = set() 

1179 datasetTypes: dict[str, DatasetType] = {} 

1180 n_exposures = 0 

1181 n_exposures_failed = 0 

1182 n_ingests_failed = 0 

1183 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1184 assert exposure.record is not None, "Should be guaranteed by prep()" 

1185 self.log.debug( 

1186 "Attempting to ingest %d file%s from exposure %s:%s", 

1187 *_log_msg_counter(exposure.files), 

1188 exposure.record.instrument, 

1189 exposure.record.obs_id, 

1190 ) 

1191 

1192 try: 

1193 for name, record in exposure.dependencyRecords.items(): 

1194 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1195 inserted_or_updated = self.butler.registry.syncDimensionData( 

1196 "exposure", 

1197 exposure.record, 

1198 update=update_exposure_records, 

1199 ) 

1200 except Exception as e: 

1201 self._on_ingest_failure(exposure, e) 

1202 n_exposures_failed += 1 

1203 self.log.warning( 

1204 "Exposure %s:%s could not be registered: %s", 

1205 exposure.record.instrument, 

1206 exposure.record.obs_id, 

1207 e, 

1208 ) 

1209 if self.config.failFast: 

1210 raise e 

1211 continue 

1212 

1213 if isinstance(inserted_or_updated, dict): 

1214 # Exposure is in the registry and we updated it, so 

1215 # syncDimensionData returned a dict. 

1216 self.log.info( 

1217 "Exposure %s:%s was already present, but columns %s were updated.", 

1218 exposure.record.instrument, 

1219 exposure.record.obs_id, 

1220 str(list(inserted_or_updated.keys())), 

1221 ) 

1222 

1223 # Determine the instrument so we can work out the dataset type. 

1224 instrument = exposure.files[0].instrument 

1225 assert ( 

1226 instrument is not None 

1227 ), "file should have been removed from this list by prep if instrument could not be found" 

1228 

1229 if raw_definition := getattr(instrument, "raw_definition", None): 

1230 datasetTypeName, dimensions, storageClass = raw_definition 

1231 if not (datasetType := datasetTypes.get(datasetTypeName)): 

1232 datasetType = DatasetType( 

1233 datasetTypeName, dimensions, storageClass, universe=self.butler.registry.dimensions 

1234 ) 

1235 else: 

1236 datasetType = self.datasetType 

1237 if datasetType.name not in datasetTypes: 

1238 self.butler.registry.registerDatasetType(datasetType) 

1239 datasetTypes[datasetType.name] = datasetType 

1240 

1241 # Override default run if nothing specified explicitly. 

1242 if run is None: 

1243 this_run = instrument.makeDefaultRawIngestRunName() 

1244 else: 

1245 this_run = run 

1246 if this_run not in runs: 

1247 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1248 runs.add(this_run) 

1249 try: 

1250 datasets_for_exposure = self.ingestExposureDatasets( 

1251 exposure, 

1252 datasetType=datasetType, 

1253 run=this_run, 

1254 skip_existing_exposures=skip_existing_exposures, 

1255 track_file_attrs=track_file_attrs, 

1256 ) 

1257 except Exception as e: 

1258 self._on_ingest_failure(exposure, e) 

1259 n_ingests_failed += 1 

1260 self.log.warning("Failed to ingest the following for reason: %s", e) 

1261 for f in exposure.files: 

1262 self.log.warning("- %s", f.filename) 

1263 if self.config.failFast: 

1264 raise e 

1265 continue 

1266 else: 

1267 self._on_success(datasets_for_exposure) 

1268 for dataset in datasets_for_exposure: 

1269 refs.extend(dataset.refs) 

1270 

1271 # Success for this exposure. 

1272 n_exposures += 1 

1273 self.log.info( 

1274 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1275 ) 

1276 

1277 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1278 

1279 @timeMethod 

1280 def run( 

1281 self, 

1282 files: Iterable[ResourcePathExpression], 

1283 *, 

1284 pool: Optional[PoolType] = None, 

1285 processes: int = 1, 

1286 run: Optional[str] = None, 

1287 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", 

1288 group_files: bool = True, 

1289 skip_existing_exposures: bool = False, 

1290 update_exposure_records: bool = False, 

1291 track_file_attrs: bool = True, 

1292 ) -> List[DatasetRef]: 

1293 """Ingest files into a Butler data repository. 

1294 

1295 This creates any new exposure or visit Dimension entries needed to 

1296 identify the ingested files, creates new Dataset entries in the 

1297 Registry and finally ingests the files themselves into the Datastore. 

1298 Any needed instrument, detector, and physical_filter Dimension entries 

1299 must exist in the Registry before `run` is called. 

1300 

1301 Parameters 

1302 ---------- 

1303 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1304 Paths to the files to be ingested. Can refer to directories. 

1305 Will be made absolute if they are not already. 

1306 pool : `multiprocessing.Pool`, optional 

1307 If not `None`, a process pool with which to parallelize some 

1308 operations. 

1309 processes : `int`, optional 

1310 The number of processes to use. Ignored if ``pool`` is not `None`. 

1311 run : `str`, optional 

1312 Name of a RUN-type collection to write to, overriding 

1313 the default derived from the instrument name. 

1314 file_filter : `str` or `re.Pattern`, optional 

1315 Pattern to use to discover files to ingest within directories. 

1316 The default is to search for FITS files. The regex applies to 

1317 files within the directory. 

1318 group_files : `bool`, optional 

1319 Group files by directory if they have been discovered in 

1320 directories. Will not affect files explicitly provided. 

1321 skip_existing_exposures : `bool`, optional 

1322 If `True` (`False` is default), skip raws that have already been 

1323 ingested (i.e. raws for which we already have a dataset with the 

1324 same data ID in the target collection, even if from another file). 

1325 Note that this is much slower than just not passing 

1326 already-ingested files as inputs, because we still need to read and 

1327 process metadata to identify which exposures to search for. It 

1328 also will not work reliably if multiple processes are attempting to 

1329 ingest raws from the same exposure concurrently, in that different 

1330 processes may still attempt to ingest the same raw and conflict, 

1331 causing a failure that prevents other raws from the same exposure 

1332 from being ingested. 

1333 update_exposure_records : `bool`, optional 

1334 If `True` (`False` is default), update existing exposure records 

1335 that conflict with the new ones instead of rejecting them. THIS IS 

1336 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1337 KNOWN TO BE BAD. This should usually be combined with 

1338 ``skip_existing_exposures=True``. 

1339 track_file_attrs : `bool`, optional 

1340 Control whether file attributes such as the size or checksum should 

1341 be tracked by the datastore. Whether this parameter is honored 

1342 depends on the specific datastore implentation. 

1343 

1344 Returns 

1345 ------- 

1346 refs : `list` of `lsst.daf.butler.DatasetRef` 

1347 Dataset references for ingested raws. 

1348 

1349 Notes 

1350 ----- 

1351 This method inserts all datasets for an exposure within a transaction, 

1352 guaranteeing that partial exposures are never ingested. The exposure 

1353 dimension record is inserted with `Registry.syncDimensionData` first 

1354 (in its own transaction), which inserts only if a record with the same 

1355 primary key does not already exist. This allows different files within 

1356 the same exposure to be ingested in different runs. 

1357 """ 

1358 

1359 refs = [] 

1360 bad_files = [] 

1361 n_exposures = 0 

1362 n_exposures_failed = 0 

1363 n_ingests_failed = 0 

1364 if group_files: 

1365 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1366 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1367 group, 

1368 pool=pool, 

1369 processes=processes, 

1370 run=run, 

1371 skip_existing_exposures=skip_existing_exposures, 

1372 update_exposure_records=update_exposure_records, 

1373 track_file_attrs=track_file_attrs, 

1374 ) 

1375 refs.extend(new_refs) 

1376 bad_files.extend(bad) 

1377 n_exposures += n_exp 

1378 n_exposures_failed += n_exp_fail 

1379 n_ingests_failed += n_ingest_fail 

1380 else: 

1381 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1382 ResourcePath.findFileResources(files, file_filter, group_files), 

1383 pool=pool, 

1384 processes=processes, 

1385 run=run, 

1386 skip_existing_exposures=skip_existing_exposures, 

1387 update_exposure_records=update_exposure_records, 

1388 ) 

1389 

1390 had_failure = False 

1391 

1392 if bad_files: 

1393 had_failure = True 

1394 self.log.warning("Could not extract observation metadata from the following:") 

1395 for f in bad_files: 

1396 self.log.warning("- %s", f) 

1397 

1398 self.log.info( 

1399 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1400 " registration and %d failure%s from file ingest.", 

1401 *_log_msg_counter(n_exposures), 

1402 *_log_msg_counter(n_exposures_failed), 

1403 *_log_msg_counter(n_ingests_failed), 

1404 ) 

1405 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1406 had_failure = True 

1407 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1408 

1409 if had_failure: 

1410 raise RuntimeError("Some failures encountered during ingestion") 

1411 

1412 return refs