Coverage for python/lsst/obs/base/ingest.py: 17%

343 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-31 02:35 -0800

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from dataclasses import InitVar, dataclass 

29from multiprocessing import Pool 

30from typing import ( 

31 Any, 

32 Callable, 

33 ClassVar, 

34 Dict, 

35 Iterable, 

36 Iterator, 

37 List, 

38 MutableMapping, 

39 Optional, 

40 Set, 

41 Sized, 

42 Tuple, 

43 Type, 

44 Union, 

45) 

46 

47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

49from lsst.afw.fits import readMetadata 

50from lsst.daf.butler import ( 

51 Butler, 

52 CollectionType, 

53 DataCoordinate, 

54 DatasetIdGenEnum, 

55 DatasetRef, 

56 DatasetType, 

57 DimensionRecord, 

58 DimensionUniverse, 

59 FileDataset, 

60 Formatter, 

61 Progress, 

62) 

63from lsst.pex.config import ChoiceField, Config, Field 

64from lsst.pipe.base import Instrument, Task 

65from lsst.resources import ResourcePath, ResourcePathExpression 

66from lsst.utils.timer import timeMethod 

67 

68from ._instrument import makeExposureRecordFromObsInfo 

69 

70# multiprocessing.Pool is actually a function, not a type, and the real type 

71# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

72# this alias instead. 

73PoolType = Any 

74 

75 

76def _do_nothing(*args: Any, **kwargs: Any) -> None: 

77 """Do nothing. 

78 

79 This is a function that accepts anything and does nothing. 

80 For use as a default in callback arguments. 

81 """ 

82 pass 

83 

84 

85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]: 

86 """Count the iterable and return the count and plural modifier. 

87 

88 Parameters 

89 ---------- 

90 noun : `Sized` or `int` 

91 Thing to count. If given an integer it is assumed to be the count 

92 to use to calculate modifier. 

93 

94 Returns 

95 ------- 

96 num : `int` 

97 Number of items found in ``noun``. 

98 modifier : `str` 

99 Character to add to the end of a string referring to these items 

100 to indicate whether it was a single item or not. Returns empty 

101 string if there is one item or "s" otherwise. 

102 

103 Examples 

104 -------- 

105 

106 .. code-block:: python 

107 

108 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

109 """ 

110 if isinstance(noun, int): 

111 num = noun 

112 else: 

113 num = len(noun) 

114 return num, "" if num == 1 else "s" 

115 

116 

117@dataclass 

118class RawFileDatasetInfo: 

119 """Information about a single dataset within a raw file.""" 

120 

121 dataId: DataCoordinate 

122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

123 

124 obsInfo: ObservationInfo 

125 """Standardized observation metadata extracted directly from the file 

126 headers (`astro_metadata_translator.ObservationInfo`). 

127 """ 

128 

129 

130@dataclass 

131class RawFileData: 

132 """Information about a single raw file, used during ingest.""" 

133 

134 datasets: List[RawFileDatasetInfo] 

135 """The information describing each dataset within this raw file. 

136 (`list` of `RawFileDatasetInfo`) 

137 """ 

138 

139 filename: ResourcePath 

140 """URI of the file this information was extracted from (`str`). 

141 

142 This is the path prior to ingest, not the path after ingest. 

143 """ 

144 

145 FormatterClass: Type[Formatter] 

146 """Formatter class that should be used to ingest this file (`type`; as 

147 subclass of `Formatter`). 

148 """ 

149 

150 instrument: Optional[Instrument] 

151 """The `Instrument` instance associated with this file. Can be `None` 

152 if ``datasets`` is an empty list.""" 

153 

154 

155@dataclass 

156class RawExposureData: 

157 """Information about a complete raw exposure, used during ingest.""" 

158 

159 dataId: DataCoordinate 

160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

161 """ 

162 

163 files: List[RawFileData] 

164 """List of structures containing file-level information. 

165 """ 

166 

167 universe: InitVar[DimensionUniverse] 

168 """Set of all known dimensions. 

169 """ 

170 

171 record: DimensionRecord 

172 """The exposure `DimensionRecord` that must be inserted into the 

173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

174 """ 

175 

176 dependencyRecords: Dict[str, DimensionRecord] 

177 """Additional records that must be inserted into the 

178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

179 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

180 """ 

181 

182 

183def makeTransferChoiceField( 

184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

185) -> ChoiceField: 

186 """Create a Config field with options for transferring data between repos. 

187 

188 The allowed options for the field are exactly those supported by 

189 `lsst.daf.butler.Datastore.ingest`. 

190 

191 Parameters 

192 ---------- 

193 doc : `str` 

194 Documentation for the configuration field. 

195 default : `str`, optional 

196 Default transfer mode for the field. 

197 

198 Returns 

199 ------- 

200 field : `lsst.pex.config.ChoiceField` 

201 Configuration field. 

202 """ 

203 return ChoiceField( 

204 doc=doc, 

205 dtype=str, 

206 allowed={ 

207 "move": "move", 

208 "copy": "copy", 

209 "auto": "choice will depend on datastore", 

210 "direct": "use URI to ingested file directly in datastore", 

211 "link": "hard link falling back to symbolic link", 

212 "hardlink": "hard link", 

213 "symlink": "symbolic (soft) link", 

214 "relsymlink": "relative symbolic link", 

215 }, 

216 optional=True, 

217 default=default, 

218 ) 

219 

220 

221class RawIngestConfig(Config): 

222 """Configuration class for RawIngestTask.""" 

223 

224 transfer = makeTransferChoiceField() 

225 failFast: Field[bool] = Field( 

226 dtype=bool, 

227 default=False, 

228 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

229 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

230 ) 

231 

232 

233class RawIngestTask(Task): 

234 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

235 

236 Parameters 

237 ---------- 

238 config : `RawIngestConfig` 

239 Configuration for the task. 

240 butler : `~lsst.daf.butler.Butler` 

241 Writeable butler instance, with ``butler.run`` set to the appropriate 

242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

243 datasets. 

244 on_success : `Callable`, optional 

245 A callback invoked when all of the raws associated with an exposure 

246 are ingested. Will be passed a list of `FileDataset` objects, each 

247 containing one or more resolved `DatasetRef` objects. If this callback 

248 raises it will interrupt the entire ingest process, even if 

249 `RawIngestConfig.failFast` is `False`. 

250 on_metadata_failure : `Callable`, optional 

251 A callback invoked when a failure occurs trying to translate the 

252 metadata for a file. Will be passed the URI and the exception, in 

253 that order, as positional arguments. Guaranteed to be called in an 

254 ``except`` block, allowing the callback to re-raise or replace (with 

255 ``raise ... from``) to override the task's usual error handling (before 

256 `RawIngestConfig.failFast` logic occurs). 

257 on_ingest_failure : `Callable`, optional 

258 A callback invoked when dimension record or dataset insertion into the 

259 database fails for an exposure. Will be passed a `RawExposureData` 

260 instance and the exception, in that order, as positional arguments. 

261 Guaranteed to be called in an ``except`` block, allowing the callback 

262 to re-raise or replace (with ``raise ... from``) to override the task's 

263 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

264 **kwargs 

265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

266 constructor. 

267 

268 Notes 

269 ----- 

270 Each instance of `RawIngestTask` writes to the same Butler. Each 

271 invocation of `RawIngestTask.run` ingests a list of files. 

272 """ 

273 

274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig 

275 

276 _DefaultName: ClassVar[str] = "ingest" 

277 

278 def getDatasetType(self) -> DatasetType: 

279 """Return the DatasetType of the datasets ingested by this Task.""" 

280 return DatasetType( 

281 "raw", 

282 ("instrument", "detector", "exposure"), 

283 "Exposure", 

284 universe=self.butler.registry.dimensions, 

285 ) 

286 

287 # Mypy can not determine that the config passed to super() is this type. 

288 config: RawIngestConfig 

289 

290 def __init__( 

291 self, 

292 config: RawIngestConfig, 

293 *, 

294 butler: Butler, 

295 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

296 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

297 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

298 **kwargs: Any, 

299 ): 

300 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

301 super().__init__(config, **kwargs) 

302 self.butler = butler 

303 self.universe = self.butler.registry.dimensions 

304 self.datasetType = self.getDatasetType() 

305 self._on_success = on_success 

306 self._on_metadata_failure = on_metadata_failure 

307 self._on_ingest_failure = on_ingest_failure 

308 self.progress = Progress("obs.base.RawIngestTask") 

309 

310 # Import all the instrument classes so that we ensure that we 

311 # have all the relevant metadata translators loaded. 

312 Instrument.importAll(self.butler.registry) 

313 

314 def _reduce_kwargs(self) -> Dict[str, Any]: 

315 # Add extra parameters to pickle. 

316 return dict( 

317 **super()._reduce_kwargs(), 

318 butler=self.butler, 

319 on_success=self._on_success, 

320 on_metadata_failure=self._on_metadata_failure, 

321 on_ingest_failure=self._on_ingest_failure, 

322 ) 

323 

324 def _determine_instrument_formatter( 

325 self, dataId: DataCoordinate, filename: ResourcePath 

326 ) -> Tuple[Optional[Instrument], Type[Formatter]]: 

327 """Determine the instrument and formatter class. 

328 

329 Parameters 

330 ---------- 

331 dataId : `lsst.daf.butler.DataCoordinate` 

332 The dataId associated with this dataset. 

333 filename : `lsst.resources.ResourcePath` 

334 URI of file used for error reporting. 

335 

336 Returns 

337 ------- 

338 instrument : `Instrument` or `None` 

339 Instance of the `Instrument` associated with this dataset. `None` 

340 indicates that the instrument could not be determined. 

341 formatterClass : `type` 

342 Class to be used as the formatter for this dataset. 

343 """ 

344 # The data model currently assumes that whilst multiple datasets 

345 # can be associated with a single file, they must all share the 

346 # same formatter. 

347 try: 

348 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

349 except LookupError as e: 

350 self._on_metadata_failure(filename, e) 

351 self.log.warning( 

352 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

353 ) 

354 if self.config.failFast: 

355 raise RuntimeError( 

356 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

357 ) from e 

358 FormatterClass = Formatter 

359 # Indicate that we could not work out the instrument. 

360 instrument = None 

361 else: 

362 assert instrument is not None, "Should be guaranted by fromName succeeding." 

363 FormatterClass = instrument.getRawFormatter(dataId) 

364 return instrument, FormatterClass 

365 

366 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

367 """Extract and process metadata from a single raw file. 

368 

369 Parameters 

370 ---------- 

371 filename : `lsst.resources.ResourcePath` 

372 URI to the file. 

373 

374 Returns 

375 ------- 

376 data : `RawFileData` 

377 A structure containing the metadata extracted from the file, 

378 as well as the original filename. All fields will be populated, 

379 but the `RawFileData.dataId` attribute will be a minimal 

380 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

381 ``instrument`` field will be `None` if there is a problem 

382 with metadata extraction. 

383 

384 Notes 

385 ----- 

386 Assumes that there is a single dataset associated with the given 

387 file. Instruments using a single file to store multiple datasets 

388 must implement their own version of this method. 

389 

390 By default the method will catch all exceptions unless the ``failFast`` 

391 configuration item is `True`. If an error is encountered the 

392 `_on_metadata_failure()` method will be called. If no exceptions 

393 result and an error was encountered the returned object will have 

394 a null-instrument class and no datasets. 

395 

396 This method supports sidecar JSON files which can be used to 

397 extract metadata without having to read the data file itself. 

398 The sidecar file is always used if found. 

399 """ 

400 sidecar_fail_msg = "" # Requires prepended space when set. 

401 try: 

402 sidecar_file = filename.updatedExtension(".json") 

403 if sidecar_file.exists(): 

404 content = json.loads(sidecar_file.read()) 

405 headers = [process_sidecar_data(content)] 

406 sidecar_fail_msg = " (via sidecar)" 

407 else: 

408 # Read the metadata from the data file itself. 

409 

410 # For remote files download the entire file to get the 

411 # header. This is very inefficient and it would be better 

412 # to have some way of knowing where in the file the headers 

413 # are and to only download those parts of the file. 

414 with filename.as_local() as local_file: 

415 # Read the primary. This might be sufficient. 

416 header = readMetadata(local_file.ospath, 0) 

417 

418 try: 

419 # Try to work out a translator class early. 

420 translator_class = MetadataTranslator.determine_translator( 

421 header, filename=str(filename) 

422 ) 

423 except ValueError: 

424 # Primary header was not sufficient (maybe this file 

425 # has been compressed or is a MEF with minimal 

426 # primary). Read second header and merge with primary. 

427 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

428 

429 # Try again to work out a translator class, letting this 

430 # fail. 

431 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename)) 

432 

433 # Request the headers to use for ingest 

434 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header)) 

435 

436 # Add each header to the dataset list 

437 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

438 

439 except Exception as e: 

440 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

441 # Indicate to the caller that we failed to read. 

442 datasets = [] 

443 formatterClass = Formatter 

444 instrument = None 

445 self._on_metadata_failure(filename, e) 

446 if self.config.failFast: 

447 raise RuntimeError( 

448 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

449 ) from e 

450 else: 

451 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

452 # The data model currently assumes that whilst multiple datasets 

453 # can be associated with a single file, they must all share the 

454 # same formatter. 

455 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

456 if instrument is None: 

457 datasets = [] 

458 

459 return RawFileData( 

460 datasets=datasets, 

461 filename=filename, 

462 # MyPy wants this to be a non-abstract class, which is not true 

463 # for the error case where instrument is None and datasets=[]. 

464 FormatterClass=formatterClass, # type: ignore 

465 instrument=instrument, 

466 ) 

467 

468 @classmethod 

469 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]: 

470 """Return subsets of fields in the `ObservationInfo` that we care about 

471 

472 These fields will be used in constructing an exposure record. 

473 

474 Returns 

475 ------- 

476 required : `set` 

477 Set of `ObservationInfo` field names that are required. 

478 optional : `set` 

479 Set of `ObservationInfo` field names we will use if they are 

480 available. 

481 """ 

482 # Marking the new properties "group_counter_*" and 

483 # "has_simulated_content" as required, assumes that we either 

484 # recreate any existing index/sidecar files that include translated 

485 # values, or else allow astro_metadata_translator to fill in 

486 # defaults. 

487 required = { 

488 "datetime_begin", 

489 "datetime_end", 

490 "detector_num", 

491 "exposure_id", 

492 "exposure_time", 

493 "group_counter_end", 

494 "group_counter_start", 

495 "has_simulated_content", 

496 "instrument", 

497 "observation_id", 

498 "observation_type", 

499 "physical_filter", 

500 } 

501 optional = { 

502 "altaz_begin", 

503 "boresight_rotation_coord", 

504 "boresight_rotation_angle", 

505 "dark_time", 

506 "exposure_group", 

507 "tracking_radec", 

508 "object", 

509 "observation_counter", 

510 "observation_reason", 

511 "observing_day", 

512 "science_program", 

513 "visit_id", 

514 } 

515 return required, optional 

516 

517 def _calculate_dataset_info( 

518 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath 

519 ) -> RawFileDatasetInfo: 

520 """Calculate a RawFileDatasetInfo from the supplied information. 

521 

522 Parameters 

523 ---------- 

524 header : Mapping or `astro_metadata_translator.ObservationInfo` 

525 Header from the dataset or previously-translated content. 

526 filename : `lsst.resources.ResourcePath` 

527 Filename to use for error messages. 

528 

529 Returns 

530 ------- 

531 dataset : `RawFileDatasetInfo` 

532 The dataId, and observation information associated with this 

533 dataset. 

534 """ 

535 required, optional = self.getObservationInfoSubsets() 

536 if isinstance(header, ObservationInfo): 

537 obsInfo = header 

538 missing = [] 

539 # Need to check the required properties are present. 

540 for property in required: 

541 # getattr does not need to be protected because it is using 

542 # the defined list above containing properties that must exist. 

543 value = getattr(obsInfo, property) 

544 if value is None: 

545 missing.append(property) 

546 if missing: 

547 raise ValueError( 

548 f"Requested required properties are missing from file {filename}:" 

549 f" {missing} (via JSON)" 

550 ) 

551 

552 else: 

553 obsInfo = ObservationInfo( 

554 header, 

555 pedantic=False, 

556 filename=str(filename), 

557 required=required, 

558 subset=required | optional, 

559 ) 

560 

561 dataId = DataCoordinate.standardize( 

562 instrument=obsInfo.instrument, 

563 exposure=obsInfo.exposure_id, 

564 detector=obsInfo.detector_num, 

565 universe=self.universe, 

566 ) 

567 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

568 

569 def locateAndReadIndexFiles( 

570 self, files: Iterable[ResourcePath] 

571 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]: 

572 """Given a list of files, look for index files and read them. 

573 

574 Index files can either be explicitly in the list of files to 

575 ingest, or else located in the same directory as a file to ingest. 

576 Index entries are always used if present. 

577 

578 Parameters 

579 ---------- 

580 files : iterable over `lsst.resources.ResourcePath` 

581 URIs to the files to be ingested. 

582 

583 Returns 

584 ------- 

585 index : `dict` [`ResourcePath`, Any] 

586 Merged contents of all relevant index files found. These can 

587 be explicitly specified index files or ones found in the 

588 directory alongside a data file to be ingested. 

589 updated_files : `list` of `ResourcePath` 

590 Updated list of the input files with entries removed that were 

591 found listed in an index file. Order is not guaranteed to 

592 match the order of the files given to this routine. 

593 good_index_files: `set` [ `ResourcePath` ] 

594 Index files that were successfully read. 

595 bad_index_files: `set` [ `ResourcePath` ] 

596 Files that looked like index files but failed to read properly. 

597 """ 

598 # Convert the paths to absolute for easy comparison with index content. 

599 # Do not convert to real paths since we have to assume that index 

600 # files are in this location and not the location which it links to. 

601 files = tuple(f.abspath() for f in files) 

602 

603 # Index files must be named this. 

604 index_root_file = "_index.json" 

605 

606 # Group the files by directory. 

607 files_by_directory = defaultdict(set) 

608 

609 for path in files: 

610 directory, file_in_dir = path.split() 

611 files_by_directory[directory].add(file_in_dir) 

612 

613 # All the metadata read from index files with keys of full path. 

614 index_entries: Dict[ResourcePath, Any] = {} 

615 

616 # Index files we failed to read. 

617 bad_index_files = set() 

618 

619 # Any good index files that were found and used. 

620 good_index_files = set() 

621 

622 # Look for index files in those directories. 

623 for directory, files_in_directory in files_by_directory.items(): 

624 possible_index_file = directory.join(index_root_file) 

625 if possible_index_file.exists(): 

626 # If we are explicitly requesting an index file the 

627 # messages should be different. 

628 index_msg = "inferred" 

629 is_implied = True 

630 if index_root_file in files_in_directory: 

631 index_msg = "explicit" 

632 is_implied = False 

633 

634 # Try to read the index file and catch and report any 

635 # problems. 

636 try: 

637 content = json.loads(possible_index_file.read()) 

638 index = process_index_data(content, force_dict=True) 

639 # mypy should in theory know that this is a mapping 

640 # from the overload type annotation of process_index_data. 

641 assert isinstance(index, MutableMapping) 

642 except Exception as e: 

643 # Only trigger the callback if the index file 

644 # was asked for explicitly. Triggering on implied file 

645 # might be surprising. 

646 if not is_implied: 

647 self._on_metadata_failure(possible_index_file, e) 

648 if self.config.failFast: 

649 raise RuntimeError( 

650 f"Problem reading index file from {index_msg} location {possible_index_file}" 

651 ) from e 

652 bad_index_files.add(possible_index_file) 

653 continue 

654 

655 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

656 good_index_files.add(possible_index_file) 

657 

658 # Go through the index adding entries for files. 

659 # If we have non-index files in this directory marked for 

660 # ingest we should only get index information for those. 

661 # If the index file was explicit we use all entries. 

662 if is_implied: 

663 files_to_ingest = files_in_directory 

664 else: 

665 files_to_ingest = set(index) 

666 

667 # Copy relevant metadata into a single dict for all index 

668 # entries. 

669 for file_in_dir in files_to_ingest: 

670 # Skip an explicitly specified index file. 

671 # This should never happen because an explicit index 

672 # file will force ingest of all files in the index 

673 # and not use the explicit file list. If somehow 

674 # this is not true we continue. Raising an exception 

675 # seems like the wrong thing to do since this is harmless. 

676 if file_in_dir == index_root_file: 

677 self.log.info( 

678 "Logic error found scanning directory %s. Please file ticket.", directory 

679 ) 

680 continue 

681 if file_in_dir in index: 

682 file = directory.join(file_in_dir) 

683 if file in index_entries: 

684 # ObservationInfo overrides raw metadata 

685 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

686 index_entries[file], ObservationInfo 

687 ): 

688 self.log.warning( 

689 "File %s already specified in an index file but overriding" 

690 " with ObservationInfo content from %s", 

691 file, 

692 possible_index_file, 

693 ) 

694 else: 

695 self.log.warning( 

696 "File %s already specified in an index file, ignoring content from %s", 

697 file, 

698 possible_index_file, 

699 ) 

700 # Do nothing in this case 

701 continue 

702 

703 index_entries[file] = index[file_in_dir] 

704 

705 # Remove files from list that have index entries and also 

706 # any files that we determined to be explicit index files 

707 # or any index files that we failed to read. 

708 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

709 

710 # The filtered list loses the initial order. Retaining the order 

711 # is good for testing but does have a cost if there are many 

712 # files when copying the good values out. A dict would have faster 

713 # lookups (using the files as keys) but use more memory. 

714 ordered = [f for f in filtered if f in files] 

715 

716 return index_entries, ordered, good_index_files, bad_index_files 

717 

718 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]: 

719 """Convert index entries to RawFileData. 

720 

721 Parameters 

722 ---------- 

723 index_entries : `dict` [`ResourcePath`, Any] 

724 Dict indexed by name of file to ingest and with keys either 

725 raw metadata or translated 

726 `~astro_metadata_translator.ObservationInfo`. 

727 

728 Returns 

729 ------- 

730 data : `list` [ `RawFileData` ] 

731 Structures containing the metadata extracted from the file, 

732 as well as the original filename. All fields will be populated, 

733 but the `RawFileData.dataId` attributes will be minimal 

734 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

735 """ 

736 fileData = [] 

737 for filename, metadata in index_entries.items(): 

738 try: 

739 datasets = [self._calculate_dataset_info(metadata, filename)] 

740 except Exception as e: 

741 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

742 datasets = [] 

743 formatterClass = Formatter 

744 instrument = None 

745 self._on_metadata_failure(filename, e) 

746 if self.config.failFast: 

747 raise RuntimeError( 

748 f"Problem extracting metadata for file {filename} found in index file" 

749 ) from e 

750 else: 

751 instrument, formatterClass = self._determine_instrument_formatter( 

752 datasets[0].dataId, filename 

753 ) 

754 if instrument is None: 

755 datasets = [] 

756 fileData.append( 

757 RawFileData( 

758 datasets=datasets, 

759 filename=filename, 

760 # MyPy wants this to be a non-abstract class, which is not 

761 # true for the error case where instrument is None and 

762 # datasets=[]. 

763 FormatterClass=formatterClass, # type: ignore 

764 instrument=instrument, 

765 ) 

766 ) 

767 return fileData 

768 

769 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

770 """Group an iterable of `RawFileData` by exposure. 

771 

772 Parameters 

773 ---------- 

774 files : iterable of `RawFileData` 

775 File-level information to group. 

776 

777 Returns 

778 ------- 

779 exposures : `list` of `RawExposureData` 

780 A list of structures that group the file-level information by 

781 exposure. All fields will be populated. The 

782 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

783 `~lsst.daf.butler.DataCoordinate` instances. 

784 """ 

785 exposureDimensions = self.universe["exposure"].graph 

786 byExposure = defaultdict(list) 

787 for f in files: 

788 # Assume that the first dataset is representative for the file. 

789 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

790 

791 return [ 

792 RawExposureData( 

793 dataId=dataId, 

794 files=exposureFiles, 

795 universe=self.universe, 

796 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

797 dependencyRecords=self.makeDependencyRecords( 

798 exposureFiles[0].datasets[0].obsInfo, self.universe 

799 ), 

800 ) 

801 for dataId, exposureFiles in byExposure.items() 

802 ] 

803 

804 def makeExposureRecord( 

805 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

806 ) -> DimensionRecord: 

807 """Construct a registry record for an exposure 

808 

809 This is a method that subclasses will often want to customize. This can 

810 often be done by calling this base class implementation with additional 

811 ``kwargs``. 

812 

813 Parameters 

814 ---------- 

815 obsInfo : `ObservationInfo` 

816 Observation details for (one of the components of) the exposure. 

817 universe : `DimensionUniverse` 

818 Set of all known dimensions. 

819 **kwargs 

820 Additional field values for this record. 

821 

822 Returns 

823 ------- 

824 record : `DimensionRecord` 

825 The exposure record that must be inserted into the 

826 `~lsst.daf.butler.Registry` prior to file-level ingest. 

827 """ 

828 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

829 

830 def makeDependencyRecords( 

831 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

832 ) -> Dict[str, DimensionRecord]: 

833 """Construct dependency records 

834 

835 These dependency records will be inserted into the 

836 `~lsst.daf.butler.Registry` before the exposure records, because they 

837 are dependencies of the exposure. This allows an opportunity to satisfy 

838 foreign key constraints that exist because of dimensions related to the 

839 exposure. 

840 

841 This is a method that subclasses may want to customize, if they've 

842 added dimensions that relate to an exposure. 

843 

844 Parameters 

845 ---------- 

846 obsInfo : `ObservationInfo` 

847 Observation details for (one of the components of) the exposure. 

848 universe : `DimensionUniverse` 

849 Set of all known dimensions. 

850 

851 Returns 

852 ------- 

853 records : `dict` [`str`, `DimensionRecord`] 

854 The records to insert, indexed by dimension name. 

855 """ 

856 return {} 

857 

858 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

859 """Expand the data IDs associated with a raw exposure. 

860 

861 This adds the metadata records. 

862 

863 Parameters 

864 ---------- 

865 exposure : `RawExposureData` 

866 A structure containing information about the exposure to be 

867 ingested. Must have `RawExposureData.record` populated. Should 

868 be considered consumed upon return. 

869 

870 Returns 

871 ------- 

872 exposure : `RawExposureData` 

873 An updated version of the input structure, with 

874 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

875 updated to data IDs for which 

876 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

877 """ 

878 # We start by expanded the exposure-level data ID; we won't use that 

879 # directly in file ingest, but this lets us do some database lookups 

880 # once per exposure instead of once per file later. 

881 data.dataId = self.butler.registry.expandDataId( 

882 data.dataId, 

883 # We pass in the records we'll be inserting shortly so they aren't 

884 # looked up from the database. We do expect instrument and filter 

885 # records to be retrieved from the database here (though the 

886 # Registry may cache them so there isn't a lookup every time). 

887 records={"exposure": data.record}, 

888 ) 

889 # Now we expand the per-file (exposure+detector) data IDs. This time 

890 # we pass in the records we just retrieved from the exposure data ID 

891 # expansion. 

892 for file in data.files: 

893 for dataset in file.datasets: 

894 dataset.dataId = self.butler.registry.expandDataId( 

895 dataset.dataId, records=data.dataId.records 

896 ) 

897 return data 

898 

899 def prep( 

900 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1 

901 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]: 

902 """Perform all non-database-updating ingest preprocessing steps. 

903 

904 Parameters 

905 ---------- 

906 files : iterable over `str` or path-like objects 

907 Paths to the files to be ingested. Will be made absolute 

908 if they are not already. 

909 pool : `multiprocessing.Pool`, optional 

910 If not `None`, a process pool with which to parallelize some 

911 operations. 

912 processes : `int`, optional 

913 The number of processes to use. Ignored if ``pool`` is not `None`. 

914 

915 Returns 

916 ------- 

917 exposures : `Iterator` [ `RawExposureData` ] 

918 Data structures containing dimension records, filenames, and data 

919 IDs to be ingested (one structure for each exposure). 

920 bad_files : `list` of `str` 

921 List of all the files that could not have metadata extracted. 

922 """ 

923 if pool is None and processes > 1: 

924 pool = Pool(processes) 

925 mapFunc = map if pool is None else pool.imap_unordered 

926 

927 def _partition_good_bad( 

928 file_data: Iterable[RawFileData], 

929 ) -> Tuple[List[RawFileData], List[ResourcePath]]: 

930 """Filter out bad files and return good with list of bad.""" 

931 good_files = [] 

932 bad_files = [] 

933 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

934 if not fileDatum.datasets: 

935 bad_files.append(fileDatum.filename) 

936 else: 

937 good_files.append(fileDatum) 

938 return good_files, bad_files 

939 

940 # Look for index files and read them. 

941 # There should be far fewer index files than data files. 

942 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

943 if bad_index_files: 

944 self.log.info("Failed to read the following explicitly requested index files:") 

945 for bad in sorted(bad_index_files): 

946 self.log.info("- %s", bad) 

947 

948 # Now convert all the index file entries to standard form for ingest. 

949 processed_bad_index_files: List[ResourcePath] = [] 

950 indexFileData = self.processIndexEntries(index_entries) 

951 if indexFileData: 

952 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

953 self.log.info( 

954 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

955 *_log_msg_counter(indexFileData), 

956 *_log_msg_counter(good_index_files), 

957 *_log_msg_counter(processed_bad_index_files), 

958 ) 

959 

960 # Extract metadata and build per-detector regions. 

961 # This could run in a subprocess so collect all output 

962 # before looking at failures. 

963 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

964 

965 # Filter out all the failed reads and store them for later 

966 # reporting. 

967 good_file_data, bad_files = _partition_good_bad(fileData) 

968 self.log.info( 

969 "Successfully extracted metadata from %d file%s with %d failure%s", 

970 *_log_msg_counter(good_file_data), 

971 *_log_msg_counter(bad_files), 

972 ) 

973 

974 # Combine with data from index files. 

975 good_file_data.extend(indexFileData) 

976 bad_files.extend(processed_bad_index_files) 

977 bad_files.extend(bad_index_files) 

978 

979 # Use that metadata to group files (and extracted metadata) by 

980 # exposure. Never parallelized because it's intrinsically a gather 

981 # step. 

982 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data) 

983 

984 # The next operation operates on RawExposureData instances (one at 

985 # a time) in-place and then returns the modified instance. We call it 

986 # as a pass-through instead of relying on the arguments we pass in to 

987 # have been modified because in the parallel case those arguments are 

988 # going to be pickled and unpickled, and I'm not certain 

989 # multiprocessing is careful enough with that for output arguments to 

990 # work. 

991 

992 # Expand the data IDs to include all dimension metadata; we need this 

993 # because we may need to generate path templates that rely on that 

994 # metadata. 

995 # This is the first step that involves actual database calls (but just 

996 # SELECTs), so if there's going to be a problem with connections vs. 

997 # multiple processes, or lock contention (in SQLite) slowing things 

998 # down, it'll happen here. 

999 return mapFunc(self.expandDataIds, exposureData), bad_files 

1000 

1001 def ingestExposureDatasets( 

1002 self, 

1003 exposure: RawExposureData, 

1004 *, 

1005 run: Optional[str] = None, 

1006 skip_existing_exposures: bool = False, 

1007 track_file_attrs: bool = True, 

1008 ) -> List[FileDataset]: 

1009 """Ingest all raw files in one exposure. 

1010 

1011 Parameters 

1012 ---------- 

1013 exposure : `RawExposureData` 

1014 A structure containing information about the exposure to be 

1015 ingested. Must have `RawExposureData.records` populated and all 

1016 data ID attributes expanded. 

1017 run : `str`, optional 

1018 Name of a RUN-type collection to write to, overriding 

1019 ``self.butler.run``. 

1020 skip_existing_exposures : `bool`, optional 

1021 If `True` (`False` is default), skip raws that have already been 

1022 ingested (i.e. raws for which we already have a dataset with the 

1023 same data ID in the target collection, even if from another file). 

1024 Note that this is much slower than just not passing 

1025 already-ingested files as inputs, because we still need to read and 

1026 process metadata to identify which exposures to search for. It 

1027 also will not work reliably if multiple processes are attempting to 

1028 ingest raws from the same exposure concurrently, in that different 

1029 processes may still attempt to ingest the same raw and conflict, 

1030 causing a failure that prevents other raws from the same exposure 

1031 from being ingested. 

1032 track_file_attrs : `bool`, optional 

1033 Control whether file attributes such as the size or checksum should 

1034 be tracked by the datastore. Whether this parameter is honored 

1035 depends on the specific datastore implentation. 

1036 

1037 Returns 

1038 ------- 

1039 datasets : `list` of `lsst.daf.butler.FileDataset` 

1040 Per-file structures identifying the files ingested and their 

1041 dataset representation in the data repository. 

1042 """ 

1043 if skip_existing_exposures: 

1044 existing = { 

1045 ref.dataId 

1046 for ref in self.butler.registry.queryDatasets( 

1047 self.datasetType, 

1048 collections=[run], 

1049 dataId=exposure.dataId, 

1050 ) 

1051 } 

1052 else: 

1053 existing = set() 

1054 datasets = [] 

1055 for file in exposure.files: 

1056 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing] 

1057 if refs: 

1058 datasets.append( 

1059 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1060 ) 

1061 

1062 # Raw files are preferentially ingested using a UUID derived from 

1063 # the collection name and dataId. 

1064 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1065 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1066 else: 

1067 mode = DatasetIdGenEnum.UNIQUE 

1068 self.butler.ingest( 

1069 *datasets, 

1070 transfer=self.config.transfer, 

1071 run=run, 

1072 idGenerationMode=mode, 

1073 record_validation_info=track_file_attrs, 

1074 ) 

1075 return datasets 

1076 

1077 def ingestFiles( 

1078 self, 

1079 files: Iterable[ResourcePath], 

1080 *, 

1081 pool: Optional[PoolType] = None, 

1082 processes: int = 1, 

1083 run: Optional[str] = None, 

1084 skip_existing_exposures: bool = False, 

1085 update_exposure_records: bool = False, 

1086 track_file_attrs: bool = True, 

1087 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]: 

1088 """Ingest files into a Butler data repository. 

1089 

1090 This creates any new exposure or visit Dimension entries needed to 

1091 identify the ingested files, creates new Dataset entries in the 

1092 Registry and finally ingests the files themselves into the Datastore. 

1093 Any needed instrument, detector, and physical_filter Dimension entries 

1094 must exist in the Registry before `run` is called. 

1095 

1096 Parameters 

1097 ---------- 

1098 files : iterable over `lsst.resources.ResourcePath` 

1099 URIs to the files to be ingested. 

1100 pool : `multiprocessing.Pool`, optional 

1101 If not `None`, a process pool with which to parallelize some 

1102 operations. 

1103 processes : `int`, optional 

1104 The number of processes to use. Ignored if ``pool`` is not `None`. 

1105 run : `str`, optional 

1106 Name of a RUN-type collection to write to, overriding 

1107 the default derived from the instrument name. 

1108 skip_existing_exposures : `bool`, optional 

1109 If `True` (`False` is default), skip raws that have already been 

1110 ingested (i.e. raws for which we already have a dataset with the 

1111 same data ID in the target collection, even if from another file). 

1112 Note that this is much slower than just not passing 

1113 already-ingested files as inputs, because we still need to read and 

1114 process metadata to identify which exposures to search for. It 

1115 also will not work reliably if multiple processes are attempting to 

1116 ingest raws from the same exposure concurrently, in that different 

1117 processes may still attempt to ingest the same raw and conflict, 

1118 causing a failure that prevents other raws from the same exposure 

1119 from being ingested. 

1120 update_exposure_records : `bool`, optional 

1121 If `True` (`False` is default), update existing exposure records 

1122 that conflict with the new ones instead of rejecting them. THIS IS 

1123 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1124 KNOWN TO BE BAD. This should usually be combined with 

1125 ``skip_existing_exposures=True``. 

1126 track_file_attrs : `bool`, optional 

1127 Control whether file attributes such as the size or checksum should 

1128 be tracked by the datastore. Whether this parameter is honored 

1129 depends on the specific datastore implentation. 

1130 

1131 Returns 

1132 ------- 

1133 refs : `list` of `lsst.daf.butler.DatasetRef` 

1134 Dataset references for ingested raws. 

1135 bad_files : `list` of `ResourcePath` 

1136 Given paths that could not be ingested. 

1137 n_exposures : `int` 

1138 Number of exposures successfully ingested. 

1139 n_exposures_failed : `int` 

1140 Number of exposures that failed when inserting dimension data. 

1141 n_ingests_failed : `int` 

1142 Number of exposures that failed when ingesting raw datasets. 

1143 """ 

1144 

1145 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

1146 

1147 # Up to this point, we haven't modified the data repository at all. 

1148 # Now we finally do that, with one transaction per exposure. This is 

1149 # not parallelized at present because the performance of this step is 

1150 # limited by the database server. That may or may not change in the 

1151 # future once we increase our usage of bulk inserts and reduce our 

1152 # usage of savepoints; we've tried to get everything but the database 

1153 # operations done in advance to reduce the time spent inside 

1154 # transactions. 

1155 self.butler.registry.registerDatasetType(self.datasetType) 

1156 

1157 refs = [] 

1158 runs = set() 

1159 n_exposures = 0 

1160 n_exposures_failed = 0 

1161 n_ingests_failed = 0 

1162 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1163 assert exposure.record is not None, "Should be guaranteed by prep()" 

1164 self.log.debug( 

1165 "Attempting to ingest %d file%s from exposure %s:%s", 

1166 *_log_msg_counter(exposure.files), 

1167 exposure.record.instrument, 

1168 exposure.record.obs_id, 

1169 ) 

1170 

1171 try: 

1172 for name, record in exposure.dependencyRecords.items(): 

1173 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1174 inserted_or_updated = self.butler.registry.syncDimensionData( 

1175 "exposure", 

1176 exposure.record, 

1177 update=update_exposure_records, 

1178 ) 

1179 except Exception as e: 

1180 self._on_ingest_failure(exposure, e) 

1181 n_exposures_failed += 1 

1182 self.log.warning( 

1183 "Exposure %s:%s could not be registered: %s", 

1184 exposure.record.instrument, 

1185 exposure.record.obs_id, 

1186 e, 

1187 ) 

1188 if self.config.failFast: 

1189 raise e 

1190 continue 

1191 

1192 if isinstance(inserted_or_updated, dict): 

1193 # Exposure is in the registry and we updated it, so 

1194 # syncDimensionData returned a dict. 

1195 self.log.info( 

1196 "Exposure %s:%s was already present, but columns %s were updated.", 

1197 exposure.record.instrument, 

1198 exposure.record.obs_id, 

1199 str(list(inserted_or_updated.keys())), 

1200 ) 

1201 

1202 # Override default run if nothing specified explicitly. 

1203 if run is None: 

1204 instrument = exposure.files[0].instrument 

1205 assert ( 

1206 instrument is not None 

1207 ), "file should have been removed from this list by prep if instrument could not be found" 

1208 this_run = instrument.makeDefaultRawIngestRunName() 

1209 else: 

1210 this_run = run 

1211 if this_run not in runs: 

1212 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1213 runs.add(this_run) 

1214 try: 

1215 datasets_for_exposure = self.ingestExposureDatasets( 

1216 exposure, 

1217 run=this_run, 

1218 skip_existing_exposures=skip_existing_exposures, 

1219 track_file_attrs=track_file_attrs, 

1220 ) 

1221 except Exception as e: 

1222 self._on_ingest_failure(exposure, e) 

1223 n_ingests_failed += 1 

1224 self.log.warning("Failed to ingest the following for reason: %s", e) 

1225 for f in exposure.files: 

1226 self.log.warning("- %s", f.filename) 

1227 if self.config.failFast: 

1228 raise e 

1229 continue 

1230 else: 

1231 self._on_success(datasets_for_exposure) 

1232 for dataset in datasets_for_exposure: 

1233 refs.extend(dataset.refs) 

1234 

1235 # Success for this exposure. 

1236 n_exposures += 1 

1237 self.log.info( 

1238 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1239 ) 

1240 

1241 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1242 

1243 @timeMethod 

1244 def run( 

1245 self, 

1246 files: Iterable[ResourcePathExpression], 

1247 *, 

1248 pool: Optional[PoolType] = None, 

1249 processes: int = 1, 

1250 run: Optional[str] = None, 

1251 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", 

1252 group_files: bool = True, 

1253 skip_existing_exposures: bool = False, 

1254 update_exposure_records: bool = False, 

1255 track_file_attrs: bool = True, 

1256 ) -> List[DatasetRef]: 

1257 """Ingest files into a Butler data repository. 

1258 

1259 This creates any new exposure or visit Dimension entries needed to 

1260 identify the ingested files, creates new Dataset entries in the 

1261 Registry and finally ingests the files themselves into the Datastore. 

1262 Any needed instrument, detector, and physical_filter Dimension entries 

1263 must exist in the Registry before `run` is called. 

1264 

1265 Parameters 

1266 ---------- 

1267 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1268 Paths to the files to be ingested. Can refer to directories. 

1269 Will be made absolute if they are not already. 

1270 pool : `multiprocessing.Pool`, optional 

1271 If not `None`, a process pool with which to parallelize some 

1272 operations. 

1273 processes : `int`, optional 

1274 The number of processes to use. Ignored if ``pool`` is not `None`. 

1275 run : `str`, optional 

1276 Name of a RUN-type collection to write to, overriding 

1277 the default derived from the instrument name. 

1278 file_filter : `str` or `re.Pattern`, optional 

1279 Pattern to use to discover files to ingest within directories. 

1280 The default is to search for FITS files. The regex applies to 

1281 files within the directory. 

1282 group_files : `bool`, optional 

1283 Group files by directory if they have been discovered in 

1284 directories. Will not affect files explicitly provided. 

1285 skip_existing_exposures : `bool`, optional 

1286 If `True` (`False` is default), skip raws that have already been 

1287 ingested (i.e. raws for which we already have a dataset with the 

1288 same data ID in the target collection, even if from another file). 

1289 Note that this is much slower than just not passing 

1290 already-ingested files as inputs, because we still need to read and 

1291 process metadata to identify which exposures to search for. It 

1292 also will not work reliably if multiple processes are attempting to 

1293 ingest raws from the same exposure concurrently, in that different 

1294 processes may still attempt to ingest the same raw and conflict, 

1295 causing a failure that prevents other raws from the same exposure 

1296 from being ingested. 

1297 update_exposure_records : `bool`, optional 

1298 If `True` (`False` is default), update existing exposure records 

1299 that conflict with the new ones instead of rejecting them. THIS IS 

1300 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1301 KNOWN TO BE BAD. This should usually be combined with 

1302 ``skip_existing_exposures=True``. 

1303 track_file_attrs : `bool`, optional 

1304 Control whether file attributes such as the size or checksum should 

1305 be tracked by the datastore. Whether this parameter is honored 

1306 depends on the specific datastore implentation. 

1307 

1308 Returns 

1309 ------- 

1310 refs : `list` of `lsst.daf.butler.DatasetRef` 

1311 Dataset references for ingested raws. 

1312 

1313 Notes 

1314 ----- 

1315 This method inserts all datasets for an exposure within a transaction, 

1316 guaranteeing that partial exposures are never ingested. The exposure 

1317 dimension record is inserted with `Registry.syncDimensionData` first 

1318 (in its own transaction), which inserts only if a record with the same 

1319 primary key does not already exist. This allows different files within 

1320 the same exposure to be ingested in different runs. 

1321 """ 

1322 

1323 refs = [] 

1324 bad_files = [] 

1325 n_exposures = 0 

1326 n_exposures_failed = 0 

1327 n_ingests_failed = 0 

1328 if group_files: 

1329 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1330 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1331 group, 

1332 pool=pool, 

1333 processes=processes, 

1334 run=run, 

1335 skip_existing_exposures=skip_existing_exposures, 

1336 update_exposure_records=update_exposure_records, 

1337 track_file_attrs=track_file_attrs, 

1338 ) 

1339 refs.extend(new_refs) 

1340 bad_files.extend(bad) 

1341 n_exposures += n_exp 

1342 n_exposures_failed += n_exp_fail 

1343 n_ingests_failed += n_ingest_fail 

1344 else: 

1345 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1346 ResourcePath.findFileResources(files, file_filter, group_files), 

1347 pool=pool, 

1348 processes=processes, 

1349 run=run, 

1350 skip_existing_exposures=skip_existing_exposures, 

1351 update_exposure_records=update_exposure_records, 

1352 ) 

1353 

1354 had_failure = False 

1355 

1356 if bad_files: 

1357 had_failure = True 

1358 self.log.warning("Could not extract observation metadata from the following:") 

1359 for f in bad_files: 

1360 self.log.warning("- %s", f) 

1361 

1362 self.log.info( 

1363 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1364 " registration and %d failure%s from file ingest.", 

1365 *_log_msg_counter(n_exposures), 

1366 *_log_msg_counter(n_exposures_failed), 

1367 *_log_msg_counter(n_ingests_failed), 

1368 ) 

1369 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1370 had_failure = True 

1371 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1372 

1373 if had_failure: 

1374 raise RuntimeError("Some failures encountered during ingestion") 

1375 

1376 return refs