Coverage for python/lsst/obs/base/ingest.py: 18%

342 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-23 02:50 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from dataclasses import InitVar, dataclass 

29from multiprocessing import Pool 

30from typing import ( 

31 Any, 

32 Callable, 

33 ClassVar, 

34 Dict, 

35 Iterable, 

36 Iterator, 

37 List, 

38 MutableMapping, 

39 Optional, 

40 Set, 

41 Sized, 

42 Tuple, 

43 Type, 

44 Union, 

45) 

46 

47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

49from lsst.afw.fits import readMetadata 

50from lsst.daf.butler import ( 

51 Butler, 

52 CollectionType, 

53 DataCoordinate, 

54 DatasetIdGenEnum, 

55 DatasetRef, 

56 DatasetType, 

57 DimensionRecord, 

58 DimensionUniverse, 

59 FileDataset, 

60 Formatter, 

61 Progress, 

62) 

63from lsst.pex.config import ChoiceField, Config, Field 

64from lsst.pipe.base import Instrument, Task 

65from lsst.resources import ResourcePath, ResourcePathExpression 

66from lsst.utils.timer import timeMethod 

67 

68from ._instrument import makeExposureRecordFromObsInfo 

69 

70# multiprocessing.Pool is actually a function, not a type, and the real type 

71# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

72# this alias instead. 

73PoolType = Any 

74 

75 

76def _do_nothing(*args: Any, **kwargs: Any) -> None: 

77 """Do nothing. 

78 

79 This is a function that accepts anything and does nothing. 

80 For use as a default in callback arguments. 

81 """ 

82 pass 

83 

84 

85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]: 

86 """Count the iterable and return the count and plural modifier. 

87 

88 Parameters 

89 ---------- 

90 noun : `Sized` or `int` 

91 Thing to count. If given an integer it is assumed to be the count 

92 to use to calculate modifier. 

93 

94 Returns 

95 ------- 

96 num : `int` 

97 Number of items found in ``noun``. 

98 modifier : `str` 

99 Character to add to the end of a string referring to these items 

100 to indicate whether it was a single item or not. Returns empty 

101 string if there is one item or "s" otherwise. 

102 

103 Examples 

104 -------- 

105 

106 .. code-block:: python 

107 

108 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

109 """ 

110 if isinstance(noun, int): 

111 num = noun 

112 else: 

113 num = len(noun) 

114 return num, "" if num == 1 else "s" 

115 

116 

117@dataclass 

118class RawFileDatasetInfo: 

119 """Information about a single dataset within a raw file.""" 

120 

121 dataId: DataCoordinate 

122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

123 

124 obsInfo: ObservationInfo 

125 """Standardized observation metadata extracted directly from the file 

126 headers (`astro_metadata_translator.ObservationInfo`). 

127 """ 

128 

129 

130@dataclass 

131class RawFileData: 

132 """Information about a single raw file, used during ingest.""" 

133 

134 datasets: List[RawFileDatasetInfo] 

135 """The information describing each dataset within this raw file. 

136 (`list` of `RawFileDatasetInfo`) 

137 """ 

138 

139 filename: ResourcePath 

140 """URI of the file this information was extracted from (`str`). 

141 

142 This is the path prior to ingest, not the path after ingest. 

143 """ 

144 

145 FormatterClass: Type[Formatter] 

146 """Formatter class that should be used to ingest this file (`type`; as 

147 subclass of `Formatter`). 

148 """ 

149 

150 instrument: Optional[Instrument] 

151 """The `Instrument` instance associated with this file. Can be `None` 

152 if ``datasets`` is an empty list.""" 

153 

154 

155@dataclass 

156class RawExposureData: 

157 """Information about a complete raw exposure, used during ingest.""" 

158 

159 dataId: DataCoordinate 

160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

161 """ 

162 

163 files: List[RawFileData] 

164 """List of structures containing file-level information. 

165 """ 

166 

167 universe: InitVar[DimensionUniverse] 

168 """Set of all known dimensions. 

169 """ 

170 

171 record: DimensionRecord 

172 """The exposure `DimensionRecord` that must be inserted into the 

173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

174 """ 

175 

176 dependencyRecords: Dict[str, DimensionRecord] 

177 """Additional records that must be inserted into the 

178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

179 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

180 """ 

181 

182 

183def makeTransferChoiceField( 

184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

185) -> ChoiceField: 

186 """Create a Config field with options for transferring data between repos. 

187 

188 The allowed options for the field are exactly those supported by 

189 `lsst.daf.butler.Datastore.ingest`. 

190 

191 Parameters 

192 ---------- 

193 doc : `str` 

194 Documentation for the configuration field. 

195 default : `str`, optional 

196 Default transfer mode for the field. 

197 

198 Returns 

199 ------- 

200 field : `lsst.pex.config.ChoiceField` 

201 Configuration field. 

202 """ 

203 return ChoiceField( 

204 doc=doc, 

205 dtype=str, 

206 allowed={ 

207 "move": "move", 

208 "copy": "copy", 

209 "auto": "choice will depend on datastore", 

210 "direct": "use URI to ingested file directly in datastore", 

211 "link": "hard link falling back to symbolic link", 

212 "hardlink": "hard link", 

213 "symlink": "symbolic (soft) link", 

214 "relsymlink": "relative symbolic link", 

215 }, 

216 optional=True, 

217 default=default, 

218 ) 

219 

220 

221class RawIngestConfig(Config): 

222 """Configuration class for RawIngestTask.""" 

223 

224 transfer = makeTransferChoiceField() 

225 failFast = Field( 

226 dtype=bool, 

227 default=False, 

228 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

229 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

230 ) 

231 

232 

233class RawIngestTask(Task): 

234 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

235 

236 Parameters 

237 ---------- 

238 config : `RawIngestConfig` 

239 Configuration for the task. 

240 butler : `~lsst.daf.butler.Butler` 

241 Writeable butler instance, with ``butler.run`` set to the appropriate 

242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

243 datasets. 

244 on_success : `Callable`, optional 

245 A callback invoked when all of the raws associated with an exposure 

246 are ingested. Will be passed a list of `FileDataset` objects, each 

247 containing one or more resolved `DatasetRef` objects. If this callback 

248 raises it will interrupt the entire ingest process, even if 

249 `RawIngestConfig.failFast` is `False`. 

250 on_metadata_failure : `Callable`, optional 

251 A callback invoked when a failure occurs trying to translate the 

252 metadata for a file. Will be passed the URI and the exception, in 

253 that order, as positional arguments. Guaranteed to be called in an 

254 ``except`` block, allowing the callback to re-raise or replace (with 

255 ``raise ... from``) to override the task's usual error handling (before 

256 `RawIngestConfig.failFast` logic occurs). 

257 on_ingest_failure : `Callable`, optional 

258 A callback invoked when dimension record or dataset insertion into the 

259 database fails for an exposure. Will be passed a `RawExposureData` 

260 instance and the exception, in that order, as positional arguments. 

261 Guaranteed to be called in an ``except`` block, allowing the callback 

262 to re-raise or replace (with ``raise ... from``) to override the task's 

263 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

264 **kwargs 

265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

266 constructor. 

267 

268 Notes 

269 ----- 

270 Each instance of `RawIngestTask` writes to the same Butler. Each 

271 invocation of `RawIngestTask.run` ingests a list of files. 

272 """ 

273 

274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig 

275 

276 _DefaultName: ClassVar[str] = "ingest" 

277 

278 def getDatasetType(self) -> DatasetType: 

279 """Return the DatasetType of the datasets ingested by this Task.""" 

280 return DatasetType( 

281 "raw", 

282 ("instrument", "detector", "exposure"), 

283 "Exposure", 

284 universe=self.butler.registry.dimensions, 

285 ) 

286 

287 def __init__( 

288 self, 

289 config: RawIngestConfig, 

290 *, 

291 butler: Butler, 

292 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

293 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

294 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

295 **kwargs: Any, 

296 ): 

297 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

298 super().__init__(config, **kwargs) 

299 self.butler = butler 

300 self.universe = self.butler.registry.dimensions 

301 self.datasetType = self.getDatasetType() 

302 self._on_success = on_success 

303 self._on_metadata_failure = on_metadata_failure 

304 self._on_ingest_failure = on_ingest_failure 

305 self.progress = Progress("obs.base.RawIngestTask") 

306 

307 # Import all the instrument classes so that we ensure that we 

308 # have all the relevant metadata translators loaded. 

309 Instrument.importAll(self.butler.registry) 

310 

311 def _reduce_kwargs(self) -> Dict[str, Any]: 

312 # Add extra parameters to pickle. 

313 return dict( 

314 **super()._reduce_kwargs(), 

315 butler=self.butler, 

316 on_success=self._on_success, 

317 on_metadata_failure=self._on_metadata_failure, 

318 on_ingest_failure=self._on_ingest_failure, 

319 ) 

320 

321 def _determine_instrument_formatter( 

322 self, dataId: DataCoordinate, filename: ResourcePath 

323 ) -> Tuple[Optional[Instrument], Type[Formatter]]: 

324 """Determine the instrument and formatter class. 

325 

326 Parameters 

327 ---------- 

328 dataId : `lsst.daf.butler.DataCoordinate` 

329 The dataId associated with this dataset. 

330 filename : `lsst.resources.ResourcePath` 

331 URI of file used for error reporting. 

332 

333 Returns 

334 ------- 

335 instrument : `Instrument` or `None` 

336 Instance of the `Instrument` associated with this dataset. `None` 

337 indicates that the instrument could not be determined. 

338 formatterClass : `type` 

339 Class to be used as the formatter for this dataset. 

340 """ 

341 # The data model currently assumes that whilst multiple datasets 

342 # can be associated with a single file, they must all share the 

343 # same formatter. 

344 try: 

345 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

346 except LookupError as e: 

347 self._on_metadata_failure(filename, e) 

348 self.log.warning( 

349 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

350 ) 

351 if self.config.failFast: 

352 raise RuntimeError( 

353 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

354 ) from e 

355 FormatterClass = Formatter 

356 # Indicate that we could not work out the instrument. 

357 instrument = None 

358 else: 

359 assert instrument is not None, "Should be guaranted by fromName succeeding." 

360 FormatterClass = instrument.getRawFormatter(dataId) 

361 return instrument, FormatterClass 

362 

363 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

364 """Extract and process metadata from a single raw file. 

365 

366 Parameters 

367 ---------- 

368 filename : `lsst.resources.ResourcePath` 

369 URI to the file. 

370 

371 Returns 

372 ------- 

373 data : `RawFileData` 

374 A structure containing the metadata extracted from the file, 

375 as well as the original filename. All fields will be populated, 

376 but the `RawFileData.dataId` attribute will be a minimal 

377 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

378 ``instrument`` field will be `None` if there is a problem 

379 with metadata extraction. 

380 

381 Notes 

382 ----- 

383 Assumes that there is a single dataset associated with the given 

384 file. Instruments using a single file to store multiple datasets 

385 must implement their own version of this method. 

386 

387 By default the method will catch all exceptions unless the ``failFast`` 

388 configuration item is `True`. If an error is encountered the 

389 `_on_metadata_failure()` method will be called. If no exceptions 

390 result and an error was encountered the returned object will have 

391 a null-instrument class and no datasets. 

392 

393 This method supports sidecar JSON files which can be used to 

394 extract metadata without having to read the data file itself. 

395 The sidecar file is always used if found. 

396 """ 

397 sidecar_fail_msg = "" # Requires prepended space when set. 

398 try: 

399 sidecar_file = filename.updatedExtension(".json") 

400 if sidecar_file.exists(): 

401 content = json.loads(sidecar_file.read()) 

402 headers = [process_sidecar_data(content)] 

403 sidecar_fail_msg = " (via sidecar)" 

404 else: 

405 # Read the metadata from the data file itself. 

406 

407 # For remote files download the entire file to get the 

408 # header. This is very inefficient and it would be better 

409 # to have some way of knowing where in the file the headers 

410 # are and to only download those parts of the file. 

411 with filename.as_local() as local_file: 

412 # Read the primary. This might be sufficient. 

413 header = readMetadata(local_file.ospath, 0) 

414 

415 try: 

416 # Try to work out a translator class early. 

417 translator_class = MetadataTranslator.determine_translator( 

418 header, filename=str(filename) 

419 ) 

420 except ValueError: 

421 # Primary header was not sufficient (maybe this file 

422 # has been compressed or is a MEF with minimal 

423 # primary). Read second header and merge with primary. 

424 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

425 

426 # Try again to work out a translator class, letting this 

427 # fail. 

428 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename)) 

429 

430 # Request the headers to use for ingest 

431 headers = list(translator_class.determine_translatable_headers(filename.ospath, header)) 

432 

433 # Add each header to the dataset list 

434 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

435 

436 except Exception as e: 

437 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

438 # Indicate to the caller that we failed to read. 

439 datasets = [] 

440 formatterClass = Formatter 

441 instrument = None 

442 self._on_metadata_failure(filename, e) 

443 if self.config.failFast: 

444 raise RuntimeError( 

445 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

446 ) from e 

447 else: 

448 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

449 # The data model currently assumes that whilst multiple datasets 

450 # can be associated with a single file, they must all share the 

451 # same formatter. 

452 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

453 if instrument is None: 

454 datasets = [] 

455 

456 return RawFileData( 

457 datasets=datasets, 

458 filename=filename, 

459 # MyPy wants this to be a non-abstract class, which is not true 

460 # for the error case where instrument is None and datasets=[]. 

461 FormatterClass=formatterClass, # type: ignore 

462 instrument=instrument, 

463 ) 

464 

465 @classmethod 

466 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]: 

467 """Return subsets of fields in the `ObservationInfo` that we care about 

468 

469 These fields will be used in constructing an exposure record. 

470 

471 Returns 

472 ------- 

473 required : `set` 

474 Set of `ObservationInfo` field names that are required. 

475 optional : `set` 

476 Set of `ObservationInfo` field names we will use if they are 

477 available. 

478 """ 

479 # Marking the new properties "group_counter_*" and 

480 # "has_simulated_content" as required, assumes that we either 

481 # recreate any existing index/sidecar files that include translated 

482 # values, or else allow astro_metadata_translator to fill in 

483 # defaults. 

484 required = { 

485 "datetime_begin", 

486 "datetime_end", 

487 "detector_num", 

488 "exposure_id", 

489 "exposure_time", 

490 "group_counter_end", 

491 "group_counter_start", 

492 "has_simulated_content", 

493 "instrument", 

494 "observation_id", 

495 "observation_type", 

496 "physical_filter", 

497 } 

498 optional = { 

499 "altaz_begin", 

500 "boresight_rotation_coord", 

501 "boresight_rotation_angle", 

502 "dark_time", 

503 "exposure_group", 

504 "tracking_radec", 

505 "object", 

506 "observation_counter", 

507 "observation_reason", 

508 "observing_day", 

509 "science_program", 

510 "visit_id", 

511 } 

512 return required, optional 

513 

514 def _calculate_dataset_info( 

515 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath 

516 ) -> RawFileDatasetInfo: 

517 """Calculate a RawFileDatasetInfo from the supplied information. 

518 

519 Parameters 

520 ---------- 

521 header : Mapping or `astro_metadata_translator.ObservationInfo` 

522 Header from the dataset or previously-translated content. 

523 filename : `lsst.resources.ResourcePath` 

524 Filename to use for error messages. 

525 

526 Returns 

527 ------- 

528 dataset : `RawFileDatasetInfo` 

529 The dataId, and observation information associated with this 

530 dataset. 

531 """ 

532 required, optional = self.getObservationInfoSubsets() 

533 if isinstance(header, ObservationInfo): 

534 obsInfo = header 

535 missing = [] 

536 # Need to check the required properties are present. 

537 for property in required: 

538 # getattr does not need to be protected because it is using 

539 # the defined list above containing properties that must exist. 

540 value = getattr(obsInfo, property) 

541 if value is None: 

542 missing.append(property) 

543 if missing: 

544 raise ValueError( 

545 f"Requested required properties are missing from file {filename}:" 

546 f" {missing} (via JSON)" 

547 ) 

548 

549 else: 

550 obsInfo = ObservationInfo( 

551 header, 

552 pedantic=False, 

553 filename=str(filename), 

554 required=required, 

555 subset=required | optional, 

556 ) 

557 

558 dataId = DataCoordinate.standardize( 

559 instrument=obsInfo.instrument, 

560 exposure=obsInfo.exposure_id, 

561 detector=obsInfo.detector_num, 

562 universe=self.universe, 

563 ) 

564 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

565 

566 def locateAndReadIndexFiles( 

567 self, files: Iterable[ResourcePath] 

568 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]: 

569 """Given a list of files, look for index files and read them. 

570 

571 Index files can either be explicitly in the list of files to 

572 ingest, or else located in the same directory as a file to ingest. 

573 Index entries are always used if present. 

574 

575 Parameters 

576 ---------- 

577 files : iterable over `lsst.resources.ResourcePath` 

578 URIs to the files to be ingested. 

579 

580 Returns 

581 ------- 

582 index : `dict` [`ResourcePath`, Any] 

583 Merged contents of all relevant index files found. These can 

584 be explicitly specified index files or ones found in the 

585 directory alongside a data file to be ingested. 

586 updated_files : `list` of `ResourcePath` 

587 Updated list of the input files with entries removed that were 

588 found listed in an index file. Order is not guaranteed to 

589 match the order of the files given to this routine. 

590 good_index_files: `set` [ `ResourcePath` ] 

591 Index files that were successfully read. 

592 bad_index_files: `set` [ `ResourcePath` ] 

593 Files that looked like index files but failed to read properly. 

594 """ 

595 # Convert the paths to absolute for easy comparison with index content. 

596 # Do not convert to real paths since we have to assume that index 

597 # files are in this location and not the location which it links to. 

598 files = tuple(f.abspath() for f in files) 

599 

600 # Index files must be named this. 

601 index_root_file = "_index.json" 

602 

603 # Group the files by directory. 

604 files_by_directory = defaultdict(set) 

605 

606 for path in files: 

607 directory, file_in_dir = path.split() 

608 files_by_directory[directory].add(file_in_dir) 

609 

610 # All the metadata read from index files with keys of full path. 

611 index_entries: Dict[ResourcePath, Any] = {} 

612 

613 # Index files we failed to read. 

614 bad_index_files = set() 

615 

616 # Any good index files that were found and used. 

617 good_index_files = set() 

618 

619 # Look for index files in those directories. 

620 for directory, files_in_directory in files_by_directory.items(): 

621 possible_index_file = directory.join(index_root_file) 

622 if possible_index_file.exists(): 

623 # If we are explicitly requesting an index file the 

624 # messages should be different. 

625 index_msg = "inferred" 

626 is_implied = True 

627 if index_root_file in files_in_directory: 

628 index_msg = "explicit" 

629 is_implied = False 

630 

631 # Try to read the index file and catch and report any 

632 # problems. 

633 try: 

634 content = json.loads(possible_index_file.read()) 

635 index = process_index_data(content, force_dict=True) 

636 # mypy should in theory know that this is a mapping 

637 # from the overload type annotation of process_index_data. 

638 assert isinstance(index, MutableMapping) 

639 except Exception as e: 

640 # Only trigger the callback if the index file 

641 # was asked for explicitly. Triggering on implied file 

642 # might be surprising. 

643 if not is_implied: 

644 self._on_metadata_failure(possible_index_file, e) 

645 if self.config.failFast: 

646 raise RuntimeError( 

647 f"Problem reading index file from {index_msg} location {possible_index_file}" 

648 ) from e 

649 bad_index_files.add(possible_index_file) 

650 continue 

651 

652 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

653 good_index_files.add(possible_index_file) 

654 

655 # Go through the index adding entries for files. 

656 # If we have non-index files in this directory marked for 

657 # ingest we should only get index information for those. 

658 # If the index file was explicit we use all entries. 

659 if is_implied: 

660 files_to_ingest = files_in_directory 

661 else: 

662 files_to_ingest = set(index) 

663 

664 # Copy relevant metadata into a single dict for all index 

665 # entries. 

666 for file_in_dir in files_to_ingest: 

667 # Skip an explicitly specified index file. 

668 # This should never happen because an explicit index 

669 # file will force ingest of all files in the index 

670 # and not use the explicit file list. If somehow 

671 # this is not true we continue. Raising an exception 

672 # seems like the wrong thing to do since this is harmless. 

673 if file_in_dir == index_root_file: 

674 self.log.info( 

675 "Logic error found scanning directory %s. Please file ticket.", directory 

676 ) 

677 continue 

678 if file_in_dir in index: 

679 file = directory.join(file_in_dir) 

680 if file in index_entries: 

681 # ObservationInfo overrides raw metadata 

682 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

683 index_entries[file], ObservationInfo 

684 ): 

685 self.log.warning( 

686 "File %s already specified in an index file but overriding" 

687 " with ObservationInfo content from %s", 

688 file, 

689 possible_index_file, 

690 ) 

691 else: 

692 self.log.warning( 

693 "File %s already specified in an index file, ignoring content from %s", 

694 file, 

695 possible_index_file, 

696 ) 

697 # Do nothing in this case 

698 continue 

699 

700 index_entries[file] = index[file_in_dir] 

701 

702 # Remove files from list that have index entries and also 

703 # any files that we determined to be explicit index files 

704 # or any index files that we failed to read. 

705 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

706 

707 # The filtered list loses the initial order. Retaining the order 

708 # is good for testing but does have a cost if there are many 

709 # files when copying the good values out. A dict would have faster 

710 # lookups (using the files as keys) but use more memory. 

711 ordered = [f for f in filtered if f in files] 

712 

713 return index_entries, ordered, good_index_files, bad_index_files 

714 

715 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]: 

716 """Convert index entries to RawFileData. 

717 

718 Parameters 

719 ---------- 

720 index_entries : `dict` [`ResourcePath`, Any] 

721 Dict indexed by name of file to ingest and with keys either 

722 raw metadata or translated 

723 `~astro_metadata_translator.ObservationInfo`. 

724 

725 Returns 

726 ------- 

727 data : `list` [ `RawFileData` ] 

728 Structures containing the metadata extracted from the file, 

729 as well as the original filename. All fields will be populated, 

730 but the `RawFileData.dataId` attributes will be minimal 

731 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

732 """ 

733 fileData = [] 

734 for filename, metadata in index_entries.items(): 

735 try: 

736 datasets = [self._calculate_dataset_info(metadata, filename)] 

737 except Exception as e: 

738 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

739 datasets = [] 

740 formatterClass = Formatter 

741 instrument = None 

742 self._on_metadata_failure(filename, e) 

743 if self.config.failFast: 

744 raise RuntimeError( 

745 f"Problem extracting metadata for file {filename} found in index file" 

746 ) from e 

747 else: 

748 instrument, formatterClass = self._determine_instrument_formatter( 

749 datasets[0].dataId, filename 

750 ) 

751 if instrument is None: 

752 datasets = [] 

753 fileData.append( 

754 RawFileData( 

755 datasets=datasets, 

756 filename=filename, 

757 # MyPy wants this to be a non-abstract class, which is not 

758 # true for the error case where instrument is None and 

759 # datasets=[]. 

760 FormatterClass=formatterClass, # type: ignore 

761 instrument=instrument, 

762 ) 

763 ) 

764 return fileData 

765 

766 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

767 """Group an iterable of `RawFileData` by exposure. 

768 

769 Parameters 

770 ---------- 

771 files : iterable of `RawFileData` 

772 File-level information to group. 

773 

774 Returns 

775 ------- 

776 exposures : `list` of `RawExposureData` 

777 A list of structures that group the file-level information by 

778 exposure. All fields will be populated. The 

779 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

780 `~lsst.daf.butler.DataCoordinate` instances. 

781 """ 

782 exposureDimensions = self.universe["exposure"].graph 

783 byExposure = defaultdict(list) 

784 for f in files: 

785 # Assume that the first dataset is representative for the file. 

786 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

787 

788 return [ 

789 RawExposureData( 

790 dataId=dataId, 

791 files=exposureFiles, 

792 universe=self.universe, 

793 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

794 dependencyRecords=self.makeDependencyRecords( 

795 exposureFiles[0].datasets[0].obsInfo, self.universe 

796 ), 

797 ) 

798 for dataId, exposureFiles in byExposure.items() 

799 ] 

800 

801 def makeExposureRecord( 

802 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

803 ) -> DimensionRecord: 

804 """Construct a registry record for an exposure 

805 

806 This is a method that subclasses will often want to customize. This can 

807 often be done by calling this base class implementation with additional 

808 ``kwargs``. 

809 

810 Parameters 

811 ---------- 

812 obsInfo : `ObservationInfo` 

813 Observation details for (one of the components of) the exposure. 

814 universe : `DimensionUniverse` 

815 Set of all known dimensions. 

816 **kwargs 

817 Additional field values for this record. 

818 

819 Returns 

820 ------- 

821 record : `DimensionRecord` 

822 The exposure record that must be inserted into the 

823 `~lsst.daf.butler.Registry` prior to file-level ingest. 

824 """ 

825 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

826 

827 def makeDependencyRecords( 

828 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

829 ) -> Dict[str, DimensionRecord]: 

830 """Construct dependency records 

831 

832 These dependency records will be inserted into the 

833 `~lsst.daf.butler.Registry` before the exposure records, because they 

834 are dependencies of the exposure. This allows an opportunity to satisfy 

835 foreign key constraints that exist because of dimensions related to the 

836 exposure. 

837 

838 This is a method that subclasses may want to customize, if they've 

839 added dimensions that relate to an exposure. 

840 

841 Parameters 

842 ---------- 

843 obsInfo : `ObservationInfo` 

844 Observation details for (one of the components of) the exposure. 

845 universe : `DimensionUniverse` 

846 Set of all known dimensions. 

847 

848 Returns 

849 ------- 

850 records : `dict` [`str`, `DimensionRecord`] 

851 The records to insert, indexed by dimension name. 

852 """ 

853 return {} 

854 

855 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

856 """Expand the data IDs associated with a raw exposure. 

857 

858 This adds the metadata records. 

859 

860 Parameters 

861 ---------- 

862 exposure : `RawExposureData` 

863 A structure containing information about the exposure to be 

864 ingested. Must have `RawExposureData.record` populated. Should 

865 be considered consumed upon return. 

866 

867 Returns 

868 ------- 

869 exposure : `RawExposureData` 

870 An updated version of the input structure, with 

871 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

872 updated to data IDs for which 

873 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

874 """ 

875 # We start by expanded the exposure-level data ID; we won't use that 

876 # directly in file ingest, but this lets us do some database lookups 

877 # once per exposure instead of once per file later. 

878 data.dataId = self.butler.registry.expandDataId( 

879 data.dataId, 

880 # We pass in the records we'll be inserting shortly so they aren't 

881 # looked up from the database. We do expect instrument and filter 

882 # records to be retrieved from the database here (though the 

883 # Registry may cache them so there isn't a lookup every time). 

884 records={"exposure": data.record}, 

885 ) 

886 # Now we expand the per-file (exposure+detector) data IDs. This time 

887 # we pass in the records we just retrieved from the exposure data ID 

888 # expansion. 

889 for file in data.files: 

890 for dataset in file.datasets: 

891 dataset.dataId = self.butler.registry.expandDataId( 

892 dataset.dataId, records=data.dataId.records 

893 ) 

894 return data 

895 

896 def prep( 

897 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1 

898 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]: 

899 """Perform all non-database-updating ingest preprocessing steps. 

900 

901 Parameters 

902 ---------- 

903 files : iterable over `str` or path-like objects 

904 Paths to the files to be ingested. Will be made absolute 

905 if they are not already. 

906 pool : `multiprocessing.Pool`, optional 

907 If not `None`, a process pool with which to parallelize some 

908 operations. 

909 processes : `int`, optional 

910 The number of processes to use. Ignored if ``pool`` is not `None`. 

911 

912 Returns 

913 ------- 

914 exposures : `Iterator` [ `RawExposureData` ] 

915 Data structures containing dimension records, filenames, and data 

916 IDs to be ingested (one structure for each exposure). 

917 bad_files : `list` of `str` 

918 List of all the files that could not have metadata extracted. 

919 """ 

920 if pool is None and processes > 1: 

921 pool = Pool(processes) 

922 mapFunc = map if pool is None else pool.imap_unordered 

923 

924 def _partition_good_bad( 

925 file_data: Iterable[RawFileData], 

926 ) -> Tuple[List[RawFileData], List[ResourcePath]]: 

927 """Filter out bad files and return good with list of bad.""" 

928 good_files = [] 

929 bad_files = [] 

930 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

931 if not fileDatum.datasets: 

932 bad_files.append(fileDatum.filename) 

933 else: 

934 good_files.append(fileDatum) 

935 return good_files, bad_files 

936 

937 # Look for index files and read them. 

938 # There should be far fewer index files than data files. 

939 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

940 if bad_index_files: 

941 self.log.info("Failed to read the following explicitly requested index files:") 

942 for bad in sorted(bad_index_files): 

943 self.log.info("- %s", bad) 

944 

945 # Now convert all the index file entries to standard form for ingest. 

946 processed_bad_index_files: List[ResourcePath] = [] 

947 indexFileData = self.processIndexEntries(index_entries) 

948 if indexFileData: 

949 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

950 self.log.info( 

951 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

952 *_log_msg_counter(indexFileData), 

953 *_log_msg_counter(good_index_files), 

954 *_log_msg_counter(processed_bad_index_files), 

955 ) 

956 

957 # Extract metadata and build per-detector regions. 

958 # This could run in a subprocess so collect all output 

959 # before looking at failures. 

960 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

961 

962 # Filter out all the failed reads and store them for later 

963 # reporting. 

964 good_file_data, bad_files = _partition_good_bad(fileData) 

965 self.log.info( 

966 "Successfully extracted metadata from %d file%s with %d failure%s", 

967 *_log_msg_counter(good_file_data), 

968 *_log_msg_counter(bad_files), 

969 ) 

970 

971 # Combine with data from index files. 

972 good_file_data.extend(indexFileData) 

973 bad_files.extend(processed_bad_index_files) 

974 bad_files.extend(bad_index_files) 

975 

976 # Use that metadata to group files (and extracted metadata) by 

977 # exposure. Never parallelized because it's intrinsically a gather 

978 # step. 

979 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data) 

980 

981 # The next operation operates on RawExposureData instances (one at 

982 # a time) in-place and then returns the modified instance. We call it 

983 # as a pass-through instead of relying on the arguments we pass in to 

984 # have been modified because in the parallel case those arguments are 

985 # going to be pickled and unpickled, and I'm not certain 

986 # multiprocessing is careful enough with that for output arguments to 

987 # work. 

988 

989 # Expand the data IDs to include all dimension metadata; we need this 

990 # because we may need to generate path templates that rely on that 

991 # metadata. 

992 # This is the first step that involves actual database calls (but just 

993 # SELECTs), so if there's going to be a problem with connections vs. 

994 # multiple processes, or lock contention (in SQLite) slowing things 

995 # down, it'll happen here. 

996 return mapFunc(self.expandDataIds, exposureData), bad_files 

997 

998 def ingestExposureDatasets( 

999 self, 

1000 exposure: RawExposureData, 

1001 *, 

1002 run: Optional[str] = None, 

1003 skip_existing_exposures: bool = False, 

1004 track_file_attrs: bool = True, 

1005 ) -> List[FileDataset]: 

1006 """Ingest all raw files in one exposure. 

1007 

1008 Parameters 

1009 ---------- 

1010 exposure : `RawExposureData` 

1011 A structure containing information about the exposure to be 

1012 ingested. Must have `RawExposureData.records` populated and all 

1013 data ID attributes expanded. 

1014 run : `str`, optional 

1015 Name of a RUN-type collection to write to, overriding 

1016 ``self.butler.run``. 

1017 skip_existing_exposures : `bool`, optional 

1018 If `True` (`False` is default), skip raws that have already been 

1019 ingested (i.e. raws for which we already have a dataset with the 

1020 same data ID in the target collection, even if from another file). 

1021 Note that this is much slower than just not passing 

1022 already-ingested files as inputs, because we still need to read and 

1023 process metadata to identify which exposures to search for. It 

1024 also will not work reliably if multiple processes are attempting to 

1025 ingest raws from the same exposure concurrently, in that different 

1026 processes may still attempt to ingest the same raw and conflict, 

1027 causing a failure that prevents other raws from the same exposure 

1028 from being ingested. 

1029 track_file_attrs : `bool`, optional 

1030 Control whether file attributes such as the size or checksum should 

1031 be tracked by the datastore. Whether this parameter is honored 

1032 depends on the specific datastore implentation. 

1033 

1034 Returns 

1035 ------- 

1036 datasets : `list` of `lsst.daf.butler.FileDataset` 

1037 Per-file structures identifying the files ingested and their 

1038 dataset representation in the data repository. 

1039 """ 

1040 if skip_existing_exposures: 

1041 existing = { 

1042 ref.dataId 

1043 for ref in self.butler.registry.queryDatasets( 

1044 self.datasetType, 

1045 collections=[run], 

1046 dataId=exposure.dataId, 

1047 ) 

1048 } 

1049 else: 

1050 existing = set() 

1051 datasets = [] 

1052 for file in exposure.files: 

1053 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing] 

1054 if refs: 

1055 datasets.append( 

1056 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1057 ) 

1058 

1059 # Raw files are preferentially ingested using a UUID derived from 

1060 # the collection name and dataId. 

1061 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1062 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1063 else: 

1064 mode = DatasetIdGenEnum.UNIQUE 

1065 self.butler.ingest( 

1066 *datasets, 

1067 transfer=self.config.transfer, 

1068 run=run, 

1069 idGenerationMode=mode, 

1070 record_validation_info=track_file_attrs, 

1071 ) 

1072 return datasets 

1073 

1074 def ingestFiles( 

1075 self, 

1076 files: Iterable[ResourcePath], 

1077 *, 

1078 pool: Optional[PoolType] = None, 

1079 processes: int = 1, 

1080 run: Optional[str] = None, 

1081 skip_existing_exposures: bool = False, 

1082 update_exposure_records: bool = False, 

1083 track_file_attrs: bool = True, 

1084 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]: 

1085 """Ingest files into a Butler data repository. 

1086 

1087 This creates any new exposure or visit Dimension entries needed to 

1088 identify the ingested files, creates new Dataset entries in the 

1089 Registry and finally ingests the files themselves into the Datastore. 

1090 Any needed instrument, detector, and physical_filter Dimension entries 

1091 must exist in the Registry before `run` is called. 

1092 

1093 Parameters 

1094 ---------- 

1095 files : iterable over `lsst.resources.ResourcePath` 

1096 URIs to the files to be ingested. 

1097 pool : `multiprocessing.Pool`, optional 

1098 If not `None`, a process pool with which to parallelize some 

1099 operations. 

1100 processes : `int`, optional 

1101 The number of processes to use. Ignored if ``pool`` is not `None`. 

1102 run : `str`, optional 

1103 Name of a RUN-type collection to write to, overriding 

1104 the default derived from the instrument name. 

1105 skip_existing_exposures : `bool`, optional 

1106 If `True` (`False` is default), skip raws that have already been 

1107 ingested (i.e. raws for which we already have a dataset with the 

1108 same data ID in the target collection, even if from another file). 

1109 Note that this is much slower than just not passing 

1110 already-ingested files as inputs, because we still need to read and 

1111 process metadata to identify which exposures to search for. It 

1112 also will not work reliably if multiple processes are attempting to 

1113 ingest raws from the same exposure concurrently, in that different 

1114 processes may still attempt to ingest the same raw and conflict, 

1115 causing a failure that prevents other raws from the same exposure 

1116 from being ingested. 

1117 update_exposure_records : `bool`, optional 

1118 If `True` (`False` is default), update existing exposure records 

1119 that conflict with the new ones instead of rejecting them. THIS IS 

1120 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1121 KNOWN TO BE BAD. This should usually be combined with 

1122 ``skip_existing_exposures=True``. 

1123 track_file_attrs : `bool`, optional 

1124 Control whether file attributes such as the size or checksum should 

1125 be tracked by the datastore. Whether this parameter is honored 

1126 depends on the specific datastore implentation. 

1127 

1128 Returns 

1129 ------- 

1130 refs : `list` of `lsst.daf.butler.DatasetRef` 

1131 Dataset references for ingested raws. 

1132 bad_files : `list` of `ResourcePath` 

1133 Given paths that could not be ingested. 

1134 n_exposures : `int` 

1135 Number of exposures successfully ingested. 

1136 n_exposures_failed : `int` 

1137 Number of exposures that failed when inserting dimension data. 

1138 n_ingests_failed : `int` 

1139 Number of exposures that failed when ingesting raw datasets. 

1140 """ 

1141 

1142 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

1143 

1144 # Up to this point, we haven't modified the data repository at all. 

1145 # Now we finally do that, with one transaction per exposure. This is 

1146 # not parallelized at present because the performance of this step is 

1147 # limited by the database server. That may or may not change in the 

1148 # future once we increase our usage of bulk inserts and reduce our 

1149 # usage of savepoints; we've tried to get everything but the database 

1150 # operations done in advance to reduce the time spent inside 

1151 # transactions. 

1152 self.butler.registry.registerDatasetType(self.datasetType) 

1153 

1154 refs = [] 

1155 runs = set() 

1156 n_exposures = 0 

1157 n_exposures_failed = 0 

1158 n_ingests_failed = 0 

1159 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1160 assert exposure.record is not None, "Should be guaranteed by prep()" 

1161 self.log.debug( 

1162 "Attempting to ingest %d file%s from exposure %s:%s", 

1163 *_log_msg_counter(exposure.files), 

1164 exposure.record.instrument, 

1165 exposure.record.obs_id, 

1166 ) 

1167 

1168 try: 

1169 for name, record in exposure.dependencyRecords.items(): 

1170 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1171 inserted_or_updated = self.butler.registry.syncDimensionData( 

1172 "exposure", 

1173 exposure.record, 

1174 update=update_exposure_records, 

1175 ) 

1176 except Exception as e: 

1177 self._on_ingest_failure(exposure, e) 

1178 n_exposures_failed += 1 

1179 self.log.warning( 

1180 "Exposure %s:%s could not be registered: %s", 

1181 exposure.record.instrument, 

1182 exposure.record.obs_id, 

1183 e, 

1184 ) 

1185 if self.config.failFast: 

1186 raise e 

1187 continue 

1188 

1189 if isinstance(inserted_or_updated, dict): 

1190 # Exposure is in the registry and we updated it, so 

1191 # syncDimensionData returned a dict. 

1192 self.log.info( 

1193 "Exposure %s:%s was already present, but columns %s were updated.", 

1194 exposure.record.instrument, 

1195 exposure.record.obs_id, 

1196 str(list(inserted_or_updated.keys())), 

1197 ) 

1198 

1199 # Override default run if nothing specified explicitly. 

1200 if run is None: 

1201 instrument = exposure.files[0].instrument 

1202 assert ( 

1203 instrument is not None 

1204 ), "file should have been removed from this list by prep if instrument could not be found" 

1205 this_run = instrument.makeDefaultRawIngestRunName() 

1206 else: 

1207 this_run = run 

1208 if this_run not in runs: 

1209 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1210 runs.add(this_run) 

1211 try: 

1212 datasets_for_exposure = self.ingestExposureDatasets( 

1213 exposure, 

1214 run=this_run, 

1215 skip_existing_exposures=skip_existing_exposures, 

1216 track_file_attrs=track_file_attrs, 

1217 ) 

1218 except Exception as e: 

1219 self._on_ingest_failure(exposure, e) 

1220 n_ingests_failed += 1 

1221 self.log.warning("Failed to ingest the following for reason: %s", e) 

1222 for f in exposure.files: 

1223 self.log.warning("- %s", f.filename) 

1224 if self.config.failFast: 

1225 raise e 

1226 continue 

1227 else: 

1228 self._on_success(datasets_for_exposure) 

1229 for dataset in datasets_for_exposure: 

1230 refs.extend(dataset.refs) 

1231 

1232 # Success for this exposure. 

1233 n_exposures += 1 

1234 self.log.info( 

1235 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1236 ) 

1237 

1238 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1239 

1240 @timeMethod 

1241 def run( 

1242 self, 

1243 files: Iterable[ResourcePathExpression], 

1244 *, 

1245 pool: Optional[PoolType] = None, 

1246 processes: int = 1, 

1247 run: Optional[str] = None, 

1248 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", 

1249 group_files: bool = True, 

1250 skip_existing_exposures: bool = False, 

1251 update_exposure_records: bool = False, 

1252 track_file_attrs: bool = True, 

1253 ) -> List[DatasetRef]: 

1254 """Ingest files into a Butler data repository. 

1255 

1256 This creates any new exposure or visit Dimension entries needed to 

1257 identify the ingested files, creates new Dataset entries in the 

1258 Registry and finally ingests the files themselves into the Datastore. 

1259 Any needed instrument, detector, and physical_filter Dimension entries 

1260 must exist in the Registry before `run` is called. 

1261 

1262 Parameters 

1263 ---------- 

1264 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1265 Paths to the files to be ingested. Can refer to directories. 

1266 Will be made absolute if they are not already. 

1267 pool : `multiprocessing.Pool`, optional 

1268 If not `None`, a process pool with which to parallelize some 

1269 operations. 

1270 processes : `int`, optional 

1271 The number of processes to use. Ignored if ``pool`` is not `None`. 

1272 run : `str`, optional 

1273 Name of a RUN-type collection to write to, overriding 

1274 the default derived from the instrument name. 

1275 file_filter : `str` or `re.Pattern`, optional 

1276 Pattern to use to discover files to ingest within directories. 

1277 The default is to search for FITS files. The regex applies to 

1278 files within the directory. 

1279 group_files : `bool`, optional 

1280 Group files by directory if they have been discovered in 

1281 directories. Will not affect files explicitly provided. 

1282 skip_existing_exposures : `bool`, optional 

1283 If `True` (`False` is default), skip raws that have already been 

1284 ingested (i.e. raws for which we already have a dataset with the 

1285 same data ID in the target collection, even if from another file). 

1286 Note that this is much slower than just not passing 

1287 already-ingested files as inputs, because we still need to read and 

1288 process metadata to identify which exposures to search for. It 

1289 also will not work reliably if multiple processes are attempting to 

1290 ingest raws from the same exposure concurrently, in that different 

1291 processes may still attempt to ingest the same raw and conflict, 

1292 causing a failure that prevents other raws from the same exposure 

1293 from being ingested. 

1294 update_exposure_records : `bool`, optional 

1295 If `True` (`False` is default), update existing exposure records 

1296 that conflict with the new ones instead of rejecting them. THIS IS 

1297 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1298 KNOWN TO BE BAD. This should usually be combined with 

1299 ``skip_existing_exposures=True``. 

1300 track_file_attrs : `bool`, optional 

1301 Control whether file attributes such as the size or checksum should 

1302 be tracked by the datastore. Whether this parameter is honored 

1303 depends on the specific datastore implentation. 

1304 

1305 Returns 

1306 ------- 

1307 refs : `list` of `lsst.daf.butler.DatasetRef` 

1308 Dataset references for ingested raws. 

1309 

1310 Notes 

1311 ----- 

1312 This method inserts all datasets for an exposure within a transaction, 

1313 guaranteeing that partial exposures are never ingested. The exposure 

1314 dimension record is inserted with `Registry.syncDimensionData` first 

1315 (in its own transaction), which inserts only if a record with the same 

1316 primary key does not already exist. This allows different files within 

1317 the same exposure to be ingested in different runs. 

1318 """ 

1319 

1320 refs = [] 

1321 bad_files = [] 

1322 n_exposures = 0 

1323 n_exposures_failed = 0 

1324 n_ingests_failed = 0 

1325 if group_files: 

1326 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1327 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1328 group, 

1329 pool=pool, 

1330 processes=processes, 

1331 run=run, 

1332 skip_existing_exposures=skip_existing_exposures, 

1333 update_exposure_records=update_exposure_records, 

1334 track_file_attrs=track_file_attrs, 

1335 ) 

1336 refs.extend(new_refs) 

1337 bad_files.extend(bad) 

1338 n_exposures += n_exp 

1339 n_exposures_failed += n_exp_fail 

1340 n_ingests_failed += n_ingest_fail 

1341 else: 

1342 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1343 ResourcePath.findFileResources(files, file_filter, group_files), 

1344 pool=pool, 

1345 processes=processes, 

1346 run=run, 

1347 skip_existing_exposures=skip_existing_exposures, 

1348 update_exposure_records=update_exposure_records, 

1349 ) 

1350 

1351 had_failure = False 

1352 

1353 if bad_files: 

1354 had_failure = True 

1355 self.log.warning("Could not extract observation metadata from the following:") 

1356 for f in bad_files: 

1357 self.log.warning("- %s", f) 

1358 

1359 self.log.info( 

1360 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1361 " registration and %d failure%s from file ingest.", 

1362 *_log_msg_counter(n_exposures), 

1363 *_log_msg_counter(n_exposures_failed), 

1364 *_log_msg_counter(n_ingests_failed), 

1365 ) 

1366 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1367 had_failure = True 

1368 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1369 

1370 if had_failure: 

1371 raise RuntimeError("Some failures encountered during ingestion") 

1372 

1373 return refs