Coverage for python/lsst/obs/base/ingest.py: 18%

341 statements  

« prev     ^ index     » next       coverage.py v6.4, created at 2022-05-27 11:22 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from dataclasses import InitVar, dataclass 

29from multiprocessing import Pool 

30from typing import ( 

31 Any, 

32 Callable, 

33 ClassVar, 

34 Dict, 

35 Iterable, 

36 Iterator, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Sized, 

42 Tuple, 

43 Type, 

44 Union, 

45) 

46 

47from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

48from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

49from lsst.afw.fits import readMetadata 

50from lsst.daf.butler import ( 

51 Butler, 

52 CollectionType, 

53 DataCoordinate, 

54 DatasetIdGenEnum, 

55 DatasetRef, 

56 DatasetType, 

57 DimensionRecord, 

58 DimensionUniverse, 

59 FileDataset, 

60 Formatter, 

61 Progress, 

62) 

63from lsst.pex.config import ChoiceField, Config, Field 

64from lsst.pipe.base import Instrument, Task 

65from lsst.resources import ResourcePath, ResourcePathExpression 

66from lsst.utils.timer import timeMethod 

67 

68from ._instrument import makeExposureRecordFromObsInfo 

69 

70# multiprocessing.Pool is actually a function, not a type, and the real type 

71# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

72# this alias instead. 

73PoolType = Any 

74 

75 

76def _do_nothing(*args: Any, **kwargs: Any) -> None: 

77 """Do nothing. 

78 

79 This is a function that accepts anything and does nothing. 

80 For use as a default in callback arguments. 

81 """ 

82 pass 

83 

84 

85def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]: 

86 """Count the iterable and return the count and plural modifier. 

87 

88 Parameters 

89 ---------- 

90 noun : `Sized` or `int` 

91 Thing to count. If given an integer it is assumed to be the count 

92 to use to calculate modifier. 

93 

94 Returns 

95 ------- 

96 num : `int` 

97 Number of items found in ``noun``. 

98 modifier : `str` 

99 Character to add to the end of a string referring to these items 

100 to indicate whether it was a single item or not. Returns empty 

101 string if there is one item or "s" otherwise. 

102 

103 Examples 

104 -------- 

105 

106 .. code-block:: python 

107 

108 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

109 """ 

110 if isinstance(noun, int): 

111 num = noun 

112 else: 

113 num = len(noun) 

114 return num, "" if num == 1 else "s" 

115 

116 

117@dataclass 

118class RawFileDatasetInfo: 

119 """Information about a single dataset within a raw file.""" 

120 

121 dataId: DataCoordinate 

122 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

123 

124 obsInfo: ObservationInfo 

125 """Standardized observation metadata extracted directly from the file 

126 headers (`astro_metadata_translator.ObservationInfo`). 

127 """ 

128 

129 

130@dataclass 

131class RawFileData: 

132 """Information about a single raw file, used during ingest.""" 

133 

134 datasets: List[RawFileDatasetInfo] 

135 """The information describing each dataset within this raw file. 

136 (`list` of `RawFileDatasetInfo`) 

137 """ 

138 

139 filename: ResourcePath 

140 """URI of the file this information was extracted from (`str`). 

141 

142 This is the path prior to ingest, not the path after ingest. 

143 """ 

144 

145 FormatterClass: Type[Formatter] 

146 """Formatter class that should be used to ingest this file (`type`; as 

147 subclass of `Formatter`). 

148 """ 

149 

150 instrument: Optional[Instrument] 

151 """The `Instrument` instance associated with this file. Can be `None` 

152 if ``datasets`` is an empty list.""" 

153 

154 

155@dataclass 

156class RawExposureData: 

157 """Information about a complete raw exposure, used during ingest.""" 

158 

159 dataId: DataCoordinate 

160 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

161 """ 

162 

163 files: List[RawFileData] 

164 """List of structures containing file-level information. 

165 """ 

166 

167 universe: InitVar[DimensionUniverse] 

168 """Set of all known dimensions. 

169 """ 

170 

171 record: DimensionRecord 

172 """The exposure `DimensionRecord` that must be inserted into the 

173 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

174 """ 

175 

176 dependencyRecords: Dict[str, DimensionRecord] 

177 """Additional records that must be inserted into the 

178 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

179 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

180 """ 

181 

182 

183def makeTransferChoiceField( 

184 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

185) -> ChoiceField: 

186 """Create a Config field with options for transferring data between repos. 

187 

188 The allowed options for the field are exactly those supported by 

189 `lsst.daf.butler.Datastore.ingest`. 

190 

191 Parameters 

192 ---------- 

193 doc : `str` 

194 Documentation for the configuration field. 

195 default : `str`, optional 

196 Default transfer mode for the field. 

197 

198 Returns 

199 ------- 

200 field : `lsst.pex.config.ChoiceField` 

201 Configuration field. 

202 """ 

203 return ChoiceField( 

204 doc=doc, 

205 dtype=str, 

206 allowed={ 

207 "move": "move", 

208 "copy": "copy", 

209 "auto": "choice will depend on datastore", 

210 "direct": "use URI to ingested file directly in datastore", 

211 "link": "hard link falling back to symbolic link", 

212 "hardlink": "hard link", 

213 "symlink": "symbolic (soft) link", 

214 "relsymlink": "relative symbolic link", 

215 }, 

216 optional=True, 

217 default=default, 

218 ) 

219 

220 

221class RawIngestConfig(Config): 

222 """Configuration class for RawIngestTask.""" 

223 

224 transfer = makeTransferChoiceField() 

225 failFast = Field( 

226 dtype=bool, 

227 default=False, 

228 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

229 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

230 ) 

231 

232 

233class RawIngestTask(Task): 

234 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

235 

236 Parameters 

237 ---------- 

238 config : `RawIngestConfig` 

239 Configuration for the task. 

240 butler : `~lsst.daf.butler.Butler` 

241 Writeable butler instance, with ``butler.run`` set to the appropriate 

242 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

243 datasets. 

244 on_success : `Callable`, optional 

245 A callback invoked when all of the raws associated with an exposure 

246 are ingested. Will be passed a list of `FileDataset` objects, each 

247 containing one or more resolved `DatasetRef` objects. If this callback 

248 raises it will interrupt the entire ingest process, even if 

249 `RawIngestConfig.failFast` is `False`. 

250 on_metadata_failure : `Callable`, optional 

251 A callback invoked when a failure occurs trying to translate the 

252 metadata for a file. Will be passed the URI and the exception, in 

253 that order, as positional arguments. Guaranteed to be called in an 

254 ``except`` block, allowing the callback to re-raise or replace (with 

255 ``raise ... from``) to override the task's usual error handling (before 

256 `RawIngestConfig.failFast` logic occurs). 

257 on_ingest_failure : `Callable`, optional 

258 A callback invoked when dimension record or dataset insertion into the 

259 database fails for an exposure. Will be passed a `RawExposureData` 

260 instance and the exception, in that order, as positional arguments. 

261 Guaranteed to be called in an ``except`` block, allowing the callback 

262 to re-raise or replace (with ``raise ... from``) to override the task's 

263 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

264 **kwargs 

265 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

266 constructor. 

267 

268 Notes 

269 ----- 

270 Each instance of `RawIngestTask` writes to the same Butler. Each 

271 invocation of `RawIngestTask.run` ingests a list of files. 

272 """ 

273 

274 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig 

275 

276 _DefaultName: ClassVar[str] = "ingest" 

277 

278 def getDatasetType(self) -> DatasetType: 

279 """Return the DatasetType of the datasets ingested by this Task.""" 

280 return DatasetType( 

281 "raw", 

282 ("instrument", "detector", "exposure"), 

283 "Exposure", 

284 universe=self.butler.registry.dimensions, 

285 ) 

286 

287 def __init__( 

288 self, 

289 config: RawIngestConfig, 

290 *, 

291 butler: Butler, 

292 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

293 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

294 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

295 **kwargs: Any, 

296 ): 

297 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

298 super().__init__(config, **kwargs) 

299 self.butler = butler 

300 self.universe = self.butler.registry.dimensions 

301 self.datasetType = self.getDatasetType() 

302 self._on_success = on_success 

303 self._on_metadata_failure = on_metadata_failure 

304 self._on_ingest_failure = on_ingest_failure 

305 self.progress = Progress("obs.base.RawIngestTask") 

306 

307 # Import all the instrument classes so that we ensure that we 

308 # have all the relevant metadata translators loaded. 

309 Instrument.importAll(self.butler.registry) 

310 

311 def _reduce_kwargs(self) -> Dict[str, Any]: 

312 # Add extra parameters to pickle. 

313 return dict( 

314 **super()._reduce_kwargs(), 

315 butler=self.butler, 

316 on_success=self._on_success, 

317 on_metadata_failure=self._on_metadata_failure, 

318 on_ingest_failure=self._on_ingest_failure, 

319 ) 

320 

321 def _determine_instrument_formatter( 

322 self, dataId: DataCoordinate, filename: ResourcePath 

323 ) -> Tuple[Optional[Instrument], Type[Formatter]]: 

324 """Determine the instrument and formatter class. 

325 

326 Parameters 

327 ---------- 

328 dataId : `lsst.daf.butler.DataCoordinate` 

329 The dataId associated with this dataset. 

330 filename : `lsst.resources.ResourcePath` 

331 URI of file used for error reporting. 

332 

333 Returns 

334 ------- 

335 instrument : `Instrument` or `None` 

336 Instance of the `Instrument` associated with this dataset. `None` 

337 indicates that the instrument could not be determined. 

338 formatterClass : `type` 

339 Class to be used as the formatter for this dataset. 

340 """ 

341 # The data model currently assumes that whilst multiple datasets 

342 # can be associated with a single file, they must all share the 

343 # same formatter. 

344 try: 

345 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

346 except LookupError as e: 

347 self._on_metadata_failure(filename, e) 

348 self.log.warning( 

349 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

350 ) 

351 if self.config.failFast: 

352 raise RuntimeError( 

353 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

354 ) from e 

355 FormatterClass = Formatter 

356 # Indicate that we could not work out the instrument. 

357 instrument = None 

358 else: 

359 assert instrument is not None, "Should be guaranted by fromName succeeding." 

360 FormatterClass = instrument.getRawFormatter(dataId) 

361 return instrument, FormatterClass 

362 

363 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

364 """Extract and process metadata from a single raw file. 

365 

366 Parameters 

367 ---------- 

368 filename : `lsst.resources.ResourcePath` 

369 URI to the file. 

370 

371 Returns 

372 ------- 

373 data : `RawFileData` 

374 A structure containing the metadata extracted from the file, 

375 as well as the original filename. All fields will be populated, 

376 but the `RawFileData.dataId` attribute will be a minimal 

377 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

378 ``instrument`` field will be `None` if there is a problem 

379 with metadata extraction. 

380 

381 Notes 

382 ----- 

383 Assumes that there is a single dataset associated with the given 

384 file. Instruments using a single file to store multiple datasets 

385 must implement their own version of this method. 

386 

387 By default the method will catch all exceptions unless the ``failFast`` 

388 configuration item is `True`. If an error is encountered the 

389 `_on_metadata_failure()` method will be called. If no exceptions 

390 result and an error was encountered the returned object will have 

391 a null-instrument class and no datasets. 

392 

393 This method supports sidecar JSON files which can be used to 

394 extract metadata without having to read the data file itself. 

395 The sidecar file is always used if found. 

396 """ 

397 sidecar_fail_msg = "" # Requires prepended space when set. 

398 try: 

399 sidecar_file = filename.updatedExtension(".json") 

400 if sidecar_file.exists(): 

401 content = json.loads(sidecar_file.read()) 

402 headers = [process_sidecar_data(content)] 

403 sidecar_fail_msg = " (via sidecar)" 

404 else: 

405 # Read the metadata from the data file itself. 

406 

407 # For remote files download the entire file to get the 

408 # header. This is very inefficient and it would be better 

409 # to have some way of knowing where in the file the headers 

410 # are and to only download those parts of the file. 

411 with filename.as_local() as local_file: 

412 # Read the primary. This might be sufficient. 

413 header = readMetadata(local_file.ospath, 0) 

414 

415 try: 

416 # Try to work out a translator class early. 

417 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

418 except ValueError: 

419 # Primary header was not sufficient (maybe this file 

420 # has been compressed or is a MEF with minimal 

421 # primary). Read second header and merge with primary. 

422 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

423 

424 # Try again to work out a translator class, letting this 

425 # fail. 

426 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

427 

428 # Request the headers to use for ingest 

429 headers = translator_class.determine_translatable_headers(filename.ospath, header) 

430 

431 # Add each header to the dataset list 

432 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

433 

434 except Exception as e: 

435 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

436 # Indicate to the caller that we failed to read. 

437 datasets = [] 

438 formatterClass = Formatter 

439 instrument = None 

440 self._on_metadata_failure(filename, e) 

441 if self.config.failFast: 

442 raise RuntimeError( 

443 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

444 ) from e 

445 else: 

446 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

447 # The data model currently assumes that whilst multiple datasets 

448 # can be associated with a single file, they must all share the 

449 # same formatter. 

450 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

451 if instrument is None: 

452 datasets = [] 

453 

454 return RawFileData( 

455 datasets=datasets, 

456 filename=filename, 

457 # MyPy wants this to be a non-abstract class, which is not true 

458 # for the error case where instrument is None and datasets=[]. 

459 FormatterClass=formatterClass, # type: ignore 

460 instrument=instrument, 

461 ) 

462 

463 @classmethod 

464 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]: 

465 """Return subsets of fields in the `ObservationInfo` that we care about 

466 

467 These fields will be used in constructing an exposure record. 

468 

469 Returns 

470 ------- 

471 required : `set` 

472 Set of `ObservationInfo` field names that are required. 

473 optional : `set` 

474 Set of `ObservationInfo` field names we will use if they are 

475 available. 

476 """ 

477 required = { 

478 "datetime_begin", 

479 "datetime_end", 

480 "detector_num", 

481 "exposure_id", 

482 "exposure_time", 

483 "instrument", 

484 "observation_id", 

485 "observation_type", 

486 "physical_filter", 

487 } 

488 optional = { 

489 "altaz_begin", 

490 "boresight_rotation_coord", 

491 "boresight_rotation_angle", 

492 "dark_time", 

493 "exposure_group", 

494 "tracking_radec", 

495 "object", 

496 "observation_counter", 

497 "observation_reason", 

498 "observing_day", 

499 "science_program", 

500 "visit_id", 

501 } 

502 return required, optional 

503 

504 def _calculate_dataset_info( 

505 self, header: Union[Mapping[str, Any], ObservationInfo], filename: ResourcePath 

506 ) -> RawFileDatasetInfo: 

507 """Calculate a RawFileDatasetInfo from the supplied information. 

508 

509 Parameters 

510 ---------- 

511 header : Mapping or `astro_metadata_translator.ObservationInfo` 

512 Header from the dataset or previously-translated content. 

513 filename : `lsst.resources.ResourcePath` 

514 Filename to use for error messages. 

515 

516 Returns 

517 ------- 

518 dataset : `RawFileDatasetInfo` 

519 The dataId, and observation information associated with this 

520 dataset. 

521 """ 

522 required, optional = self.getObservationInfoSubsets() 

523 if isinstance(header, ObservationInfo): 

524 obsInfo = header 

525 missing = [] 

526 # Need to check the required properties are present. 

527 for property in required: 

528 # getattr does not need to be protected because it is using 

529 # the defined list above containing properties that must exist. 

530 value = getattr(obsInfo, property) 

531 if value is None: 

532 missing.append(property) 

533 if missing: 

534 raise ValueError( 

535 f"Requested required properties are missing from file {filename}:" 

536 f" {missing} (via JSON)" 

537 ) 

538 

539 else: 

540 obsInfo = ObservationInfo( 

541 header, 

542 pedantic=False, 

543 filename=str(filename), 

544 required=required, 

545 subset=required | optional, 

546 ) 

547 

548 dataId = DataCoordinate.standardize( 

549 instrument=obsInfo.instrument, 

550 exposure=obsInfo.exposure_id, 

551 detector=obsInfo.detector_num, 

552 universe=self.universe, 

553 ) 

554 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

555 

556 def locateAndReadIndexFiles( 

557 self, files: Iterable[ResourcePath] 

558 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]: 

559 """Given a list of files, look for index files and read them. 

560 

561 Index files can either be explicitly in the list of files to 

562 ingest, or else located in the same directory as a file to ingest. 

563 Index entries are always used if present. 

564 

565 Parameters 

566 ---------- 

567 files : iterable over `lsst.resources.ResourcePath` 

568 URIs to the files to be ingested. 

569 

570 Returns 

571 ------- 

572 index : `dict` [`ResourcePath`, Any] 

573 Merged contents of all relevant index files found. These can 

574 be explicitly specified index files or ones found in the 

575 directory alongside a data file to be ingested. 

576 updated_files : `list` of `ResourcePath` 

577 Updated list of the input files with entries removed that were 

578 found listed in an index file. Order is not guaranteed to 

579 match the order of the files given to this routine. 

580 good_index_files: `set` [ `ResourcePath` ] 

581 Index files that were successfully read. 

582 bad_index_files: `set` [ `ResourcePath` ] 

583 Files that looked like index files but failed to read properly. 

584 """ 

585 # Convert the paths to absolute for easy comparison with index content. 

586 # Do not convert to real paths since we have to assume that index 

587 # files are in this location and not the location which it links to. 

588 files = tuple(f.abspath() for f in files) 

589 

590 # Index files must be named this. 

591 index_root_file = "_index.json" 

592 

593 # Group the files by directory. 

594 files_by_directory = defaultdict(set) 

595 

596 for path in files: 

597 directory, file_in_dir = path.split() 

598 files_by_directory[directory].add(file_in_dir) 

599 

600 # All the metadata read from index files with keys of full path. 

601 index_entries: Dict[ResourcePath, Any] = {} 

602 

603 # Index files we failed to read. 

604 bad_index_files = set() 

605 

606 # Any good index files that were found and used. 

607 good_index_files = set() 

608 

609 # Look for index files in those directories. 

610 for directory, files_in_directory in files_by_directory.items(): 

611 possible_index_file = directory.join(index_root_file) 

612 if possible_index_file.exists(): 

613 # If we are explicitly requesting an index file the 

614 # messages should be different. 

615 index_msg = "inferred" 

616 is_implied = True 

617 if index_root_file in files_in_directory: 

618 index_msg = "explicit" 

619 is_implied = False 

620 

621 # Try to read the index file and catch and report any 

622 # problems. 

623 try: 

624 content = json.loads(possible_index_file.read()) 

625 index = process_index_data(content, force_dict=True) 

626 except Exception as e: 

627 # Only trigger the callback if the index file 

628 # was asked for explicitly. Triggering on implied file 

629 # might be surprising. 

630 if not is_implied: 

631 self._on_metadata_failure(possible_index_file, e) 

632 if self.config.failFast: 

633 raise RuntimeError( 

634 f"Problem reading index file from {index_msg} location {possible_index_file}" 

635 ) from e 

636 bad_index_files.add(possible_index_file) 

637 continue 

638 

639 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

640 good_index_files.add(possible_index_file) 

641 

642 # Go through the index adding entries for files. 

643 # If we have non-index files in this directory marked for 

644 # ingest we should only get index information for those. 

645 # If the index file was explicit we use all entries. 

646 if is_implied: 

647 files_to_ingest = files_in_directory 

648 else: 

649 files_to_ingest = set(index) 

650 

651 # Copy relevant metadata into a single dict for all index 

652 # entries. 

653 for file_in_dir in files_to_ingest: 

654 # Skip an explicitly specified index file. 

655 # This should never happen because an explicit index 

656 # file will force ingest of all files in the index 

657 # and not use the explicit file list. If somehow 

658 # this is not true we continue. Raising an exception 

659 # seems like the wrong thing to do since this is harmless. 

660 if file_in_dir == index_root_file: 

661 self.log.info( 

662 "Logic error found scanning directory %s. Please file ticket.", directory 

663 ) 

664 continue 

665 if file_in_dir in index: 

666 file = directory.join(file_in_dir) 

667 if file in index_entries: 

668 # ObservationInfo overrides raw metadata 

669 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

670 index_entries[file], ObservationInfo 

671 ): 

672 self.log.warning( 

673 "File %s already specified in an index file but overriding" 

674 " with ObservationInfo content from %s", 

675 file, 

676 possible_index_file, 

677 ) 

678 else: 

679 self.log.warning( 

680 "File %s already specified in an index file, ignoring content from %s", 

681 file, 

682 possible_index_file, 

683 ) 

684 # Do nothing in this case 

685 continue 

686 

687 index_entries[file] = index[file_in_dir] 

688 

689 # Remove files from list that have index entries and also 

690 # any files that we determined to be explicit index files 

691 # or any index files that we failed to read. 

692 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

693 

694 # The filtered list loses the initial order. Retaining the order 

695 # is good for testing but does have a cost if there are many 

696 # files when copying the good values out. A dict would have faster 

697 # lookups (using the files as keys) but use more memory. 

698 ordered = [f for f in filtered if f in files] 

699 

700 return index_entries, ordered, good_index_files, bad_index_files 

701 

702 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]: 

703 """Convert index entries to RawFileData. 

704 

705 Parameters 

706 ---------- 

707 index_entries : `dict` [`ResourcePath`, Any] 

708 Dict indexed by name of file to ingest and with keys either 

709 raw metadata or translated 

710 `~astro_metadata_translator.ObservationInfo`. 

711 

712 Returns 

713 ------- 

714 data : `list` [ `RawFileData` ] 

715 Structures containing the metadata extracted from the file, 

716 as well as the original filename. All fields will be populated, 

717 but the `RawFileData.dataId` attributes will be minimal 

718 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

719 """ 

720 fileData = [] 

721 for filename, metadata in index_entries.items(): 

722 try: 

723 datasets = [self._calculate_dataset_info(metadata, filename)] 

724 except Exception as e: 

725 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

726 datasets = [] 

727 formatterClass = Formatter 

728 instrument = None 

729 self._on_metadata_failure(filename, e) 

730 if self.config.failFast: 

731 raise RuntimeError( 

732 f"Problem extracting metadata for file {filename} found in index file" 

733 ) from e 

734 else: 

735 instrument, formatterClass = self._determine_instrument_formatter( 

736 datasets[0].dataId, filename 

737 ) 

738 if instrument is None: 

739 datasets = [] 

740 fileData.append( 

741 RawFileData( 

742 datasets=datasets, 

743 filename=filename, 

744 # MyPy wants this to be a non-abstract class, which is not 

745 # true for the error case where instrument is None and 

746 # datasets=[]. 

747 FormatterClass=formatterClass, # type: ignore 

748 instrument=instrument, 

749 ) 

750 ) 

751 return fileData 

752 

753 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

754 """Group an iterable of `RawFileData` by exposure. 

755 

756 Parameters 

757 ---------- 

758 files : iterable of `RawFileData` 

759 File-level information to group. 

760 

761 Returns 

762 ------- 

763 exposures : `list` of `RawExposureData` 

764 A list of structures that group the file-level information by 

765 exposure. All fields will be populated. The 

766 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

767 `~lsst.daf.butler.DataCoordinate` instances. 

768 """ 

769 exposureDimensions = self.universe["exposure"].graph 

770 byExposure = defaultdict(list) 

771 for f in files: 

772 # Assume that the first dataset is representative for the file. 

773 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

774 

775 return [ 

776 RawExposureData( 

777 dataId=dataId, 

778 files=exposureFiles, 

779 universe=self.universe, 

780 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

781 dependencyRecords=self.makeDependencyRecords( 

782 exposureFiles[0].datasets[0].obsInfo, self.universe 

783 ), 

784 ) 

785 for dataId, exposureFiles in byExposure.items() 

786 ] 

787 

788 def makeExposureRecord( 

789 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

790 ) -> DimensionRecord: 

791 """Construct a registry record for an exposure 

792 

793 This is a method that subclasses will often want to customize. This can 

794 often be done by calling this base class implementation with additional 

795 ``kwargs``. 

796 

797 Parameters 

798 ---------- 

799 obsInfo : `ObservationInfo` 

800 Observation details for (one of the components of) the exposure. 

801 universe : `DimensionUniverse` 

802 Set of all known dimensions. 

803 **kwargs 

804 Additional field values for this record. 

805 

806 Returns 

807 ------- 

808 record : `DimensionRecord` 

809 The exposure record that must be inserted into the 

810 `~lsst.daf.butler.Registry` prior to file-level ingest. 

811 """ 

812 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

813 

814 def makeDependencyRecords( 

815 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

816 ) -> Dict[str, DimensionRecord]: 

817 """Construct dependency records 

818 

819 These dependency records will be inserted into the 

820 `~lsst.daf.butler.Registry` before the exposure records, because they 

821 are dependencies of the exposure. This allows an opportunity to satisfy 

822 foreign key constraints that exist because of dimensions related to the 

823 exposure. 

824 

825 This is a method that subclasses may want to customize, if they've 

826 added dimensions that relate to an exposure. 

827 

828 Parameters 

829 ---------- 

830 obsInfo : `ObservationInfo` 

831 Observation details for (one of the components of) the exposure. 

832 universe : `DimensionUniverse` 

833 Set of all known dimensions. 

834 

835 Returns 

836 ------- 

837 records : `dict` [`str`, `DimensionRecord`] 

838 The records to insert, indexed by dimension name. 

839 """ 

840 return {} 

841 

842 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

843 """Expand the data IDs associated with a raw exposure. 

844 

845 This adds the metadata records. 

846 

847 Parameters 

848 ---------- 

849 exposure : `RawExposureData` 

850 A structure containing information about the exposure to be 

851 ingested. Must have `RawExposureData.record` populated. Should 

852 be considered consumed upon return. 

853 

854 Returns 

855 ------- 

856 exposure : `RawExposureData` 

857 An updated version of the input structure, with 

858 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

859 updated to data IDs for which 

860 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

861 """ 

862 # We start by expanded the exposure-level data ID; we won't use that 

863 # directly in file ingest, but this lets us do some database lookups 

864 # once per exposure instead of once per file later. 

865 data.dataId = self.butler.registry.expandDataId( 

866 data.dataId, 

867 # We pass in the records we'll be inserting shortly so they aren't 

868 # looked up from the database. We do expect instrument and filter 

869 # records to be retrieved from the database here (though the 

870 # Registry may cache them so there isn't a lookup every time). 

871 records={"exposure": data.record}, 

872 ) 

873 # Now we expand the per-file (exposure+detector) data IDs. This time 

874 # we pass in the records we just retrieved from the exposure data ID 

875 # expansion. 

876 for file in data.files: 

877 for dataset in file.datasets: 

878 dataset.dataId = self.butler.registry.expandDataId( 

879 dataset.dataId, records=data.dataId.records 

880 ) 

881 return data 

882 

883 def prep( 

884 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1 

885 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]: 

886 """Perform all non-database-updating ingest preprocessing steps. 

887 

888 Parameters 

889 ---------- 

890 files : iterable over `str` or path-like objects 

891 Paths to the files to be ingested. Will be made absolute 

892 if they are not already. 

893 pool : `multiprocessing.Pool`, optional 

894 If not `None`, a process pool with which to parallelize some 

895 operations. 

896 processes : `int`, optional 

897 The number of processes to use. Ignored if ``pool`` is not `None`. 

898 

899 Returns 

900 ------- 

901 exposures : `Iterator` [ `RawExposureData` ] 

902 Data structures containing dimension records, filenames, and data 

903 IDs to be ingested (one structure for each exposure). 

904 bad_files : `list` of `str` 

905 List of all the files that could not have metadata extracted. 

906 """ 

907 if pool is None and processes > 1: 

908 pool = Pool(processes) 

909 mapFunc = map if pool is None else pool.imap_unordered 

910 

911 def _partition_good_bad( 

912 file_data: Iterable[RawFileData], 

913 ) -> Tuple[List[RawFileData], List[ResourcePath]]: 

914 """Filter out bad files and return good with list of bad.""" 

915 good_files = [] 

916 bad_files = [] 

917 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

918 if not fileDatum.datasets: 

919 bad_files.append(fileDatum.filename) 

920 else: 

921 good_files.append(fileDatum) 

922 return good_files, bad_files 

923 

924 # Look for index files and read them. 

925 # There should be far fewer index files than data files. 

926 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

927 if bad_index_files: 

928 self.log.info("Failed to read the following explicitly requested index files:") 

929 for bad in sorted(bad_index_files): 

930 self.log.info("- %s", bad) 

931 

932 # Now convert all the index file entries to standard form for ingest. 

933 processed_bad_index_files: List[ResourcePath] = [] 

934 indexFileData = self.processIndexEntries(index_entries) 

935 if indexFileData: 

936 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

937 self.log.info( 

938 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

939 *_log_msg_counter(indexFileData), 

940 *_log_msg_counter(good_index_files), 

941 *_log_msg_counter(processed_bad_index_files), 

942 ) 

943 

944 # Extract metadata and build per-detector regions. 

945 # This could run in a subprocess so collect all output 

946 # before looking at failures. 

947 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

948 

949 # Filter out all the failed reads and store them for later 

950 # reporting. 

951 good_file_data, bad_files = _partition_good_bad(fileData) 

952 self.log.info( 

953 "Successfully extracted metadata from %d file%s with %d failure%s", 

954 *_log_msg_counter(good_file_data), 

955 *_log_msg_counter(bad_files), 

956 ) 

957 

958 # Combine with data from index files. 

959 good_file_data.extend(indexFileData) 

960 bad_files.extend(processed_bad_index_files) 

961 bad_files.extend(bad_index_files) 

962 

963 # Use that metadata to group files (and extracted metadata) by 

964 # exposure. Never parallelized because it's intrinsically a gather 

965 # step. 

966 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data) 

967 

968 # The next operation operates on RawExposureData instances (one at 

969 # a time) in-place and then returns the modified instance. We call it 

970 # as a pass-through instead of relying on the arguments we pass in to 

971 # have been modified because in the parallel case those arguments are 

972 # going to be pickled and unpickled, and I'm not certain 

973 # multiprocessing is careful enough with that for output arguments to 

974 # work. 

975 

976 # Expand the data IDs to include all dimension metadata; we need this 

977 # because we may need to generate path templates that rely on that 

978 # metadata. 

979 # This is the first step that involves actual database calls (but just 

980 # SELECTs), so if there's going to be a problem with connections vs. 

981 # multiple processes, or lock contention (in SQLite) slowing things 

982 # down, it'll happen here. 

983 return mapFunc(self.expandDataIds, exposureData), bad_files 

984 

985 def ingestExposureDatasets( 

986 self, 

987 exposure: RawExposureData, 

988 *, 

989 run: Optional[str] = None, 

990 skip_existing_exposures: bool = False, 

991 track_file_attrs: bool = True, 

992 ) -> List[FileDataset]: 

993 """Ingest all raw files in one exposure. 

994 

995 Parameters 

996 ---------- 

997 exposure : `RawExposureData` 

998 A structure containing information about the exposure to be 

999 ingested. Must have `RawExposureData.records` populated and all 

1000 data ID attributes expanded. 

1001 run : `str`, optional 

1002 Name of a RUN-type collection to write to, overriding 

1003 ``self.butler.run``. 

1004 skip_existing_exposures : `bool`, optional 

1005 If `True` (`False` is default), skip raws that have already been 

1006 ingested (i.e. raws for which we already have a dataset with the 

1007 same data ID in the target collection, even if from another file). 

1008 Note that this is much slower than just not passing 

1009 already-ingested files as inputs, because we still need to read and 

1010 process metadata to identify which exposures to search for. It 

1011 also will not work reliably if multiple processes are attempting to 

1012 ingest raws from the same exposure concurrently, in that different 

1013 processes may still attempt to ingest the same raw and conflict, 

1014 causing a failure that prevents other raws from the same exposure 

1015 from being ingested. 

1016 track_file_attrs : `bool`, optional 

1017 Control whether file attributes such as the size or checksum should 

1018 be tracked by the datastore. Whether this parameter is honored 

1019 depends on the specific datastore implentation. 

1020 

1021 Returns 

1022 ------- 

1023 datasets : `list` of `lsst.daf.butler.FileDataset` 

1024 Per-file structures identifying the files ingested and their 

1025 dataset representation in the data repository. 

1026 """ 

1027 if skip_existing_exposures: 

1028 existing = { 

1029 ref.dataId 

1030 for ref in self.butler.registry.queryDatasets( 

1031 self.datasetType, 

1032 collections=[run], 

1033 dataId=exposure.dataId, 

1034 ) 

1035 } 

1036 else: 

1037 existing = set() 

1038 datasets = [] 

1039 for file in exposure.files: 

1040 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing] 

1041 if refs: 

1042 datasets.append( 

1043 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1044 ) 

1045 

1046 # Raw files are preferentially ingested using a UUID derived from 

1047 # the collection name and dataId. 

1048 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1049 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1050 else: 

1051 mode = DatasetIdGenEnum.UNIQUE 

1052 self.butler.ingest( 

1053 *datasets, 

1054 transfer=self.config.transfer, 

1055 run=run, 

1056 idGenerationMode=mode, 

1057 record_validation_info=track_file_attrs, 

1058 ) 

1059 return datasets 

1060 

1061 def ingestFiles( 

1062 self, 

1063 files: Iterable[ResourcePath], 

1064 *, 

1065 pool: Optional[PoolType] = None, 

1066 processes: int = 1, 

1067 run: Optional[str] = None, 

1068 skip_existing_exposures: bool = False, 

1069 update_exposure_records: bool = False, 

1070 track_file_attrs: bool = True, 

1071 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]: 

1072 """Ingest files into a Butler data repository. 

1073 

1074 This creates any new exposure or visit Dimension entries needed to 

1075 identify the ingested files, creates new Dataset entries in the 

1076 Registry and finally ingests the files themselves into the Datastore. 

1077 Any needed instrument, detector, and physical_filter Dimension entries 

1078 must exist in the Registry before `run` is called. 

1079 

1080 Parameters 

1081 ---------- 

1082 files : iterable over `lsst.resources.ResourcePath` 

1083 URIs to the files to be ingested. 

1084 pool : `multiprocessing.Pool`, optional 

1085 If not `None`, a process pool with which to parallelize some 

1086 operations. 

1087 processes : `int`, optional 

1088 The number of processes to use. Ignored if ``pool`` is not `None`. 

1089 run : `str`, optional 

1090 Name of a RUN-type collection to write to, overriding 

1091 the default derived from the instrument name. 

1092 skip_existing_exposures : `bool`, optional 

1093 If `True` (`False` is default), skip raws that have already been 

1094 ingested (i.e. raws for which we already have a dataset with the 

1095 same data ID in the target collection, even if from another file). 

1096 Note that this is much slower than just not passing 

1097 already-ingested files as inputs, because we still need to read and 

1098 process metadata to identify which exposures to search for. It 

1099 also will not work reliably if multiple processes are attempting to 

1100 ingest raws from the same exposure concurrently, in that different 

1101 processes may still attempt to ingest the same raw and conflict, 

1102 causing a failure that prevents other raws from the same exposure 

1103 from being ingested. 

1104 update_exposure_records : `bool`, optional 

1105 If `True` (`False` is default), update existing exposure records 

1106 that conflict with the new ones instead of rejecting them. THIS IS 

1107 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1108 KNOWN TO BE BAD. This should usually be combined with 

1109 ``skip_existing_exposures=True``. 

1110 track_file_attrs : `bool`, optional 

1111 Control whether file attributes such as the size or checksum should 

1112 be tracked by the datastore. Whether this parameter is honored 

1113 depends on the specific datastore implentation. 

1114 

1115 Returns 

1116 ------- 

1117 refs : `list` of `lsst.daf.butler.DatasetRef` 

1118 Dataset references for ingested raws. 

1119 bad_files : `list` of `ResourcePath` 

1120 Given paths that could not be ingested. 

1121 n_exposures : `int` 

1122 Number of exposures successfully ingested. 

1123 n_exposures_failed : `int` 

1124 Number of exposures that failed when inserting dimension data. 

1125 n_ingests_failed : `int` 

1126 Number of exposures that failed when ingesting raw datasets. 

1127 """ 

1128 

1129 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

1130 

1131 # Up to this point, we haven't modified the data repository at all. 

1132 # Now we finally do that, with one transaction per exposure. This is 

1133 # not parallelized at present because the performance of this step is 

1134 # limited by the database server. That may or may not change in the 

1135 # future once we increase our usage of bulk inserts and reduce our 

1136 # usage of savepoints; we've tried to get everything but the database 

1137 # operations done in advance to reduce the time spent inside 

1138 # transactions. 

1139 self.butler.registry.registerDatasetType(self.datasetType) 

1140 

1141 refs = [] 

1142 runs = set() 

1143 n_exposures = 0 

1144 n_exposures_failed = 0 

1145 n_ingests_failed = 0 

1146 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1147 assert exposure.record is not None, "Should be guaranteed by prep()" 

1148 self.log.debug( 

1149 "Attempting to ingest %d file%s from exposure %s:%s", 

1150 *_log_msg_counter(exposure.files), 

1151 exposure.record.instrument, 

1152 exposure.record.obs_id, 

1153 ) 

1154 

1155 try: 

1156 for name, record in exposure.dependencyRecords.items(): 

1157 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1158 inserted_or_updated = self.butler.registry.syncDimensionData( 

1159 "exposure", 

1160 exposure.record, 

1161 update=update_exposure_records, 

1162 ) 

1163 except Exception as e: 

1164 self._on_ingest_failure(exposure, e) 

1165 n_exposures_failed += 1 

1166 self.log.warning( 

1167 "Exposure %s:%s could not be registered: %s", 

1168 exposure.record.instrument, 

1169 exposure.record.obs_id, 

1170 e, 

1171 ) 

1172 if self.config.failFast: 

1173 raise e 

1174 continue 

1175 

1176 if isinstance(inserted_or_updated, dict): 

1177 # Exposure is in the registry and we updated it, so 

1178 # syncDimensionData returned a dict. 

1179 self.log.info( 

1180 "Exposure %s:%s was already present, but columns %s were updated.", 

1181 exposure.record.instrument, 

1182 exposure.record.obs_id, 

1183 str(list(inserted_or_updated.keys())), 

1184 ) 

1185 

1186 # Override default run if nothing specified explicitly. 

1187 if run is None: 

1188 instrument = exposure.files[0].instrument 

1189 assert ( 

1190 instrument is not None 

1191 ), "file should have been removed from this list by prep if instrument could not be found" 

1192 this_run = instrument.makeDefaultRawIngestRunName() 

1193 else: 

1194 this_run = run 

1195 if this_run not in runs: 

1196 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1197 runs.add(this_run) 

1198 try: 

1199 datasets_for_exposure = self.ingestExposureDatasets( 

1200 exposure, 

1201 run=this_run, 

1202 skip_existing_exposures=skip_existing_exposures, 

1203 track_file_attrs=track_file_attrs, 

1204 ) 

1205 except Exception as e: 

1206 self._on_ingest_failure(exposure, e) 

1207 n_ingests_failed += 1 

1208 self.log.warning("Failed to ingest the following for reason: %s", e) 

1209 for f in exposure.files: 

1210 self.log.warning("- %s", f.filename) 

1211 if self.config.failFast: 

1212 raise e 

1213 continue 

1214 else: 

1215 self._on_success(datasets_for_exposure) 

1216 for dataset in datasets_for_exposure: 

1217 refs.extend(dataset.refs) 

1218 

1219 # Success for this exposure. 

1220 n_exposures += 1 

1221 self.log.info( 

1222 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1223 ) 

1224 

1225 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1226 

1227 @timeMethod 

1228 def run( 

1229 self, 

1230 files: Iterable[ResourcePathExpression], 

1231 *, 

1232 pool: Optional[PoolType] = None, 

1233 processes: int = 1, 

1234 run: Optional[str] = None, 

1235 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", 

1236 group_files: bool = True, 

1237 skip_existing_exposures: bool = False, 

1238 update_exposure_records: bool = False, 

1239 track_file_attrs: bool = True, 

1240 ) -> List[DatasetRef]: 

1241 """Ingest files into a Butler data repository. 

1242 

1243 This creates any new exposure or visit Dimension entries needed to 

1244 identify the ingested files, creates new Dataset entries in the 

1245 Registry and finally ingests the files themselves into the Datastore. 

1246 Any needed instrument, detector, and physical_filter Dimension entries 

1247 must exist in the Registry before `run` is called. 

1248 

1249 Parameters 

1250 ---------- 

1251 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1252 Paths to the files to be ingested. Can refer to directories. 

1253 Will be made absolute if they are not already. 

1254 pool : `multiprocessing.Pool`, optional 

1255 If not `None`, a process pool with which to parallelize some 

1256 operations. 

1257 processes : `int`, optional 

1258 The number of processes to use. Ignored if ``pool`` is not `None`. 

1259 run : `str`, optional 

1260 Name of a RUN-type collection to write to, overriding 

1261 the default derived from the instrument name. 

1262 file_filter : `str` or `re.Pattern`, optional 

1263 Pattern to use to discover files to ingest within directories. 

1264 The default is to search for FITS files. The regex applies to 

1265 files within the directory. 

1266 group_files : `bool`, optional 

1267 Group files by directory if they have been discovered in 

1268 directories. Will not affect files explicitly provided. 

1269 skip_existing_exposures : `bool`, optional 

1270 If `True` (`False` is default), skip raws that have already been 

1271 ingested (i.e. raws for which we already have a dataset with the 

1272 same data ID in the target collection, even if from another file). 

1273 Note that this is much slower than just not passing 

1274 already-ingested files as inputs, because we still need to read and 

1275 process metadata to identify which exposures to search for. It 

1276 also will not work reliably if multiple processes are attempting to 

1277 ingest raws from the same exposure concurrently, in that different 

1278 processes may still attempt to ingest the same raw and conflict, 

1279 causing a failure that prevents other raws from the same exposure 

1280 from being ingested. 

1281 update_exposure_records : `bool`, optional 

1282 If `True` (`False` is default), update existing exposure records 

1283 that conflict with the new ones instead of rejecting them. THIS IS 

1284 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1285 KNOWN TO BE BAD. This should usually be combined with 

1286 ``skip_existing_exposures=True``. 

1287 track_file_attrs : `bool`, optional 

1288 Control whether file attributes such as the size or checksum should 

1289 be tracked by the datastore. Whether this parameter is honored 

1290 depends on the specific datastore implentation. 

1291 

1292 Returns 

1293 ------- 

1294 refs : `list` of `lsst.daf.butler.DatasetRef` 

1295 Dataset references for ingested raws. 

1296 

1297 Notes 

1298 ----- 

1299 This method inserts all datasets for an exposure within a transaction, 

1300 guaranteeing that partial exposures are never ingested. The exposure 

1301 dimension record is inserted with `Registry.syncDimensionData` first 

1302 (in its own transaction), which inserts only if a record with the same 

1303 primary key does not already exist. This allows different files within 

1304 the same exposure to be ingested in different runs. 

1305 """ 

1306 

1307 refs = [] 

1308 bad_files = [] 

1309 n_exposures = 0 

1310 n_exposures_failed = 0 

1311 n_ingests_failed = 0 

1312 if group_files: 

1313 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1314 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1315 group, 

1316 pool=pool, 

1317 processes=processes, 

1318 run=run, 

1319 skip_existing_exposures=skip_existing_exposures, 

1320 update_exposure_records=update_exposure_records, 

1321 track_file_attrs=track_file_attrs, 

1322 ) 

1323 refs.extend(new_refs) 

1324 bad_files.extend(bad) 

1325 n_exposures += n_exp 

1326 n_exposures_failed += n_exp_fail 

1327 n_ingests_failed += n_ingest_fail 

1328 else: 

1329 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1330 ResourcePath.findFileResources(files, file_filter, group_files), 

1331 pool=pool, 

1332 processes=processes, 

1333 run=run, 

1334 skip_existing_exposures=skip_existing_exposures, 

1335 update_exposure_records=update_exposure_records, 

1336 ) 

1337 

1338 had_failure = False 

1339 

1340 if bad_files: 

1341 had_failure = True 

1342 self.log.warning("Could not extract observation metadata from the following:") 

1343 for f in bad_files: 

1344 self.log.warning("- %s", f) 

1345 

1346 self.log.info( 

1347 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1348 " registration and %d failure%s from file ingest.", 

1349 *_log_msg_counter(n_exposures), 

1350 *_log_msg_counter(n_exposures_failed), 

1351 *_log_msg_counter(n_ingests_failed), 

1352 ) 

1353 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1354 had_failure = True 

1355 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1356 

1357 if had_failure: 

1358 raise RuntimeError("Some failures encountered during ingestion") 

1359 

1360 return refs