Coverage for python/lsst/obs/base/ingest.py: 17%

354 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-22 10:15 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27import warnings 

28from collections import defaultdict 

29from dataclasses import InitVar, dataclass 

30from multiprocessing import Pool 

31from typing import ( 

32 Any, 

33 Callable, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 Iterator, 

38 List, 

39 MutableMapping, 

40 Optional, 

41 Set, 

42 Sized, 

43 Tuple, 

44 Type, 

45 Union, 

46) 

47 

48from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

49from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

50from lsst.afw.fits import readMetadata 

51from lsst.daf.butler import ( 

52 Butler, 

53 CollectionType, 

54 DataCoordinate, 

55 DatasetIdGenEnum, 

56 DatasetRef, 

57 DatasetType, 

58 DimensionRecord, 

59 DimensionUniverse, 

60 FileDataset, 

61 Formatter, 

62 Progress, 

63 UnresolvedRefWarning, 

64) 

65from lsst.pex.config import ChoiceField, Config, Field 

66from lsst.pipe.base import Instrument, Task 

67from lsst.resources import ResourcePath, ResourcePathExpression 

68from lsst.utils.timer import timeMethod 

69 

70from ._instrument import makeExposureRecordFromObsInfo 

71 

72# multiprocessing.Pool is actually a function, not a type, and the real type 

73# isn't exposed, so we can't used it annotations, so we'll just punt on it via 

74# this alias instead. 

75PoolType = Any 

76 

77 

78def _do_nothing(*args: Any, **kwargs: Any) -> None: 

79 """Do nothing. 

80 

81 This is a function that accepts anything and does nothing. 

82 For use as a default in callback arguments. 

83 """ 

84 pass 

85 

86 

87def _log_msg_counter(noun: Union[int, Sized]) -> Tuple[int, str]: 

88 """Count the iterable and return the count and plural modifier. 

89 

90 Parameters 

91 ---------- 

92 noun : `Sized` or `int` 

93 Thing to count. If given an integer it is assumed to be the count 

94 to use to calculate modifier. 

95 

96 Returns 

97 ------- 

98 num : `int` 

99 Number of items found in ``noun``. 

100 modifier : `str` 

101 Character to add to the end of a string referring to these items 

102 to indicate whether it was a single item or not. Returns empty 

103 string if there is one item or "s" otherwise. 

104 

105 Examples 

106 -------- 

107 

108 .. code-block:: python 

109 

110 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

111 """ 

112 if isinstance(noun, int): 

113 num = noun 

114 else: 

115 num = len(noun) 

116 return num, "" if num == 1 else "s" 

117 

118 

119@dataclass 

120class RawFileDatasetInfo: 

121 """Information about a single dataset within a raw file.""" 

122 

123 dataId: DataCoordinate 

124 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

125 

126 obsInfo: ObservationInfo 

127 """Standardized observation metadata extracted directly from the file 

128 headers (`astro_metadata_translator.ObservationInfo`). 

129 """ 

130 

131 

132@dataclass 

133class RawFileData: 

134 """Information about a single raw file, used during ingest.""" 

135 

136 datasets: List[RawFileDatasetInfo] 

137 """The information describing each dataset within this raw file. 

138 (`list` of `RawFileDatasetInfo`) 

139 """ 

140 

141 filename: ResourcePath 

142 """URI of the file this information was extracted from (`str`). 

143 

144 This is the path prior to ingest, not the path after ingest. 

145 """ 

146 

147 FormatterClass: Type[Formatter] 

148 """Formatter class that should be used to ingest this file (`type`; as 

149 subclass of `Formatter`). 

150 """ 

151 

152 instrument: Optional[Instrument] 

153 """The `Instrument` instance associated with this file. Can be `None` 

154 if ``datasets`` is an empty list.""" 

155 

156 

157@dataclass 

158class RawExposureData: 

159 """Information about a complete raw exposure, used during ingest.""" 

160 

161 dataId: DataCoordinate 

162 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

163 """ 

164 

165 files: List[RawFileData] 

166 """List of structures containing file-level information. 

167 """ 

168 

169 universe: InitVar[DimensionUniverse] 

170 """Set of all known dimensions. 

171 """ 

172 

173 record: DimensionRecord 

174 """The exposure `DimensionRecord` that must be inserted into the 

175 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

176 """ 

177 

178 dependencyRecords: Dict[str, DimensionRecord] 

179 """Additional records that must be inserted into the 

180 `~lsst.daf.butler.Registry` prior to ingesting the exposure ``record`` 

181 (e.g., to satisfy foreign key constraints), indexed by the dimension name. 

182 """ 

183 

184 

185def makeTransferChoiceField( 

186 doc: str = "How to transfer files (None for no transfer).", default: str = "auto" 

187) -> ChoiceField: 

188 """Create a Config field with options for transferring data between repos. 

189 

190 The allowed options for the field are exactly those supported by 

191 `lsst.daf.butler.Datastore.ingest`. 

192 

193 Parameters 

194 ---------- 

195 doc : `str` 

196 Documentation for the configuration field. 

197 default : `str`, optional 

198 Default transfer mode for the field. 

199 

200 Returns 

201 ------- 

202 field : `lsst.pex.config.ChoiceField` 

203 Configuration field. 

204 """ 

205 return ChoiceField( 

206 doc=doc, 

207 dtype=str, 

208 allowed={ 

209 "move": "move", 

210 "copy": "copy", 

211 "auto": "choice will depend on datastore", 

212 "direct": "use URI to ingested file directly in datastore", 

213 "link": "hard link falling back to symbolic link", 

214 "hardlink": "hard link", 

215 "symlink": "symbolic (soft) link", 

216 "relsymlink": "relative symbolic link", 

217 }, 

218 optional=True, 

219 default=default, 

220 ) 

221 

222 

223class RawIngestConfig(Config): 

224 """Configuration class for RawIngestTask.""" 

225 

226 transfer = makeTransferChoiceField() 

227 failFast: Field[bool] = Field( 

228 dtype=bool, 

229 default=False, 

230 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

231 "Otherwise problem files will be skipped and logged and a report issued at completion.", 

232 ) 

233 

234 

235class RawIngestTask(Task): 

236 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

237 

238 Parameters 

239 ---------- 

240 config : `RawIngestConfig` 

241 Configuration for the task. 

242 butler : `~lsst.daf.butler.Butler` 

243 Writeable butler instance, with ``butler.run`` set to the appropriate 

244 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

245 datasets. 

246 on_success : `Callable`, optional 

247 A callback invoked when all of the raws associated with an exposure 

248 are ingested. Will be passed a list of `FileDataset` objects, each 

249 containing one or more resolved `DatasetRef` objects. If this callback 

250 raises it will interrupt the entire ingest process, even if 

251 `RawIngestConfig.failFast` is `False`. 

252 on_metadata_failure : `Callable`, optional 

253 A callback invoked when a failure occurs trying to translate the 

254 metadata for a file. Will be passed the URI and the exception, in 

255 that order, as positional arguments. Guaranteed to be called in an 

256 ``except`` block, allowing the callback to re-raise or replace (with 

257 ``raise ... from``) to override the task's usual error handling (before 

258 `RawIngestConfig.failFast` logic occurs). 

259 on_ingest_failure : `Callable`, optional 

260 A callback invoked when dimension record or dataset insertion into the 

261 database fails for an exposure. Will be passed a `RawExposureData` 

262 instance and the exception, in that order, as positional arguments. 

263 Guaranteed to be called in an ``except`` block, allowing the callback 

264 to re-raise or replace (with ``raise ... from``) to override the task's 

265 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

266 **kwargs 

267 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

268 constructor. 

269 

270 Notes 

271 ----- 

272 Each instance of `RawIngestTask` writes to the same Butler. Each 

273 invocation of `RawIngestTask.run` ingests a list of files. 

274 """ 

275 

276 ConfigClass: ClassVar[Type[Config]] = RawIngestConfig 

277 

278 _DefaultName: ClassVar[str] = "ingest" 

279 

280 def getDatasetType(self) -> DatasetType: 

281 """Return the default DatasetType of the datasets ingested by this 

282 Task. 

283 

284 Returns 

285 ------- 

286 datasetType : `DatasetType` 

287 The default dataset type to use for the data being ingested. This 

288 is only used if the relevant `~lsst.pipe.base.Instrument` does not 

289 define an override. 

290 """ 

291 return DatasetType( 

292 "raw", 

293 ("instrument", "detector", "exposure"), 

294 "Exposure", 

295 universe=self.butler.registry.dimensions, 

296 ) 

297 

298 # Mypy can not determine that the config passed to super() is this type. 

299 config: RawIngestConfig 

300 

301 def __init__( 

302 self, 

303 config: RawIngestConfig, 

304 *, 

305 butler: Butler, 

306 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

307 on_metadata_failure: Callable[[ResourcePath, Exception], Any] = _do_nothing, 

308 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

309 **kwargs: Any, 

310 ): 

311 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

312 super().__init__(config, **kwargs) 

313 self.butler = butler 

314 self.universe = self.butler.registry.dimensions 

315 self.datasetType = self.getDatasetType() 

316 self._on_success = on_success 

317 self._on_metadata_failure = on_metadata_failure 

318 self._on_ingest_failure = on_ingest_failure 

319 self.progress = Progress("obs.base.RawIngestTask") 

320 

321 # Import all the instrument classes so that we ensure that we 

322 # have all the relevant metadata translators loaded. 

323 Instrument.importAll(self.butler.registry) 

324 

325 def _reduce_kwargs(self) -> Dict[str, Any]: 

326 # Add extra parameters to pickle. 

327 return dict( 

328 **super()._reduce_kwargs(), 

329 butler=self.butler, 

330 on_success=self._on_success, 

331 on_metadata_failure=self._on_metadata_failure, 

332 on_ingest_failure=self._on_ingest_failure, 

333 ) 

334 

335 def _determine_instrument_formatter( 

336 self, dataId: DataCoordinate, filename: ResourcePath 

337 ) -> Tuple[Optional[Instrument], Type[Formatter]]: 

338 """Determine the instrument and formatter class. 

339 

340 Parameters 

341 ---------- 

342 dataId : `lsst.daf.butler.DataCoordinate` 

343 The dataId associated with this dataset. 

344 filename : `lsst.resources.ResourcePath` 

345 URI of file used for error reporting. 

346 

347 Returns 

348 ------- 

349 instrument : `Instrument` or `None` 

350 Instance of the `Instrument` associated with this dataset. `None` 

351 indicates that the instrument could not be determined. 

352 formatterClass : `type` 

353 Class to be used as the formatter for this dataset. 

354 """ 

355 # The data model currently assumes that whilst multiple datasets 

356 # can be associated with a single file, they must all share the 

357 # same formatter. 

358 try: 

359 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) # type: ignore 

360 except LookupError as e: 

361 self._on_metadata_failure(filename, e) 

362 self.log.warning( 

363 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

364 ) 

365 if self.config.failFast: 

366 raise RuntimeError( 

367 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

368 ) from e 

369 FormatterClass = Formatter 

370 # Indicate that we could not work out the instrument. 

371 instrument = None 

372 else: 

373 assert instrument is not None, "Should be guaranted by fromName succeeding." 

374 FormatterClass = instrument.getRawFormatter(dataId) 

375 return instrument, FormatterClass 

376 

377 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

378 """Extract and process metadata from a single raw file. 

379 

380 Parameters 

381 ---------- 

382 filename : `lsst.resources.ResourcePath` 

383 URI to the file. 

384 

385 Returns 

386 ------- 

387 data : `RawFileData` 

388 A structure containing the metadata extracted from the file, 

389 as well as the original filename. All fields will be populated, 

390 but the `RawFileData.dataId` attribute will be a minimal 

391 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

392 ``instrument`` field will be `None` if there is a problem 

393 with metadata extraction. 

394 

395 Notes 

396 ----- 

397 Assumes that there is a single dataset associated with the given 

398 file. Instruments using a single file to store multiple datasets 

399 must implement their own version of this method. 

400 

401 By default the method will catch all exceptions unless the ``failFast`` 

402 configuration item is `True`. If an error is encountered the 

403 `_on_metadata_failure()` method will be called. If no exceptions 

404 result and an error was encountered the returned object will have 

405 a null-instrument class and no datasets. 

406 

407 This method supports sidecar JSON files which can be used to 

408 extract metadata without having to read the data file itself. 

409 The sidecar file is always used if found. 

410 """ 

411 sidecar_fail_msg = "" # Requires prepended space when set. 

412 try: 

413 sidecar_file = filename.updatedExtension(".json") 

414 if sidecar_file.exists(): 

415 content = json.loads(sidecar_file.read()) 

416 headers = [process_sidecar_data(content)] 

417 sidecar_fail_msg = " (via sidecar)" 

418 else: 

419 # Read the metadata from the data file itself. 

420 

421 # For remote files download the entire file to get the 

422 # header. This is very inefficient and it would be better 

423 # to have some way of knowing where in the file the headers 

424 # are and to only download those parts of the file. 

425 with filename.as_local() as local_file: 

426 # Read the primary. This might be sufficient. 

427 header = readMetadata(local_file.ospath, 0) 

428 

429 try: 

430 # Try to work out a translator class early. 

431 translator_class = MetadataTranslator.determine_translator( 

432 header, filename=str(filename) 

433 ) 

434 except ValueError: 

435 # Primary header was not sufficient (maybe this file 

436 # has been compressed or is a MEF with minimal 

437 # primary). Read second header and merge with primary. 

438 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

439 

440 # Try again to work out a translator class, letting this 

441 # fail. 

442 translator_class = MetadataTranslator.determine_translator(header, filename=str(filename)) 

443 

444 # Request the headers to use for ingest 

445 headers = list(translator_class.determine_translatable_headers(local_file.ospath, header)) 

446 

447 # Add each header to the dataset list 

448 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

449 

450 except Exception as e: 

451 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

452 # Indicate to the caller that we failed to read. 

453 datasets = [] 

454 formatterClass = Formatter 

455 instrument = None 

456 self._on_metadata_failure(filename, e) 

457 if self.config.failFast: 

458 raise RuntimeError( 

459 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

460 ) from e 

461 else: 

462 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

463 # The data model currently assumes that whilst multiple datasets 

464 # can be associated with a single file, they must all share the 

465 # same formatter. 

466 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

467 if instrument is None: 

468 datasets = [] 

469 

470 return RawFileData( 

471 datasets=datasets, 

472 filename=filename, 

473 # MyPy wants this to be a non-abstract class, which is not true 

474 # for the error case where instrument is None and datasets=[]. 

475 FormatterClass=formatterClass, # type: ignore 

476 instrument=instrument, 

477 ) 

478 

479 @classmethod 

480 def getObservationInfoSubsets(cls) -> Tuple[Set, Set]: 

481 """Return subsets of fields in the `ObservationInfo` that we care about 

482 

483 These fields will be used in constructing an exposure record. 

484 

485 Returns 

486 ------- 

487 required : `set` 

488 Set of `ObservationInfo` field names that are required. 

489 optional : `set` 

490 Set of `ObservationInfo` field names we will use if they are 

491 available. 

492 """ 

493 # Marking the new properties "group_counter_*" and 

494 # "has_simulated_content" as required, assumes that we either 

495 # recreate any existing index/sidecar files that include translated 

496 # values, or else allow astro_metadata_translator to fill in 

497 # defaults. 

498 required = { 

499 "datetime_begin", 

500 "datetime_end", 

501 "detector_num", 

502 "exposure_id", 

503 "exposure_time", 

504 "group_counter_end", 

505 "group_counter_start", 

506 "has_simulated_content", 

507 "instrument", 

508 "observation_id", 

509 "observation_type", 

510 "physical_filter", 

511 } 

512 optional = { 

513 "altaz_begin", 

514 "boresight_rotation_coord", 

515 "boresight_rotation_angle", 

516 "dark_time", 

517 "exposure_group", 

518 "tracking_radec", 

519 "object", 

520 "observation_counter", 

521 "observation_reason", 

522 "observing_day", 

523 "science_program", 

524 "visit_id", 

525 } 

526 return required, optional 

527 

528 def _calculate_dataset_info( 

529 self, header: Union[MutableMapping[str, Any], ObservationInfo], filename: ResourcePath 

530 ) -> RawFileDatasetInfo: 

531 """Calculate a RawFileDatasetInfo from the supplied information. 

532 

533 Parameters 

534 ---------- 

535 header : Mapping or `astro_metadata_translator.ObservationInfo` 

536 Header from the dataset or previously-translated content. 

537 filename : `lsst.resources.ResourcePath` 

538 Filename to use for error messages. 

539 

540 Returns 

541 ------- 

542 dataset : `RawFileDatasetInfo` 

543 The dataId, and observation information associated with this 

544 dataset. 

545 """ 

546 required, optional = self.getObservationInfoSubsets() 

547 if isinstance(header, ObservationInfo): 

548 obsInfo = header 

549 missing = [] 

550 # Need to check the required properties are present. 

551 for property in required: 

552 # getattr does not need to be protected because it is using 

553 # the defined list above containing properties that must exist. 

554 value = getattr(obsInfo, property) 

555 if value is None: 

556 missing.append(property) 

557 if missing: 

558 raise ValueError( 

559 f"Requested required properties are missing from file {filename}: {missing} (via JSON)" 

560 ) 

561 

562 else: 

563 obsInfo = ObservationInfo( 

564 header, 

565 pedantic=False, 

566 filename=str(filename), 

567 required=required, 

568 subset=required | optional, 

569 ) 

570 

571 dataId = DataCoordinate.standardize( 

572 instrument=obsInfo.instrument, 

573 exposure=obsInfo.exposure_id, 

574 detector=obsInfo.detector_num, 

575 universe=self.universe, 

576 ) 

577 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

578 

579 def locateAndReadIndexFiles( 

580 self, files: Iterable[ResourcePath] 

581 ) -> Tuple[Dict[ResourcePath, Any], List[ResourcePath], Set[ResourcePath], Set[ResourcePath]]: 

582 """Given a list of files, look for index files and read them. 

583 

584 Index files can either be explicitly in the list of files to 

585 ingest, or else located in the same directory as a file to ingest. 

586 Index entries are always used if present. 

587 

588 Parameters 

589 ---------- 

590 files : iterable over `lsst.resources.ResourcePath` 

591 URIs to the files to be ingested. 

592 

593 Returns 

594 ------- 

595 index : `dict` [`ResourcePath`, Any] 

596 Merged contents of all relevant index files found. These can 

597 be explicitly specified index files or ones found in the 

598 directory alongside a data file to be ingested. 

599 updated_files : `list` of `ResourcePath` 

600 Updated list of the input files with entries removed that were 

601 found listed in an index file. Order is not guaranteed to 

602 match the order of the files given to this routine. 

603 good_index_files: `set` [ `ResourcePath` ] 

604 Index files that were successfully read. 

605 bad_index_files: `set` [ `ResourcePath` ] 

606 Files that looked like index files but failed to read properly. 

607 """ 

608 # Convert the paths to absolute for easy comparison with index content. 

609 # Do not convert to real paths since we have to assume that index 

610 # files are in this location and not the location which it links to. 

611 files = tuple(f.abspath() for f in files) 

612 

613 # Index files must be named this. 

614 index_root_file = "_index.json" 

615 

616 # Group the files by directory. 

617 files_by_directory = defaultdict(set) 

618 

619 for path in files: 

620 directory, file_in_dir = path.split() 

621 files_by_directory[directory].add(file_in_dir) 

622 

623 # All the metadata read from index files with keys of full path. 

624 index_entries: Dict[ResourcePath, Any] = {} 

625 

626 # Index files we failed to read. 

627 bad_index_files = set() 

628 

629 # Any good index files that were found and used. 

630 good_index_files = set() 

631 

632 # Look for index files in those directories. 

633 for directory, files_in_directory in files_by_directory.items(): 

634 possible_index_file = directory.join(index_root_file) 

635 if possible_index_file.exists(): 

636 # If we are explicitly requesting an index file the 

637 # messages should be different. 

638 index_msg = "inferred" 

639 is_implied = True 

640 if index_root_file in files_in_directory: 

641 index_msg = "explicit" 

642 is_implied = False 

643 

644 # Try to read the index file and catch and report any 

645 # problems. 

646 try: 

647 content = json.loads(possible_index_file.read()) 

648 index = process_index_data(content, force_dict=True) 

649 # mypy should in theory know that this is a mapping 

650 # from the overload type annotation of process_index_data. 

651 assert isinstance(index, MutableMapping) 

652 except Exception as e: 

653 # Only trigger the callback if the index file 

654 # was asked for explicitly. Triggering on implied file 

655 # might be surprising. 

656 if not is_implied: 

657 self._on_metadata_failure(possible_index_file, e) 

658 if self.config.failFast: 

659 raise RuntimeError( 

660 f"Problem reading index file from {index_msg} location {possible_index_file}" 

661 ) from e 

662 bad_index_files.add(possible_index_file) 

663 continue 

664 

665 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

666 good_index_files.add(possible_index_file) 

667 

668 # Go through the index adding entries for files. 

669 # If we have non-index files in this directory marked for 

670 # ingest we should only get index information for those. 

671 # If the index file was explicit we use all entries. 

672 if is_implied: 

673 files_to_ingest = files_in_directory 

674 else: 

675 files_to_ingest = set(index) 

676 

677 # Copy relevant metadata into a single dict for all index 

678 # entries. 

679 for file_in_dir in files_to_ingest: 

680 # Skip an explicitly specified index file. 

681 # This should never happen because an explicit index 

682 # file will force ingest of all files in the index 

683 # and not use the explicit file list. If somehow 

684 # this is not true we continue. Raising an exception 

685 # seems like the wrong thing to do since this is harmless. 

686 if file_in_dir == index_root_file: 

687 self.log.info( 

688 "Logic error found scanning directory %s. Please file ticket.", directory 

689 ) 

690 continue 

691 if file_in_dir in index: 

692 file = directory.join(file_in_dir) 

693 if file in index_entries: 

694 # ObservationInfo overrides raw metadata 

695 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

696 index_entries[file], ObservationInfo 

697 ): 

698 self.log.warning( 

699 "File %s already specified in an index file but overriding" 

700 " with ObservationInfo content from %s", 

701 file, 

702 possible_index_file, 

703 ) 

704 else: 

705 self.log.warning( 

706 "File %s already specified in an index file, ignoring content from %s", 

707 file, 

708 possible_index_file, 

709 ) 

710 # Do nothing in this case 

711 continue 

712 

713 index_entries[file] = index[file_in_dir] 

714 

715 # Remove files from list that have index entries and also 

716 # any files that we determined to be explicit index files 

717 # or any index files that we failed to read. 

718 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

719 

720 # The filtered list loses the initial order. Retaining the order 

721 # is good for testing but does have a cost if there are many 

722 # files when copying the good values out. A dict would have faster 

723 # lookups (using the files as keys) but use more memory. 

724 ordered = [f for f in filtered if f in files] 

725 

726 return index_entries, ordered, good_index_files, bad_index_files 

727 

728 def processIndexEntries(self, index_entries: Dict[ResourcePath, Any]) -> List[RawFileData]: 

729 """Convert index entries to RawFileData. 

730 

731 Parameters 

732 ---------- 

733 index_entries : `dict` [`ResourcePath`, Any] 

734 Dict indexed by name of file to ingest and with keys either 

735 raw metadata or translated 

736 `~astro_metadata_translator.ObservationInfo`. 

737 

738 Returns 

739 ------- 

740 data : `list` [ `RawFileData` ] 

741 Structures containing the metadata extracted from the file, 

742 as well as the original filename. All fields will be populated, 

743 but the `RawFileData.dataId` attributes will be minimal 

744 (unexpanded) `~lsst.daf.butler.DataCoordinate` instances. 

745 """ 

746 fileData = [] 

747 for filename, metadata in index_entries.items(): 

748 try: 

749 datasets = [self._calculate_dataset_info(metadata, filename)] 

750 except Exception as e: 

751 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

752 datasets = [] 

753 formatterClass = Formatter 

754 instrument = None 

755 self._on_metadata_failure(filename, e) 

756 if self.config.failFast: 

757 raise RuntimeError( 

758 f"Problem extracting metadata for file {filename} found in index file" 

759 ) from e 

760 else: 

761 instrument, formatterClass = self._determine_instrument_formatter( 

762 datasets[0].dataId, filename 

763 ) 

764 if instrument is None: 

765 datasets = [] 

766 fileData.append( 

767 RawFileData( 

768 datasets=datasets, 

769 filename=filename, 

770 # MyPy wants this to be a non-abstract class, which is not 

771 # true for the error case where instrument is None and 

772 # datasets=[]. 

773 FormatterClass=formatterClass, # type: ignore 

774 instrument=instrument, 

775 ) 

776 ) 

777 return fileData 

778 

779 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

780 """Group an iterable of `RawFileData` by exposure. 

781 

782 Parameters 

783 ---------- 

784 files : iterable of `RawFileData` 

785 File-level information to group. 

786 

787 Returns 

788 ------- 

789 exposures : `list` of `RawExposureData` 

790 A list of structures that group the file-level information by 

791 exposure. All fields will be populated. The 

792 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

793 `~lsst.daf.butler.DataCoordinate` instances. 

794 """ 

795 exposureDimensions = self.universe["exposure"].graph 

796 byExposure = defaultdict(list) 

797 for f in files: 

798 # Assume that the first dataset is representative for the file. 

799 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

800 

801 return [ 

802 RawExposureData( 

803 dataId=dataId, 

804 files=exposureFiles, 

805 universe=self.universe, 

806 record=self.makeExposureRecord(exposureFiles[0].datasets[0].obsInfo, self.universe), 

807 dependencyRecords=self.makeDependencyRecords( 

808 exposureFiles[0].datasets[0].obsInfo, self.universe 

809 ), 

810 ) 

811 for dataId, exposureFiles in byExposure.items() 

812 ] 

813 

814 def makeExposureRecord( 

815 self, obsInfo: ObservationInfo, universe: DimensionUniverse, **kwargs: Any 

816 ) -> DimensionRecord: 

817 """Construct a registry record for an exposure 

818 

819 This is a method that subclasses will often want to customize. This can 

820 often be done by calling this base class implementation with additional 

821 ``kwargs``. 

822 

823 Parameters 

824 ---------- 

825 obsInfo : `ObservationInfo` 

826 Observation details for (one of the components of) the exposure. 

827 universe : `DimensionUniverse` 

828 Set of all known dimensions. 

829 **kwargs 

830 Additional field values for this record. 

831 

832 Returns 

833 ------- 

834 record : `DimensionRecord` 

835 The exposure record that must be inserted into the 

836 `~lsst.daf.butler.Registry` prior to file-level ingest. 

837 """ 

838 return makeExposureRecordFromObsInfo(obsInfo, universe, **kwargs) 

839 

840 def makeDependencyRecords( 

841 self, obsInfo: ObservationInfo, universe: DimensionUniverse 

842 ) -> Dict[str, DimensionRecord]: 

843 """Construct dependency records 

844 

845 These dependency records will be inserted into the 

846 `~lsst.daf.butler.Registry` before the exposure records, because they 

847 are dependencies of the exposure. This allows an opportunity to satisfy 

848 foreign key constraints that exist because of dimensions related to the 

849 exposure. 

850 

851 This is a method that subclasses may want to customize, if they've 

852 added dimensions that relate to an exposure. 

853 

854 Parameters 

855 ---------- 

856 obsInfo : `ObservationInfo` 

857 Observation details for (one of the components of) the exposure. 

858 universe : `DimensionUniverse` 

859 Set of all known dimensions. 

860 

861 Returns 

862 ------- 

863 records : `dict` [`str`, `DimensionRecord`] 

864 The records to insert, indexed by dimension name. 

865 """ 

866 return {} 

867 

868 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

869 """Expand the data IDs associated with a raw exposure. 

870 

871 This adds the metadata records. 

872 

873 Parameters 

874 ---------- 

875 exposure : `RawExposureData` 

876 A structure containing information about the exposure to be 

877 ingested. Must have `RawExposureData.record` populated. Should 

878 be considered consumed upon return. 

879 

880 Returns 

881 ------- 

882 exposure : `RawExposureData` 

883 An updated version of the input structure, with 

884 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

885 updated to data IDs for which 

886 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

887 """ 

888 # We start by expanded the exposure-level data ID; we won't use that 

889 # directly in file ingest, but this lets us do some database lookups 

890 # once per exposure instead of once per file later. 

891 data.dataId = self.butler.registry.expandDataId( 

892 data.dataId, 

893 # We pass in the records we'll be inserting shortly so they aren't 

894 # looked up from the database. We do expect instrument and filter 

895 # records to be retrieved from the database here (though the 

896 # Registry may cache them so there isn't a lookup every time). 

897 records={"exposure": data.record}, 

898 ) 

899 # Now we expand the per-file (exposure+detector) data IDs. This time 

900 # we pass in the records we just retrieved from the exposure data ID 

901 # expansion. 

902 for file in data.files: 

903 for dataset in file.datasets: 

904 dataset.dataId = self.butler.registry.expandDataId( 

905 dataset.dataId, records=data.dataId.records 

906 ) 

907 return data 

908 

909 def prep( 

910 self, files: Iterable[ResourcePath], *, pool: Optional[PoolType] = None, processes: int = 1 

911 ) -> Tuple[Iterator[RawExposureData], List[ResourcePath]]: 

912 """Perform all non-database-updating ingest preprocessing steps. 

913 

914 Parameters 

915 ---------- 

916 files : iterable over `str` or path-like objects 

917 Paths to the files to be ingested. Will be made absolute 

918 if they are not already. 

919 pool : `multiprocessing.Pool`, optional 

920 If not `None`, a process pool with which to parallelize some 

921 operations. 

922 processes : `int`, optional 

923 The number of processes to use. Ignored if ``pool`` is not `None`. 

924 

925 Returns 

926 ------- 

927 exposures : `Iterator` [ `RawExposureData` ] 

928 Data structures containing dimension records, filenames, and data 

929 IDs to be ingested (one structure for each exposure). 

930 bad_files : `list` of `str` 

931 List of all the files that could not have metadata extracted. 

932 """ 

933 if pool is None and processes > 1: 

934 pool = Pool(processes) 

935 mapFunc = map if pool is None else pool.imap_unordered 

936 

937 def _partition_good_bad( 

938 file_data: Iterable[RawFileData], 

939 ) -> Tuple[List[RawFileData], List[ResourcePath]]: 

940 """Filter out bad files and return good with list of bad.""" 

941 good_files = [] 

942 bad_files = [] 

943 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata"): 

944 if not fileDatum.datasets: 

945 bad_files.append(fileDatum.filename) 

946 else: 

947 good_files.append(fileDatum) 

948 return good_files, bad_files 

949 

950 # Look for index files and read them. 

951 # There should be far fewer index files than data files. 

952 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

953 if bad_index_files: 

954 self.log.info("Failed to read the following explicitly requested index files:") 

955 for bad in sorted(bad_index_files): 

956 self.log.info("- %s", bad) 

957 

958 # Now convert all the index file entries to standard form for ingest. 

959 processed_bad_index_files: List[ResourcePath] = [] 

960 indexFileData = self.processIndexEntries(index_entries) 

961 if indexFileData: 

962 indexFileData, processed_bad_index_files = _partition_good_bad(indexFileData) 

963 self.log.info( 

964 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

965 *_log_msg_counter(indexFileData), 

966 *_log_msg_counter(good_index_files), 

967 *_log_msg_counter(processed_bad_index_files), 

968 ) 

969 

970 # Extract metadata and build per-detector regions. 

971 # This could run in a subprocess so collect all output 

972 # before looking at failures. 

973 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

974 

975 # Filter out all the failed reads and store them for later 

976 # reporting. 

977 good_file_data, bad_files = _partition_good_bad(fileData) 

978 self.log.info( 

979 "Successfully extracted metadata from %d file%s with %d failure%s", 

980 *_log_msg_counter(good_file_data), 

981 *_log_msg_counter(bad_files), 

982 ) 

983 

984 # Combine with data from index files. 

985 good_file_data.extend(indexFileData) 

986 bad_files.extend(processed_bad_index_files) 

987 bad_files.extend(bad_index_files) 

988 

989 # Use that metadata to group files (and extracted metadata) by 

990 # exposure. Never parallelized because it's intrinsically a gather 

991 # step. 

992 exposureData: List[RawExposureData] = self.groupByExposure(good_file_data) 

993 

994 # The next operation operates on RawExposureData instances (one at 

995 # a time) in-place and then returns the modified instance. We call it 

996 # as a pass-through instead of relying on the arguments we pass in to 

997 # have been modified because in the parallel case those arguments are 

998 # going to be pickled and unpickled, and I'm not certain 

999 # multiprocessing is careful enough with that for output arguments to 

1000 # work. 

1001 

1002 # Expand the data IDs to include all dimension metadata; we need this 

1003 # because we may need to generate path templates that rely on that 

1004 # metadata. 

1005 # This is the first step that involves actual database calls (but just 

1006 # SELECTs), so if there's going to be a problem with connections vs. 

1007 # multiple processes, or lock contention (in SQLite) slowing things 

1008 # down, it'll happen here. 

1009 return mapFunc(self.expandDataIds, exposureData), bad_files 

1010 

1011 def ingestExposureDatasets( 

1012 self, 

1013 exposure: RawExposureData, 

1014 datasetType: DatasetType, 

1015 *, 

1016 run: Optional[str] = None, 

1017 skip_existing_exposures: bool = False, 

1018 track_file_attrs: bool = True, 

1019 ) -> List[FileDataset]: 

1020 """Ingest all raw files in one exposure. 

1021 

1022 Parameters 

1023 ---------- 

1024 exposure : `RawExposureData` 

1025 A structure containing information about the exposure to be 

1026 ingested. Must have `RawExposureData.records` populated and all 

1027 data ID attributes expanded. 

1028 datasetType : `DatasetType` 

1029 The dataset type associated with this exposure. 

1030 run : `str`, optional 

1031 Name of a RUN-type collection to write to, overriding 

1032 ``self.butler.run``. 

1033 skip_existing_exposures : `bool`, optional 

1034 If `True` (`False` is default), skip raws that have already been 

1035 ingested (i.e. raws for which we already have a dataset with the 

1036 same data ID in the target collection, even if from another file). 

1037 Note that this is much slower than just not passing 

1038 already-ingested files as inputs, because we still need to read and 

1039 process metadata to identify which exposures to search for. It 

1040 also will not work reliably if multiple processes are attempting to 

1041 ingest raws from the same exposure concurrently, in that different 

1042 processes may still attempt to ingest the same raw and conflict, 

1043 causing a failure that prevents other raws from the same exposure 

1044 from being ingested. 

1045 track_file_attrs : `bool`, optional 

1046 Control whether file attributes such as the size or checksum should 

1047 be tracked by the datastore. Whether this parameter is honored 

1048 depends on the specific datastore implementation. 

1049 

1050 Returns 

1051 ------- 

1052 datasets : `list` of `lsst.daf.butler.FileDataset` 

1053 Per-file structures identifying the files ingested and their 

1054 dataset representation in the data repository. 

1055 """ 

1056 if skip_existing_exposures: 

1057 existing = { 

1058 ref.dataId 

1059 for ref in self.butler.registry.queryDatasets( 

1060 datasetType, 

1061 collections=[run], 

1062 dataId=exposure.dataId, 

1063 ) 

1064 } 

1065 else: 

1066 existing = set() 

1067 datasets = [] 

1068 for file in exposure.files: 

1069 with warnings.catch_warnings(): 

1070 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

1071 refs = [DatasetRef(datasetType, d.dataId) for d in file.datasets if d.dataId not in existing] 

1072 if refs: 

1073 datasets.append( 

1074 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

1075 ) 

1076 

1077 # Raw files are preferentially ingested using a UUID derived from 

1078 # the collection name and dataId. 

1079 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

1080 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

1081 else: 

1082 mode = DatasetIdGenEnum.UNIQUE 

1083 self.butler.ingest( 

1084 *datasets, 

1085 transfer=self.config.transfer, 

1086 run=run, 

1087 idGenerationMode=mode, 

1088 record_validation_info=track_file_attrs, 

1089 ) 

1090 return datasets 

1091 

1092 def ingestFiles( 

1093 self, 

1094 files: Iterable[ResourcePath], 

1095 *, 

1096 pool: Optional[PoolType] = None, 

1097 processes: int = 1, 

1098 run: Optional[str] = None, 

1099 skip_existing_exposures: bool = False, 

1100 update_exposure_records: bool = False, 

1101 track_file_attrs: bool = True, 

1102 ) -> Tuple[List[DatasetRef], List[ResourcePath], int, int, int]: 

1103 """Ingest files into a Butler data repository. 

1104 

1105 This creates any new exposure or visit Dimension entries needed to 

1106 identify the ingested files, creates new Dataset entries in the 

1107 Registry and finally ingests the files themselves into the Datastore. 

1108 Any needed instrument, detector, and physical_filter Dimension entries 

1109 must exist in the Registry before `run` is called. 

1110 

1111 Parameters 

1112 ---------- 

1113 files : iterable over `lsst.resources.ResourcePath` 

1114 URIs to the files to be ingested. 

1115 pool : `multiprocessing.Pool`, optional 

1116 If not `None`, a process pool with which to parallelize some 

1117 operations. 

1118 processes : `int`, optional 

1119 The number of processes to use. Ignored if ``pool`` is not `None`. 

1120 run : `str`, optional 

1121 Name of a RUN-type collection to write to, overriding 

1122 the default derived from the instrument name. 

1123 skip_existing_exposures : `bool`, optional 

1124 If `True` (`False` is default), skip raws that have already been 

1125 ingested (i.e. raws for which we already have a dataset with the 

1126 same data ID in the target collection, even if from another file). 

1127 Note that this is much slower than just not passing 

1128 already-ingested files as inputs, because we still need to read and 

1129 process metadata to identify which exposures to search for. It 

1130 also will not work reliably if multiple processes are attempting to 

1131 ingest raws from the same exposure concurrently, in that different 

1132 processes may still attempt to ingest the same raw and conflict, 

1133 causing a failure that prevents other raws from the same exposure 

1134 from being ingested. 

1135 update_exposure_records : `bool`, optional 

1136 If `True` (`False` is default), update existing exposure records 

1137 that conflict with the new ones instead of rejecting them. THIS IS 

1138 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1139 KNOWN TO BE BAD. This should usually be combined with 

1140 ``skip_existing_exposures=True``. 

1141 track_file_attrs : `bool`, optional 

1142 Control whether file attributes such as the size or checksum should 

1143 be tracked by the datastore. Whether this parameter is honored 

1144 depends on the specific datastore implentation. 

1145 

1146 Returns 

1147 ------- 

1148 refs : `list` of `lsst.daf.butler.DatasetRef` 

1149 Dataset references for ingested raws. 

1150 bad_files : `list` of `ResourcePath` 

1151 Given paths that could not be ingested. 

1152 n_exposures : `int` 

1153 Number of exposures successfully ingested. 

1154 n_exposures_failed : `int` 

1155 Number of exposures that failed when inserting dimension data. 

1156 n_ingests_failed : `int` 

1157 Number of exposures that failed when ingesting raw datasets. 

1158 """ 

1159 

1160 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

1161 

1162 # Up to this point, we haven't modified the data repository at all. 

1163 # Now we finally do that, with one transaction per exposure. This is 

1164 # not parallelized at present because the performance of this step is 

1165 # limited by the database server. That may or may not change in the 

1166 # future once we increase our usage of bulk inserts and reduce our 

1167 # usage of savepoints; we've tried to get everything but the database 

1168 # operations done in advance to reduce the time spent inside 

1169 # transactions. 

1170 refs = [] 

1171 runs = set() 

1172 datasetTypes: dict[str, DatasetType] = {} 

1173 n_exposures = 0 

1174 n_exposures_failed = 0 

1175 n_ingests_failed = 0 

1176 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1177 assert exposure.record is not None, "Should be guaranteed by prep()" 

1178 self.log.debug( 

1179 "Attempting to ingest %d file%s from exposure %s:%s", 

1180 *_log_msg_counter(exposure.files), 

1181 exposure.record.instrument, 

1182 exposure.record.obs_id, 

1183 ) 

1184 

1185 try: 

1186 for name, record in exposure.dependencyRecords.items(): 

1187 self.butler.registry.syncDimensionData(name, record, update=update_exposure_records) 

1188 inserted_or_updated = self.butler.registry.syncDimensionData( 

1189 "exposure", 

1190 exposure.record, 

1191 update=update_exposure_records, 

1192 ) 

1193 except Exception as e: 

1194 self._on_ingest_failure(exposure, e) 

1195 n_exposures_failed += 1 

1196 self.log.warning( 

1197 "Exposure %s:%s could not be registered: %s", 

1198 exposure.record.instrument, 

1199 exposure.record.obs_id, 

1200 e, 

1201 ) 

1202 if self.config.failFast: 

1203 raise e 

1204 continue 

1205 

1206 if isinstance(inserted_or_updated, dict): 

1207 # Exposure is in the registry and we updated it, so 

1208 # syncDimensionData returned a dict. 

1209 self.log.info( 

1210 "Exposure %s:%s was already present, but columns %s were updated.", 

1211 exposure.record.instrument, 

1212 exposure.record.obs_id, 

1213 str(list(inserted_or_updated.keys())), 

1214 ) 

1215 

1216 # Determine the instrument so we can work out the dataset type. 

1217 instrument = exposure.files[0].instrument 

1218 assert ( 

1219 instrument is not None 

1220 ), "file should have been removed from this list by prep if instrument could not be found" 

1221 

1222 if raw_definition := getattr(instrument, "raw_definition", None): 

1223 datasetTypeName, dimensions, storageClass = raw_definition 

1224 if not (datasetType := datasetTypes.get(datasetTypeName)): 

1225 datasetType = DatasetType( 

1226 datasetTypeName, dimensions, storageClass, universe=self.butler.registry.dimensions 

1227 ) 

1228 else: 

1229 datasetType = self.datasetType 

1230 if datasetType.name not in datasetTypes: 

1231 self.butler.registry.registerDatasetType(datasetType) 

1232 datasetTypes[datasetType.name] = datasetType 

1233 

1234 # Override default run if nothing specified explicitly. 

1235 if run is None: 

1236 this_run = instrument.makeDefaultRawIngestRunName() 

1237 else: 

1238 this_run = run 

1239 if this_run not in runs: 

1240 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1241 runs.add(this_run) 

1242 try: 

1243 datasets_for_exposure = self.ingestExposureDatasets( 

1244 exposure, 

1245 datasetType=datasetType, 

1246 run=this_run, 

1247 skip_existing_exposures=skip_existing_exposures, 

1248 track_file_attrs=track_file_attrs, 

1249 ) 

1250 except Exception as e: 

1251 self._on_ingest_failure(exposure, e) 

1252 n_ingests_failed += 1 

1253 self.log.warning("Failed to ingest the following for reason: %s", e) 

1254 for f in exposure.files: 

1255 self.log.warning("- %s", f.filename) 

1256 if self.config.failFast: 

1257 raise e 

1258 continue 

1259 else: 

1260 self._on_success(datasets_for_exposure) 

1261 for dataset in datasets_for_exposure: 

1262 refs.extend(dataset.refs) 

1263 

1264 # Success for this exposure. 

1265 n_exposures += 1 

1266 self.log.info( 

1267 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1268 ) 

1269 

1270 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1271 

1272 @timeMethod 

1273 def run( 

1274 self, 

1275 files: Iterable[ResourcePathExpression], 

1276 *, 

1277 pool: Optional[PoolType] = None, 

1278 processes: int = 1, 

1279 run: Optional[str] = None, 

1280 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", 

1281 group_files: bool = True, 

1282 skip_existing_exposures: bool = False, 

1283 update_exposure_records: bool = False, 

1284 track_file_attrs: bool = True, 

1285 ) -> List[DatasetRef]: 

1286 """Ingest files into a Butler data repository. 

1287 

1288 This creates any new exposure or visit Dimension entries needed to 

1289 identify the ingested files, creates new Dataset entries in the 

1290 Registry and finally ingests the files themselves into the Datastore. 

1291 Any needed instrument, detector, and physical_filter Dimension entries 

1292 must exist in the Registry before `run` is called. 

1293 

1294 Parameters 

1295 ---------- 

1296 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1297 Paths to the files to be ingested. Can refer to directories. 

1298 Will be made absolute if they are not already. 

1299 pool : `multiprocessing.Pool`, optional 

1300 If not `None`, a process pool with which to parallelize some 

1301 operations. 

1302 processes : `int`, optional 

1303 The number of processes to use. Ignored if ``pool`` is not `None`. 

1304 run : `str`, optional 

1305 Name of a RUN-type collection to write to, overriding 

1306 the default derived from the instrument name. 

1307 file_filter : `str` or `re.Pattern`, optional 

1308 Pattern to use to discover files to ingest within directories. 

1309 The default is to search for FITS files. The regex applies to 

1310 files within the directory. 

1311 group_files : `bool`, optional 

1312 Group files by directory if they have been discovered in 

1313 directories. Will not affect files explicitly provided. 

1314 skip_existing_exposures : `bool`, optional 

1315 If `True` (`False` is default), skip raws that have already been 

1316 ingested (i.e. raws for which we already have a dataset with the 

1317 same data ID in the target collection, even if from another file). 

1318 Note that this is much slower than just not passing 

1319 already-ingested files as inputs, because we still need to read and 

1320 process metadata to identify which exposures to search for. It 

1321 also will not work reliably if multiple processes are attempting to 

1322 ingest raws from the same exposure concurrently, in that different 

1323 processes may still attempt to ingest the same raw and conflict, 

1324 causing a failure that prevents other raws from the same exposure 

1325 from being ingested. 

1326 update_exposure_records : `bool`, optional 

1327 If `True` (`False` is default), update existing exposure records 

1328 that conflict with the new ones instead of rejecting them. THIS IS 

1329 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1330 KNOWN TO BE BAD. This should usually be combined with 

1331 ``skip_existing_exposures=True``. 

1332 track_file_attrs : `bool`, optional 

1333 Control whether file attributes such as the size or checksum should 

1334 be tracked by the datastore. Whether this parameter is honored 

1335 depends on the specific datastore implentation. 

1336 

1337 Returns 

1338 ------- 

1339 refs : `list` of `lsst.daf.butler.DatasetRef` 

1340 Dataset references for ingested raws. 

1341 

1342 Notes 

1343 ----- 

1344 This method inserts all datasets for an exposure within a transaction, 

1345 guaranteeing that partial exposures are never ingested. The exposure 

1346 dimension record is inserted with `Registry.syncDimensionData` first 

1347 (in its own transaction), which inserts only if a record with the same 

1348 primary key does not already exist. This allows different files within 

1349 the same exposure to be ingested in different runs. 

1350 """ 

1351 

1352 refs = [] 

1353 bad_files = [] 

1354 n_exposures = 0 

1355 n_exposures_failed = 0 

1356 n_ingests_failed = 0 

1357 if group_files: 

1358 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1359 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1360 group, 

1361 pool=pool, 

1362 processes=processes, 

1363 run=run, 

1364 skip_existing_exposures=skip_existing_exposures, 

1365 update_exposure_records=update_exposure_records, 

1366 track_file_attrs=track_file_attrs, 

1367 ) 

1368 refs.extend(new_refs) 

1369 bad_files.extend(bad) 

1370 n_exposures += n_exp 

1371 n_exposures_failed += n_exp_fail 

1372 n_ingests_failed += n_ingest_fail 

1373 else: 

1374 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1375 ResourcePath.findFileResources(files, file_filter, group_files), 

1376 pool=pool, 

1377 processes=processes, 

1378 run=run, 

1379 skip_existing_exposures=skip_existing_exposures, 

1380 update_exposure_records=update_exposure_records, 

1381 ) 

1382 

1383 had_failure = False 

1384 

1385 if bad_files: 

1386 had_failure = True 

1387 self.log.warning("Could not extract observation metadata from the following:") 

1388 for f in bad_files: 

1389 self.log.warning("- %s", f) 

1390 

1391 self.log.info( 

1392 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1393 " registration and %d failure%s from file ingest.", 

1394 *_log_msg_counter(n_exposures), 

1395 *_log_msg_counter(n_exposures_failed), 

1396 *_log_msg_counter(n_ingests_failed), 

1397 ) 

1398 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1399 had_failure = True 

1400 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1401 

1402 if had_failure: 

1403 raise RuntimeError("Some failures encountered during ingestion") 

1404 

1405 return refs