Coverage for python/lsst/obs/base/ingest.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

321 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from collections import defaultdict 

28from dataclasses import InitVar, dataclass 

29from multiprocessing import Pool 

30from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple, Type, Union 

31 

32from astro_metadata_translator import MetadataTranslator, ObservationInfo, merge_headers 

33from astro_metadata_translator.indexing import process_index_data, process_sidecar_data 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 CollectionType, 

38 DataCoordinate, 

39 DatasetIdGenEnum, 

40 DatasetRef, 

41 DatasetType, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 FileDataset, 

45 Formatter, 

46 Progress, 

47) 

48from lsst.pex.config import ChoiceField, Config, Field 

49from lsst.pipe.base import Task 

50from lsst.resources import ResourcePath 

51from lsst.utils.timer import timeMethod 

52 

53from ._fitsRawFormatterBase import FitsRawFormatterBase 

54from ._instrument import Instrument, makeExposureRecordFromObsInfo 

55 

56 

57def _do_nothing(*args, **kwargs) -> None: 

58 """Do nothing. 

59 

60 This is a function that accepts anything and does nothing. 

61 For use as a default in callback arguments. 

62 """ 

63 pass 

64 

65 

66def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

67 """Count the iterable and return the count and plural modifier. 

68 

69 Parameters 

70 ---------- 

71 noun : Iterable or `int` 

72 Thing to count. If given an integer it is assumed to be the count 

73 to use to calculate modifier. 

74 

75 Returns 

76 ------- 

77 num : `int` 

78 Number of items found in ``noun``. 

79 modifier : `str` 

80 Character to add to the end of a string referring to these items 

81 to indicate whether it was a single item or not. Returns empty 

82 string if there is one item or "s" otherwise. 

83 

84 Examples 

85 -------- 

86 

87 .. code-block:: python 

88 

89 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

90 """ 

91 if isinstance(noun, int): 

92 num = noun 

93 else: 

94 num = len(noun) 

95 return num, "" if num == 1 else "s" 

96 

97 

98@dataclass 

99class RawFileDatasetInfo: 

100 """Information about a single dataset within a raw file.""" 

101 

102 dataId: DataCoordinate 

103 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

104 

105 obsInfo: ObservationInfo 

106 """Standardized observation metadata extracted directly from the file 

107 headers (`astro_metadata_translator.ObservationInfo`). 

108 """ 

109 

110 

111@dataclass 

112class RawFileData: 

113 """Information about a single raw file, used during ingest.""" 

114 

115 datasets: List[RawFileDatasetInfo] 

116 """The information describing each dataset within this raw file. 

117 (`list` of `RawFileDatasetInfo`) 

118 """ 

119 

120 filename: ResourcePath 

121 """URI of the file this information was extracted from (`str`). 

122 

123 This is the path prior to ingest, not the path after ingest. 

124 """ 

125 

126 FormatterClass: Type[FitsRawFormatterBase] 

127 """Formatter class that should be used to ingest this file (`type`; as 

128 subclass of `FitsRawFormatterBase`). 

129 """ 

130 

131 instrument: Optional[Instrument] 

132 """The `Instrument` instance associated with this file. Can be `None` 

133 if ``datasets`` is an empty list.""" 

134 

135 

136@dataclass 

137class RawExposureData: 

138 """Information about a complete raw exposure, used during ingest.""" 

139 

140 dataId: DataCoordinate 

141 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

142 """ 

143 

144 files: List[RawFileData] 

145 """List of structures containing file-level information. 

146 """ 

147 

148 universe: InitVar[DimensionUniverse] 

149 """Set of all known dimensions. 

150 """ 

151 

152 record: Optional[DimensionRecord] = None 

153 """The exposure `DimensionRecord` that must be inserted into the 

154 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

155 """ 

156 

157 def __post_init__(self, universe: DimensionUniverse): 

158 # We don't care which file or dataset we read metadata from, because 

159 # we're assuming they'll all be the same; just use the first ones. 

160 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

161 

162 

163def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

164 """Create a Config field with options for transferring data between repos. 

165 

166 The allowed options for the field are exactly those supported by 

167 `lsst.daf.butler.Datastore.ingest`. 

168 

169 Parameters 

170 ---------- 

171 doc : `str` 

172 Documentation for the configuration field. 

173 

174 Returns 

175 ------- 

176 field : `lsst.pex.config.ChoiceField` 

177 Configuration field. 

178 """ 

179 return ChoiceField( 

180 doc=doc, 

181 dtype=str, 

182 allowed={ 

183 "move": "move", 

184 "copy": "copy", 

185 "auto": "choice will depend on datastore", 

186 "direct": "use URI to ingested file directly in datastore", 

187 "link": "hard link falling back to symbolic link", 

188 "hardlink": "hard link", 

189 "symlink": "symbolic (soft) link", 

190 "relsymlink": "relative symbolic link", 

191 }, 

192 optional=True, 

193 default=default, 

194 ) 

195 

196 

197class RawIngestConfig(Config): 

198 """Configuration class for RawIngestTask.""" 

199 

200 transfer = makeTransferChoiceField() 

201 failFast = Field( 

202 dtype=bool, 

203 default=False, 

204 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

205 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

206 ) 

207 

208 

209class RawIngestTask(Task): 

210 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

211 

212 Parameters 

213 ---------- 

214 config : `RawIngestConfig` 

215 Configuration for the task. 

216 butler : `~lsst.daf.butler.Butler` 

217 Writeable butler instance, with ``butler.run`` set to the appropriate 

218 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

219 datasets. 

220 on_success : `Callable`, optional 

221 A callback invoked when all of the raws associated with an exposure 

222 are ingested. Will be passed a list of `FileDataset` objects, each 

223 containing one or more resolved `DatasetRef` objects. If this callback 

224 raises it will interrupt the entire ingest process, even if 

225 `RawIngestConfig.failFast` is `False`. 

226 on_metadata_failure : `Callable`, optional 

227 A callback invoked when a failure occurs trying to translate the 

228 metadata for a file. Will be passed the URI and the exception, in 

229 that order, as positional arguments. Guaranteed to be called in an 

230 ``except`` block, allowing the callback to re-raise or replace (with 

231 ``raise ... from``) to override the task's usual error handling (before 

232 `RawIngestConfig.failFast` logic occurs). 

233 on_ingest_failure : `Callable`, optional 

234 A callback invoked when dimension record or dataset insertion into the 

235 database fails for an exposure. Will be passed a `RawExposureData` 

236 instance and the exception, in that order, as positional arguments. 

237 Guaranteed to be called in an ``except`` block, allowing the callback 

238 to re-raise or replace (with ``raise ... from``) to override the task's 

239 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

240 **kwargs 

241 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

242 constructor. 

243 

244 Notes 

245 ----- 

246 Each instance of `RawIngestTask` writes to the same Butler. Each 

247 invocation of `RawIngestTask.run` ingests a list of files. 

248 """ 

249 

250 ConfigClass = RawIngestConfig 

251 

252 _DefaultName = "ingest" 

253 

254 def getDatasetType(self): 

255 """Return the DatasetType of the datasets ingested by this Task.""" 

256 return DatasetType( 

257 "raw", 

258 ("instrument", "detector", "exposure"), 

259 "Exposure", 

260 universe=self.butler.registry.dimensions, 

261 ) 

262 

263 def __init__( 

264 self, 

265 config: Optional[RawIngestConfig] = None, 

266 *, 

267 butler: Butler, 

268 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

269 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

270 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

271 **kwargs: Any, 

272 ): 

273 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

274 super().__init__(config, **kwargs) 

275 self.butler = butler 

276 self.universe = self.butler.registry.dimensions 

277 self.datasetType = self.getDatasetType() 

278 self._on_success = on_success 

279 self._on_metadata_failure = on_metadata_failure 

280 self._on_ingest_failure = on_ingest_failure 

281 self.progress = Progress("obs.base.RawIngestTask") 

282 

283 # Import all the instrument classes so that we ensure that we 

284 # have all the relevant metadata translators loaded. 

285 Instrument.importAll(self.butler.registry) 

286 

287 def _reduce_kwargs(self): 

288 # Add extra parameters to pickle. 

289 return dict( 

290 **super()._reduce_kwargs(), 

291 butler=self.butler, 

292 on_success=self._on_success, 

293 on_metadata_failure=self._on_metadata_failure, 

294 on_ingest_failure=self._on_ingest_failure, 

295 ) 

296 

297 def _determine_instrument_formatter(self, dataId, filename): 

298 """Determine the instrument and formatter class. 

299 

300 Parameters 

301 ---------- 

302 dataId : `lsst.daf.butler.DataCoordinate` 

303 The dataId associated with this dataset. 

304 filename : `lsst.resources.ResourcePath` 

305 URI of file used for error reporting. 

306 

307 Returns 

308 ------- 

309 instrument : `Instrument` or `None` 

310 Instance of the `Instrument` associated with this dataset. `None` 

311 indicates that the instrument could not be determined. 

312 formatterClass : `type` 

313 Class to be used as the formatter for this dataset. 

314 """ 

315 # The data model currently assumes that whilst multiple datasets 

316 # can be associated with a single file, they must all share the 

317 # same formatter. 

318 try: 

319 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

320 except LookupError as e: 

321 self._on_metadata_failure(filename, e) 

322 self.log.warning( 

323 "Instrument %s for file %s not known to registry", dataId["instrument"], filename 

324 ) 

325 if self.config.failFast: 

326 raise RuntimeError( 

327 f"Instrument {dataId['instrument']} for file {filename} not known to registry" 

328 ) from e 

329 FormatterClass = Formatter 

330 # Indicate that we could not work out the instrument. 

331 instrument = None 

332 else: 

333 FormatterClass = instrument.getRawFormatter(dataId) 

334 return instrument, FormatterClass 

335 

336 def extractMetadata(self, filename: ResourcePath) -> RawFileData: 

337 """Extract and process metadata from a single raw file. 

338 

339 Parameters 

340 ---------- 

341 filename : `lsst.resources.ResourcePath` 

342 URI to the file. 

343 

344 Returns 

345 ------- 

346 data : `RawFileData` 

347 A structure containing the metadata extracted from the file, 

348 as well as the original filename. All fields will be populated, 

349 but the `RawFileData.dataId` attribute will be a minimal 

350 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

351 ``instrument`` field will be `None` if there is a problem 

352 with metadata extraction. 

353 

354 Notes 

355 ----- 

356 Assumes that there is a single dataset associated with the given 

357 file. Instruments using a single file to store multiple datasets 

358 must implement their own version of this method. 

359 

360 By default the method will catch all exceptions unless the ``failFast`` 

361 configuration item is `True`. If an error is encountered the 

362 `_on_metadata_failure()` method will be called. If no exceptions 

363 result and an error was encountered the returned object will have 

364 a null-instrument class and no datasets. 

365 

366 This method supports sidecar JSON files which can be used to 

367 extract metadata without having to read the data file itself. 

368 The sidecar file is always used if found. 

369 """ 

370 sidecar_fail_msg = "" # Requires prepended space when set. 

371 try: 

372 sidecar_file = filename.updatedExtension(".json") 

373 if sidecar_file.exists(): 

374 content = json.loads(sidecar_file.read()) 

375 headers = [process_sidecar_data(content)] 

376 sidecar_fail_msg = " (via sidecar)" 

377 else: 

378 # Read the metadata from the data file itself. 

379 

380 # For remote files download the entire file to get the 

381 # header. This is very inefficient and it would be better 

382 # to have some way of knowing where in the file the headers 

383 # are and to only download those parts of the file. 

384 with filename.as_local() as local_file: 

385 # Read the primary. This might be sufficient. 

386 header = readMetadata(local_file.ospath, 0) 

387 

388 try: 

389 # Try to work out a translator class early. 

390 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

391 except ValueError: 

392 # Primary header was not sufficient (maybe this file 

393 # has been compressed or is a MEF with minimal 

394 # primary). Read second header and merge with primary. 

395 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

396 

397 # Try again to work out a translator class, letting this 

398 # fail. 

399 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

400 

401 # Request the headers to use for ingest 

402 headers = translator_class.determine_translatable_headers(filename.ospath, header) 

403 

404 # Add each header to the dataset list 

405 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

406 

407 except Exception as e: 

408 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

409 # Indicate to the caller that we failed to read. 

410 datasets = [] 

411 formatterClass = Formatter 

412 instrument = None 

413 self._on_metadata_failure(filename, e) 

414 if self.config.failFast: 

415 raise RuntimeError( 

416 f"Problem extracting metadata for file {filename}{sidecar_fail_msg}" 

417 ) from e 

418 else: 

419 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

420 # The data model currently assumes that whilst multiple datasets 

421 # can be associated with a single file, they must all share the 

422 # same formatter. 

423 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

424 if instrument is None: 

425 datasets = [] 

426 

427 return RawFileData( 

428 datasets=datasets, filename=filename, FormatterClass=formatterClass, instrument=instrument 

429 ) 

430 

431 def _calculate_dataset_info(self, header, filename): 

432 """Calculate a RawFileDatasetInfo from the supplied information. 

433 

434 Parameters 

435 ---------- 

436 header : Mapping or `astro_metadata_translator.ObservationInfo` 

437 Header from the dataset or previously-translated content. 

438 filename : `lsst.resources.ResourcePath` 

439 Filename to use for error messages. 

440 

441 Returns 

442 ------- 

443 dataset : `RawFileDatasetInfo` 

444 The dataId, and observation information associated with this 

445 dataset. 

446 """ 

447 # To ensure we aren't slowed down for no reason, explicitly 

448 # list here the properties we need for the schema. 

449 # Use a dict with values a boolean where True indicates 

450 # that it is required that we calculate this property. 

451 ingest_subset = { 

452 "altaz_begin": False, 

453 "boresight_rotation_coord": False, 

454 "boresight_rotation_angle": False, 

455 "dark_time": False, 

456 "datetime_begin": True, 

457 "datetime_end": True, 

458 "detector_num": True, 

459 "exposure_group": False, 

460 "exposure_id": True, 

461 "exposure_time": True, 

462 "instrument": True, 

463 "tracking_radec": False, 

464 "object": False, 

465 "observation_counter": False, 

466 "observation_id": True, 

467 "observation_reason": False, 

468 "observation_type": True, 

469 "observing_day": False, 

470 "physical_filter": True, 

471 "science_program": False, 

472 "visit_id": False, 

473 } 

474 

475 if isinstance(header, ObservationInfo): 

476 obsInfo = header 

477 missing = [] 

478 # Need to check the required properties are present. 

479 for property, required in ingest_subset.items(): 

480 if not required: 

481 continue 

482 # getattr does not need to be protected because it is using 

483 # the defined list above containing properties that must exist. 

484 value = getattr(obsInfo, property) 

485 if value is None: 

486 missing.append(property) 

487 if missing: 

488 raise ValueError( 

489 f"Requested required properties are missing from file {filename}:" 

490 f" {missing} (via JSON)" 

491 ) 

492 

493 else: 

494 obsInfo = ObservationInfo( 

495 header, 

496 pedantic=False, 

497 filename=str(filename), 

498 required={k for k in ingest_subset if ingest_subset[k]}, 

499 subset=set(ingest_subset), 

500 ) 

501 

502 dataId = DataCoordinate.standardize( 

503 instrument=obsInfo.instrument, 

504 exposure=obsInfo.exposure_id, 

505 detector=obsInfo.detector_num, 

506 universe=self.universe, 

507 ) 

508 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

509 

510 def locateAndReadIndexFiles(self, files): 

511 """Given a list of files, look for index files and read them. 

512 

513 Index files can either be explicitly in the list of files to 

514 ingest, or else located in the same directory as a file to ingest. 

515 Index entries are always used if present. 

516 

517 Parameters 

518 ---------- 

519 files : iterable over `lsst.resources.ResourcePath` 

520 URIs to the files to be ingested. 

521 

522 Returns 

523 ------- 

524 index : `dict` [`str`, Any] 

525 Merged contents of all relevant index files found. These can 

526 be explicitly specified index files or ones found in the 

527 directory alongside a data file to be ingested. 

528 updated_files : iterable of `str` 

529 Updated list of the input files with entries removed that were 

530 found listed in an index file. Order is not guaranteed to 

531 match the order of the files given to this routine. 

532 bad_index_files: `set[str]` 

533 Files that looked like index files but failed to read properly. 

534 """ 

535 # Convert the paths to absolute for easy comparison with index content. 

536 # Do not convert to real paths since we have to assume that index 

537 # files are in this location and not the location which it links to. 

538 files = tuple(f.abspath() for f in files) 

539 

540 # Index files must be named this. 

541 index_root_file = "_index.json" 

542 

543 # Group the files by directory. 

544 files_by_directory = defaultdict(set) 

545 

546 for path in files: 

547 directory, file_in_dir = path.split() 

548 files_by_directory[directory].add(file_in_dir) 

549 

550 # All the metadata read from index files with keys of full path. 

551 index_entries = {} 

552 

553 # Index files we failed to read. 

554 bad_index_files = set() 

555 

556 # Any good index files that were found and used. 

557 good_index_files = set() 

558 

559 # Look for index files in those directories. 

560 for directory, files_in_directory in files_by_directory.items(): 

561 possible_index_file = directory.join(index_root_file) 

562 if possible_index_file.exists(): 

563 # If we are explicitly requesting an index file the 

564 # messages should be different. 

565 index_msg = "inferred" 

566 is_implied = True 

567 if index_root_file in files_in_directory: 

568 index_msg = "explicit" 

569 is_implied = False 

570 

571 # Try to read the index file and catch and report any 

572 # problems. 

573 try: 

574 content = json.loads(possible_index_file.read()) 

575 index = process_index_data(content, force_dict=True) 

576 except Exception as e: 

577 # Only trigger the callback if the index file 

578 # was asked for explicitly. Triggering on implied file 

579 # might be surprising. 

580 if not is_implied: 

581 self._on_metadata_failure(possible_index_file, e) 

582 if self.config.failFast: 

583 raise RuntimeError( 

584 f"Problem reading index file from {index_msg} location {possible_index_file}" 

585 ) from e 

586 bad_index_files.add(possible_index_file) 

587 continue 

588 

589 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

590 good_index_files.add(possible_index_file) 

591 

592 # Go through the index adding entries for files. 

593 # If we have non-index files in this directory marked for 

594 # ingest we should only get index information for those. 

595 # If the index file was explicit we use all entries. 

596 if is_implied: 

597 files_to_ingest = files_in_directory 

598 else: 

599 files_to_ingest = set(index) 

600 

601 # Copy relevant metadata into a single dict for all index 

602 # entries. 

603 for file_in_dir in files_to_ingest: 

604 # Skip an explicitly specified index file. 

605 # This should never happen because an explicit index 

606 # file will force ingest of all files in the index 

607 # and not use the explicit file list. If somehow 

608 # this is not true we continue. Raising an exception 

609 # seems like the wrong thing to do since this is harmless. 

610 if file_in_dir == index_root_file: 

611 self.log.info( 

612 "Logic error found scanning directory %s. Please file ticket.", directory 

613 ) 

614 continue 

615 if file_in_dir in index: 

616 file = directory.join(file_in_dir) 

617 if file in index_entries: 

618 # ObservationInfo overrides raw metadata 

619 if isinstance(index[file_in_dir], ObservationInfo) and not isinstance( 

620 index_entries[file], ObservationInfo 

621 ): 

622 self.log.warning( 

623 "File %s already specified in an index file but overriding" 

624 " with ObservationInfo content from %s", 

625 file, 

626 possible_index_file, 

627 ) 

628 else: 

629 self.log.warning( 

630 "File %s already specified in an index file, ignoring content from %s", 

631 file, 

632 possible_index_file, 

633 ) 

634 # Do nothing in this case 

635 continue 

636 

637 index_entries[file] = index[file_in_dir] 

638 

639 # Remove files from list that have index entries and also 

640 # any files that we determined to be explicit index files 

641 # or any index files that we failed to read. 

642 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

643 

644 # The filtered list loses the initial order. Retaining the order 

645 # is good for testing but does have a cost if there are many 

646 # files when copying the good values out. A dict would have faster 

647 # lookups (using the files as keys) but use more memory. 

648 ordered = [f for f in filtered if f in files] 

649 

650 return index_entries, ordered, good_index_files, bad_index_files 

651 

652 def processIndexEntries(self, index_entries): 

653 """Convert index entries to RawFileData. 

654 

655 Parameters 

656 ---------- 

657 index_entries : `dict` [`str`, Any] 

658 Dict indexed by name of file to ingest and with keys either 

659 raw metadata or translated 

660 `~astro_metadata_translator.ObservationInfo`. 

661 

662 Returns 

663 ------- 

664 data : `RawFileData` 

665 A structure containing the metadata extracted from the file, 

666 as well as the original filename. All fields will be populated, 

667 but the `RawFileData.dataId` attribute will be a minimal 

668 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

669 """ 

670 fileData = [] 

671 for filename, metadata in index_entries.items(): 

672 try: 

673 datasets = [self._calculate_dataset_info(metadata, filename)] 

674 except Exception as e: 

675 self.log.debug("Problem extracting metadata for file %s found in index file: %s", filename, e) 

676 datasets = [] 

677 formatterClass = Formatter 

678 instrument = None 

679 self._on_metadata_failure(filename, e) 

680 if self.config.failFast: 

681 raise RuntimeError( 

682 f"Problem extracting metadata for file {filename} found in index file" 

683 ) from e 

684 else: 

685 instrument, formatterClass = self._determine_instrument_formatter( 

686 datasets[0].dataId, filename 

687 ) 

688 if instrument is None: 

689 datasets = [] 

690 fileData.append( 

691 RawFileData( 

692 datasets=datasets, filename=filename, FormatterClass=formatterClass, instrument=instrument 

693 ) 

694 ) 

695 return fileData 

696 

697 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

698 """Group an iterable of `RawFileData` by exposure. 

699 

700 Parameters 

701 ---------- 

702 files : iterable of `RawFileData` 

703 File-level information to group. 

704 

705 Returns 

706 ------- 

707 exposures : `list` of `RawExposureData` 

708 A list of structures that group the file-level information by 

709 exposure. All fields will be populated. The 

710 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

711 `~lsst.daf.butler.DataCoordinate` instances. 

712 """ 

713 exposureDimensions = self.universe["exposure"].graph 

714 byExposure = defaultdict(list) 

715 for f in files: 

716 # Assume that the first dataset is representative for the file. 

717 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

718 

719 return [ 

720 RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

721 for dataId, exposureFiles in byExposure.items() 

722 ] 

723 

724 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

725 """Expand the data IDs associated with a raw exposure. 

726 

727 This adds the metadata records. 

728 

729 Parameters 

730 ---------- 

731 exposure : `RawExposureData` 

732 A structure containing information about the exposure to be 

733 ingested. Must have `RawExposureData.records` populated. Should 

734 be considered consumed upon return. 

735 

736 Returns 

737 ------- 

738 exposure : `RawExposureData` 

739 An updated version of the input structure, with 

740 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

741 updated to data IDs for which 

742 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

743 """ 

744 # We start by expanded the exposure-level data ID; we won't use that 

745 # directly in file ingest, but this lets us do some database lookups 

746 # once per exposure instead of once per file later. 

747 data.dataId = self.butler.registry.expandDataId( 

748 data.dataId, 

749 # We pass in the records we'll be inserting shortly so they aren't 

750 # looked up from the database. We do expect instrument and filter 

751 # records to be retrieved from the database here (though the 

752 # Registry may cache them so there isn't a lookup every time). 

753 records={ 

754 self.butler.registry.dimensions["exposure"]: data.record, 

755 }, 

756 ) 

757 # Now we expand the per-file (exposure+detector) data IDs. This time 

758 # we pass in the records we just retrieved from the exposure data ID 

759 # expansion. 

760 for file in data.files: 

761 for dataset in file.datasets: 

762 dataset.dataId = self.butler.registry.expandDataId( 

763 dataset.dataId, records=dict(data.dataId.records) 

764 ) 

765 return data 

766 

767 def prep( 

768 self, files, *, pool: Optional[Pool] = None, processes: int = 1 

769 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

770 """Perform all non-database-updating ingest preprocessing steps. 

771 

772 Parameters 

773 ---------- 

774 files : iterable over `str` or path-like objects 

775 Paths to the files to be ingested. Will be made absolute 

776 if they are not already. 

777 pool : `multiprocessing.Pool`, optional 

778 If not `None`, a process pool with which to parallelize some 

779 operations. 

780 processes : `int`, optional 

781 The number of processes to use. Ignored if ``pool`` is not `None`. 

782 

783 Returns 

784 ------- 

785 exposures : `Iterator` [ `RawExposureData` ] 

786 Data structures containing dimension records, filenames, and data 

787 IDs to be ingested (one structure for each exposure). 

788 bad_files : `list` of `str` 

789 List of all the files that could not have metadata extracted. 

790 """ 

791 if pool is None and processes > 1: 

792 pool = Pool(processes) 

793 mapFunc = map if pool is None else pool.imap_unordered 

794 

795 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

796 """Filter out bad files and return good with list of bad.""" 

797 good_files = [] 

798 bad_files = [] 

799 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)): 

800 if not fileDatum.datasets: 

801 bad_files.append(fileDatum.filename) 

802 else: 

803 good_files.append(fileDatum) 

804 return good_files, bad_files 

805 

806 # Look for index files and read them. 

807 # There should be far fewer index files than data files. 

808 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

809 if bad_index_files: 

810 self.log.info("Failed to read the following explicitly requested index files:"), 

811 for bad in sorted(bad_index_files): 

812 self.log.info("- %s", bad) 

813 

814 # Now convert all the index file entries to standard form for ingest. 

815 bad_index_file_data = [] 

816 indexFileData = self.processIndexEntries(index_entries) 

817 if indexFileData: 

818 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

819 self.log.info( 

820 "Successfully extracted metadata for %d file%s found in %d index file%s with %d failure%s", 

821 *_log_msg_counter(indexFileData), 

822 *_log_msg_counter(good_index_files), 

823 *_log_msg_counter(bad_index_file_data), 

824 ) 

825 

826 # Extract metadata and build per-detector regions. 

827 # This could run in a subprocess so collect all output 

828 # before looking at failures. 

829 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

830 

831 # Filter out all the failed reads and store them for later 

832 # reporting. 

833 fileData, bad_files = _partition_good_bad(fileData) 

834 self.log.info( 

835 "Successfully extracted metadata from %d file%s with %d failure%s", 

836 *_log_msg_counter(fileData), 

837 *_log_msg_counter(bad_files), 

838 ) 

839 

840 # Combine with data from index files. 

841 fileData.extend(indexFileData) 

842 bad_files.extend(bad_index_file_data) 

843 bad_files.extend(bad_index_files) 

844 

845 # Use that metadata to group files (and extracted metadata) by 

846 # exposure. Never parallelized because it's intrinsically a gather 

847 # step. 

848 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

849 

850 # The next operation operates on RawExposureData instances (one at 

851 # a time) in-place and then returns the modified instance. We call it 

852 # as a pass-through instead of relying on the arguments we pass in to 

853 # have been modified because in the parallel case those arguments are 

854 # going to be pickled and unpickled, and I'm not certain 

855 # multiprocessing is careful enough with that for output arguments to 

856 # work. 

857 

858 # Expand the data IDs to include all dimension metadata; we need this 

859 # because we may need to generate path templates that rely on that 

860 # metadata. 

861 # This is the first step that involves actual database calls (but just 

862 # SELECTs), so if there's going to be a problem with connections vs. 

863 # multiple processes, or lock contention (in SQLite) slowing things 

864 # down, it'll happen here. 

865 return mapFunc(self.expandDataIds, exposureData), bad_files 

866 

867 def ingestExposureDatasets( 

868 self, 

869 exposure: RawExposureData, 

870 *, 

871 run: Optional[str] = None, 

872 skip_existing_exposures: bool = False, 

873 track_file_attrs: bool = True, 

874 ) -> List[FileDataset]: 

875 """Ingest all raw files in one exposure. 

876 

877 Parameters 

878 ---------- 

879 exposure : `RawExposureData` 

880 A structure containing information about the exposure to be 

881 ingested. Must have `RawExposureData.records` populated and all 

882 data ID attributes expanded. 

883 run : `str`, optional 

884 Name of a RUN-type collection to write to, overriding 

885 ``self.butler.run``. 

886 skip_existing_exposures : `bool`, optional 

887 If `True` (`False` is default), skip raws that have already been 

888 ingested (i.e. raws for which we already have a dataset with the 

889 same data ID in the target collection, even if from another file). 

890 Note that this is much slower than just not passing 

891 already-ingested files as inputs, because we still need to read and 

892 process metadata to identify which exposures to search for. It 

893 also will not work reliably if multiple processes are attempting to 

894 ingest raws from the same exposure concurrently, in that different 

895 processes may still attempt to ingest the same raw and conflict, 

896 causing a failure that prevents other raws from the same exposure 

897 from being ingested. 

898 track_file_attrs : `bool`, optional 

899 Control whether file attributes such as the size or checksum should 

900 be tracked by the datastore. Whether this parameter is honored 

901 depends on the specific datastore implentation. 

902 

903 Returns 

904 ------- 

905 datasets : `list` of `lsst.daf.butler.FileDataset` 

906 Per-file structures identifying the files ingested and their 

907 dataset representation in the data repository. 

908 """ 

909 if skip_existing_exposures: 

910 existing = { 

911 ref.dataId 

912 for ref in self.butler.registry.queryDatasets( 

913 self.datasetType, 

914 collections=[run], 

915 dataId=exposure.dataId, 

916 ) 

917 } 

918 else: 

919 existing = set() 

920 datasets = [] 

921 for file in exposure.files: 

922 refs = [DatasetRef(self.datasetType, d.dataId) for d in file.datasets if d.dataId not in existing] 

923 if refs: 

924 datasets.append( 

925 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

926 ) 

927 

928 # Raw files are preferentially ingested using a UUID derived from 

929 # the collection name and dataId. 

930 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

931 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

932 else: 

933 mode = DatasetIdGenEnum.UNIQUE 

934 self.butler.ingest( 

935 *datasets, 

936 transfer=self.config.transfer, 

937 run=run, 

938 idGenerationMode=mode, 

939 record_validation_info=track_file_attrs, 

940 ) 

941 return datasets 

942 

943 def ingestFiles( 

944 self, 

945 files, 

946 *, 

947 pool: Optional[Pool] = None, 

948 processes: int = 1, 

949 run: Optional[str] = None, 

950 skip_existing_exposures: bool = False, 

951 update_exposure_records: bool = False, 

952 track_file_attrs: bool = True, 

953 ): 

954 """Ingest files into a Butler data repository. 

955 

956 This creates any new exposure or visit Dimension entries needed to 

957 identify the ingested files, creates new Dataset entries in the 

958 Registry and finally ingests the files themselves into the Datastore. 

959 Any needed instrument, detector, and physical_filter Dimension entries 

960 must exist in the Registry before `run` is called. 

961 

962 Parameters 

963 ---------- 

964 files : iterable over `lsst.resources.ResourcePath` 

965 URIs to the files to be ingested. 

966 pool : `multiprocessing.Pool`, optional 

967 If not `None`, a process pool with which to parallelize some 

968 operations. 

969 processes : `int`, optional 

970 The number of processes to use. Ignored if ``pool`` is not `None`. 

971 run : `str`, optional 

972 Name of a RUN-type collection to write to, overriding 

973 the default derived from the instrument name. 

974 skip_existing_exposures : `bool`, optional 

975 If `True` (`False` is default), skip raws that have already been 

976 ingested (i.e. raws for which we already have a dataset with the 

977 same data ID in the target collection, even if from another file). 

978 Note that this is much slower than just not passing 

979 already-ingested files as inputs, because we still need to read and 

980 process metadata to identify which exposures to search for. It 

981 also will not work reliably if multiple processes are attempting to 

982 ingest raws from the same exposure concurrently, in that different 

983 processes may still attempt to ingest the same raw and conflict, 

984 causing a failure that prevents other raws from the same exposure 

985 from being ingested. 

986 update_exposure_records : `bool`, optional 

987 If `True` (`False` is default), update existing exposure records 

988 that conflict with the new ones instead of rejecting them. THIS IS 

989 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

990 KNOWN TO BE BAD. This should usually be combined with 

991 ``skip_existing_exposures=True``. 

992 track_file_attrs : `bool`, optional 

993 Control whether file attributes such as the size or checksum should 

994 be tracked by the datastore. Whether this parameter is honored 

995 depends on the specific datastore implentation. 

996 

997 Returns 

998 ------- 

999 refs : `list` of `lsst.daf.butler.DatasetRef` 

1000 Dataset references for ingested raws. 

1001 """ 

1002 

1003 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

1004 

1005 # Up to this point, we haven't modified the data repository at all. 

1006 # Now we finally do that, with one transaction per exposure. This is 

1007 # not parallelized at present because the performance of this step is 

1008 # limited by the database server. That may or may not change in the 

1009 # future once we increase our usage of bulk inserts and reduce our 

1010 # usage of savepoints; we've tried to get everything but the database 

1011 # operations done in advance to reduce the time spent inside 

1012 # transactions. 

1013 self.butler.registry.registerDatasetType(self.datasetType) 

1014 

1015 refs = [] 

1016 runs = set() 

1017 n_exposures = 0 

1018 n_exposures_failed = 0 

1019 n_ingests_failed = 0 

1020 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

1021 

1022 self.log.debug( 

1023 "Attempting to ingest %d file%s from exposure %s:%s", 

1024 *_log_msg_counter(exposure.files), 

1025 exposure.record.instrument, 

1026 exposure.record.obs_id, 

1027 ) 

1028 

1029 try: 

1030 inserted_or_updated = self.butler.registry.syncDimensionData( 

1031 "exposure", 

1032 exposure.record, 

1033 update=update_exposure_records, 

1034 ) 

1035 except Exception as e: 

1036 self._on_ingest_failure(exposure, e) 

1037 n_exposures_failed += 1 

1038 self.log.warning( 

1039 "Exposure %s:%s could not be registered: %s", 

1040 exposure.record.instrument, 

1041 exposure.record.obs_id, 

1042 e, 

1043 ) 

1044 if self.config.failFast: 

1045 raise e 

1046 continue 

1047 

1048 if isinstance(inserted_or_updated, dict): 

1049 # Exposure is in the registry and we updated it, so 

1050 # syncDimensionData returned a dict. 

1051 self.log.info( 

1052 "Exposure %s:%s was already present, but columns %s were updated.", 

1053 exposure.record.instrument, 

1054 exposure.record.obs_id, 

1055 str(list(inserted_or_updated.keys())), 

1056 ) 

1057 

1058 # Override default run if nothing specified explicitly. 

1059 if run is None: 

1060 instrument = exposure.files[0].instrument 

1061 this_run = instrument.makeDefaultRawIngestRunName() 

1062 else: 

1063 this_run = run 

1064 if this_run not in runs: 

1065 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

1066 runs.add(this_run) 

1067 try: 

1068 datasets_for_exposure = self.ingestExposureDatasets( 

1069 exposure, 

1070 run=this_run, 

1071 skip_existing_exposures=skip_existing_exposures, 

1072 track_file_attrs=track_file_attrs, 

1073 ) 

1074 except Exception as e: 

1075 self._on_ingest_failure(exposure, e) 

1076 n_ingests_failed += 1 

1077 self.log.warning("Failed to ingest the following for reason: %s", e) 

1078 for f in exposure.files: 

1079 self.log.warning("- %s", f.filename) 

1080 if self.config.failFast: 

1081 raise e 

1082 continue 

1083 else: 

1084 self._on_success(datasets_for_exposure) 

1085 for dataset in datasets_for_exposure: 

1086 refs.extend(dataset.refs) 

1087 

1088 # Success for this exposure. 

1089 n_exposures += 1 

1090 self.log.info( 

1091 "Exposure %s:%s ingested successfully", exposure.record.instrument, exposure.record.obs_id 

1092 ) 

1093 

1094 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1095 

1096 @timeMethod 

1097 def run( 

1098 self, 

1099 files, 

1100 *, 

1101 pool: Optional[Pool] = None, 

1102 processes: int = 1, 

1103 run: Optional[str] = None, 

1104 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", 

1105 group_files: bool = True, 

1106 skip_existing_exposures: bool = False, 

1107 update_exposure_records: bool = False, 

1108 track_file_attrs: bool = True, 

1109 ): 

1110 """Ingest files into a Butler data repository. 

1111 

1112 This creates any new exposure or visit Dimension entries needed to 

1113 identify the ingested files, creates new Dataset entries in the 

1114 Registry and finally ingests the files themselves into the Datastore. 

1115 Any needed instrument, detector, and physical_filter Dimension entries 

1116 must exist in the Registry before `run` is called. 

1117 

1118 Parameters 

1119 ---------- 

1120 files : iterable `lsst.resources.ResourcePath`, `str` or path-like 

1121 Paths to the files to be ingested. Can refer to directories. 

1122 Will be made absolute if they are not already. 

1123 pool : `multiprocessing.Pool`, optional 

1124 If not `None`, a process pool with which to parallelize some 

1125 operations. 

1126 processes : `int`, optional 

1127 The number of processes to use. Ignored if ``pool`` is not `None`. 

1128 run : `str`, optional 

1129 Name of a RUN-type collection to write to, overriding 

1130 the default derived from the instrument name. 

1131 file_filter : `str` or `re.Pattern`, optional 

1132 Pattern to use to discover files to ingest within directories. 

1133 The default is to search for FITS files. The regex applies to 

1134 files within the directory. 

1135 group_files : `bool`, optional 

1136 Group files by directory if they have been discovered in 

1137 directories. Will not affect files explicitly provided. 

1138 skip_existing_exposures : `bool`, optional 

1139 If `True` (`False` is default), skip raws that have already been 

1140 ingested (i.e. raws for which we already have a dataset with the 

1141 same data ID in the target collection, even if from another file). 

1142 Note that this is much slower than just not passing 

1143 already-ingested files as inputs, because we still need to read and 

1144 process metadata to identify which exposures to search for. It 

1145 also will not work reliably if multiple processes are attempting to 

1146 ingest raws from the same exposure concurrently, in that different 

1147 processes may still attempt to ingest the same raw and conflict, 

1148 causing a failure that prevents other raws from the same exposure 

1149 from being ingested. 

1150 update_exposure_records : `bool`, optional 

1151 If `True` (`False` is default), update existing exposure records 

1152 that conflict with the new ones instead of rejecting them. THIS IS 

1153 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1154 KNOWN TO BE BAD. This should usually be combined with 

1155 ``skip_existing_exposures=True``. 

1156 track_file_attrs : `bool`, optional 

1157 Control whether file attributes such as the size or checksum should 

1158 be tracked by the datastore. Whether this parameter is honored 

1159 depends on the specific datastore implentation. 

1160 

1161 Returns 

1162 ------- 

1163 refs : `list` of `lsst.daf.butler.DatasetRef` 

1164 Dataset references for ingested raws. 

1165 

1166 Notes 

1167 ----- 

1168 This method inserts all datasets for an exposure within a transaction, 

1169 guaranteeing that partial exposures are never ingested. The exposure 

1170 dimension record is inserted with `Registry.syncDimensionData` first 

1171 (in its own transaction), which inserts only if a record with the same 

1172 primary key does not already exist. This allows different files within 

1173 the same exposure to be ingested in different runs. 

1174 """ 

1175 

1176 refs = [] 

1177 bad_files = [] 

1178 n_exposures = 0 

1179 n_exposures_failed = 0 

1180 n_ingests_failed = 0 

1181 if group_files: 

1182 for group in ResourcePath.findFileResources(files, file_filter, group_files): 

1183 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1184 group, 

1185 pool=pool, 

1186 processes=processes, 

1187 run=run, 

1188 skip_existing_exposures=skip_existing_exposures, 

1189 update_exposure_records=update_exposure_records, 

1190 track_file_attrs=track_file_attrs, 

1191 ) 

1192 refs.extend(new_refs) 

1193 bad_files.extend(bad) 

1194 n_exposures += n_exp 

1195 n_exposures_failed += n_exp_fail 

1196 n_ingests_failed += n_ingest_fail 

1197 else: 

1198 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1199 ResourcePath.findFileResources(files, file_filter, group_files), 

1200 pool=pool, 

1201 processes=processes, 

1202 run=run, 

1203 skip_existing_exposures=skip_existing_exposures, 

1204 update_exposure_records=update_exposure_records, 

1205 ) 

1206 

1207 had_failure = False 

1208 

1209 if bad_files: 

1210 had_failure = True 

1211 self.log.warning("Could not extract observation metadata from the following:") 

1212 for f in bad_files: 

1213 self.log.warning("- %s", f) 

1214 

1215 self.log.info( 

1216 "Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1217 " registration and %d failure%s from file ingest.", 

1218 *_log_msg_counter(n_exposures), 

1219 *_log_msg_counter(n_exposures_failed), 

1220 *_log_msg_counter(n_ingests_failed), 

1221 ) 

1222 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1223 had_failure = True 

1224 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1225 

1226 if had_failure: 

1227 raise RuntimeError("Some failures encountered during ingestion") 

1228 

1229 return refs