Coverage for python/lsst/obs/base/ingest.py: 15%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

320 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from dataclasses import dataclass, InitVar 

28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, merge_headers, MetadataTranslator 

33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 ButlerURI, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetIdGenEnum, 

41 DatasetRef, 

42 DatasetType, 

43 DimensionRecord, 

44 DimensionUniverse, 

45 FileDataset, 

46 Formatter, 

47 Progress, 

48) 

49from lsst.pex.config import Config, ChoiceField, Field 

50from lsst.pipe.base import Task 

51from lsst.utils.timer import timeMethod 

52 

53from ._instrument import Instrument, makeExposureRecordFromObsInfo 

54from ._fitsRawFormatterBase import FitsRawFormatterBase 

55 

56 

57def _do_nothing(*args, **kwargs) -> None: 

58 """Do nothing. 

59 

60 This is a function that accepts anything and does nothing. 

61 For use as a default in callback arguments. 

62 """ 

63 pass 

64 

65 

66def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

67 """Count the iterable and return the count and plural modifier. 

68 

69 Parameters 

70 ---------- 

71 noun : Iterable or `int` 

72 Thing to count. If given an integer it is assumed to be the count 

73 to use to calculate modifier. 

74 

75 Returns 

76 ------- 

77 num : `int` 

78 Number of items found in ``noun``. 

79 modifier : `str` 

80 Character to add to the end of a string referring to these items 

81 to indicate whether it was a single item or not. Returns empty 

82 string if there is one item or "s" otherwise. 

83 

84 Examples 

85 -------- 

86 

87 .. code-block:: python 

88 

89 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

90 """ 

91 if isinstance(noun, int): 

92 num = noun 

93 else: 

94 num = len(noun) 

95 return num, "" if num == 1 else "s" 

96 

97 

98@dataclass 

99class RawFileDatasetInfo: 

100 """Information about a single dataset within a raw file.""" 

101 

102 dataId: DataCoordinate 

103 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

104 

105 obsInfo: ObservationInfo 

106 """Standardized observation metadata extracted directly from the file 

107 headers (`astro_metadata_translator.ObservationInfo`). 

108 """ 

109 

110 

111@dataclass 

112class RawFileData: 

113 """Information about a single raw file, used during ingest.""" 

114 

115 datasets: List[RawFileDatasetInfo] 

116 """The information describing each dataset within this raw file. 

117 (`list` of `RawFileDatasetInfo`) 

118 """ 

119 

120 filename: ButlerURI 

121 """URI of the file this information was extracted from (`str`). 

122 

123 This is the path prior to ingest, not the path after ingest. 

124 """ 

125 

126 FormatterClass: Type[FitsRawFormatterBase] 

127 """Formatter class that should be used to ingest this file (`type`; as 

128 subclass of `FitsRawFormatterBase`). 

129 """ 

130 

131 instrument: Optional[Instrument] 

132 """The `Instrument` instance associated with this file. Can be `None` 

133 if ``datasets`` is an empty list.""" 

134 

135 

136@dataclass 

137class RawExposureData: 

138 """Information about a complete raw exposure, used during ingest.""" 

139 

140 dataId: DataCoordinate 

141 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

142 """ 

143 

144 files: List[RawFileData] 

145 """List of structures containing file-level information. 

146 """ 

147 

148 universe: InitVar[DimensionUniverse] 

149 """Set of all known dimensions. 

150 """ 

151 

152 record: Optional[DimensionRecord] = None 

153 """The exposure `DimensionRecord` that must be inserted into the 

154 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

155 """ 

156 

157 def __post_init__(self, universe: DimensionUniverse): 

158 # We don't care which file or dataset we read metadata from, because 

159 # we're assuming they'll all be the same; just use the first ones. 

160 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

161 

162 

163def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

164 """Create a Config field with options for transferring data between repos. 

165 

166 The allowed options for the field are exactly those supported by 

167 `lsst.daf.butler.Datastore.ingest`. 

168 

169 Parameters 

170 ---------- 

171 doc : `str` 

172 Documentation for the configuration field. 

173 

174 Returns 

175 ------- 

176 field : `lsst.pex.config.ChoiceField` 

177 Configuration field. 

178 """ 

179 return ChoiceField( 

180 doc=doc, 

181 dtype=str, 

182 allowed={"move": "move", 

183 "copy": "copy", 

184 "auto": "choice will depend on datastore", 

185 "direct": "use URI to ingested file directly in datastore", 

186 "link": "hard link falling back to symbolic link", 

187 "hardlink": "hard link", 

188 "symlink": "symbolic (soft) link", 

189 "relsymlink": "relative symbolic link", 

190 }, 

191 optional=True, 

192 default=default 

193 ) 

194 

195 

196class RawIngestConfig(Config): 

197 """Configuration class for RawIngestTask.""" 

198 

199 transfer = makeTransferChoiceField() 

200 failFast = Field( 

201 dtype=bool, 

202 default=False, 

203 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

204 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

205 ) 

206 

207 

208class RawIngestTask(Task): 

209 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

210 

211 Parameters 

212 ---------- 

213 config : `RawIngestConfig` 

214 Configuration for the task. 

215 butler : `~lsst.daf.butler.Butler` 

216 Writeable butler instance, with ``butler.run`` set to the appropriate 

217 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

218 datasets. 

219 on_success : `Callable`, optional 

220 A callback invoked when all of the raws associated with an exposure 

221 are ingested. Will be passed a list of `FileDataset` objects, each 

222 containing one or more resolved `DatasetRef` objects. If this callback 

223 raises it will interrupt the entire ingest process, even if 

224 `RawIngestConfig.failFast` is `False`. 

225 on_metadata_failure : `Callable`, optional 

226 A callback invoked when a failure occurs trying to translate the 

227 metadata for a file. Will be passed the URI and the exception, in 

228 that order, as positional arguments. Guaranteed to be called in an 

229 ``except`` block, allowing the callback to re-raise or replace (with 

230 ``raise ... from``) to override the task's usual error handling (before 

231 `RawIngestConfig.failFast` logic occurs). 

232 on_ingest_failure : `Callable`, optional 

233 A callback invoked when dimension record or dataset insertion into the 

234 database fails for an exposure. Will be passed a `RawExposureData` 

235 instance and the exception, in that order, as positional arguments. 

236 Guaranteed to be called in an ``except`` block, allowing the callback 

237 to re-raise or replace (with ``raise ... from``) to override the task's 

238 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

239 **kwargs 

240 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

241 constructor. 

242 

243 Notes 

244 ----- 

245 Each instance of `RawIngestTask` writes to the same Butler. Each 

246 invocation of `RawIngestTask.run` ingests a list of files. 

247 """ 

248 

249 ConfigClass = RawIngestConfig 

250 

251 _DefaultName = "ingest" 

252 

253 def getDatasetType(self): 

254 """Return the DatasetType of the datasets ingested by this Task.""" 

255 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

256 universe=self.butler.registry.dimensions) 

257 

258 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, 

259 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

260 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

261 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

262 **kwargs: Any): 

263 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

264 super().__init__(config, **kwargs) 

265 self.butler = butler 

266 self.universe = self.butler.registry.dimensions 

267 self.datasetType = self.getDatasetType() 

268 self._on_success = on_success 

269 self._on_metadata_failure = on_metadata_failure 

270 self._on_ingest_failure = on_ingest_failure 

271 self.progress = Progress("obs.base.RawIngestTask") 

272 

273 # Import all the instrument classes so that we ensure that we 

274 # have all the relevant metadata translators loaded. 

275 Instrument.importAll(self.butler.registry) 

276 

277 def _reduce_kwargs(self): 

278 # Add extra parameters to pickle. 

279 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success, 

280 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure) 

281 

282 def _determine_instrument_formatter(self, dataId, filename): 

283 """Determine the instrument and formatter class. 

284 

285 Parameters 

286 ---------- 

287 dataId : `lsst.daf.butler.DataCoordinate` 

288 The dataId associated with this dataset. 

289 filename : `ButlerURI` 

290 URI of file used for error reporting. 

291 

292 Returns 

293 ------- 

294 instrument : `Instrument` or `None` 

295 Instance of the `Instrument` associated with this dataset. `None` 

296 indicates that the instrument could not be determined. 

297 formatterClass : `type` 

298 Class to be used as the formatter for this dataset. 

299 """ 

300 # The data model currently assumes that whilst multiple datasets 

301 # can be associated with a single file, they must all share the 

302 # same formatter. 

303 try: 

304 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

305 except LookupError as e: 

306 self._on_metadata_failure(filename, e) 

307 self.log.warning("Instrument %s for file %s not known to registry", 

308 dataId["instrument"], filename) 

309 if self.config.failFast: 

310 raise RuntimeError(f"Instrument {dataId['instrument']} for" 

311 f" file {filename} not known to registry") from e 

312 FormatterClass = Formatter 

313 # Indicate that we could not work out the instrument. 

314 instrument = None 

315 else: 

316 FormatterClass = instrument.getRawFormatter(dataId) 

317 return instrument, FormatterClass 

318 

319 def extractMetadata(self, filename: ButlerURI) -> RawFileData: 

320 """Extract and process metadata from a single raw file. 

321 

322 Parameters 

323 ---------- 

324 filename : `ButlerURI` 

325 URI to the file. 

326 

327 Returns 

328 ------- 

329 data : `RawFileData` 

330 A structure containing the metadata extracted from the file, 

331 as well as the original filename. All fields will be populated, 

332 but the `RawFileData.dataId` attribute will be a minimal 

333 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

334 ``instrument`` field will be `None` if there is a problem 

335 with metadata extraction. 

336 

337 Notes 

338 ----- 

339 Assumes that there is a single dataset associated with the given 

340 file. Instruments using a single file to store multiple datasets 

341 must implement their own version of this method. 

342 

343 By default the method will catch all exceptions unless the ``failFast`` 

344 configuration item is `True`. If an error is encountered the 

345 `_on_metadata_failure()` method will be called. If no exceptions 

346 result and an error was encountered the returned object will have 

347 a null-instrument class and no datasets. 

348 

349 This method supports sidecar JSON files which can be used to 

350 extract metadata without having to read the data file itself. 

351 The sidecar file is always used if found. 

352 """ 

353 sidecar_fail_msg = "" # Requires prepended space when set. 

354 try: 

355 sidecar_file = filename.updatedExtension(".json") 

356 if sidecar_file.exists(): 

357 content = json.loads(sidecar_file.read()) 

358 headers = [process_sidecar_data(content)] 

359 sidecar_fail_msg = " (via sidecar)" 

360 else: 

361 # Read the metadata from the data file itself. 

362 

363 # For remote files download the entire file to get the 

364 # header. This is very inefficient and it would be better 

365 # to have some way of knowing where in the file the headers 

366 # are and to only download those parts of the file. 

367 with filename.as_local() as local_file: 

368 # Read the primary. This might be sufficient. 

369 header = readMetadata(local_file.ospath, 0) 

370 

371 try: 

372 # Try to work out a translator class early. 

373 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

374 except ValueError: 

375 # Primary header was not sufficient (maybe this file 

376 # has been compressed or is a MEF with minimal 

377 # primary). Read second header and merge with primary. 

378 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

379 

380 # Try again to work out a translator class, letting this 

381 # fail. 

382 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

383 

384 # Request the headers to use for ingest 

385 headers = translator_class.determine_translatable_headers(filename.ospath, header) 

386 

387 # Add each header to the dataset list 

388 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

389 

390 except Exception as e: 

391 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

392 # Indicate to the caller that we failed to read. 

393 datasets = [] 

394 formatterClass = Formatter 

395 instrument = None 

396 self._on_metadata_failure(filename, e) 

397 if self.config.failFast: 

398 raise RuntimeError("Problem extracting metadata for file " 

399 f"{filename}{sidecar_fail_msg}") from e 

400 else: 

401 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

402 # The data model currently assumes that whilst multiple datasets 

403 # can be associated with a single file, they must all share the 

404 # same formatter. 

405 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

406 if instrument is None: 

407 datasets = [] 

408 

409 return RawFileData(datasets=datasets, filename=filename, 

410 FormatterClass=formatterClass, 

411 instrument=instrument) 

412 

413 def _calculate_dataset_info(self, header, filename): 

414 """Calculate a RawFileDatasetInfo from the supplied information. 

415 

416 Parameters 

417 ---------- 

418 header : Mapping or `astro_metadata_translator.ObservationInfo` 

419 Header from the dataset or previously-translated content. 

420 filename : `ButlerURI` 

421 Filename to use for error messages. 

422 

423 Returns 

424 ------- 

425 dataset : `RawFileDatasetInfo` 

426 The dataId, and observation information associated with this 

427 dataset. 

428 """ 

429 # To ensure we aren't slowed down for no reason, explicitly 

430 # list here the properties we need for the schema. 

431 # Use a dict with values a boolean where True indicates 

432 # that it is required that we calculate this property. 

433 ingest_subset = { 

434 "altaz_begin": False, 

435 "boresight_rotation_coord": False, 

436 "boresight_rotation_angle": False, 

437 "dark_time": False, 

438 "datetime_begin": True, 

439 "datetime_end": True, 

440 "detector_num": True, 

441 "exposure_group": False, 

442 "exposure_id": True, 

443 "exposure_time": True, 

444 "instrument": True, 

445 "tracking_radec": False, 

446 "object": False, 

447 "observation_counter": False, 

448 "observation_id": True, 

449 "observation_reason": False, 

450 "observation_type": True, 

451 "observing_day": False, 

452 "physical_filter": True, 

453 "science_program": False, 

454 "visit_id": False, 

455 } 

456 

457 if isinstance(header, ObservationInfo): 

458 obsInfo = header 

459 missing = [] 

460 # Need to check the required properties are present. 

461 for property, required in ingest_subset.items(): 

462 if not required: 

463 continue 

464 # getattr does not need to be protected because it is using 

465 # the defined list above containing properties that must exist. 

466 value = getattr(obsInfo, property) 

467 if value is None: 

468 missing.append(property) 

469 if missing: 

470 raise ValueError(f"Requested required properties are missing from file {filename}:" 

471 f" {missing} (via JSON)") 

472 

473 else: 

474 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename), 

475 required={k for k in ingest_subset if ingest_subset[k]}, 

476 subset=set(ingest_subset)) 

477 

478 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

479 exposure=obsInfo.exposure_id, 

480 detector=obsInfo.detector_num, 

481 universe=self.universe) 

482 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

483 

484 def locateAndReadIndexFiles(self, files): 

485 """Given a list of files, look for index files and read them. 

486 

487 Index files can either be explicitly in the list of files to 

488 ingest, or else located in the same directory as a file to ingest. 

489 Index entries are always used if present. 

490 

491 Parameters 

492 ---------- 

493 files : iterable over `ButlerURI` 

494 URIs to the files to be ingested. 

495 

496 Returns 

497 ------- 

498 index : `dict` [`str`, Any] 

499 Merged contents of all relevant index files found. These can 

500 be explicitly specified index files or ones found in the 

501 directory alongside a data file to be ingested. 

502 updated_files : iterable of `str` 

503 Updated list of the input files with entries removed that were 

504 found listed in an index file. Order is not guaranteed to 

505 match the order of the files given to this routine. 

506 bad_index_files: `set[str]` 

507 Files that looked like index files but failed to read properly. 

508 """ 

509 # Convert the paths to absolute for easy comparison with index content. 

510 # Do not convert to real paths since we have to assume that index 

511 # files are in this location and not the location which it links to. 

512 files = tuple(f.abspath() for f in files) 

513 

514 # Index files must be named this. 

515 index_root_file = "_index.json" 

516 

517 # Group the files by directory. 

518 files_by_directory = defaultdict(set) 

519 

520 for path in files: 

521 directory, file_in_dir = path.split() 

522 files_by_directory[directory].add(file_in_dir) 

523 

524 # All the metadata read from index files with keys of full path. 

525 index_entries = {} 

526 

527 # Index files we failed to read. 

528 bad_index_files = set() 

529 

530 # Any good index files that were found and used. 

531 good_index_files = set() 

532 

533 # Look for index files in those directories. 

534 for directory, files_in_directory in files_by_directory.items(): 

535 possible_index_file = directory.join(index_root_file) 

536 if possible_index_file.exists(): 

537 # If we are explicitly requesting an index file the 

538 # messages should be different. 

539 index_msg = "inferred" 

540 is_implied = True 

541 if index_root_file in files_in_directory: 

542 index_msg = "explicit" 

543 is_implied = False 

544 

545 # Try to read the index file and catch and report any 

546 # problems. 

547 try: 

548 content = json.loads(possible_index_file.read()) 

549 index = process_index_data(content, force_dict=True) 

550 except Exception as e: 

551 # Only trigger the callback if the index file 

552 # was asked for explicitly. Triggering on implied file 

553 # might be surprising. 

554 if not is_implied: 

555 self._on_metadata_failure(possible_index_file, e) 

556 if self.config.failFast: 

557 raise RuntimeError(f"Problem reading index file from {index_msg} " 

558 f"location {possible_index_file}") from e 

559 bad_index_files.add(possible_index_file) 

560 continue 

561 

562 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

563 good_index_files.add(possible_index_file) 

564 

565 # Go through the index adding entries for files. 

566 # If we have non-index files in this directory marked for 

567 # ingest we should only get index information for those. 

568 # If the index file was explicit we use all entries. 

569 if is_implied: 

570 files_to_ingest = files_in_directory 

571 else: 

572 files_to_ingest = set(index) 

573 

574 # Copy relevant metadata into a single dict for all index 

575 # entries. 

576 for file_in_dir in files_to_ingest: 

577 # Skip an explicitly specified index file. 

578 # This should never happen because an explicit index 

579 # file will force ingest of all files in the index 

580 # and not use the explicit file list. If somehow 

581 # this is not true we continue. Raising an exception 

582 # seems like the wrong thing to do since this is harmless. 

583 if file_in_dir == index_root_file: 

584 self.log.info("Logic error found scanning directory %s. Please file ticket.", 

585 directory) 

586 continue 

587 if file_in_dir in index: 

588 file = directory.join(file_in_dir) 

589 if file in index_entries: 

590 # ObservationInfo overrides raw metadata 

591 if isinstance(index[file_in_dir], ObservationInfo) \ 

592 and not isinstance(index_entries[file], ObservationInfo): 

593 self.log.warning("File %s already specified in an index file but overriding" 

594 " with ObservationInfo content from %s", 

595 file, possible_index_file) 

596 else: 

597 self.log.warning("File %s already specified in an index file, " 

598 "ignoring content from %s", file, possible_index_file) 

599 # Do nothing in this case 

600 continue 

601 

602 index_entries[file] = index[file_in_dir] 

603 

604 # Remove files from list that have index entries and also 

605 # any files that we determined to be explicit index files 

606 # or any index files that we failed to read. 

607 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

608 

609 # The filtered list loses the initial order. Retaining the order 

610 # is good for testing but does have a cost if there are many 

611 # files when copying the good values out. A dict would have faster 

612 # lookups (using the files as keys) but use more memory. 

613 ordered = [f for f in filtered if f in files] 

614 

615 return index_entries, ordered, good_index_files, bad_index_files 

616 

617 def processIndexEntries(self, index_entries): 

618 """Convert index entries to RawFileData. 

619 

620 Parameters 

621 ---------- 

622 index_entries : `dict` [`str`, Any] 

623 Dict indexed by name of file to ingest and with keys either 

624 raw metadata or translated 

625 `~astro_metadata_translator.ObservationInfo`. 

626 

627 Returns 

628 ------- 

629 data : `RawFileData` 

630 A structure containing the metadata extracted from the file, 

631 as well as the original filename. All fields will be populated, 

632 but the `RawFileData.dataId` attribute will be a minimal 

633 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

634 """ 

635 fileData = [] 

636 for filename, metadata in index_entries.items(): 

637 try: 

638 datasets = [self._calculate_dataset_info(metadata, filename)] 

639 except Exception as e: 

640 self.log.debug("Problem extracting metadata for file %s found in index file: %s", 

641 filename, e) 

642 datasets = [] 

643 formatterClass = Formatter 

644 instrument = None 

645 self._on_metadata_failure(filename, e) 

646 if self.config.failFast: 

647 raise RuntimeError(f"Problem extracting metadata for file {filename} " 

648 "found in index file") from e 

649 else: 

650 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, 

651 filename) 

652 if instrument is None: 

653 datasets = [] 

654 fileData.append(RawFileData(datasets=datasets, filename=filename, 

655 FormatterClass=formatterClass, instrument=instrument)) 

656 return fileData 

657 

658 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

659 """Group an iterable of `RawFileData` by exposure. 

660 

661 Parameters 

662 ---------- 

663 files : iterable of `RawFileData` 

664 File-level information to group. 

665 

666 Returns 

667 ------- 

668 exposures : `list` of `RawExposureData` 

669 A list of structures that group the file-level information by 

670 exposure. All fields will be populated. The 

671 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

672 `~lsst.daf.butler.DataCoordinate` instances. 

673 """ 

674 exposureDimensions = self.universe["exposure"].graph 

675 byExposure = defaultdict(list) 

676 for f in files: 

677 # Assume that the first dataset is representative for the file. 

678 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

679 

680 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

681 for dataId, exposureFiles in byExposure.items()] 

682 

683 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

684 """Expand the data IDs associated with a raw exposure. 

685 

686 This adds the metadata records. 

687 

688 Parameters 

689 ---------- 

690 exposure : `RawExposureData` 

691 A structure containing information about the exposure to be 

692 ingested. Must have `RawExposureData.records` populated. Should 

693 be considered consumed upon return. 

694 

695 Returns 

696 ------- 

697 exposure : `RawExposureData` 

698 An updated version of the input structure, with 

699 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

700 updated to data IDs for which 

701 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

702 """ 

703 # We start by expanded the exposure-level data ID; we won't use that 

704 # directly in file ingest, but this lets us do some database lookups 

705 # once per exposure instead of once per file later. 

706 data.dataId = self.butler.registry.expandDataId( 

707 data.dataId, 

708 # We pass in the records we'll be inserting shortly so they aren't 

709 # looked up from the database. We do expect instrument and filter 

710 # records to be retrieved from the database here (though the 

711 # Registry may cache them so there isn't a lookup every time). 

712 records={ 

713 self.butler.registry.dimensions["exposure"]: data.record, 

714 } 

715 ) 

716 # Now we expand the per-file (exposure+detector) data IDs. This time 

717 # we pass in the records we just retrieved from the exposure data ID 

718 # expansion. 

719 for file in data.files: 

720 for dataset in file.datasets: 

721 dataset.dataId = self.butler.registry.expandDataId( 

722 dataset.dataId, 

723 records=dict(data.dataId.records) 

724 ) 

725 return data 

726 

727 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1 

728 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

729 """Perform all non-database-updating ingest preprocessing steps. 

730 

731 Parameters 

732 ---------- 

733 files : iterable over `str` or path-like objects 

734 Paths to the files to be ingested. Will be made absolute 

735 if they are not already. 

736 pool : `multiprocessing.Pool`, optional 

737 If not `None`, a process pool with which to parallelize some 

738 operations. 

739 processes : `int`, optional 

740 The number of processes to use. Ignored if ``pool`` is not `None`. 

741 

742 Returns 

743 ------- 

744 exposures : `Iterator` [ `RawExposureData` ] 

745 Data structures containing dimension records, filenames, and data 

746 IDs to be ingested (one structure for each exposure). 

747 bad_files : `list` of `str` 

748 List of all the files that could not have metadata extracted. 

749 """ 

750 if pool is None and processes > 1: 

751 pool = Pool(processes) 

752 mapFunc = map if pool is None else pool.imap_unordered 

753 

754 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

755 """Filter out bad files and return good with list of bad.""" 

756 good_files = [] 

757 bad_files = [] 

758 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)): 

759 if not fileDatum.datasets: 

760 bad_files.append(fileDatum.filename) 

761 else: 

762 good_files.append(fileDatum) 

763 return good_files, bad_files 

764 

765 # Look for index files and read them. 

766 # There should be far fewer index files than data files. 

767 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

768 if bad_index_files: 

769 self.log.info("Failed to read the following explicitly requested index files:"), 

770 for bad in sorted(bad_index_files): 

771 self.log.info("- %s", bad) 

772 

773 # Now convert all the index file entries to standard form for ingest. 

774 bad_index_file_data = [] 

775 indexFileData = self.processIndexEntries(index_entries) 

776 if indexFileData: 

777 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

778 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s" 

779 " with %d failure%s", 

780 *_log_msg_counter(indexFileData), 

781 *_log_msg_counter(good_index_files), 

782 *_log_msg_counter(bad_index_file_data)) 

783 

784 # Extract metadata and build per-detector regions. 

785 # This could run in a subprocess so collect all output 

786 # before looking at failures. 

787 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

788 

789 # Filter out all the failed reads and store them for later 

790 # reporting. 

791 fileData, bad_files = _partition_good_bad(fileData) 

792 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

793 *_log_msg_counter(fileData), 

794 *_log_msg_counter(bad_files)) 

795 

796 # Combine with data from index files. 

797 fileData.extend(indexFileData) 

798 bad_files.extend(bad_index_file_data) 

799 bad_files.extend(bad_index_files) 

800 

801 # Use that metadata to group files (and extracted metadata) by 

802 # exposure. Never parallelized because it's intrinsically a gather 

803 # step. 

804 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

805 

806 # The next operation operates on RawExposureData instances (one at 

807 # a time) in-place and then returns the modified instance. We call it 

808 # as a pass-through instead of relying on the arguments we pass in to 

809 # have been modified because in the parallel case those arguments are 

810 # going to be pickled and unpickled, and I'm not certain 

811 # multiprocessing is careful enough with that for output arguments to 

812 # work. 

813 

814 # Expand the data IDs to include all dimension metadata; we need this 

815 # because we may need to generate path templates that rely on that 

816 # metadata. 

817 # This is the first step that involves actual database calls (but just 

818 # SELECTs), so if there's going to be a problem with connections vs. 

819 # multiple processes, or lock contention (in SQLite) slowing things 

820 # down, it'll happen here. 

821 return mapFunc(self.expandDataIds, exposureData), bad_files 

822 

823 def ingestExposureDatasets( 

824 self, 

825 exposure: RawExposureData, 

826 *, 

827 run: Optional[str] = None, 

828 skip_existing_exposures: bool = False, 

829 ) -> List[FileDataset]: 

830 """Ingest all raw files in one exposure. 

831 

832 Parameters 

833 ---------- 

834 exposure : `RawExposureData` 

835 A structure containing information about the exposure to be 

836 ingested. Must have `RawExposureData.records` populated and all 

837 data ID attributes expanded. 

838 run : `str`, optional 

839 Name of a RUN-type collection to write to, overriding 

840 ``self.butler.run``. 

841 skip_existing_exposures : `bool`, optional 

842 If `True` (`False` is default), skip raws that have already been 

843 ingested (i.e. raws for which we already have a dataset with the 

844 same data ID in the target collection, even if from another file). 

845 Note that this is much slower than just not passing 

846 already-ingested files as inputs, because we still need to read and 

847 process metadata to identify which exposures to search for. It 

848 also will not work reliably if multiple processes are attempting to 

849 ingest raws from the same exposure concurrently, in that different 

850 processes may still attempt to ingest the same raw and conflict, 

851 causing a failure that prevents other raws from the same exposure 

852 from being ingested. 

853 

854 Returns 

855 ------- 

856 datasets : `list` of `lsst.daf.butler.FileDataset` 

857 Per-file structures identifying the files ingested and their 

858 dataset representation in the data repository. 

859 """ 

860 if skip_existing_exposures: 

861 existing = { 

862 ref.dataId for ref in self.butler.registry.queryDatasets( 

863 self.datasetType, 

864 collections=[run], 

865 dataId=exposure.dataId, 

866 ) 

867 } 

868 else: 

869 existing = set() 

870 datasets = [] 

871 for file in exposure.files: 

872 refs = [ 

873 DatasetRef(self.datasetType, d.dataId) 

874 for d in file.datasets 

875 if d.dataId not in existing 

876 ] 

877 if refs: 

878 datasets.append( 

879 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

880 ) 

881 

882 # Raw files are preferentially ingested using a UUID derived from 

883 # the collection name and dataId. 

884 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

885 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

886 else: 

887 mode = DatasetIdGenEnum.UNIQUE 

888 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run, idGenerationMode=mode) 

889 return datasets 

890 

891 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1, 

892 run: Optional[str] = None, 

893 skip_existing_exposures: bool = False, 

894 update_exposure_records: bool = False): 

895 """Ingest files into a Butler data repository. 

896 

897 This creates any new exposure or visit Dimension entries needed to 

898 identify the ingested files, creates new Dataset entries in the 

899 Registry and finally ingests the files themselves into the Datastore. 

900 Any needed instrument, detector, and physical_filter Dimension entries 

901 must exist in the Registry before `run` is called. 

902 

903 Parameters 

904 ---------- 

905 files : iterable over `ButlerURI` 

906 URIs to the files to be ingested. 

907 pool : `multiprocessing.Pool`, optional 

908 If not `None`, a process pool with which to parallelize some 

909 operations. 

910 processes : `int`, optional 

911 The number of processes to use. Ignored if ``pool`` is not `None`. 

912 run : `str`, optional 

913 Name of a RUN-type collection to write to, overriding 

914 the default derived from the instrument name. 

915 skip_existing_exposures : `bool`, optional 

916 If `True` (`False` is default), skip raws that have already been 

917 ingested (i.e. raws for which we already have a dataset with the 

918 same data ID in the target collection, even if from another file). 

919 Note that this is much slower than just not passing 

920 already-ingested files as inputs, because we still need to read and 

921 process metadata to identify which exposures to search for. It 

922 also will not work reliably if multiple processes are attempting to 

923 ingest raws from the same exposure concurrently, in that different 

924 processes may still attempt to ingest the same raw and conflict, 

925 causing a failure that prevents other raws from the same exposure 

926 from being ingested. 

927 update_exposure_records : `bool`, optional 

928 If `True` (`False` is default), update existing exposure records 

929 that conflict with the new ones instead of rejecting them. THIS IS 

930 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

931 KNOWN TO BE BAD. This should usually be combined with 

932 ``skip_existing_exposures=True``. 

933 

934 Returns 

935 ------- 

936 refs : `list` of `lsst.daf.butler.DatasetRef` 

937 Dataset references for ingested raws. 

938 """ 

939 

940 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

941 

942 # Up to this point, we haven't modified the data repository at all. 

943 # Now we finally do that, with one transaction per exposure. This is 

944 # not parallelized at present because the performance of this step is 

945 # limited by the database server. That may or may not change in the 

946 # future once we increase our usage of bulk inserts and reduce our 

947 # usage of savepoints; we've tried to get everything but the database 

948 # operations done in advance to reduce the time spent inside 

949 # transactions. 

950 self.butler.registry.registerDatasetType(self.datasetType) 

951 

952 refs = [] 

953 runs = set() 

954 n_exposures = 0 

955 n_exposures_failed = 0 

956 n_ingests_failed = 0 

957 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

958 

959 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

960 *_log_msg_counter(exposure.files), 

961 exposure.record.instrument, exposure.record.obs_id) 

962 

963 try: 

964 inserted_or_updated = self.butler.registry.syncDimensionData( 

965 "exposure", 

966 exposure.record, 

967 update=update_exposure_records, 

968 ) 

969 except Exception as e: 

970 self._on_ingest_failure(exposure, e) 

971 n_exposures_failed += 1 

972 self.log.warning("Exposure %s:%s could not be registered: %s", 

973 exposure.record.instrument, exposure.record.obs_id, e) 

974 if self.config.failFast: 

975 raise e 

976 continue 

977 

978 if isinstance(inserted_or_updated, dict): 

979 # Exposure is in the registry and we updated it, so 

980 # syncDimensionData returned a dict. 

981 self.log.info( 

982 "Exposure %s:%s was already present, but columns %s were updated.", 

983 exposure.record.instrument, 

984 exposure.record.obs_id, 

985 str(list(inserted_or_updated.keys())) 

986 ) 

987 

988 # Override default run if nothing specified explicitly. 

989 if run is None: 

990 instrument = exposure.files[0].instrument 

991 this_run = instrument.makeDefaultRawIngestRunName() 

992 else: 

993 this_run = run 

994 if this_run not in runs: 

995 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

996 runs.add(this_run) 

997 try: 

998 datasets_for_exposure = self.ingestExposureDatasets( 

999 exposure, 

1000 run=this_run, 

1001 skip_existing_exposures=skip_existing_exposures, 

1002 ) 

1003 except Exception as e: 

1004 self._on_ingest_failure(exposure, e) 

1005 n_ingests_failed += 1 

1006 self.log.warning("Failed to ingest the following for reason: %s", e) 

1007 for f in exposure.files: 

1008 self.log.warning("- %s", f.filename) 

1009 if self.config.failFast: 

1010 raise e 

1011 continue 

1012 else: 

1013 self._on_success(datasets_for_exposure) 

1014 for dataset in datasets_for_exposure: 

1015 refs.extend(dataset.refs) 

1016 

1017 # Success for this exposure. 

1018 n_exposures += 1 

1019 self.log.info("Exposure %s:%s ingested successfully", 

1020 exposure.record.instrument, exposure.record.obs_id) 

1021 

1022 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1023 

1024 @timeMethod 

1025 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None, 

1026 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True, 

1027 skip_existing_exposures: bool = False, update_exposure_records: bool = False): 

1028 """Ingest files into a Butler data repository. 

1029 

1030 This creates any new exposure or visit Dimension entries needed to 

1031 identify the ingested files, creates new Dataset entries in the 

1032 Registry and finally ingests the files themselves into the Datastore. 

1033 Any needed instrument, detector, and physical_filter Dimension entries 

1034 must exist in the Registry before `run` is called. 

1035 

1036 Parameters 

1037 ---------- 

1038 files : iterable over `ButlerURI`, `str` or path-like objects 

1039 Paths to the files to be ingested. Can refer to directories. 

1040 Will be made absolute if they are not already. 

1041 pool : `multiprocessing.Pool`, optional 

1042 If not `None`, a process pool with which to parallelize some 

1043 operations. 

1044 processes : `int`, optional 

1045 The number of processes to use. Ignored if ``pool`` is not `None`. 

1046 run : `str`, optional 

1047 Name of a RUN-type collection to write to, overriding 

1048 the default derived from the instrument name. 

1049 file_filter : `str` or `re.Pattern`, optional 

1050 Pattern to use to discover files to ingest within directories. 

1051 The default is to search for FITS files. The regex applies to 

1052 files within the directory. 

1053 group_files : `bool`, optional 

1054 Group files by directory if they have been discovered in 

1055 directories. Will not affect files explicitly provided. 

1056 skip_existing_exposures : `bool`, optional 

1057 If `True` (`False` is default), skip raws that have already been 

1058 ingested (i.e. raws for which we already have a dataset with the 

1059 same data ID in the target collection, even if from another file). 

1060 Note that this is much slower than just not passing 

1061 already-ingested files as inputs, because we still need to read and 

1062 process metadata to identify which exposures to search for. It 

1063 also will not work reliably if multiple processes are attempting to 

1064 ingest raws from the same exposure concurrently, in that different 

1065 processes may still attempt to ingest the same raw and conflict, 

1066 causing a failure that prevents other raws from the same exposure 

1067 from being ingested. 

1068 update_exposure_records : `bool`, optional 

1069 If `True` (`False` is default), update existing exposure records 

1070 that conflict with the new ones instead of rejecting them. THIS IS 

1071 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1072 KNOWN TO BE BAD. This should usually be combined with 

1073 ``skip_existing_exposures=True``. 

1074 

1075 Returns 

1076 ------- 

1077 refs : `list` of `lsst.daf.butler.DatasetRef` 

1078 Dataset references for ingested raws. 

1079 

1080 Notes 

1081 ----- 

1082 This method inserts all datasets for an exposure within a transaction, 

1083 guaranteeing that partial exposures are never ingested. The exposure 

1084 dimension record is inserted with `Registry.syncDimensionData` first 

1085 (in its own transaction), which inserts only if a record with the same 

1086 primary key does not already exist. This allows different files within 

1087 the same exposure to be ingested in different runs. 

1088 """ 

1089 

1090 refs = [] 

1091 bad_files = [] 

1092 n_exposures = 0 

1093 n_exposures_failed = 0 

1094 n_ingests_failed = 0 

1095 if group_files: 

1096 for group in ButlerURI.findFileResources(files, file_filter, group_files): 

1097 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1098 group, 

1099 pool=pool, 

1100 processes=processes, 

1101 run=run, 

1102 skip_existing_exposures=skip_existing_exposures, 

1103 update_exposure_records=update_exposure_records, 

1104 ) 

1105 refs.extend(new_refs) 

1106 bad_files.extend(bad) 

1107 n_exposures += n_exp 

1108 n_exposures_failed += n_exp_fail 

1109 n_ingests_failed += n_ingest_fail 

1110 else: 

1111 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1112 ButlerURI.findFileResources(files, file_filter, group_files), 

1113 pool=pool, 

1114 processes=processes, 

1115 run=run, 

1116 skip_existing_exposures=skip_existing_exposures, 

1117 update_exposure_records=update_exposure_records, 

1118 ) 

1119 

1120 had_failure = False 

1121 

1122 if bad_files: 

1123 had_failure = True 

1124 self.log.warning("Could not extract observation metadata from the following:") 

1125 for f in bad_files: 

1126 self.log.warning("- %s", f) 

1127 

1128 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1129 " registration and %d failure%s from file ingest.", 

1130 *_log_msg_counter(n_exposures), 

1131 *_log_msg_counter(n_exposures_failed), 

1132 *_log_msg_counter(n_ingests_failed)) 

1133 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1134 had_failure = True 

1135 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1136 

1137 if had_failure: 

1138 raise RuntimeError("Some failures encountered during ingestion") 

1139 

1140 return refs