Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, merge_headers 

32from astro_metadata_translator.indexing import read_sidecar, read_index 

33from lsst.afw.fits import readMetadata 

34from lsst.daf.butler import ( 

35 Butler, 

36 CollectionType, 

37 DataCoordinate, 

38 DatasetRef, 

39 DatasetType, 

40 DimensionRecord, 

41 DimensionUniverse, 

42 FileDataset, 

43 Formatter, 

44) 

45from lsst.pex.config import Config, ChoiceField, Field 

46from lsst.pipe.base import Task, timeMethod 

47 

48from ._instrument import Instrument, makeExposureRecordFromObsInfo 

49from ._fitsRawFormatterBase import FitsRawFormatterBase 

50 

51 

52def _do_nothing(*args, **kwargs) -> None: 

53 """Do nothing. 

54 

55 This is a function that accepts anything and does nothing. 

56 For use as a default in callback arguments. 

57 """ 

58 pass 

59 

60 

61def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

62 """Count the iterable and return the count and plural modifier. 

63 

64 Parameters 

65 ---------- 

66 noun : Iterable or `int` 

67 Thing to count. If given an integer it is assumed to be the count 

68 to use to calculate modifier. 

69 

70 Returns 

71 ------- 

72 num : `int` 

73 Number of items found in ``noun``. 

74 modifier : `str` 

75 Character to add to the end of a string referring to these items 

76 to indicate whether it was a single item or not. Returns empty 

77 string if there is one item or "s" otherwise. 

78 

79 Examples 

80 -------- 

81 

82 .. code-block:: python 

83 

84 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

85 """ 

86 if isinstance(noun, int): 

87 num = noun 

88 else: 

89 num = len(noun) 

90 return num, "" if num == 1 else "s" 

91 

92 

93@dataclass 

94class RawFileDatasetInfo: 

95 """Information about a single dataset within a raw file.""" 

96 

97 dataId: DataCoordinate 

98 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

99 

100 obsInfo: ObservationInfo 

101 """Standardized observation metadata extracted directly from the file 

102 headers (`astro_metadata_translator.ObservationInfo`). 

103 """ 

104 

105 

106@dataclass 

107class RawFileData: 

108 """Information about a single raw file, used during ingest.""" 

109 

110 datasets: List[RawFileDatasetInfo] 

111 """The information describing each dataset within this raw file. 

112 (`list` of `RawFileDatasetInfo`) 

113 """ 

114 

115 filename: str 

116 """Name of the file this information was extracted from (`str`). 

117 

118 This is the path prior to ingest, not the path after ingest. 

119 """ 

120 

121 FormatterClass: Type[FitsRawFormatterBase] 

122 """Formatter class that should be used to ingest this file (`type`; as 

123 subclass of `FitsRawFormatterBase`). 

124 """ 

125 

126 instrumentClass: Optional[Type[Instrument]] 

127 """The `Instrument` class associated with this file. Can be `None` 

128 if ``datasets`` is an empty list.""" 

129 

130 

131@dataclass 

132class RawExposureData: 

133 """Information about a complete raw exposure, used during ingest.""" 

134 

135 dataId: DataCoordinate 

136 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

137 """ 

138 

139 files: List[RawFileData] 

140 """List of structures containing file-level information. 

141 """ 

142 

143 universe: InitVar[DimensionUniverse] 

144 """Set of all known dimensions. 

145 """ 

146 

147 record: Optional[DimensionRecord] = None 

148 """The exposure `DimensionRecord` that must be inserted into the 

149 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

150 """ 

151 

152 def __post_init__(self, universe: DimensionUniverse): 

153 # We don't care which file or dataset we read metadata from, because 

154 # we're assuming they'll all be the same; just use the first ones. 

155 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

156 

157 

158def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

159 """Create a Config field with options for transferring data between repos. 

160 

161 The allowed options for the field are exactly those supported by 

162 `lsst.daf.butler.Datastore.ingest`. 

163 

164 Parameters 

165 ---------- 

166 doc : `str` 

167 Documentation for the configuration field. 

168 

169 Returns 

170 ------- 

171 field : `lsst.pex.config.ChoiceField` 

172 Configuration field. 

173 """ 

174 return ChoiceField( 

175 doc=doc, 

176 dtype=str, 

177 allowed={"move": "move", 

178 "copy": "copy", 

179 "auto": "choice will depend on datastore", 

180 "direct": "use URI to ingested file directly in datastore", 

181 "link": "hard link falling back to symbolic link", 

182 "hardlink": "hard link", 

183 "symlink": "symbolic (soft) link", 

184 "relsymlink": "relative symbolic link", 

185 }, 

186 optional=True, 

187 default=default 

188 ) 

189 

190 

191class RawIngestConfig(Config): 

192 """Configuration class for RawIngestTask.""" 

193 

194 transfer = makeTransferChoiceField() 

195 failFast = Field( 

196 dtype=bool, 

197 default=False, 

198 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

199 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

200 ) 

201 

202 

203class RawIngestTask(Task): 

204 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

205 

206 Parameters 

207 ---------- 

208 config : `RawIngestConfig` 

209 Configuration for the task. 

210 butler : `~lsst.daf.butler.Butler` 

211 Writeable butler instance, with ``butler.run`` set to the appropriate 

212 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

213 datasets. 

214 on_success : `Callable`, optional 

215 A callback invoked when all of the raws associated with an exposure 

216 are ingested. Will be passed a list of `FileDataset` objects, each 

217 containing one or more resolved `DatasetRef` objects. If this callback 

218 raises it will interrupt the entire ingest process, even if 

219 `RawIngestConfig.failFast` is `False`. 

220 on_metadata_failure : `Callable`, optional 

221 A callback invoked when a failure occurs trying to translate the 

222 metadata for a file. Will be passed the filename and the exception, in 

223 that order, as positional arguments. Guaranteed to be called in an 

224 ``except`` block, allowing the callback to re-raise or replace (with 

225 ``raise ... from``) to override the task's usual error handling (before 

226 `RawIngestConfig.failFast` logic occurs). 

227 on_ingest_failure : `Callable`, optional 

228 A callback invoked when dimension record or dataset insertion into the 

229 database fails for an exposure. Will be passed a `RawExposureData` 

230 instance and the exception, in that order, as positional arguments. 

231 Guaranteed to be called in an ``except`` block, allowing the callback 

232 to re-raise or replace (with ``raise ... from``) to override the task's 

233 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

234 **kwargs 

235 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

236 constructor. 

237 

238 Notes 

239 ----- 

240 Each instance of `RawIngestTask` writes to the same Butler. Each 

241 invocation of `RawIngestTask.run` ingests a list of files. 

242 """ 

243 

244 ConfigClass = RawIngestConfig 

245 

246 _DefaultName = "ingest" 

247 

248 def getDatasetType(self): 

249 """Return the DatasetType of the datasets ingested by this Task.""" 

250 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

251 universe=self.butler.registry.dimensions) 

252 

253 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, 

254 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

255 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

256 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

257 **kwargs: Any): 

258 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

259 super().__init__(config, **kwargs) 

260 self.butler = butler 

261 self.universe = self.butler.registry.dimensions 

262 self.datasetType = self.getDatasetType() 

263 self._on_success = on_success 

264 self._on_metadata_failure = on_metadata_failure 

265 self._on_ingest_failure = on_ingest_failure 

266 

267 # Import all the instrument classes so that we ensure that we 

268 # have all the relevant metadata translators loaded. 

269 Instrument.importAll(self.butler.registry) 

270 

271 def _reduce_kwargs(self): 

272 # Add extra parameters to pickle. 

273 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success, 

274 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure) 

275 

276 def _determine_instrument_formatter(self, dataId, filename): 

277 """Determine the instrument and formatter class. 

278 

279 Parameters 

280 ---------- 

281 dataId : `lsst.daf.butler.DataCoordinate` 

282 The dataId associated with this dataset. 

283 filename : `str` 

284 Filename used for error reporting. 

285 

286 Returns 

287 ------- 

288 instrument : `Instrument` or `None` 

289 Instance of the `Instrument` associated with this dataset. `None` 

290 indicates that the instrument could not be determined. 

291 formatterClass : `type` 

292 Class to be used as the formatter for this dataset. 

293 """ 

294 # The data model currently assumes that whilst multiple datasets 

295 # can be associated with a single file, they must all share the 

296 # same formatter. 

297 try: 

298 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

299 except LookupError as e: 

300 self._on_metadata_failure(filename, e) 

301 self.log.warning("Instrument %s for file %s not known to registry", 

302 dataId["instrument"], filename) 

303 if self.config.failFast: 

304 raise RuntimeError(f"Instrument {dataId['instrument']} for" 

305 f" file {filename} not known to registry") from e 

306 FormatterClass = Formatter 

307 # Indicate that we could not work out the instrument. 

308 instrument = None 

309 else: 

310 FormatterClass = instrument.getRawFormatter(dataId) 

311 return instrument, FormatterClass 

312 

313 def extractMetadata(self, filename: str) -> RawFileData: 

314 """Extract and process metadata from a single raw file. 

315 

316 Parameters 

317 ---------- 

318 filename : `str` 

319 Path to the file. 

320 

321 Returns 

322 ------- 

323 data : `RawFileData` 

324 A structure containing the metadata extracted from the file, 

325 as well as the original filename. All fields will be populated, 

326 but the `RawFileData.dataId` attribute will be a minimal 

327 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

328 ``instrumentClass`` field will be `None` if there is a problem 

329 with metadata extraction. 

330 

331 Notes 

332 ----- 

333 Assumes that there is a single dataset associated with the given 

334 file. Instruments using a single file to store multiple datasets 

335 must implement their own version of this method. 

336 

337 By default the method will catch all exceptions unless the ``failFast`` 

338 configuration item is `True`. If an error is encountered the 

339 `_on_metadata_failure()` method will be called. If no exceptions 

340 result and an error was encountered the returned object will have 

341 a null-instrument class and no datasets. 

342 

343 This method supports sidecar JSON files which can be used to 

344 extract metadata without having to read the data file itself. 

345 The sidecar file is always used if found. 

346 """ 

347 sidecar_fail_msg = "" # Requires prepended space when set. 

348 try: 

349 root, ext = os.path.splitext(filename) 

350 sidecar_file = root + ".json" 

351 if os.path.exists(sidecar_file): 

352 header = read_sidecar(sidecar_file) 

353 sidecar_fail_msg = " (via sidecar)" 

354 else: 

355 # Read the metadata from the data file itself. 

356 # Manually merge the primary and "first data" headers here 

357 # because we do not know in general if an input file has 

358 # set INHERIT=T. 

359 phdu = readMetadata(filename, 0) 

360 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

361 datasets = [self._calculate_dataset_info(header, filename)] 

362 except Exception as e: 

363 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

364 # Indicate to the caller that we failed to read. 

365 datasets = [] 

366 formatterClass = Formatter 

367 instrument = None 

368 self._on_metadata_failure(filename, e) 

369 if self.config.failFast: 

370 raise RuntimeError("Problem extracting metadata for file " 

371 f"{filename}{sidecar_fail_msg}") from e 

372 else: 

373 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

374 # The data model currently assumes that whilst multiple datasets 

375 # can be associated with a single file, they must all share the 

376 # same formatter. 

377 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

378 if instrument is None: 

379 datasets = [] 

380 

381 return RawFileData(datasets=datasets, filename=filename, 

382 FormatterClass=formatterClass, 

383 instrumentClass=instrument) 

384 

385 def _calculate_dataset_info(self, header, filename): 

386 """Calculate a RawFileDatasetInfo from the supplied information. 

387 

388 Parameters 

389 ---------- 

390 header : Mapping or `astro_metadata_translator.ObservationInfo` 

391 Header from the dataset or previously-translated content. 

392 filename : `str` 

393 Filename to use for error messages. 

394 

395 Returns 

396 ------- 

397 dataset : `RawFileDatasetInfo` 

398 The dataId, and observation information associated with this 

399 dataset. 

400 """ 

401 # To ensure we aren't slowed down for no reason, explicitly 

402 # list here the properties we need for the schema. 

403 # Use a dict with values a boolean where True indicates 

404 # that it is required that we calculate this property. 

405 ingest_subset = { 

406 "altaz_begin": False, 

407 "boresight_rotation_coord": False, 

408 "boresight_rotation_angle": False, 

409 "dark_time": False, 

410 "datetime_begin": True, 

411 "datetime_end": True, 

412 "detector_num": True, 

413 "exposure_group": False, 

414 "exposure_id": True, 

415 "exposure_time": True, 

416 "instrument": True, 

417 "tracking_radec": False, 

418 "object": False, 

419 "observation_counter": False, 

420 "observation_id": True, 

421 "observation_reason": False, 

422 "observation_type": True, 

423 "observing_day": False, 

424 "physical_filter": True, 

425 "science_program": False, 

426 "visit_id": False, 

427 } 

428 

429 if isinstance(header, ObservationInfo): 

430 obsInfo = header 

431 missing = [] 

432 # Need to check the required properties are present. 

433 for property, required in ingest_subset.items(): 

434 if not required: 

435 continue 

436 # getattr does not need to be protected because it is using 

437 # the defined list above containing properties that must exist. 

438 value = getattr(obsInfo, property) 

439 if value is None: 

440 missing.append(property) 

441 if missing: 

442 raise ValueError(f"Requested required properties are missing from file {filename}:" 

443 f" {missing} (via JSON)") 

444 

445 else: 

446 obsInfo = ObservationInfo(header, pedantic=False, filename=filename, 

447 required={k for k in ingest_subset if ingest_subset[k]}, 

448 subset=set(ingest_subset)) 

449 

450 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

451 exposure=obsInfo.exposure_id, 

452 detector=obsInfo.detector_num, 

453 universe=self.universe) 

454 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

455 

456 def locateAndReadIndexFiles(self, files): 

457 """Given a list of files, look for index files and read them. 

458 

459 Index files can either be explicitly in the list of files to 

460 ingest, or else located in the same directory as a file to ingest. 

461 Index entries are always used if present. 

462 

463 Parameters 

464 ---------- 

465 files : iterable over `str` or path-like objects 

466 Paths to the files to be ingested. Will be made absolute 

467 if they are not already. 

468 

469 Returns 

470 ------- 

471 index : `dict` [`str`, Any] 

472 Merged contents of all relevant index files found. These can 

473 be explicitly specified index files or ones found in the 

474 directory alongside a data file to be ingested. 

475 updated_files : iterable of `str` 

476 Updated list of the input files with entries removed that were 

477 found listed in an index file. Order is not guaranteed to 

478 match the order of the files given to this routine. 

479 bad_index_files: `set[str]` 

480 Files that looked like index files but failed to read properly. 

481 """ 

482 # Convert the paths to absolute for easy comparison with index content. 

483 # Do not convert to real paths since we have to assume that index 

484 # files are in this location and not the location which it links to. 

485 files = tuple(os.path.abspath(f) for f in files) 

486 

487 # Index files must be named this. 

488 index_root_file = "_index.json" 

489 

490 # Group the files by directory. 

491 files_by_directory = defaultdict(set) 

492 

493 for path in files: 

494 directory, file_in_dir = os.path.split(path) 

495 files_by_directory[directory].add(file_in_dir) 

496 

497 # All the metadata read from index files with keys of full path. 

498 index_entries = {} 

499 

500 # Index files we failed to read. 

501 bad_index_files = set() 

502 

503 # Any good index files that were found and used. 

504 good_index_files = set() 

505 

506 # Look for index files in those directories. 

507 for directory, files_in_directory in files_by_directory.items(): 

508 possible_index_file = os.path.join(directory, index_root_file) 

509 if os.path.exists(possible_index_file): 

510 # If we are explicitly requesting an index file the 

511 # messages should be different. 

512 index_msg = "inferred" 

513 is_implied = True 

514 if index_root_file in files_in_directory: 

515 index_msg = "explicit" 

516 is_implied = False 

517 

518 # Try to read the index file and catch and report any 

519 # problems. 

520 try: 

521 index = read_index(possible_index_file, force_dict=True) 

522 except Exception as e: 

523 # Only trigger the callback if the index file 

524 # was asked for explicitly. Triggering on implied file 

525 # might be surprising. 

526 if not is_implied: 

527 self._on_metadata_failure(possible_index_file, e) 

528 if self.config.failFast: 

529 raise RuntimeError(f"Problem reading index file from {index_msg} " 

530 f"location {possible_index_file}") from e 

531 bad_index_files.add(possible_index_file) 

532 continue 

533 

534 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

535 good_index_files.add(possible_index_file) 

536 

537 # Go through the index adding entries for files. 

538 # If we have non-index files in this directory marked for 

539 # ingest we should only get index information for those. 

540 # If the index file was explicit we use all entries. 

541 if is_implied: 

542 files_to_ingest = files_in_directory 

543 else: 

544 files_to_ingest = set(index) 

545 

546 # Copy relevant metadata into a single dict for all index 

547 # entries. 

548 for file_in_dir in files_to_ingest: 

549 # Skip an explicitly specified index file. 

550 # This should never happen because an explicit index 

551 # file will force ingest of all files in the index 

552 # and not use the explicit file list. If somehow 

553 # this is not true we continue. Raising an exception 

554 # seems like the wrong thing to do since this is harmless. 

555 if file_in_dir == index_root_file: 

556 self.log.info("Logic error found scanning directory %s. Please file ticket.", 

557 directory) 

558 continue 

559 if file_in_dir in index: 

560 file = os.path.abspath(os.path.join(directory, file_in_dir)) 

561 if file in index_entries: 

562 # ObservationInfo overrides raw metadata 

563 if isinstance(index[file_in_dir], ObservationInfo) \ 

564 and not isinstance(index_entries[file], ObservationInfo): 

565 self.log.warning("File %s already specified in an index file but overriding" 

566 " with ObservationInfo content from %s", 

567 file, possible_index_file) 

568 else: 

569 self.log.warning("File %s already specified in an index file, " 

570 "ignoring content from %s", file, possible_index_file) 

571 # Do nothing in this case 

572 continue 

573 

574 index_entries[file] = index[file_in_dir] 

575 

576 # Remove files from list that have index entries and also 

577 # any files that we determined to be explicit index files 

578 # or any index files that we failed to read. 

579 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

580 

581 # The filtered list loses the initial order. Retaining the order 

582 # is good for testing but does have a cost if there are many 

583 # files when copying the good values out. A dict would have faster 

584 # lookups (using the files as keys) but use more memory. 

585 ordered = [f for f in filtered if f in files] 

586 

587 return index_entries, ordered, good_index_files, bad_index_files 

588 

589 def processIndexEntries(self, index_entries): 

590 """Convert index entries to RawFileData. 

591 

592 Parameters 

593 ---------- 

594 index_entries : `dict` [`str`, Any] 

595 Dict indexed by name of file to ingest and with keys either 

596 raw metadata or translated 

597 `~astro_metadata_translator.ObservationInfo`. 

598 

599 Returns 

600 ------- 

601 data : `RawFileData` 

602 A structure containing the metadata extracted from the file, 

603 as well as the original filename. All fields will be populated, 

604 but the `RawFileData.dataId` attribute will be a minimal 

605 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

606 """ 

607 fileData = [] 

608 for filename, metadata in index_entries.items(): 

609 try: 

610 datasets = [self._calculate_dataset_info(metadata, filename)] 

611 except Exception as e: 

612 self.log.debug("Problem extracting metadata for file %s found in index file: %s", 

613 filename, e) 

614 datasets = [] 

615 formatterClass = Formatter 

616 instrument = None 

617 self._on_metadata_failure(filename, e) 

618 if self.config.failFast: 

619 raise RuntimeError(f"Problem extracting metadata for file {filename} " 

620 "found in index file") from e 

621 else: 

622 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, 

623 filename) 

624 if instrument is None: 

625 datasets = [] 

626 fileData.append(RawFileData(datasets=datasets, filename=filename, 

627 FormatterClass=formatterClass, instrumentClass=instrument)) 

628 return fileData 

629 

630 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

631 """Group an iterable of `RawFileData` by exposure. 

632 

633 Parameters 

634 ---------- 

635 files : iterable of `RawFileData` 

636 File-level information to group. 

637 

638 Returns 

639 ------- 

640 exposures : `list` of `RawExposureData` 

641 A list of structures that group the file-level information by 

642 exposure. All fields will be populated. The 

643 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

644 `~lsst.daf.butler.DataCoordinate` instances. 

645 """ 

646 exposureDimensions = self.universe["exposure"].graph 

647 byExposure = defaultdict(list) 

648 for f in files: 

649 # Assume that the first dataset is representative for the file. 

650 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

651 

652 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

653 for dataId, exposureFiles in byExposure.items()] 

654 

655 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

656 """Expand the data IDs associated with a raw exposure. 

657 

658 This adds the metadata records. 

659 

660 Parameters 

661 ---------- 

662 exposure : `RawExposureData` 

663 A structure containing information about the exposure to be 

664 ingested. Must have `RawExposureData.records` populated. Should 

665 be considered consumed upon return. 

666 

667 Returns 

668 ------- 

669 exposure : `RawExposureData` 

670 An updated version of the input structure, with 

671 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

672 updated to data IDs for which 

673 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

674 """ 

675 # We start by expanded the exposure-level data ID; we won't use that 

676 # directly in file ingest, but this lets us do some database lookups 

677 # once per exposure instead of once per file later. 

678 data.dataId = self.butler.registry.expandDataId( 

679 data.dataId, 

680 # We pass in the records we'll be inserting shortly so they aren't 

681 # looked up from the database. We do expect instrument and filter 

682 # records to be retrieved from the database here (though the 

683 # Registry may cache them so there isn't a lookup every time). 

684 records={ 

685 self.butler.registry.dimensions["exposure"]: data.record, 

686 } 

687 ) 

688 # Now we expand the per-file (exposure+detector) data IDs. This time 

689 # we pass in the records we just retrieved from the exposure data ID 

690 # expansion. 

691 for file in data.files: 

692 for dataset in file.datasets: 

693 dataset.dataId = self.butler.registry.expandDataId( 

694 dataset.dataId, 

695 records=dict(data.dataId.records) 

696 ) 

697 return data 

698 

699 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1 

700 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

701 """Perform all non-database-updating ingest preprocessing steps. 

702 

703 Parameters 

704 ---------- 

705 files : iterable over `str` or path-like objects 

706 Paths to the files to be ingested. Will be made absolute 

707 if they are not already. 

708 pool : `multiprocessing.Pool`, optional 

709 If not `None`, a process pool with which to parallelize some 

710 operations. 

711 processes : `int`, optional 

712 The number of processes to use. Ignored if ``pool`` is not `None`. 

713 

714 Returns 

715 ------- 

716 exposures : `Iterator` [ `RawExposureData` ] 

717 Data structures containing dimension records, filenames, and data 

718 IDs to be ingested (one structure for each exposure). 

719 bad_files : `list` of `str` 

720 List of all the files that could not have metadata extracted. 

721 """ 

722 if pool is None and processes > 1: 

723 pool = Pool(processes) 

724 mapFunc = map if pool is None else pool.imap_unordered 

725 

726 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

727 """Filter out bad files and return good with list of bad.""" 

728 good_files = [] 

729 bad_files = [] 

730 for fileDatum in file_data: 

731 if not fileDatum.datasets: 

732 bad_files.append(fileDatum.filename) 

733 else: 

734 good_files.append(fileDatum) 

735 return good_files, bad_files 

736 

737 # Look for index files and read them. 

738 # There should be far fewer index files than data files. 

739 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

740 if bad_index_files: 

741 self.log.info("Failed to read the following explicitly requested index files:"), 

742 for bad in sorted(bad_index_files): 

743 self.log.info("- %s", bad) 

744 

745 # Now convert all the index file entries to standard form for ingest. 

746 bad_index_file_data = [] 

747 indexFileData = self.processIndexEntries(index_entries) 

748 if indexFileData: 

749 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

750 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s" 

751 " with %d failure%s", 

752 *_log_msg_counter(indexFileData), 

753 *_log_msg_counter(good_index_files), 

754 *_log_msg_counter(bad_index_file_data)) 

755 

756 # Extract metadata and build per-detector regions. 

757 # This could run in a subprocess so collect all output 

758 # before looking at failures. 

759 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

760 

761 # Filter out all the failed reads and store them for later 

762 # reporting. 

763 fileData, bad_files = _partition_good_bad(fileData) 

764 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

765 *_log_msg_counter(fileData), 

766 *_log_msg_counter(bad_files)) 

767 

768 # Combine with data from index files. 

769 fileData.extend(indexFileData) 

770 bad_files.extend(bad_index_file_data) 

771 bad_files.extend(bad_index_files) 

772 

773 # Use that metadata to group files (and extracted metadata) by 

774 # exposure. Never parallelized because it's intrinsically a gather 

775 # step. 

776 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

777 

778 # The next operation operates on RawExposureData instances (one at 

779 # a time) in-place and then returns the modified instance. We call it 

780 # as a pass-through instead of relying on the arguments we pass in to 

781 # have been modified because in the parallel case those arguments are 

782 # going to be pickled and unpickled, and I'm not certain 

783 # multiprocessing is careful enough with that for output arguments to 

784 # work. 

785 

786 # Expand the data IDs to include all dimension metadata; we need this 

787 # because we may need to generate path templates that rely on that 

788 # metadata. 

789 # This is the first step that involves actual database calls (but just 

790 # SELECTs), so if there's going to be a problem with connections vs. 

791 # multiple processes, or lock contention (in SQLite) slowing things 

792 # down, it'll happen here. 

793 return mapFunc(self.expandDataIds, exposureData), bad_files 

794 

795 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

796 ) -> List[FileDataset]: 

797 """Ingest all raw files in one exposure. 

798 

799 Parameters 

800 ---------- 

801 exposure : `RawExposureData` 

802 A structure containing information about the exposure to be 

803 ingested. Must have `RawExposureData.records` populated and all 

804 data ID attributes expanded. 

805 run : `str`, optional 

806 Name of a RUN-type collection to write to, overriding 

807 ``self.butler.run``. 

808 

809 Returns 

810 ------- 

811 datasets : `list` of `lsst.daf.butler.FileDataset` 

812 Per-file structures identifying the files ingested and their 

813 dataset representation in the data repository. 

814 """ 

815 datasets = [FileDataset(path=os.path.abspath(file.filename), 

816 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

817 formatter=file.FormatterClass) 

818 for file in exposure.files] 

819 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

820 return datasets 

821 

822 @timeMethod 

823 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

824 """Ingest files into a Butler data repository. 

825 

826 This creates any new exposure or visit Dimension entries needed to 

827 identify the ingested files, creates new Dataset entries in the 

828 Registry and finally ingests the files themselves into the Datastore. 

829 Any needed instrument, detector, and physical_filter Dimension entries 

830 must exist in the Registry before `run` is called. 

831 

832 Parameters 

833 ---------- 

834 files : iterable over `str` or path-like objects 

835 Paths to the files to be ingested. Will be made absolute 

836 if they are not already. 

837 pool : `multiprocessing.Pool`, optional 

838 If not `None`, a process pool with which to parallelize some 

839 operations. 

840 processes : `int`, optional 

841 The number of processes to use. Ignored if ``pool`` is not `None`. 

842 run : `str`, optional 

843 Name of a RUN-type collection to write to, overriding 

844 the default derived from the instrument name. 

845 

846 Returns 

847 ------- 

848 refs : `list` of `lsst.daf.butler.DatasetRef` 

849 Dataset references for ingested raws. 

850 

851 Notes 

852 ----- 

853 This method inserts all datasets for an exposure within a transaction, 

854 guaranteeing that partial exposures are never ingested. The exposure 

855 dimension record is inserted with `Registry.syncDimensionData` first 

856 (in its own transaction), which inserts only if a record with the same 

857 primary key does not already exist. This allows different files within 

858 the same exposure to be incremented in different runs. 

859 """ 

860 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

861 # Up to this point, we haven't modified the data repository at all. 

862 # Now we finally do that, with one transaction per exposure. This is 

863 # not parallelized at present because the performance of this step is 

864 # limited by the database server. That may or may not change in the 

865 # future once we increase our usage of bulk inserts and reduce our 

866 # usage of savepoints; we've tried to get everything but the database 

867 # operations done in advance to reduce the time spent inside 

868 # transactions. 

869 self.butler.registry.registerDatasetType(self.datasetType) 

870 refs = [] 

871 runs = set() 

872 n_exposures = 0 

873 n_exposures_failed = 0 

874 n_ingests_failed = 0 

875 for exposure in exposureData: 

876 

877 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

878 *_log_msg_counter(exposure.files), 

879 exposure.record.instrument, exposure.record.obs_id) 

880 

881 try: 

882 self.butler.registry.syncDimensionData("exposure", exposure.record) 

883 except Exception as e: 

884 self._on_ingest_failure(exposure, e) 

885 n_exposures_failed += 1 

886 self.log.warning("Exposure %s:%s could not be registered: %s", 

887 exposure.record.instrument, exposure.record.obs_id, e) 

888 if self.config.failFast: 

889 raise e 

890 continue 

891 

892 # Override default run if nothing specified explicitly. 

893 if run is None: 

894 instrumentClass = exposure.files[0].instrumentClass 

895 this_run = instrumentClass.makeDefaultRawIngestRunName() 

896 else: 

897 this_run = run 

898 if this_run not in runs: 

899 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

900 runs.add(this_run) 

901 try: 

902 with self.butler.transaction(): 

903 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run) 

904 except Exception as e: 

905 self._on_ingest_failure(exposure, e) 

906 n_ingests_failed += 1 

907 self.log.warning("Failed to ingest the following for reason: %s", e) 

908 for f in exposure.files: 

909 self.log.warning("- %s", f.filename) 

910 if self.config.failFast: 

911 raise e 

912 continue 

913 else: 

914 self._on_success(datasets_for_exposure) 

915 for dataset in datasets_for_exposure: 

916 refs.extend(dataset.refs) 

917 

918 # Success for this exposure. 

919 n_exposures += 1 

920 self.log.info("Exposure %s:%s ingested successfully", 

921 exposure.record.instrument, exposure.record.obs_id) 

922 

923 had_failure = False 

924 

925 if bad_files: 

926 had_failure = True 

927 self.log.warning("Could not extract observation metadata from the following:") 

928 for f in bad_files: 

929 self.log.warning("- %s", f) 

930 

931 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

932 " registration and %d failure%s from file ingest.", 

933 *_log_msg_counter(n_exposures), 

934 *_log_msg_counter(n_exposures_failed), 

935 *_log_msg_counter(n_ingests_failed)) 

936 if n_exposures_failed > 0 or n_ingests_failed > 0: 

937 had_failure = True 

938 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

939 

940 if had_failure: 

941 raise RuntimeError("Some failures encountered during ingestion") 

942 

943 return refs