Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from dataclasses import dataclass, InitVar 

28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, merge_headers, MetadataTranslator 

33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 ButlerURI, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetIdGenEnum, 

41 DatasetRef, 

42 DatasetType, 

43 DimensionRecord, 

44 DimensionUniverse, 

45 FileDataset, 

46 Formatter, 

47 Progress, 

48) 

49from lsst.pex.config import Config, ChoiceField, Field 

50from lsst.pipe.base import Task, timeMethod 

51 

52from ._instrument import Instrument, makeExposureRecordFromObsInfo 

53from ._fitsRawFormatterBase import FitsRawFormatterBase 

54 

55 

56def _do_nothing(*args, **kwargs) -> None: 

57 """Do nothing. 

58 

59 This is a function that accepts anything and does nothing. 

60 For use as a default in callback arguments. 

61 """ 

62 pass 

63 

64 

65def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

66 """Count the iterable and return the count and plural modifier. 

67 

68 Parameters 

69 ---------- 

70 noun : Iterable or `int` 

71 Thing to count. If given an integer it is assumed to be the count 

72 to use to calculate modifier. 

73 

74 Returns 

75 ------- 

76 num : `int` 

77 Number of items found in ``noun``. 

78 modifier : `str` 

79 Character to add to the end of a string referring to these items 

80 to indicate whether it was a single item or not. Returns empty 

81 string if there is one item or "s" otherwise. 

82 

83 Examples 

84 -------- 

85 

86 .. code-block:: python 

87 

88 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

89 """ 

90 if isinstance(noun, int): 

91 num = noun 

92 else: 

93 num = len(noun) 

94 return num, "" if num == 1 else "s" 

95 

96 

97@dataclass 

98class RawFileDatasetInfo: 

99 """Information about a single dataset within a raw file.""" 

100 

101 dataId: DataCoordinate 

102 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

103 

104 obsInfo: ObservationInfo 

105 """Standardized observation metadata extracted directly from the file 

106 headers (`astro_metadata_translator.ObservationInfo`). 

107 """ 

108 

109 

110@dataclass 

111class RawFileData: 

112 """Information about a single raw file, used during ingest.""" 

113 

114 datasets: List[RawFileDatasetInfo] 

115 """The information describing each dataset within this raw file. 

116 (`list` of `RawFileDatasetInfo`) 

117 """ 

118 

119 filename: ButlerURI 

120 """URI of the file this information was extracted from (`str`). 

121 

122 This is the path prior to ingest, not the path after ingest. 

123 """ 

124 

125 FormatterClass: Type[FitsRawFormatterBase] 

126 """Formatter class that should be used to ingest this file (`type`; as 

127 subclass of `FitsRawFormatterBase`). 

128 """ 

129 

130 instrument: Optional[Instrument] 

131 """The `Instrument` instance associated with this file. Can be `None` 

132 if ``datasets`` is an empty list.""" 

133 

134 

135@dataclass 

136class RawExposureData: 

137 """Information about a complete raw exposure, used during ingest.""" 

138 

139 dataId: DataCoordinate 

140 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

141 """ 

142 

143 files: List[RawFileData] 

144 """List of structures containing file-level information. 

145 """ 

146 

147 universe: InitVar[DimensionUniverse] 

148 """Set of all known dimensions. 

149 """ 

150 

151 record: Optional[DimensionRecord] = None 

152 """The exposure `DimensionRecord` that must be inserted into the 

153 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

154 """ 

155 

156 def __post_init__(self, universe: DimensionUniverse): 

157 # We don't care which file or dataset we read metadata from, because 

158 # we're assuming they'll all be the same; just use the first ones. 

159 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

160 

161 

162def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

163 """Create a Config field with options for transferring data between repos. 

164 

165 The allowed options for the field are exactly those supported by 

166 `lsst.daf.butler.Datastore.ingest`. 

167 

168 Parameters 

169 ---------- 

170 doc : `str` 

171 Documentation for the configuration field. 

172 

173 Returns 

174 ------- 

175 field : `lsst.pex.config.ChoiceField` 

176 Configuration field. 

177 """ 

178 return ChoiceField( 

179 doc=doc, 

180 dtype=str, 

181 allowed={"move": "move", 

182 "copy": "copy", 

183 "auto": "choice will depend on datastore", 

184 "direct": "use URI to ingested file directly in datastore", 

185 "link": "hard link falling back to symbolic link", 

186 "hardlink": "hard link", 

187 "symlink": "symbolic (soft) link", 

188 "relsymlink": "relative symbolic link", 

189 }, 

190 optional=True, 

191 default=default 

192 ) 

193 

194 

195class RawIngestConfig(Config): 

196 """Configuration class for RawIngestTask.""" 

197 

198 transfer = makeTransferChoiceField() 

199 failFast = Field( 

200 dtype=bool, 

201 default=False, 

202 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

203 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

204 ) 

205 

206 

207class RawIngestTask(Task): 

208 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

209 

210 Parameters 

211 ---------- 

212 config : `RawIngestConfig` 

213 Configuration for the task. 

214 butler : `~lsst.daf.butler.Butler` 

215 Writeable butler instance, with ``butler.run`` set to the appropriate 

216 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

217 datasets. 

218 on_success : `Callable`, optional 

219 A callback invoked when all of the raws associated with an exposure 

220 are ingested. Will be passed a list of `FileDataset` objects, each 

221 containing one or more resolved `DatasetRef` objects. If this callback 

222 raises it will interrupt the entire ingest process, even if 

223 `RawIngestConfig.failFast` is `False`. 

224 on_metadata_failure : `Callable`, optional 

225 A callback invoked when a failure occurs trying to translate the 

226 metadata for a file. Will be passed the URI and the exception, in 

227 that order, as positional arguments. Guaranteed to be called in an 

228 ``except`` block, allowing the callback to re-raise or replace (with 

229 ``raise ... from``) to override the task's usual error handling (before 

230 `RawIngestConfig.failFast` logic occurs). 

231 on_ingest_failure : `Callable`, optional 

232 A callback invoked when dimension record or dataset insertion into the 

233 database fails for an exposure. Will be passed a `RawExposureData` 

234 instance and the exception, in that order, as positional arguments. 

235 Guaranteed to be called in an ``except`` block, allowing the callback 

236 to re-raise or replace (with ``raise ... from``) to override the task's 

237 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

238 **kwargs 

239 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

240 constructor. 

241 

242 Notes 

243 ----- 

244 Each instance of `RawIngestTask` writes to the same Butler. Each 

245 invocation of `RawIngestTask.run` ingests a list of files. 

246 """ 

247 

248 ConfigClass = RawIngestConfig 

249 

250 _DefaultName = "ingest" 

251 

252 def getDatasetType(self): 

253 """Return the DatasetType of the datasets ingested by this Task.""" 

254 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

255 universe=self.butler.registry.dimensions) 

256 

257 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, 

258 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

259 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

260 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

261 **kwargs: Any): 

262 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

263 super().__init__(config, **kwargs) 

264 self.butler = butler 

265 self.universe = self.butler.registry.dimensions 

266 self.datasetType = self.getDatasetType() 

267 self._on_success = on_success 

268 self._on_metadata_failure = on_metadata_failure 

269 self._on_ingest_failure = on_ingest_failure 

270 self.progress = Progress("obs.base.RawIngestTask") 

271 

272 # Import all the instrument classes so that we ensure that we 

273 # have all the relevant metadata translators loaded. 

274 Instrument.importAll(self.butler.registry) 

275 

276 def _reduce_kwargs(self): 

277 # Add extra parameters to pickle. 

278 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success, 

279 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure) 

280 

281 def _determine_instrument_formatter(self, dataId, filename): 

282 """Determine the instrument and formatter class. 

283 

284 Parameters 

285 ---------- 

286 dataId : `lsst.daf.butler.DataCoordinate` 

287 The dataId associated with this dataset. 

288 filename : `ButlerURI` 

289 URI of file used for error reporting. 

290 

291 Returns 

292 ------- 

293 instrument : `Instrument` or `None` 

294 Instance of the `Instrument` associated with this dataset. `None` 

295 indicates that the instrument could not be determined. 

296 formatterClass : `type` 

297 Class to be used as the formatter for this dataset. 

298 """ 

299 # The data model currently assumes that whilst multiple datasets 

300 # can be associated with a single file, they must all share the 

301 # same formatter. 

302 try: 

303 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

304 except LookupError as e: 

305 self._on_metadata_failure(filename, e) 

306 self.log.warning("Instrument %s for file %s not known to registry", 

307 dataId["instrument"], filename) 

308 if self.config.failFast: 

309 raise RuntimeError(f"Instrument {dataId['instrument']} for" 

310 f" file {filename} not known to registry") from e 

311 FormatterClass = Formatter 

312 # Indicate that we could not work out the instrument. 

313 instrument = None 

314 else: 

315 FormatterClass = instrument.getRawFormatter(dataId) 

316 return instrument, FormatterClass 

317 

318 def extractMetadata(self, filename: ButlerURI) -> RawFileData: 

319 """Extract and process metadata from a single raw file. 

320 

321 Parameters 

322 ---------- 

323 filename : `ButlerURI` 

324 URI to the file. 

325 

326 Returns 

327 ------- 

328 data : `RawFileData` 

329 A structure containing the metadata extracted from the file, 

330 as well as the original filename. All fields will be populated, 

331 but the `RawFileData.dataId` attribute will be a minimal 

332 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

333 ``instrument`` field will be `None` if there is a problem 

334 with metadata extraction. 

335 

336 Notes 

337 ----- 

338 Assumes that there is a single dataset associated with the given 

339 file. Instruments using a single file to store multiple datasets 

340 must implement their own version of this method. 

341 

342 By default the method will catch all exceptions unless the ``failFast`` 

343 configuration item is `True`. If an error is encountered the 

344 `_on_metadata_failure()` method will be called. If no exceptions 

345 result and an error was encountered the returned object will have 

346 a null-instrument class and no datasets. 

347 

348 This method supports sidecar JSON files which can be used to 

349 extract metadata without having to read the data file itself. 

350 The sidecar file is always used if found. 

351 """ 

352 sidecar_fail_msg = "" # Requires prepended space when set. 

353 try: 

354 sidecar_file = filename.updatedExtension(".json") 

355 if sidecar_file.exists(): 

356 content = json.loads(sidecar_file.read()) 

357 headers = [process_sidecar_data(content)] 

358 sidecar_fail_msg = " (via sidecar)" 

359 else: 

360 # Read the metadata from the data file itself. 

361 

362 # For remote files download the entire file to get the 

363 # header. This is very inefficient and it would be better 

364 # to have some way of knowing where in the file the headers 

365 # are and to only download those parts of the file. 

366 with filename.as_local() as local_file: 

367 # Read the primary. This might be sufficient. 

368 header = readMetadata(local_file.ospath, 0) 

369 

370 try: 

371 # Try to work out a translator class early. 

372 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

373 except ValueError: 

374 # Primary header was not sufficient (maybe this file 

375 # has been compressed or is a MEF with minimal 

376 # primary). Read second header and merge with primary. 

377 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

378 

379 # Try again to work out a translator class, letting this 

380 # fail. 

381 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

382 

383 # Request the headers to use for ingest 

384 headers = translator_class.determine_translatable_headers(filename.ospath, header) 

385 

386 # Add each header to the dataset list 

387 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

388 

389 except Exception as e: 

390 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

391 # Indicate to the caller that we failed to read. 

392 datasets = [] 

393 formatterClass = Formatter 

394 instrument = None 

395 self._on_metadata_failure(filename, e) 

396 if self.config.failFast: 

397 raise RuntimeError("Problem extracting metadata for file " 

398 f"{filename}{sidecar_fail_msg}") from e 

399 else: 

400 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

401 # The data model currently assumes that whilst multiple datasets 

402 # can be associated with a single file, they must all share the 

403 # same formatter. 

404 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

405 if instrument is None: 

406 datasets = [] 

407 

408 return RawFileData(datasets=datasets, filename=filename, 

409 FormatterClass=formatterClass, 

410 instrument=instrument) 

411 

412 def _calculate_dataset_info(self, header, filename): 

413 """Calculate a RawFileDatasetInfo from the supplied information. 

414 

415 Parameters 

416 ---------- 

417 header : Mapping or `astro_metadata_translator.ObservationInfo` 

418 Header from the dataset or previously-translated content. 

419 filename : `ButlerURI` 

420 Filename to use for error messages. 

421 

422 Returns 

423 ------- 

424 dataset : `RawFileDatasetInfo` 

425 The dataId, and observation information associated with this 

426 dataset. 

427 """ 

428 # To ensure we aren't slowed down for no reason, explicitly 

429 # list here the properties we need for the schema. 

430 # Use a dict with values a boolean where True indicates 

431 # that it is required that we calculate this property. 

432 ingest_subset = { 

433 "altaz_begin": False, 

434 "boresight_rotation_coord": False, 

435 "boresight_rotation_angle": False, 

436 "dark_time": False, 

437 "datetime_begin": True, 

438 "datetime_end": True, 

439 "detector_num": True, 

440 "exposure_group": False, 

441 "exposure_id": True, 

442 "exposure_time": True, 

443 "instrument": True, 

444 "tracking_radec": False, 

445 "object": False, 

446 "observation_counter": False, 

447 "observation_id": True, 

448 "observation_reason": False, 

449 "observation_type": True, 

450 "observing_day": False, 

451 "physical_filter": True, 

452 "science_program": False, 

453 "visit_id": False, 

454 } 

455 

456 if isinstance(header, ObservationInfo): 

457 obsInfo = header 

458 missing = [] 

459 # Need to check the required properties are present. 

460 for property, required in ingest_subset.items(): 

461 if not required: 

462 continue 

463 # getattr does not need to be protected because it is using 

464 # the defined list above containing properties that must exist. 

465 value = getattr(obsInfo, property) 

466 if value is None: 

467 missing.append(property) 

468 if missing: 

469 raise ValueError(f"Requested required properties are missing from file {filename}:" 

470 f" {missing} (via JSON)") 

471 

472 else: 

473 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename), 

474 required={k for k in ingest_subset if ingest_subset[k]}, 

475 subset=set(ingest_subset)) 

476 

477 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

478 exposure=obsInfo.exposure_id, 

479 detector=obsInfo.detector_num, 

480 universe=self.universe) 

481 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

482 

483 def locateAndReadIndexFiles(self, files): 

484 """Given a list of files, look for index files and read them. 

485 

486 Index files can either be explicitly in the list of files to 

487 ingest, or else located in the same directory as a file to ingest. 

488 Index entries are always used if present. 

489 

490 Parameters 

491 ---------- 

492 files : iterable over `ButlerURI` 

493 URIs to the files to be ingested. 

494 

495 Returns 

496 ------- 

497 index : `dict` [`str`, Any] 

498 Merged contents of all relevant index files found. These can 

499 be explicitly specified index files or ones found in the 

500 directory alongside a data file to be ingested. 

501 updated_files : iterable of `str` 

502 Updated list of the input files with entries removed that were 

503 found listed in an index file. Order is not guaranteed to 

504 match the order of the files given to this routine. 

505 bad_index_files: `set[str]` 

506 Files that looked like index files but failed to read properly. 

507 """ 

508 # Convert the paths to absolute for easy comparison with index content. 

509 # Do not convert to real paths since we have to assume that index 

510 # files are in this location and not the location which it links to. 

511 files = tuple(f.abspath() for f in files) 

512 

513 # Index files must be named this. 

514 index_root_file = "_index.json" 

515 

516 # Group the files by directory. 

517 files_by_directory = defaultdict(set) 

518 

519 for path in files: 

520 directory, file_in_dir = path.split() 

521 files_by_directory[directory].add(file_in_dir) 

522 

523 # All the metadata read from index files with keys of full path. 

524 index_entries = {} 

525 

526 # Index files we failed to read. 

527 bad_index_files = set() 

528 

529 # Any good index files that were found and used. 

530 good_index_files = set() 

531 

532 # Look for index files in those directories. 

533 for directory, files_in_directory in files_by_directory.items(): 

534 possible_index_file = directory.join(index_root_file) 

535 if possible_index_file.exists(): 

536 # If we are explicitly requesting an index file the 

537 # messages should be different. 

538 index_msg = "inferred" 

539 is_implied = True 

540 if index_root_file in files_in_directory: 

541 index_msg = "explicit" 

542 is_implied = False 

543 

544 # Try to read the index file and catch and report any 

545 # problems. 

546 try: 

547 content = json.loads(possible_index_file.read()) 

548 index = process_index_data(content, force_dict=True) 

549 except Exception as e: 

550 # Only trigger the callback if the index file 

551 # was asked for explicitly. Triggering on implied file 

552 # might be surprising. 

553 if not is_implied: 

554 self._on_metadata_failure(possible_index_file, e) 

555 if self.config.failFast: 

556 raise RuntimeError(f"Problem reading index file from {index_msg} " 

557 f"location {possible_index_file}") from e 

558 bad_index_files.add(possible_index_file) 

559 continue 

560 

561 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

562 good_index_files.add(possible_index_file) 

563 

564 # Go through the index adding entries for files. 

565 # If we have non-index files in this directory marked for 

566 # ingest we should only get index information for those. 

567 # If the index file was explicit we use all entries. 

568 if is_implied: 

569 files_to_ingest = files_in_directory 

570 else: 

571 files_to_ingest = set(index) 

572 

573 # Copy relevant metadata into a single dict for all index 

574 # entries. 

575 for file_in_dir in files_to_ingest: 

576 # Skip an explicitly specified index file. 

577 # This should never happen because an explicit index 

578 # file will force ingest of all files in the index 

579 # and not use the explicit file list. If somehow 

580 # this is not true we continue. Raising an exception 

581 # seems like the wrong thing to do since this is harmless. 

582 if file_in_dir == index_root_file: 

583 self.log.info("Logic error found scanning directory %s. Please file ticket.", 

584 directory) 

585 continue 

586 if file_in_dir in index: 

587 file = directory.join(file_in_dir) 

588 if file in index_entries: 

589 # ObservationInfo overrides raw metadata 

590 if isinstance(index[file_in_dir], ObservationInfo) \ 

591 and not isinstance(index_entries[file], ObservationInfo): 

592 self.log.warning("File %s already specified in an index file but overriding" 

593 " with ObservationInfo content from %s", 

594 file, possible_index_file) 

595 else: 

596 self.log.warning("File %s already specified in an index file, " 

597 "ignoring content from %s", file, possible_index_file) 

598 # Do nothing in this case 

599 continue 

600 

601 index_entries[file] = index[file_in_dir] 

602 

603 # Remove files from list that have index entries and also 

604 # any files that we determined to be explicit index files 

605 # or any index files that we failed to read. 

606 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

607 

608 # The filtered list loses the initial order. Retaining the order 

609 # is good for testing but does have a cost if there are many 

610 # files when copying the good values out. A dict would have faster 

611 # lookups (using the files as keys) but use more memory. 

612 ordered = [f for f in filtered if f in files] 

613 

614 return index_entries, ordered, good_index_files, bad_index_files 

615 

616 def processIndexEntries(self, index_entries): 

617 """Convert index entries to RawFileData. 

618 

619 Parameters 

620 ---------- 

621 index_entries : `dict` [`str`, Any] 

622 Dict indexed by name of file to ingest and with keys either 

623 raw metadata or translated 

624 `~astro_metadata_translator.ObservationInfo`. 

625 

626 Returns 

627 ------- 

628 data : `RawFileData` 

629 A structure containing the metadata extracted from the file, 

630 as well as the original filename. All fields will be populated, 

631 but the `RawFileData.dataId` attribute will be a minimal 

632 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

633 """ 

634 fileData = [] 

635 for filename, metadata in index_entries.items(): 

636 try: 

637 datasets = [self._calculate_dataset_info(metadata, filename)] 

638 except Exception as e: 

639 self.log.debug("Problem extracting metadata for file %s found in index file: %s", 

640 filename, e) 

641 datasets = [] 

642 formatterClass = Formatter 

643 instrument = None 

644 self._on_metadata_failure(filename, e) 

645 if self.config.failFast: 

646 raise RuntimeError(f"Problem extracting metadata for file {filename} " 

647 "found in index file") from e 

648 else: 

649 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, 

650 filename) 

651 if instrument is None: 

652 datasets = [] 

653 fileData.append(RawFileData(datasets=datasets, filename=filename, 

654 FormatterClass=formatterClass, instrument=instrument)) 

655 return fileData 

656 

657 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

658 """Group an iterable of `RawFileData` by exposure. 

659 

660 Parameters 

661 ---------- 

662 files : iterable of `RawFileData` 

663 File-level information to group. 

664 

665 Returns 

666 ------- 

667 exposures : `list` of `RawExposureData` 

668 A list of structures that group the file-level information by 

669 exposure. All fields will be populated. The 

670 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

671 `~lsst.daf.butler.DataCoordinate` instances. 

672 """ 

673 exposureDimensions = self.universe["exposure"].graph 

674 byExposure = defaultdict(list) 

675 for f in files: 

676 # Assume that the first dataset is representative for the file. 

677 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

678 

679 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

680 for dataId, exposureFiles in byExposure.items()] 

681 

682 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

683 """Expand the data IDs associated with a raw exposure. 

684 

685 This adds the metadata records. 

686 

687 Parameters 

688 ---------- 

689 exposure : `RawExposureData` 

690 A structure containing information about the exposure to be 

691 ingested. Must have `RawExposureData.records` populated. Should 

692 be considered consumed upon return. 

693 

694 Returns 

695 ------- 

696 exposure : `RawExposureData` 

697 An updated version of the input structure, with 

698 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

699 updated to data IDs for which 

700 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

701 """ 

702 # We start by expanded the exposure-level data ID; we won't use that 

703 # directly in file ingest, but this lets us do some database lookups 

704 # once per exposure instead of once per file later. 

705 data.dataId = self.butler.registry.expandDataId( 

706 data.dataId, 

707 # We pass in the records we'll be inserting shortly so they aren't 

708 # looked up from the database. We do expect instrument and filter 

709 # records to be retrieved from the database here (though the 

710 # Registry may cache them so there isn't a lookup every time). 

711 records={ 

712 self.butler.registry.dimensions["exposure"]: data.record, 

713 } 

714 ) 

715 # Now we expand the per-file (exposure+detector) data IDs. This time 

716 # we pass in the records we just retrieved from the exposure data ID 

717 # expansion. 

718 for file in data.files: 

719 for dataset in file.datasets: 

720 dataset.dataId = self.butler.registry.expandDataId( 

721 dataset.dataId, 

722 records=dict(data.dataId.records) 

723 ) 

724 return data 

725 

726 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1 

727 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

728 """Perform all non-database-updating ingest preprocessing steps. 

729 

730 Parameters 

731 ---------- 

732 files : iterable over `str` or path-like objects 

733 Paths to the files to be ingested. Will be made absolute 

734 if they are not already. 

735 pool : `multiprocessing.Pool`, optional 

736 If not `None`, a process pool with which to parallelize some 

737 operations. 

738 processes : `int`, optional 

739 The number of processes to use. Ignored if ``pool`` is not `None`. 

740 

741 Returns 

742 ------- 

743 exposures : `Iterator` [ `RawExposureData` ] 

744 Data structures containing dimension records, filenames, and data 

745 IDs to be ingested (one structure for each exposure). 

746 bad_files : `list` of `str` 

747 List of all the files that could not have metadata extracted. 

748 """ 

749 if pool is None and processes > 1: 

750 pool = Pool(processes) 

751 mapFunc = map if pool is None else pool.imap_unordered 

752 

753 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

754 """Filter out bad files and return good with list of bad.""" 

755 good_files = [] 

756 bad_files = [] 

757 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)): 

758 if not fileDatum.datasets: 

759 bad_files.append(fileDatum.filename) 

760 else: 

761 good_files.append(fileDatum) 

762 return good_files, bad_files 

763 

764 # Look for index files and read them. 

765 # There should be far fewer index files than data files. 

766 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

767 if bad_index_files: 

768 self.log.info("Failed to read the following explicitly requested index files:"), 

769 for bad in sorted(bad_index_files): 

770 self.log.info("- %s", bad) 

771 

772 # Now convert all the index file entries to standard form for ingest. 

773 bad_index_file_data = [] 

774 indexFileData = self.processIndexEntries(index_entries) 

775 if indexFileData: 

776 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

777 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s" 

778 " with %d failure%s", 

779 *_log_msg_counter(indexFileData), 

780 *_log_msg_counter(good_index_files), 

781 *_log_msg_counter(bad_index_file_data)) 

782 

783 # Extract metadata and build per-detector regions. 

784 # This could run in a subprocess so collect all output 

785 # before looking at failures. 

786 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

787 

788 # Filter out all the failed reads and store them for later 

789 # reporting. 

790 fileData, bad_files = _partition_good_bad(fileData) 

791 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

792 *_log_msg_counter(fileData), 

793 *_log_msg_counter(bad_files)) 

794 

795 # Combine with data from index files. 

796 fileData.extend(indexFileData) 

797 bad_files.extend(bad_index_file_data) 

798 bad_files.extend(bad_index_files) 

799 

800 # Use that metadata to group files (and extracted metadata) by 

801 # exposure. Never parallelized because it's intrinsically a gather 

802 # step. 

803 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

804 

805 # The next operation operates on RawExposureData instances (one at 

806 # a time) in-place and then returns the modified instance. We call it 

807 # as a pass-through instead of relying on the arguments we pass in to 

808 # have been modified because in the parallel case those arguments are 

809 # going to be pickled and unpickled, and I'm not certain 

810 # multiprocessing is careful enough with that for output arguments to 

811 # work. 

812 

813 # Expand the data IDs to include all dimension metadata; we need this 

814 # because we may need to generate path templates that rely on that 

815 # metadata. 

816 # This is the first step that involves actual database calls (but just 

817 # SELECTs), so if there's going to be a problem with connections vs. 

818 # multiple processes, or lock contention (in SQLite) slowing things 

819 # down, it'll happen here. 

820 return mapFunc(self.expandDataIds, exposureData), bad_files 

821 

822 def ingestExposureDatasets( 

823 self, 

824 exposure: RawExposureData, 

825 *, 

826 run: Optional[str] = None, 

827 skip_existing_exposures: bool = False, 

828 ) -> List[FileDataset]: 

829 """Ingest all raw files in one exposure. 

830 

831 Parameters 

832 ---------- 

833 exposure : `RawExposureData` 

834 A structure containing information about the exposure to be 

835 ingested. Must have `RawExposureData.records` populated and all 

836 data ID attributes expanded. 

837 run : `str`, optional 

838 Name of a RUN-type collection to write to, overriding 

839 ``self.butler.run``. 

840 skip_existing_exposures : `bool`, optional 

841 If `True` (`False` is default), skip raws that have already been 

842 ingested (i.e. raws for which we already have a dataset with the 

843 same data ID in the target collection, even if from another file). 

844 Note that this is much slower than just not passing 

845 already-ingested files as inputs, because we still need to read and 

846 process metadata to identify which exposures to search for. It 

847 also will not work reliably if multiple processes are attempting to 

848 ingest raws from the same exposure concurrently, in that different 

849 processes may still attempt to ingest the same raw and conflict, 

850 causing a failure that prevents other raws from the same exposure 

851 from being ingested. 

852 

853 Returns 

854 ------- 

855 datasets : `list` of `lsst.daf.butler.FileDataset` 

856 Per-file structures identifying the files ingested and their 

857 dataset representation in the data repository. 

858 """ 

859 if skip_existing_exposures: 

860 existing = { 

861 ref.dataId for ref in self.butler.registry.queryDatasets( 

862 self.datasetType, 

863 collections=[run], 

864 dataId=exposure.dataId, 

865 ) 

866 } 

867 else: 

868 existing = set() 

869 datasets = [] 

870 for file in exposure.files: 

871 refs = [ 

872 DatasetRef(self.datasetType, d.dataId) 

873 for d in file.datasets 

874 if d.dataId not in existing 

875 ] 

876 if refs: 

877 datasets.append( 

878 FileDataset(path=file.filename.abspath(), refs=refs, formatter=file.FormatterClass) 

879 ) 

880 

881 # Raw files are preferentially ingested using a UUID derived from 

882 # the collection name and dataId. 

883 if self.butler.registry.supportsIdGenerationMode(DatasetIdGenEnum.DATAID_TYPE_RUN): 

884 mode = DatasetIdGenEnum.DATAID_TYPE_RUN 

885 else: 

886 mode = DatasetIdGenEnum.UNIQUE 

887 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run, idGenerationMode=mode) 

888 return datasets 

889 

890 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1, 

891 run: Optional[str] = None, 

892 skip_existing_exposures: bool = False, 

893 update_exposure_records: bool = False): 

894 """Ingest files into a Butler data repository. 

895 

896 This creates any new exposure or visit Dimension entries needed to 

897 identify the ingested files, creates new Dataset entries in the 

898 Registry and finally ingests the files themselves into the Datastore. 

899 Any needed instrument, detector, and physical_filter Dimension entries 

900 must exist in the Registry before `run` is called. 

901 

902 Parameters 

903 ---------- 

904 files : iterable over `ButlerURI` 

905 URIs to the files to be ingested. 

906 pool : `multiprocessing.Pool`, optional 

907 If not `None`, a process pool with which to parallelize some 

908 operations. 

909 processes : `int`, optional 

910 The number of processes to use. Ignored if ``pool`` is not `None`. 

911 run : `str`, optional 

912 Name of a RUN-type collection to write to, overriding 

913 the default derived from the instrument name. 

914 skip_existing_exposures : `bool`, optional 

915 If `True` (`False` is default), skip raws that have already been 

916 ingested (i.e. raws for which we already have a dataset with the 

917 same data ID in the target collection, even if from another file). 

918 Note that this is much slower than just not passing 

919 already-ingested files as inputs, because we still need to read and 

920 process metadata to identify which exposures to search for. It 

921 also will not work reliably if multiple processes are attempting to 

922 ingest raws from the same exposure concurrently, in that different 

923 processes may still attempt to ingest the same raw and conflict, 

924 causing a failure that prevents other raws from the same exposure 

925 from being ingested. 

926 update_exposure_records : `bool`, optional 

927 If `True` (`False` is default), update existing exposure records 

928 that conflict with the new ones instead of rejecting them. THIS IS 

929 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

930 KNOWN TO BE BAD. This should usually be combined with 

931 ``skip_existing_exposures=True``. 

932 

933 Returns 

934 ------- 

935 refs : `list` of `lsst.daf.butler.DatasetRef` 

936 Dataset references for ingested raws. 

937 """ 

938 

939 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

940 

941 # Up to this point, we haven't modified the data repository at all. 

942 # Now we finally do that, with one transaction per exposure. This is 

943 # not parallelized at present because the performance of this step is 

944 # limited by the database server. That may or may not change in the 

945 # future once we increase our usage of bulk inserts and reduce our 

946 # usage of savepoints; we've tried to get everything but the database 

947 # operations done in advance to reduce the time spent inside 

948 # transactions. 

949 self.butler.registry.registerDatasetType(self.datasetType) 

950 

951 refs = [] 

952 runs = set() 

953 n_exposures = 0 

954 n_exposures_failed = 0 

955 n_ingests_failed = 0 

956 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

957 

958 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

959 *_log_msg_counter(exposure.files), 

960 exposure.record.instrument, exposure.record.obs_id) 

961 

962 try: 

963 inserted_or_updated = self.butler.registry.syncDimensionData( 

964 "exposure", 

965 exposure.record, 

966 update=update_exposure_records, 

967 ) 

968 except Exception as e: 

969 self._on_ingest_failure(exposure, e) 

970 n_exposures_failed += 1 

971 self.log.warning("Exposure %s:%s could not be registered: %s", 

972 exposure.record.instrument, exposure.record.obs_id, e) 

973 if self.config.failFast: 

974 raise e 

975 continue 

976 

977 if isinstance(inserted_or_updated, dict): 

978 # Exposure is in the registry and we updated it, so 

979 # syncDimensionData returned a dict. 

980 self.log.info( 

981 "Exposure %s:%s was already present, but columns %s were updated.", 

982 exposure.record.instrument, 

983 exposure.record.obs_id, 

984 str(list(inserted_or_updated.keys())) 

985 ) 

986 

987 # Override default run if nothing specified explicitly. 

988 if run is None: 

989 instrument = exposure.files[0].instrument 

990 this_run = instrument.makeDefaultRawIngestRunName() 

991 else: 

992 this_run = run 

993 if this_run not in runs: 

994 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

995 runs.add(this_run) 

996 try: 

997 datasets_for_exposure = self.ingestExposureDatasets( 

998 exposure, 

999 run=this_run, 

1000 skip_existing_exposures=skip_existing_exposures, 

1001 ) 

1002 except Exception as e: 

1003 self._on_ingest_failure(exposure, e) 

1004 n_ingests_failed += 1 

1005 self.log.warning("Failed to ingest the following for reason: %s", e) 

1006 for f in exposure.files: 

1007 self.log.warning("- %s", f.filename) 

1008 if self.config.failFast: 

1009 raise e 

1010 continue 

1011 else: 

1012 self._on_success(datasets_for_exposure) 

1013 for dataset in datasets_for_exposure: 

1014 refs.extend(dataset.refs) 

1015 

1016 # Success for this exposure. 

1017 n_exposures += 1 

1018 self.log.info("Exposure %s:%s ingested successfully", 

1019 exposure.record.instrument, exposure.record.obs_id) 

1020 

1021 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

1022 

1023 @timeMethod 

1024 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None, 

1025 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True, 

1026 skip_existing_exposures: bool = False, update_exposure_records: bool = False): 

1027 """Ingest files into a Butler data repository. 

1028 

1029 This creates any new exposure or visit Dimension entries needed to 

1030 identify the ingested files, creates new Dataset entries in the 

1031 Registry and finally ingests the files themselves into the Datastore. 

1032 Any needed instrument, detector, and physical_filter Dimension entries 

1033 must exist in the Registry before `run` is called. 

1034 

1035 Parameters 

1036 ---------- 

1037 files : iterable over `ButlerURI`, `str` or path-like objects 

1038 Paths to the files to be ingested. Can refer to directories. 

1039 Will be made absolute if they are not already. 

1040 pool : `multiprocessing.Pool`, optional 

1041 If not `None`, a process pool with which to parallelize some 

1042 operations. 

1043 processes : `int`, optional 

1044 The number of processes to use. Ignored if ``pool`` is not `None`. 

1045 run : `str`, optional 

1046 Name of a RUN-type collection to write to, overriding 

1047 the default derived from the instrument name. 

1048 file_filter : `str` or `re.Pattern`, optional 

1049 Pattern to use to discover files to ingest within directories. 

1050 The default is to search for FITS files. The regex applies to 

1051 files within the directory. 

1052 group_files : `bool`, optional 

1053 Group files by directory if they have been discovered in 

1054 directories. Will not affect files explicitly provided. 

1055 skip_existing_exposures : `bool`, optional 

1056 If `True` (`False` is default), skip raws that have already been 

1057 ingested (i.e. raws for which we already have a dataset with the 

1058 same data ID in the target collection, even if from another file). 

1059 Note that this is much slower than just not passing 

1060 already-ingested files as inputs, because we still need to read and 

1061 process metadata to identify which exposures to search for. It 

1062 also will not work reliably if multiple processes are attempting to 

1063 ingest raws from the same exposure concurrently, in that different 

1064 processes may still attempt to ingest the same raw and conflict, 

1065 causing a failure that prevents other raws from the same exposure 

1066 from being ingested. 

1067 update_exposure_records : `bool`, optional 

1068 If `True` (`False` is default), update existing exposure records 

1069 that conflict with the new ones instead of rejecting them. THIS IS 

1070 AN ADVANCED OPTION THAT SHOULD ONLY BE USED TO FIX METADATA THAT IS 

1071 KNOWN TO BE BAD. This should usually be combined with 

1072 ``skip_existing_exposures=True``. 

1073 

1074 Returns 

1075 ------- 

1076 refs : `list` of `lsst.daf.butler.DatasetRef` 

1077 Dataset references for ingested raws. 

1078 

1079 Notes 

1080 ----- 

1081 This method inserts all datasets for an exposure within a transaction, 

1082 guaranteeing that partial exposures are never ingested. The exposure 

1083 dimension record is inserted with `Registry.syncDimensionData` first 

1084 (in its own transaction), which inserts only if a record with the same 

1085 primary key does not already exist. This allows different files within 

1086 the same exposure to be ingested in different runs. 

1087 """ 

1088 

1089 refs = [] 

1090 bad_files = [] 

1091 n_exposures = 0 

1092 n_exposures_failed = 0 

1093 n_ingests_failed = 0 

1094 if group_files: 

1095 for group in ButlerURI.findFileResources(files, file_filter, group_files): 

1096 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles( 

1097 group, 

1098 pool=pool, 

1099 processes=processes, 

1100 run=run, 

1101 skip_existing_exposures=skip_existing_exposures, 

1102 update_exposure_records=update_exposure_records, 

1103 ) 

1104 refs.extend(new_refs) 

1105 bad_files.extend(bad) 

1106 n_exposures += n_exp 

1107 n_exposures_failed += n_exp_fail 

1108 n_ingests_failed += n_ingest_fail 

1109 else: 

1110 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1111 ButlerURI.findFileResources(files, file_filter, group_files), 

1112 pool=pool, 

1113 processes=processes, 

1114 run=run, 

1115 skip_existing_exposures=skip_existing_exposures, 

1116 update_exposure_records=update_exposure_records, 

1117 ) 

1118 

1119 had_failure = False 

1120 

1121 if bad_files: 

1122 had_failure = True 

1123 self.log.warning("Could not extract observation metadata from the following:") 

1124 for f in bad_files: 

1125 self.log.warning("- %s", f) 

1126 

1127 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1128 " registration and %d failure%s from file ingest.", 

1129 *_log_msg_counter(n_exposures), 

1130 *_log_msg_counter(n_exposures_failed), 

1131 *_log_msg_counter(n_ingests_failed)) 

1132 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1133 had_failure = True 

1134 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1135 

1136 if had_failure: 

1137 raise RuntimeError("Some failures encountered during ingestion") 

1138 

1139 return refs