Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from dataclasses import dataclass, InitVar 

28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, merge_headers, MetadataTranslator 

33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 ButlerURI, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetRef, 

41 DatasetType, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 FileDataset, 

45 Formatter, 

46 Progress, 

47) 

48from lsst.pex.config import Config, ChoiceField, Field 

49from lsst.pipe.base import Task, timeMethod 

50 

51from ._instrument import Instrument, makeExposureRecordFromObsInfo 

52from ._fitsRawFormatterBase import FitsRawFormatterBase 

53 

54 

55def _do_nothing(*args, **kwargs) -> None: 

56 """Do nothing. 

57 

58 This is a function that accepts anything and does nothing. 

59 For use as a default in callback arguments. 

60 """ 

61 pass 

62 

63 

64def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

65 """Count the iterable and return the count and plural modifier. 

66 

67 Parameters 

68 ---------- 

69 noun : Iterable or `int` 

70 Thing to count. If given an integer it is assumed to be the count 

71 to use to calculate modifier. 

72 

73 Returns 

74 ------- 

75 num : `int` 

76 Number of items found in ``noun``. 

77 modifier : `str` 

78 Character to add to the end of a string referring to these items 

79 to indicate whether it was a single item or not. Returns empty 

80 string if there is one item or "s" otherwise. 

81 

82 Examples 

83 -------- 

84 

85 .. code-block:: python 

86 

87 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

88 """ 

89 if isinstance(noun, int): 

90 num = noun 

91 else: 

92 num = len(noun) 

93 return num, "" if num == 1 else "s" 

94 

95 

96@dataclass 

97class RawFileDatasetInfo: 

98 """Information about a single dataset within a raw file.""" 

99 

100 dataId: DataCoordinate 

101 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

102 

103 obsInfo: ObservationInfo 

104 """Standardized observation metadata extracted directly from the file 

105 headers (`astro_metadata_translator.ObservationInfo`). 

106 """ 

107 

108 

109@dataclass 

110class RawFileData: 

111 """Information about a single raw file, used during ingest.""" 

112 

113 datasets: List[RawFileDatasetInfo] 

114 """The information describing each dataset within this raw file. 

115 (`list` of `RawFileDatasetInfo`) 

116 """ 

117 

118 filename: ButlerURI 

119 """URI of the file this information was extracted from (`str`). 

120 

121 This is the path prior to ingest, not the path after ingest. 

122 """ 

123 

124 FormatterClass: Type[FitsRawFormatterBase] 

125 """Formatter class that should be used to ingest this file (`type`; as 

126 subclass of `FitsRawFormatterBase`). 

127 """ 

128 

129 instrumentClass: Optional[Type[Instrument]] 

130 """The `Instrument` class associated with this file. Can be `None` 

131 if ``datasets`` is an empty list.""" 

132 

133 

134@dataclass 

135class RawExposureData: 

136 """Information about a complete raw exposure, used during ingest.""" 

137 

138 dataId: DataCoordinate 

139 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

140 """ 

141 

142 files: List[RawFileData] 

143 """List of structures containing file-level information. 

144 """ 

145 

146 universe: InitVar[DimensionUniverse] 

147 """Set of all known dimensions. 

148 """ 

149 

150 record: Optional[DimensionRecord] = None 

151 """The exposure `DimensionRecord` that must be inserted into the 

152 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

153 """ 

154 

155 def __post_init__(self, universe: DimensionUniverse): 

156 # We don't care which file or dataset we read metadata from, because 

157 # we're assuming they'll all be the same; just use the first ones. 

158 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

159 

160 

161def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

162 """Create a Config field with options for transferring data between repos. 

163 

164 The allowed options for the field are exactly those supported by 

165 `lsst.daf.butler.Datastore.ingest`. 

166 

167 Parameters 

168 ---------- 

169 doc : `str` 

170 Documentation for the configuration field. 

171 

172 Returns 

173 ------- 

174 field : `lsst.pex.config.ChoiceField` 

175 Configuration field. 

176 """ 

177 return ChoiceField( 

178 doc=doc, 

179 dtype=str, 

180 allowed={"move": "move", 

181 "copy": "copy", 

182 "auto": "choice will depend on datastore", 

183 "direct": "use URI to ingested file directly in datastore", 

184 "link": "hard link falling back to symbolic link", 

185 "hardlink": "hard link", 

186 "symlink": "symbolic (soft) link", 

187 "relsymlink": "relative symbolic link", 

188 }, 

189 optional=True, 

190 default=default 

191 ) 

192 

193 

194class RawIngestConfig(Config): 

195 """Configuration class for RawIngestTask.""" 

196 

197 transfer = makeTransferChoiceField() 

198 failFast = Field( 

199 dtype=bool, 

200 default=False, 

201 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

202 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

203 ) 

204 

205 

206class RawIngestTask(Task): 

207 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

208 

209 Parameters 

210 ---------- 

211 config : `RawIngestConfig` 

212 Configuration for the task. 

213 butler : `~lsst.daf.butler.Butler` 

214 Writeable butler instance, with ``butler.run`` set to the appropriate 

215 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

216 datasets. 

217 on_success : `Callable`, optional 

218 A callback invoked when all of the raws associated with an exposure 

219 are ingested. Will be passed a list of `FileDataset` objects, each 

220 containing one or more resolved `DatasetRef` objects. If this callback 

221 raises it will interrupt the entire ingest process, even if 

222 `RawIngestConfig.failFast` is `False`. 

223 on_metadata_failure : `Callable`, optional 

224 A callback invoked when a failure occurs trying to translate the 

225 metadata for a file. Will be passed the URI and the exception, in 

226 that order, as positional arguments. Guaranteed to be called in an 

227 ``except`` block, allowing the callback to re-raise or replace (with 

228 ``raise ... from``) to override the task's usual error handling (before 

229 `RawIngestConfig.failFast` logic occurs). 

230 on_ingest_failure : `Callable`, optional 

231 A callback invoked when dimension record or dataset insertion into the 

232 database fails for an exposure. Will be passed a `RawExposureData` 

233 instance and the exception, in that order, as positional arguments. 

234 Guaranteed to be called in an ``except`` block, allowing the callback 

235 to re-raise or replace (with ``raise ... from``) to override the task's 

236 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

237 **kwargs 

238 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

239 constructor. 

240 

241 Notes 

242 ----- 

243 Each instance of `RawIngestTask` writes to the same Butler. Each 

244 invocation of `RawIngestTask.run` ingests a list of files. 

245 """ 

246 

247 ConfigClass = RawIngestConfig 

248 

249 _DefaultName = "ingest" 

250 

251 def getDatasetType(self): 

252 """Return the DatasetType of the datasets ingested by this Task.""" 

253 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

254 universe=self.butler.registry.dimensions) 

255 

256 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, 

257 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

258 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

259 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

260 **kwargs: Any): 

261 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

262 super().__init__(config, **kwargs) 

263 self.butler = butler 

264 self.universe = self.butler.registry.dimensions 

265 self.datasetType = self.getDatasetType() 

266 self._on_success = on_success 

267 self._on_metadata_failure = on_metadata_failure 

268 self._on_ingest_failure = on_ingest_failure 

269 self.progress = Progress("obs.base.RawIngestTask") 

270 

271 # Import all the instrument classes so that we ensure that we 

272 # have all the relevant metadata translators loaded. 

273 Instrument.importAll(self.butler.registry) 

274 

275 def _reduce_kwargs(self): 

276 # Add extra parameters to pickle. 

277 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success, 

278 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure) 

279 

280 def _determine_instrument_formatter(self, dataId, filename): 

281 """Determine the instrument and formatter class. 

282 

283 Parameters 

284 ---------- 

285 dataId : `lsst.daf.butler.DataCoordinate` 

286 The dataId associated with this dataset. 

287 filename : `ButlerURI` 

288 URI of file used for error reporting. 

289 

290 Returns 

291 ------- 

292 instrument : `Instrument` or `None` 

293 Instance of the `Instrument` associated with this dataset. `None` 

294 indicates that the instrument could not be determined. 

295 formatterClass : `type` 

296 Class to be used as the formatter for this dataset. 

297 """ 

298 # The data model currently assumes that whilst multiple datasets 

299 # can be associated with a single file, they must all share the 

300 # same formatter. 

301 try: 

302 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

303 except LookupError as e: 

304 self._on_metadata_failure(filename, e) 

305 self.log.warning("Instrument %s for file %s not known to registry", 

306 dataId["instrument"], filename) 

307 if self.config.failFast: 

308 raise RuntimeError(f"Instrument {dataId['instrument']} for" 

309 f" file {filename} not known to registry") from e 

310 FormatterClass = Formatter 

311 # Indicate that we could not work out the instrument. 

312 instrument = None 

313 else: 

314 FormatterClass = instrument.getRawFormatter(dataId) 

315 return instrument, FormatterClass 

316 

317 def extractMetadata(self, filename: ButlerURI) -> RawFileData: 

318 """Extract and process metadata from a single raw file. 

319 

320 Parameters 

321 ---------- 

322 filename : `ButlerURI` 

323 URI to the file. 

324 

325 Returns 

326 ------- 

327 data : `RawFileData` 

328 A structure containing the metadata extracted from the file, 

329 as well as the original filename. All fields will be populated, 

330 but the `RawFileData.dataId` attribute will be a minimal 

331 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

332 ``instrumentClass`` field will be `None` if there is a problem 

333 with metadata extraction. 

334 

335 Notes 

336 ----- 

337 Assumes that there is a single dataset associated with the given 

338 file. Instruments using a single file to store multiple datasets 

339 must implement their own version of this method. 

340 

341 By default the method will catch all exceptions unless the ``failFast`` 

342 configuration item is `True`. If an error is encountered the 

343 `_on_metadata_failure()` method will be called. If no exceptions 

344 result and an error was encountered the returned object will have 

345 a null-instrument class and no datasets. 

346 

347 This method supports sidecar JSON files which can be used to 

348 extract metadata without having to read the data file itself. 

349 The sidecar file is always used if found. 

350 """ 

351 sidecar_fail_msg = "" # Requires prepended space when set. 

352 try: 

353 sidecar_file = filename.updatedExtension(".json") 

354 if sidecar_file.exists(): 

355 content = json.loads(sidecar_file.read()) 

356 headers = [process_sidecar_data(content)] 

357 sidecar_fail_msg = " (via sidecar)" 

358 else: 

359 # Read the metadata from the data file itself. 

360 

361 # For remote files download the entire file to get the 

362 # header. This is very inefficient and it would be better 

363 # to have some way of knowing where in the file the headers 

364 # are and to only download those parts of the file. 

365 with filename.as_local() as local_file: 

366 # Read the primary. This might be sufficient. 

367 header = readMetadata(local_file.ospath, 0) 

368 

369 try: 

370 # Try to work out a translator class early. 

371 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

372 except ValueError: 

373 # Primary header was not sufficient (maybe this file 

374 # has been compressed or is a MEF with minimal 

375 # primary). Read second header and merge with primary. 

376 header = merge_headers([header, readMetadata(local_file.ospath, 1)], mode="overwrite") 

377 

378 # Try again to work out a translator class, letting this 

379 # fail. 

380 translator_class = MetadataTranslator.determine_translator(header, filename=filename) 

381 

382 # Request the headers to use for ingest 

383 headers = translator_class.determine_translatable_headers(filename.ospath, header) 

384 

385 # Add each header to the dataset list 

386 datasets = [self._calculate_dataset_info(h, filename) for h in headers] 

387 

388 except Exception as e: 

389 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

390 # Indicate to the caller that we failed to read. 

391 datasets = [] 

392 formatterClass = Formatter 

393 instrument = None 

394 self._on_metadata_failure(filename, e) 

395 if self.config.failFast: 

396 raise RuntimeError("Problem extracting metadata for file " 

397 f"{filename}{sidecar_fail_msg}") from e 

398 else: 

399 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

400 # The data model currently assumes that whilst multiple datasets 

401 # can be associated with a single file, they must all share the 

402 # same formatter. 

403 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

404 if instrument is None: 

405 datasets = [] 

406 

407 return RawFileData(datasets=datasets, filename=filename, 

408 FormatterClass=formatterClass, 

409 instrumentClass=instrument) 

410 

411 def _calculate_dataset_info(self, header, filename): 

412 """Calculate a RawFileDatasetInfo from the supplied information. 

413 

414 Parameters 

415 ---------- 

416 header : Mapping or `astro_metadata_translator.ObservationInfo` 

417 Header from the dataset or previously-translated content. 

418 filename : `ButlerURI` 

419 Filename to use for error messages. 

420 

421 Returns 

422 ------- 

423 dataset : `RawFileDatasetInfo` 

424 The dataId, and observation information associated with this 

425 dataset. 

426 """ 

427 # To ensure we aren't slowed down for no reason, explicitly 

428 # list here the properties we need for the schema. 

429 # Use a dict with values a boolean where True indicates 

430 # that it is required that we calculate this property. 

431 ingest_subset = { 

432 "altaz_begin": False, 

433 "boresight_rotation_coord": False, 

434 "boresight_rotation_angle": False, 

435 "dark_time": False, 

436 "datetime_begin": True, 

437 "datetime_end": True, 

438 "detector_num": True, 

439 "exposure_group": False, 

440 "exposure_id": True, 

441 "exposure_time": True, 

442 "instrument": True, 

443 "tracking_radec": False, 

444 "object": False, 

445 "observation_counter": False, 

446 "observation_id": True, 

447 "observation_reason": False, 

448 "observation_type": True, 

449 "observing_day": False, 

450 "physical_filter": True, 

451 "science_program": False, 

452 "visit_id": False, 

453 } 

454 

455 if isinstance(header, ObservationInfo): 

456 obsInfo = header 

457 missing = [] 

458 # Need to check the required properties are present. 

459 for property, required in ingest_subset.items(): 

460 if not required: 

461 continue 

462 # getattr does not need to be protected because it is using 

463 # the defined list above containing properties that must exist. 

464 value = getattr(obsInfo, property) 

465 if value is None: 

466 missing.append(property) 

467 if missing: 

468 raise ValueError(f"Requested required properties are missing from file {filename}:" 

469 f" {missing} (via JSON)") 

470 

471 else: 

472 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename), 

473 required={k for k in ingest_subset if ingest_subset[k]}, 

474 subset=set(ingest_subset)) 

475 

476 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

477 exposure=obsInfo.exposure_id, 

478 detector=obsInfo.detector_num, 

479 universe=self.universe) 

480 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

481 

482 def locateAndReadIndexFiles(self, files): 

483 """Given a list of files, look for index files and read them. 

484 

485 Index files can either be explicitly in the list of files to 

486 ingest, or else located in the same directory as a file to ingest. 

487 Index entries are always used if present. 

488 

489 Parameters 

490 ---------- 

491 files : iterable over `ButlerURI` 

492 URIs to the files to be ingested. 

493 

494 Returns 

495 ------- 

496 index : `dict` [`str`, Any] 

497 Merged contents of all relevant index files found. These can 

498 be explicitly specified index files or ones found in the 

499 directory alongside a data file to be ingested. 

500 updated_files : iterable of `str` 

501 Updated list of the input files with entries removed that were 

502 found listed in an index file. Order is not guaranteed to 

503 match the order of the files given to this routine. 

504 bad_index_files: `set[str]` 

505 Files that looked like index files but failed to read properly. 

506 """ 

507 # Convert the paths to absolute for easy comparison with index content. 

508 # Do not convert to real paths since we have to assume that index 

509 # files are in this location and not the location which it links to. 

510 files = tuple(f.abspath() for f in files) 

511 

512 # Index files must be named this. 

513 index_root_file = "_index.json" 

514 

515 # Group the files by directory. 

516 files_by_directory = defaultdict(set) 

517 

518 for path in files: 

519 directory, file_in_dir = path.split() 

520 files_by_directory[directory].add(file_in_dir) 

521 

522 # All the metadata read from index files with keys of full path. 

523 index_entries = {} 

524 

525 # Index files we failed to read. 

526 bad_index_files = set() 

527 

528 # Any good index files that were found and used. 

529 good_index_files = set() 

530 

531 # Look for index files in those directories. 

532 for directory, files_in_directory in files_by_directory.items(): 

533 possible_index_file = directory.join(index_root_file) 

534 if possible_index_file.exists(): 

535 # If we are explicitly requesting an index file the 

536 # messages should be different. 

537 index_msg = "inferred" 

538 is_implied = True 

539 if index_root_file in files_in_directory: 

540 index_msg = "explicit" 

541 is_implied = False 

542 

543 # Try to read the index file and catch and report any 

544 # problems. 

545 try: 

546 content = json.loads(possible_index_file.read()) 

547 index = process_index_data(content, force_dict=True) 

548 except Exception as e: 

549 # Only trigger the callback if the index file 

550 # was asked for explicitly. Triggering on implied file 

551 # might be surprising. 

552 if not is_implied: 

553 self._on_metadata_failure(possible_index_file, e) 

554 if self.config.failFast: 

555 raise RuntimeError(f"Problem reading index file from {index_msg} " 

556 f"location {possible_index_file}") from e 

557 bad_index_files.add(possible_index_file) 

558 continue 

559 

560 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

561 good_index_files.add(possible_index_file) 

562 

563 # Go through the index adding entries for files. 

564 # If we have non-index files in this directory marked for 

565 # ingest we should only get index information for those. 

566 # If the index file was explicit we use all entries. 

567 if is_implied: 

568 files_to_ingest = files_in_directory 

569 else: 

570 files_to_ingest = set(index) 

571 

572 # Copy relevant metadata into a single dict for all index 

573 # entries. 

574 for file_in_dir in files_to_ingest: 

575 # Skip an explicitly specified index file. 

576 # This should never happen because an explicit index 

577 # file will force ingest of all files in the index 

578 # and not use the explicit file list. If somehow 

579 # this is not true we continue. Raising an exception 

580 # seems like the wrong thing to do since this is harmless. 

581 if file_in_dir == index_root_file: 

582 self.log.info("Logic error found scanning directory %s. Please file ticket.", 

583 directory) 

584 continue 

585 if file_in_dir in index: 

586 file = directory.join(file_in_dir) 

587 if file in index_entries: 

588 # ObservationInfo overrides raw metadata 

589 if isinstance(index[file_in_dir], ObservationInfo) \ 

590 and not isinstance(index_entries[file], ObservationInfo): 

591 self.log.warning("File %s already specified in an index file but overriding" 

592 " with ObservationInfo content from %s", 

593 file, possible_index_file) 

594 else: 

595 self.log.warning("File %s already specified in an index file, " 

596 "ignoring content from %s", file, possible_index_file) 

597 # Do nothing in this case 

598 continue 

599 

600 index_entries[file] = index[file_in_dir] 

601 

602 # Remove files from list that have index entries and also 

603 # any files that we determined to be explicit index files 

604 # or any index files that we failed to read. 

605 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

606 

607 # The filtered list loses the initial order. Retaining the order 

608 # is good for testing but does have a cost if there are many 

609 # files when copying the good values out. A dict would have faster 

610 # lookups (using the files as keys) but use more memory. 

611 ordered = [f for f in filtered if f in files] 

612 

613 return index_entries, ordered, good_index_files, bad_index_files 

614 

615 def processIndexEntries(self, index_entries): 

616 """Convert index entries to RawFileData. 

617 

618 Parameters 

619 ---------- 

620 index_entries : `dict` [`str`, Any] 

621 Dict indexed by name of file to ingest and with keys either 

622 raw metadata or translated 

623 `~astro_metadata_translator.ObservationInfo`. 

624 

625 Returns 

626 ------- 

627 data : `RawFileData` 

628 A structure containing the metadata extracted from the file, 

629 as well as the original filename. All fields will be populated, 

630 but the `RawFileData.dataId` attribute will be a minimal 

631 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

632 """ 

633 fileData = [] 

634 for filename, metadata in index_entries.items(): 

635 try: 

636 datasets = [self._calculate_dataset_info(metadata, filename)] 

637 except Exception as e: 

638 self.log.debug("Problem extracting metadata for file %s found in index file: %s", 

639 filename, e) 

640 datasets = [] 

641 formatterClass = Formatter 

642 instrument = None 

643 self._on_metadata_failure(filename, e) 

644 if self.config.failFast: 

645 raise RuntimeError(f"Problem extracting metadata for file {filename} " 

646 "found in index file") from e 

647 else: 

648 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, 

649 filename) 

650 if instrument is None: 

651 datasets = [] 

652 fileData.append(RawFileData(datasets=datasets, filename=filename, 

653 FormatterClass=formatterClass, instrumentClass=instrument)) 

654 return fileData 

655 

656 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

657 """Group an iterable of `RawFileData` by exposure. 

658 

659 Parameters 

660 ---------- 

661 files : iterable of `RawFileData` 

662 File-level information to group. 

663 

664 Returns 

665 ------- 

666 exposures : `list` of `RawExposureData` 

667 A list of structures that group the file-level information by 

668 exposure. All fields will be populated. The 

669 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

670 `~lsst.daf.butler.DataCoordinate` instances. 

671 """ 

672 exposureDimensions = self.universe["exposure"].graph 

673 byExposure = defaultdict(list) 

674 for f in files: 

675 # Assume that the first dataset is representative for the file. 

676 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

677 

678 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

679 for dataId, exposureFiles in byExposure.items()] 

680 

681 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

682 """Expand the data IDs associated with a raw exposure. 

683 

684 This adds the metadata records. 

685 

686 Parameters 

687 ---------- 

688 exposure : `RawExposureData` 

689 A structure containing information about the exposure to be 

690 ingested. Must have `RawExposureData.records` populated. Should 

691 be considered consumed upon return. 

692 

693 Returns 

694 ------- 

695 exposure : `RawExposureData` 

696 An updated version of the input structure, with 

697 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

698 updated to data IDs for which 

699 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

700 """ 

701 # We start by expanded the exposure-level data ID; we won't use that 

702 # directly in file ingest, but this lets us do some database lookups 

703 # once per exposure instead of once per file later. 

704 data.dataId = self.butler.registry.expandDataId( 

705 data.dataId, 

706 # We pass in the records we'll be inserting shortly so they aren't 

707 # looked up from the database. We do expect instrument and filter 

708 # records to be retrieved from the database here (though the 

709 # Registry may cache them so there isn't a lookup every time). 

710 records={ 

711 self.butler.registry.dimensions["exposure"]: data.record, 

712 } 

713 ) 

714 # Now we expand the per-file (exposure+detector) data IDs. This time 

715 # we pass in the records we just retrieved from the exposure data ID 

716 # expansion. 

717 for file in data.files: 

718 for dataset in file.datasets: 

719 dataset.dataId = self.butler.registry.expandDataId( 

720 dataset.dataId, 

721 records=dict(data.dataId.records) 

722 ) 

723 return data 

724 

725 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1 

726 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

727 """Perform all non-database-updating ingest preprocessing steps. 

728 

729 Parameters 

730 ---------- 

731 files : iterable over `str` or path-like objects 

732 Paths to the files to be ingested. Will be made absolute 

733 if they are not already. 

734 pool : `multiprocessing.Pool`, optional 

735 If not `None`, a process pool with which to parallelize some 

736 operations. 

737 processes : `int`, optional 

738 The number of processes to use. Ignored if ``pool`` is not `None`. 

739 

740 Returns 

741 ------- 

742 exposures : `Iterator` [ `RawExposureData` ] 

743 Data structures containing dimension records, filenames, and data 

744 IDs to be ingested (one structure for each exposure). 

745 bad_files : `list` of `str` 

746 List of all the files that could not have metadata extracted. 

747 """ 

748 if pool is None and processes > 1: 

749 pool = Pool(processes) 

750 mapFunc = map if pool is None else pool.imap_unordered 

751 

752 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

753 """Filter out bad files and return good with list of bad.""" 

754 good_files = [] 

755 bad_files = [] 

756 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)): 

757 if not fileDatum.datasets: 

758 bad_files.append(fileDatum.filename) 

759 else: 

760 good_files.append(fileDatum) 

761 return good_files, bad_files 

762 

763 # Look for index files and read them. 

764 # There should be far fewer index files than data files. 

765 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

766 if bad_index_files: 

767 self.log.info("Failed to read the following explicitly requested index files:"), 

768 for bad in sorted(bad_index_files): 

769 self.log.info("- %s", bad) 

770 

771 # Now convert all the index file entries to standard form for ingest. 

772 bad_index_file_data = [] 

773 indexFileData = self.processIndexEntries(index_entries) 

774 if indexFileData: 

775 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

776 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s" 

777 " with %d failure%s", 

778 *_log_msg_counter(indexFileData), 

779 *_log_msg_counter(good_index_files), 

780 *_log_msg_counter(bad_index_file_data)) 

781 

782 # Extract metadata and build per-detector regions. 

783 # This could run in a subprocess so collect all output 

784 # before looking at failures. 

785 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

786 

787 # Filter out all the failed reads and store them for later 

788 # reporting. 

789 fileData, bad_files = _partition_good_bad(fileData) 

790 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

791 *_log_msg_counter(fileData), 

792 *_log_msg_counter(bad_files)) 

793 

794 # Combine with data from index files. 

795 fileData.extend(indexFileData) 

796 bad_files.extend(bad_index_file_data) 

797 bad_files.extend(bad_index_files) 

798 

799 # Use that metadata to group files (and extracted metadata) by 

800 # exposure. Never parallelized because it's intrinsically a gather 

801 # step. 

802 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

803 

804 # The next operation operates on RawExposureData instances (one at 

805 # a time) in-place and then returns the modified instance. We call it 

806 # as a pass-through instead of relying on the arguments we pass in to 

807 # have been modified because in the parallel case those arguments are 

808 # going to be pickled and unpickled, and I'm not certain 

809 # multiprocessing is careful enough with that for output arguments to 

810 # work. 

811 

812 # Expand the data IDs to include all dimension metadata; we need this 

813 # because we may need to generate path templates that rely on that 

814 # metadata. 

815 # This is the first step that involves actual database calls (but just 

816 # SELECTs), so if there's going to be a problem with connections vs. 

817 # multiple processes, or lock contention (in SQLite) slowing things 

818 # down, it'll happen here. 

819 return mapFunc(self.expandDataIds, exposureData), bad_files 

820 

821 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

822 ) -> List[FileDataset]: 

823 """Ingest all raw files in one exposure. 

824 

825 Parameters 

826 ---------- 

827 exposure : `RawExposureData` 

828 A structure containing information about the exposure to be 

829 ingested. Must have `RawExposureData.records` populated and all 

830 data ID attributes expanded. 

831 run : `str`, optional 

832 Name of a RUN-type collection to write to, overriding 

833 ``self.butler.run``. 

834 

835 Returns 

836 ------- 

837 datasets : `list` of `lsst.daf.butler.FileDataset` 

838 Per-file structures identifying the files ingested and their 

839 dataset representation in the data repository. 

840 """ 

841 datasets = [FileDataset(path=file.filename.abspath(), 

842 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

843 formatter=file.FormatterClass) 

844 for file in exposure.files] 

845 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

846 return datasets 

847 

848 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1, 

849 run: Optional[str] = None): 

850 """Ingest files into a Butler data repository. 

851 

852 This creates any new exposure or visit Dimension entries needed to 

853 identify the ingested files, creates new Dataset entries in the 

854 Registry and finally ingests the files themselves into the Datastore. 

855 Any needed instrument, detector, and physical_filter Dimension entries 

856 must exist in the Registry before `run` is called. 

857 

858 Parameters 

859 ---------- 

860 files : iterable over `ButlerURI` 

861 URIs to the files to be ingested. 

862 pool : `multiprocessing.Pool`, optional 

863 If not `None`, a process pool with which to parallelize some 

864 operations. 

865 processes : `int`, optional 

866 The number of processes to use. Ignored if ``pool`` is not `None`. 

867 run : `str`, optional 

868 Name of a RUN-type collection to write to, overriding 

869 the default derived from the instrument name. 

870 

871 Returns 

872 ------- 

873 refs : `list` of `lsst.daf.butler.DatasetRef` 

874 Dataset references for ingested raws. 

875 """ 

876 

877 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

878 

879 # Up to this point, we haven't modified the data repository at all. 

880 # Now we finally do that, with one transaction per exposure. This is 

881 # not parallelized at present because the performance of this step is 

882 # limited by the database server. That may or may not change in the 

883 # future once we increase our usage of bulk inserts and reduce our 

884 # usage of savepoints; we've tried to get everything but the database 

885 # operations done in advance to reduce the time spent inside 

886 # transactions. 

887 self.butler.registry.registerDatasetType(self.datasetType) 

888 

889 refs = [] 

890 runs = set() 

891 n_exposures = 0 

892 n_exposures_failed = 0 

893 n_ingests_failed = 0 

894 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

895 

896 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

897 *_log_msg_counter(exposure.files), 

898 exposure.record.instrument, exposure.record.obs_id) 

899 

900 try: 

901 self.butler.registry.syncDimensionData("exposure", exposure.record) 

902 except Exception as e: 

903 self._on_ingest_failure(exposure, e) 

904 n_exposures_failed += 1 

905 self.log.warning("Exposure %s:%s could not be registered: %s", 

906 exposure.record.instrument, exposure.record.obs_id, e) 

907 if self.config.failFast: 

908 raise e 

909 continue 

910 

911 # Override default run if nothing specified explicitly. 

912 if run is None: 

913 instrumentClass = exposure.files[0].instrumentClass 

914 this_run = instrumentClass.makeDefaultRawIngestRunName() 

915 else: 

916 this_run = run 

917 if this_run not in runs: 

918 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

919 runs.add(this_run) 

920 try: 

921 with self.butler.transaction(): 

922 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run) 

923 except Exception as e: 

924 self._on_ingest_failure(exposure, e) 

925 n_ingests_failed += 1 

926 self.log.warning("Failed to ingest the following for reason: %s", e) 

927 for f in exposure.files: 

928 self.log.warning("- %s", f.filename) 

929 if self.config.failFast: 

930 raise e 

931 continue 

932 else: 

933 self._on_success(datasets_for_exposure) 

934 for dataset in datasets_for_exposure: 

935 refs.extend(dataset.refs) 

936 

937 # Success for this exposure. 

938 n_exposures += 1 

939 self.log.info("Exposure %s:%s ingested successfully", 

940 exposure.record.instrument, exposure.record.obs_id) 

941 

942 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

943 

944 @timeMethod 

945 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None, 

946 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True): 

947 """Ingest files into a Butler data repository. 

948 

949 This creates any new exposure or visit Dimension entries needed to 

950 identify the ingested files, creates new Dataset entries in the 

951 Registry and finally ingests the files themselves into the Datastore. 

952 Any needed instrument, detector, and physical_filter Dimension entries 

953 must exist in the Registry before `run` is called. 

954 

955 Parameters 

956 ---------- 

957 files : iterable over `ButlerURI`, `str` or path-like objects 

958 Paths to the files to be ingested. Can refer to directories. 

959 Will be made absolute if they are not already. 

960 pool : `multiprocessing.Pool`, optional 

961 If not `None`, a process pool with which to parallelize some 

962 operations. 

963 processes : `int`, optional 

964 The number of processes to use. Ignored if ``pool`` is not `None`. 

965 run : `str`, optional 

966 Name of a RUN-type collection to write to, overriding 

967 the default derived from the instrument name. 

968 file_filter : `str` or `re.Pattern`, optional 

969 Pattern to use to discover files to ingest within directories. 

970 The default is to search for FITS files. The regex applies to 

971 files within the directory. 

972 group_files : `bool`, optional 

973 Group files by directory if they have been discovered in 

974 directories. Will not affect files explicitly provided. 

975 

976 Returns 

977 ------- 

978 refs : `list` of `lsst.daf.butler.DatasetRef` 

979 Dataset references for ingested raws. 

980 

981 Notes 

982 ----- 

983 This method inserts all datasets for an exposure within a transaction, 

984 guaranteeing that partial exposures are never ingested. The exposure 

985 dimension record is inserted with `Registry.syncDimensionData` first 

986 (in its own transaction), which inserts only if a record with the same 

987 primary key does not already exist. This allows different files within 

988 the same exposure to be incremented in different runs. 

989 """ 

990 

991 refs = [] 

992 bad_files = [] 

993 n_exposures = 0 

994 n_exposures_failed = 0 

995 n_ingests_failed = 0 

996 if group_files: 

997 for group in ButlerURI.findFileResources(files, file_filter, group_files): 

998 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(group, pool=pool, 

999 processes=processes, 

1000 run=run) 

1001 refs.extend(new_refs) 

1002 bad_files.extend(bad) 

1003 n_exposures += n_exp 

1004 n_exposures_failed += n_exp_fail 

1005 n_ingests_failed += n_ingest_fail 

1006 else: 

1007 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

1008 ButlerURI.findFileResources(files, file_filter, group_files), 

1009 pool=pool, 

1010 processes=processes, 

1011 run=run, 

1012 ) 

1013 

1014 had_failure = False 

1015 

1016 if bad_files: 

1017 had_failure = True 

1018 self.log.warning("Could not extract observation metadata from the following:") 

1019 for f in bad_files: 

1020 self.log.warning("- %s", f) 

1021 

1022 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1023 " registration and %d failure%s from file ingest.", 

1024 *_log_msg_counter(n_exposures), 

1025 *_log_msg_counter(n_exposures_failed), 

1026 *_log_msg_counter(n_ingests_failed)) 

1027 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1028 had_failure = True 

1029 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1030 

1031 if had_failure: 

1032 raise RuntimeError("Some failures encountered during ingestion") 

1033 

1034 return refs