Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import List, Iterator, Iterable, Type, Optional, Any 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, merge_headers 

32from lsst.afw.fits import readMetadata 

33from lsst.daf.butler import ( 

34 Butler, 

35 CollectionType, 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 FileDataset, 

42 Formatter, 

43) 

44from lsst.pex.config import Config, ChoiceField, Field 

45from lsst.pipe.base import Task 

46 

47from ._instrument import Instrument, makeExposureRecordFromObsInfo 

48from ._fitsRawFormatterBase import FitsRawFormatterBase 

49 

50 

51@dataclass 

52class RawFileDatasetInfo: 

53 """Structure that holds information about a single dataset within a 

54 raw file. 

55 """ 

56 

57 dataId: DataCoordinate 

58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

59 """ 

60 

61 obsInfo: ObservationInfo 

62 """Standardized observation metadata extracted directly from the file 

63 headers (`astro_metadata_translator.ObservationInfo`). 

64 """ 

65 

66 

67@dataclass 

68class RawFileData: 

69 """Structure that holds information about a single raw file, used during 

70 ingest. 

71 """ 

72 

73 datasets: List[RawFileDatasetInfo] 

74 """The information describing each dataset within this raw file. 

75 (`list` of `RawFileDatasetInfo`) 

76 """ 

77 

78 filename: str 

79 """Name of the file this information was extracted from (`str`). 

80 

81 This is the path prior to ingest, not the path after ingest. 

82 """ 

83 

84 FormatterClass: Type[FitsRawFormatterBase] 

85 """Formatter class that should be used to ingest this file (`type`; as 

86 subclass of `FitsRawFormatterBase`). 

87 """ 

88 

89 instrumentClass: Optional[Type[Instrument]] 

90 """The `Instrument` class associated with this file. Can be `None` 

91 if ``datasets`` is an empty list.""" 

92 

93 

94@dataclass 

95class RawExposureData: 

96 """Structure that holds information about a complete raw exposure, used 

97 during ingest. 

98 """ 

99 

100 dataId: DataCoordinate 

101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

102 """ 

103 

104 files: List[RawFileData] 

105 """List of structures containing file-level information. 

106 """ 

107 

108 universe: InitVar[DimensionUniverse] 

109 """Set of all known dimensions. 

110 """ 

111 

112 record: Optional[DimensionRecord] = None 

113 """The exposure `DimensionRecord` that must be inserted into the 

114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

115 """ 

116 

117 def __post_init__(self, universe: DimensionUniverse): 

118 # We don't care which file or dataset we read metadata from, because 

119 # we're assuming they'll all be the same; just use the first ones. 

120 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

121 

122 

123def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

124 """Create a Config field with options for how to transfer files between 

125 data repositories. 

126 

127 The allowed options for the field are exactly those supported by 

128 `lsst.daf.butler.Datastore.ingest`. 

129 

130 Parameters 

131 ---------- 

132 doc : `str` 

133 Documentation for the configuration field. 

134 

135 Returns 

136 ------- 

137 field : `lsst.pex.config.ChoiceField` 

138 Configuration field. 

139 """ 

140 return ChoiceField( 

141 doc=doc, 

142 dtype=str, 

143 allowed={"move": "move", 

144 "copy": "copy", 

145 "auto": "choice will depend on datastore", 

146 "direct": "use URI to ingested file directly in datastore", 

147 "link": "hard link falling back to symbolic link", 

148 "hardlink": "hard link", 

149 "symlink": "symbolic (soft) link", 

150 "relsymlink": "relative symbolic link", 

151 }, 

152 optional=True, 

153 default=default 

154 ) 

155 

156 

157class RawIngestConfig(Config): 

158 transfer = makeTransferChoiceField() 

159 failFast = Field( 

160 dtype=bool, 

161 default=False, 

162 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

163 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

164 ) 

165 

166 

167class RawIngestTask(Task): 

168 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

169 

170 Parameters 

171 ---------- 

172 config : `RawIngestConfig` 

173 Configuration for the task. 

174 butler : `~lsst.daf.butler.Butler` 

175 Writeable butler instance, with ``butler.run`` set to the appropriate 

176 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

177 datasets. 

178 **kwargs 

179 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

180 constructor. 

181 

182 Notes 

183 ----- 

184 Each instance of `RawIngestTask` writes to the same Butler. Each 

185 invocation of `RawIngestTask.run` ingests a list of files. 

186 """ 

187 

188 ConfigClass = RawIngestConfig 

189 

190 _DefaultName = "ingest" 

191 

192 def getDatasetType(self): 

193 """Return the DatasetType of the datasets ingested by this Task. 

194 """ 

195 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

196 universe=self.butler.registry.dimensions) 

197 

198 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any): 

199 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

200 super().__init__(config, **kwargs) 

201 self.butler = butler 

202 self.universe = self.butler.registry.dimensions 

203 self.datasetType = self.getDatasetType() 

204 

205 # Import all the instrument classes so that we ensure that we 

206 # have all the relevant metadata translators loaded. 

207 Instrument.importAll(self.butler.registry) 

208 

209 def _reduce_kwargs(self): 

210 # Add extra parameters to pickle 

211 return dict(**super()._reduce_kwargs(), butler=self.butler) 

212 

213 def extractMetadata(self, filename: str) -> RawFileData: 

214 """Extract and process metadata from a single raw file. 

215 

216 Parameters 

217 ---------- 

218 filename : `str` 

219 Path to the file. 

220 

221 Returns 

222 ------- 

223 data : `RawFileData` 

224 A structure containing the metadata extracted from the file, 

225 as well as the original filename. All fields will be populated, 

226 but the `RawFileData.dataId` attribute will be a minimal 

227 (unexpanded) `DataCoordinate` instance. 

228 

229 Notes 

230 ----- 

231 Assumes that there is a single dataset associated with the given 

232 file. Instruments using a single file to store multiple datasets 

233 must implement their own version of this method. 

234 """ 

235 

236 # We do not want to stop ingest if we are given a bad file. 

237 # Instead return a RawFileData with no datasets and allow 

238 # the caller to report the failure. 

239 

240 try: 

241 # Manually merge the primary and "first data" headers here because 

242 # we do not know in general if an input file has set INHERIT=T. 

243 phdu = readMetadata(filename, 0) 

244 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

245 datasets = [self._calculate_dataset_info(header, filename)] 

246 except Exception as e: 

247 self.log.debug("Problem extracting metadata from %s: %s", filename, e) 

248 # Indicate to the caller that we failed to read 

249 datasets = [] 

250 FormatterClass = Formatter 

251 instrument = None 

252 if self.config.failFast: 

253 raise RuntimeError(f"Problem extracting metadata from file {filename}") from e 

254 else: 

255 self.log.debug("Extracted metadata from file %s", filename) 

256 # The data model currently assumes that whilst multiple datasets 

257 # can be associated with a single file, they must all share the 

258 # same formatter. 

259 try: 

260 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry) 

261 except LookupError as e: 

262 self.log.warning("Instrument %s for file %s not known to registry", 

263 datasets[0].dataId["instrument"], filename) 

264 if self.config.failFast: 

265 raise RuntimeError(f"Instrument {datasets[0].dataId['instrument']} for" 

266 f" file {filename} not known to registry") from e 

267 datasets = [] 

268 FormatterClass = Formatter 

269 instrument = None 

270 else: 

271 FormatterClass = instrument.getRawFormatter(datasets[0].dataId) 

272 

273 return RawFileData(datasets=datasets, filename=filename, 

274 FormatterClass=FormatterClass, 

275 instrumentClass=instrument) 

276 

277 def _calculate_dataset_info(self, header, filename): 

278 """Calculate a RawFileDatasetInfo from the supplied information. 

279 

280 Parameters 

281 ---------- 

282 header : `Mapping` 

283 Header from the dataset. 

284 filename : `str` 

285 Filename to use for error messages. 

286 

287 Returns 

288 ------- 

289 dataset : `RawFileDatasetInfo` 

290 The dataId, and observation information associated with this 

291 dataset. 

292 """ 

293 # To ensure we aren't slowed down for no reason, explicitly 

294 # list here the properties we need for the schema 

295 # Use a dict with values a boolean where True indicates 

296 # that it is required that we calculate this property. 

297 ingest_subset = { 

298 "altaz_begin": False, 

299 "boresight_rotation_coord": False, 

300 "boresight_rotation_angle": False, 

301 "dark_time": False, 

302 "datetime_begin": True, 

303 "datetime_end": True, 

304 "detector_num": True, 

305 "exposure_group": False, 

306 "exposure_id": True, 

307 "exposure_time": True, 

308 "instrument": True, 

309 "tracking_radec": False, 

310 "object": False, 

311 "observation_counter": False, 

312 "observation_id": True, 

313 "observation_reason": False, 

314 "observation_type": True, 

315 "observing_day": False, 

316 "physical_filter": True, 

317 "science_program": False, 

318 "visit_id": False, 

319 } 

320 

321 obsInfo = ObservationInfo(header, pedantic=False, filename=filename, 

322 required={k for k in ingest_subset if ingest_subset[k]}, 

323 subset=set(ingest_subset)) 

324 

325 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

326 exposure=obsInfo.exposure_id, 

327 detector=obsInfo.detector_num, 

328 universe=self.universe) 

329 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

330 

331 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

332 """Group an iterable of `RawFileData` by exposure. 

333 

334 Parameters 

335 ---------- 

336 files : iterable of `RawFileData` 

337 File-level information to group. 

338 

339 Returns 

340 ------- 

341 exposures : `list` of `RawExposureData` 

342 A list of structures that group the file-level information by 

343 exposure. All fields will be populated. The 

344 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

345 `DataCoordinate` instances. 

346 """ 

347 exposureDimensions = self.universe["exposure"].graph 

348 byExposure = defaultdict(list) 

349 for f in files: 

350 # Assume that the first dataset is representative for the file 

351 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

352 

353 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

354 for dataId, exposureFiles in byExposure.items()] 

355 

356 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

357 """Expand the data IDs associated with a raw exposure to include 

358 additional metadata records. 

359 

360 Parameters 

361 ---------- 

362 exposure : `RawExposureData` 

363 A structure containing information about the exposure to be 

364 ingested. Must have `RawExposureData.records` populated. Should 

365 be considered consumed upon return. 

366 

367 Returns 

368 ------- 

369 exposure : `RawExposureData` 

370 An updated version of the input structure, with 

371 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

372 updated to data IDs for which `DataCoordinate.hasRecords` returns 

373 `True`. 

374 """ 

375 # We start by expanded the exposure-level data ID; we won't use that 

376 # directly in file ingest, but this lets us do some database lookups 

377 # once per exposure instead of once per file later. 

378 data.dataId = self.butler.registry.expandDataId( 

379 data.dataId, 

380 # We pass in the records we'll be inserting shortly so they aren't 

381 # looked up from the database. We do expect instrument and filter 

382 # records to be retrieved from the database here (though the 

383 # Registry may cache them so there isn't a lookup every time). 

384 records={ 

385 self.butler.registry.dimensions["exposure"]: data.record, 

386 } 

387 ) 

388 # Now we expand the per-file (exposure+detector) data IDs. This time 

389 # we pass in the records we just retrieved from the exposure data ID 

390 # expansion. 

391 for file in data.files: 

392 for dataset in file.datasets: 

393 dataset.dataId = self.butler.registry.expandDataId( 

394 dataset.dataId, 

395 records=dict(data.dataId.records) 

396 ) 

397 return data 

398 

399 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

400 """Perform all ingest preprocessing steps that do not involve actually 

401 modifying the database. 

402 

403 Parameters 

404 ---------- 

405 files : iterable over `str` or path-like objects 

406 Paths to the files to be ingested. Will be made absolute 

407 if they are not already. 

408 pool : `multiprocessing.Pool`, optional 

409 If not `None`, a process pool with which to parallelize some 

410 operations. 

411 processes : `int`, optional 

412 The number of processes to use. Ignored if ``pool`` is not `None`. 

413 

414 Yields 

415 ------ 

416 exposure : `RawExposureData` 

417 Data structures containing dimension records, filenames, and data 

418 IDs to be ingested (one structure for each exposure). 

419 bad_files : `list` of `str` 

420 List of all the files that could not have metadata extracted. 

421 """ 

422 if pool is None and processes > 1: 

423 pool = Pool(processes) 

424 mapFunc = map if pool is None else pool.imap_unordered 

425 

426 # Extract metadata and build per-detector regions. 

427 # This could run in a subprocess so collect all output 

428 # before looking at failures. 

429 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

430 

431 # Filter out all the failed reads and store them for later 

432 # reporting 

433 good_files = [] 

434 bad_files = [] 

435 for fileDatum in fileData: 

436 if not fileDatum.datasets: 

437 bad_files.append(fileDatum.filename) 

438 else: 

439 good_files.append(fileDatum) 

440 fileData = good_files 

441 

442 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

443 len(fileData), "" if len(fileData) == 1 else "s", 

444 len(bad_files), "" if len(bad_files) == 1 else "s") 

445 

446 # Use that metadata to group files (and extracted metadata) by 

447 # exposure. Never parallelized because it's intrinsically a gather 

448 # step. 

449 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

450 

451 # The next operation operates on RawExposureData instances (one at 

452 # a time) in-place and then returns the modified instance. We call it 

453 # as a pass-through instead of relying on the arguments we pass in to 

454 # have been modified because in the parallel case those arguments are 

455 # going to be pickled and unpickled, and I'm not certain 

456 # multiprocessing is careful enough with that for output arguments to 

457 # work. 

458 

459 # Expand the data IDs to include all dimension metadata; we need this 

460 # because we may need to generate path templates that rely on that 

461 # metadata. 

462 # This is the first step that involves actual database calls (but just 

463 # SELECTs), so if there's going to be a problem with connections vs. 

464 # multiple processes, or lock contention (in SQLite) slowing things 

465 # down, it'll happen here. 

466 return mapFunc(self.expandDataIds, exposureData), bad_files 

467 

468 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

469 ) -> List[DatasetRef]: 

470 """Ingest all raw files in one exposure. 

471 

472 Parameters 

473 ---------- 

474 exposure : `RawExposureData` 

475 A structure containing information about the exposure to be 

476 ingested. Must have `RawExposureData.records` populated and all 

477 data ID attributes expanded. 

478 run : `str`, optional 

479 Name of a RUN-type collection to write to, overriding 

480 ``self.butler.run``. 

481 

482 Returns 

483 ------- 

484 refs : `list` of `lsst.daf.butler.DatasetRef` 

485 Dataset references for ingested raws. 

486 """ 

487 datasets = [FileDataset(path=os.path.abspath(file.filename), 

488 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

489 formatter=file.FormatterClass) 

490 for file in exposure.files] 

491 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

492 return [ref for dataset in datasets for ref in dataset.refs] 

493 

494 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

495 """Ingest files into a Butler data repository. 

496 

497 This creates any new exposure or visit Dimension entries needed to 

498 identify the ingested files, creates new Dataset entries in the 

499 Registry and finally ingests the files themselves into the Datastore. 

500 Any needed instrument, detector, and physical_filter Dimension entries 

501 must exist in the Registry before `run` is called. 

502 

503 Parameters 

504 ---------- 

505 files : iterable over `str` or path-like objects 

506 Paths to the files to be ingested. Will be made absolute 

507 if they are not already. 

508 pool : `multiprocessing.Pool`, optional 

509 If not `None`, a process pool with which to parallelize some 

510 operations. 

511 processes : `int`, optional 

512 The number of processes to use. Ignored if ``pool`` is not `None`. 

513 run : `str`, optional 

514 Name of a RUN-type collection to write to, overriding 

515 the default derived from the instrument name. 

516 

517 Returns 

518 ------- 

519 refs : `list` of `lsst.daf.butler.DatasetRef` 

520 Dataset references for ingested raws. 

521 

522 Notes 

523 ----- 

524 This method inserts all datasets for an exposure within a transaction, 

525 guaranteeing that partial exposures are never ingested. The exposure 

526 dimension record is inserted with `Registry.syncDimensionData` first 

527 (in its own transaction), which inserts only if a record with the same 

528 primary key does not already exist. This allows different files within 

529 the same exposure to be incremented in different runs. 

530 """ 

531 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

532 # Up to this point, we haven't modified the data repository at all. 

533 # Now we finally do that, with one transaction per exposure. This is 

534 # not parallelized at present because the performance of this step is 

535 # limited by the database server. That may or may not change in the 

536 # future once we increase our usage of bulk inserts and reduce our 

537 # usage of savepoints; we've tried to get everything but the database 

538 # operations done in advance to reduce the time spent inside 

539 # transactions. 

540 self.butler.registry.registerDatasetType(self.datasetType) 

541 refs = [] 

542 runs = set() 

543 n_exposures = 0 

544 n_exposures_failed = 0 

545 n_ingests_failed = 0 

546 for exposure in exposureData: 

547 

548 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

549 len(exposure.files), "" if len(exposure.files) == 1 else "s", 

550 exposure.record.instrument, exposure.record.obs_id) 

551 

552 try: 

553 self.butler.registry.syncDimensionData("exposure", exposure.record) 

554 except Exception as e: 

555 n_exposures_failed += 1 

556 self.log.warning("Exposure %s:%s could not be registered: %s", 

557 exposure.record.instrument, exposure.record.obs_id, e) 

558 if self.config.failFast: 

559 raise e 

560 continue 

561 

562 # Override default run if nothing specified explicitly 

563 if run is None: 

564 instrumentClass = exposure.files[0].instrumentClass 

565 this_run = instrumentClass.makeDefaultRawIngestRunName() 

566 else: 

567 this_run = run 

568 if this_run not in runs: 

569 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

570 runs.add(this_run) 

571 try: 

572 with self.butler.transaction(): 

573 refs.extend(self.ingestExposureDatasets(exposure, run=this_run)) 

574 except Exception as e: 

575 n_ingests_failed += 1 

576 self.log.warning("Failed to ingest the following for reason: %s", e) 

577 for f in exposure.files: 

578 self.log.warning("- %s", f.filename) 

579 if self.config.failFast: 

580 raise e 

581 continue 

582 

583 # Success for this exposure 

584 n_exposures += 1 

585 self.log.info("Exposure %s:%s ingested successfully", 

586 exposure.record.instrument, exposure.record.obs_id) 

587 

588 had_failure = False 

589 

590 if bad_files: 

591 had_failure = True 

592 self.log.warning("Could not extract observation metadata from the following:") 

593 for f in bad_files: 

594 self.log.warning("- %s", f) 

595 

596 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

597 " registration and %d failure%s from file ingest.", 

598 n_exposures, "" if n_exposures == 1 else "s", 

599 n_exposures_failed, "" if n_exposures_failed == 1 else "s", 

600 n_ingests_failed, "" if n_ingests_failed == 1 else "s") 

601 if n_exposures_failed > 0 or n_ingests_failed > 0: 

602 had_failure = True 

603 self.log.info("Ingested %d distinct Butler dataset%s", 

604 len(refs), "" if len(refs) == 1 else "s") 

605 

606 if had_failure: 

607 raise RuntimeError("Some failures encountered during ingestion") 

608 

609 return refs