Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import List, Iterator, Iterable, Type, Optional, Any 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, merge_headers 

32from lsst.afw.fits import readMetadata 

33from lsst.daf.butler import ( 

34 Butler, 

35 CollectionType, 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 FileDataset, 

42 Formatter, 

43) 

44from lsst.pex.config import Config, ChoiceField 

45from lsst.pipe.base import Task 

46 

47from ._instrument import Instrument, makeExposureRecordFromObsInfo 

48from ._fitsRawFormatterBase import FitsRawFormatterBase 

49 

50 

51@dataclass 

52class RawFileDatasetInfo: 

53 """Structure that holds information about a single dataset within a 

54 raw file. 

55 """ 

56 

57 dataId: DataCoordinate 

58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

59 """ 

60 

61 obsInfo: ObservationInfo 

62 """Standardized observation metadata extracted directly from the file 

63 headers (`astro_metadata_translator.ObservationInfo`). 

64 """ 

65 

66 

67@dataclass 

68class RawFileData: 

69 """Structure that holds information about a single raw file, used during 

70 ingest. 

71 """ 

72 

73 datasets: List[RawFileDatasetInfo] 

74 """The information describing each dataset within this raw file. 

75 (`list` of `RawFileDatasetInfo`) 

76 """ 

77 

78 filename: str 

79 """Name of the file this information was extracted from (`str`). 

80 

81 This is the path prior to ingest, not the path after ingest. 

82 """ 

83 

84 FormatterClass: Type[FitsRawFormatterBase] 

85 """Formatter class that should be used to ingest this file (`type`; as 

86 subclass of `FitsRawFormatterBase`). 

87 """ 

88 

89 instrumentClass: Optional[Type[Instrument]] 

90 """The `Instrument` class associated with this file. Can be `None` 

91 if ``datasets`` is an empty list.""" 

92 

93 

94@dataclass 

95class RawExposureData: 

96 """Structure that holds information about a complete raw exposure, used 

97 during ingest. 

98 """ 

99 

100 dataId: DataCoordinate 

101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

102 """ 

103 

104 files: List[RawFileData] 

105 """List of structures containing file-level information. 

106 """ 

107 

108 universe: InitVar[DimensionUniverse] 

109 """Set of all known dimensions. 

110 """ 

111 

112 record: Optional[DimensionRecord] = None 

113 """The exposure `DimensionRecord` that must be inserted into the 

114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

115 """ 

116 

117 def __post_init__(self, universe: DimensionUniverse): 

118 # We don't care which file or dataset we read metadata from, because 

119 # we're assuming they'll all be the same; just use the first ones. 

120 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

121 

122 

123def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

124 """Create a Config field with options for how to transfer files between 

125 data repositories. 

126 

127 The allowed options for the field are exactly those supported by 

128 `lsst.daf.butler.Datastore.ingest`. 

129 

130 Parameters 

131 ---------- 

132 doc : `str` 

133 Documentation for the configuration field. 

134 

135 Returns 

136 ------- 

137 field : `lsst.pex.config.ChoiceField` 

138 Configuration field. 

139 """ 

140 return ChoiceField( 

141 doc=doc, 

142 dtype=str, 

143 allowed={"move": "move", 

144 "copy": "copy", 

145 "auto": "choice will depend on datastore", 

146 "direct": "use URI to ingested file directly in datastore", 

147 "link": "hard link falling back to symbolic link", 

148 "hardlink": "hard link", 

149 "symlink": "symbolic (soft) link", 

150 "relsymlink": "relative symbolic link", 

151 }, 

152 optional=True, 

153 default=default 

154 ) 

155 

156 

157class RawIngestConfig(Config): 

158 transfer = makeTransferChoiceField() 

159 

160 

161class RawIngestTask(Task): 

162 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

163 

164 Parameters 

165 ---------- 

166 config : `RawIngestConfig` 

167 Configuration for the task. 

168 butler : `~lsst.daf.butler.Butler` 

169 Writeable butler instance, with ``butler.run`` set to the appropriate 

170 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

171 datasets. 

172 **kwargs 

173 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

174 constructor. 

175 

176 Notes 

177 ----- 

178 Each instance of `RawIngestTask` writes to the same Butler. Each 

179 invocation of `RawIngestTask.run` ingests a list of files. 

180 """ 

181 

182 ConfigClass = RawIngestConfig 

183 

184 _DefaultName = "ingest" 

185 

186 def getDatasetType(self): 

187 """Return the DatasetType of the datasets ingested by this Task. 

188 """ 

189 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

190 universe=self.butler.registry.dimensions) 

191 

192 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any): 

193 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

194 super().__init__(config, **kwargs) 

195 self.butler = butler 

196 self.universe = self.butler.registry.dimensions 

197 self.datasetType = self.getDatasetType() 

198 

199 # Import all the instrument classes so that we ensure that we 

200 # have all the relevant metadata translators loaded. 

201 Instrument.importAll(self.butler.registry) 

202 

203 def _reduce_kwargs(self): 

204 # Add extra parameters to pickle 

205 return dict(**super()._reduce_kwargs(), butler=self.butler) 

206 

207 def extractMetadata(self, filename: str) -> RawFileData: 

208 """Extract and process metadata from a single raw file. 

209 

210 Parameters 

211 ---------- 

212 filename : `str` 

213 Path to the file. 

214 

215 Returns 

216 ------- 

217 data : `RawFileData` 

218 A structure containing the metadata extracted from the file, 

219 as well as the original filename. All fields will be populated, 

220 but the `RawFileData.dataId` attribute will be a minimal 

221 (unexpanded) `DataCoordinate` instance. 

222 

223 Notes 

224 ----- 

225 Assumes that there is a single dataset associated with the given 

226 file. Instruments using a single file to store multiple datasets 

227 must implement their own version of this method. 

228 """ 

229 

230 # We do not want to stop ingest if we are given a bad file. 

231 # Instead return a RawFileData with no datasets and allow 

232 # the caller to report the failure. 

233 

234 try: 

235 # Manually merge the primary and "first data" headers here because 

236 # we do not know in general if an input file has set INHERIT=T. 

237 phdu = readMetadata(filename, 0) 

238 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

239 datasets = [self._calculate_dataset_info(header, filename)] 

240 except Exception as e: 

241 self.log.debug("Problem extracting metadata from %s: %s", filename, e) 

242 # Indicate to the caller that we failed to read 

243 datasets = [] 

244 FormatterClass = Formatter 

245 instrument = None 

246 else: 

247 self.log.debug("Extracted metadata from file %s", filename) 

248 # The data model currently assumes that whilst multiple datasets 

249 # can be associated with a single file, they must all share the 

250 # same formatter. 

251 try: 

252 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry) 

253 except LookupError: 

254 self.log.warning("Instrument %s for file %s not known to registry", 

255 datasets[0].dataId["instrument"], filename) 

256 datasets = [] 

257 FormatterClass = Formatter 

258 instrument = None 

259 else: 

260 FormatterClass = instrument.getRawFormatter(datasets[0].dataId) 

261 

262 return RawFileData(datasets=datasets, filename=filename, 

263 FormatterClass=FormatterClass, 

264 instrumentClass=instrument) 

265 

266 def _calculate_dataset_info(self, header, filename): 

267 """Calculate a RawFileDatasetInfo from the supplied information. 

268 

269 Parameters 

270 ---------- 

271 header : `Mapping` 

272 Header from the dataset. 

273 filename : `str` 

274 Filename to use for error messages. 

275 

276 Returns 

277 ------- 

278 dataset : `RawFileDatasetInfo` 

279 The dataId, and observation information associated with this 

280 dataset. 

281 """ 

282 # To ensure we aren't slowed down for no reason, explicitly 

283 # list here the properties we need for the schema 

284 # Use a dict with values a boolean where True indicates 

285 # that it is required that we calculate this property. 

286 ingest_subset = { 

287 "altaz_begin": False, 

288 "boresight_rotation_coord": False, 

289 "boresight_rotation_angle": False, 

290 "dark_time": False, 

291 "datetime_begin": True, 

292 "datetime_end": True, 

293 "detector_num": True, 

294 "exposure_group": False, 

295 "exposure_id": True, 

296 "exposure_time": True, 

297 "instrument": True, 

298 "tracking_radec": False, 

299 "object": False, 

300 "observation_counter": False, 

301 "observation_id": True, 

302 "observation_reason": False, 

303 "observation_type": True, 

304 "observing_day": False, 

305 "physical_filter": True, 

306 "science_program": False, 

307 "visit_id": False, 

308 } 

309 

310 obsInfo = ObservationInfo(header, pedantic=False, filename=filename, 

311 required={k for k in ingest_subset if ingest_subset[k]}, 

312 subset=set(ingest_subset)) 

313 

314 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

315 exposure=obsInfo.exposure_id, 

316 detector=obsInfo.detector_num, 

317 universe=self.universe) 

318 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

319 

320 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

321 """Group an iterable of `RawFileData` by exposure. 

322 

323 Parameters 

324 ---------- 

325 files : iterable of `RawFileData` 

326 File-level information to group. 

327 

328 Returns 

329 ------- 

330 exposures : `list` of `RawExposureData` 

331 A list of structures that group the file-level information by 

332 exposure. All fields will be populated. The 

333 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

334 `DataCoordinate` instances. 

335 """ 

336 exposureDimensions = self.universe["exposure"].graph 

337 byExposure = defaultdict(list) 

338 for f in files: 

339 # Assume that the first dataset is representative for the file 

340 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

341 

342 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

343 for dataId, exposureFiles in byExposure.items()] 

344 

345 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

346 """Expand the data IDs associated with a raw exposure to include 

347 additional metadata records. 

348 

349 Parameters 

350 ---------- 

351 exposure : `RawExposureData` 

352 A structure containing information about the exposure to be 

353 ingested. Must have `RawExposureData.records` populated. Should 

354 be considered consumed upon return. 

355 

356 Returns 

357 ------- 

358 exposure : `RawExposureData` 

359 An updated version of the input structure, with 

360 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

361 updated to data IDs for which `DataCoordinate.hasRecords` returns 

362 `True`. 

363 """ 

364 # We start by expanded the exposure-level data ID; we won't use that 

365 # directly in file ingest, but this lets us do some database lookups 

366 # once per exposure instead of once per file later. 

367 data.dataId = self.butler.registry.expandDataId( 

368 data.dataId, 

369 # We pass in the records we'll be inserting shortly so they aren't 

370 # looked up from the database. We do expect instrument and filter 

371 # records to be retrieved from the database here (though the 

372 # Registry may cache them so there isn't a lookup every time). 

373 records={ 

374 self.butler.registry.dimensions["exposure"]: data.record, 

375 } 

376 ) 

377 # Now we expand the per-file (exposure+detector) data IDs. This time 

378 # we pass in the records we just retrieved from the exposure data ID 

379 # expansion. 

380 for file in data.files: 

381 for dataset in file.datasets: 

382 dataset.dataId = self.butler.registry.expandDataId( 

383 dataset.dataId, 

384 records=dict(data.dataId.records) 

385 ) 

386 return data 

387 

388 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

389 """Perform all ingest preprocessing steps that do not involve actually 

390 modifying the database. 

391 

392 Parameters 

393 ---------- 

394 files : iterable over `str` or path-like objects 

395 Paths to the files to be ingested. Will be made absolute 

396 if they are not already. 

397 pool : `multiprocessing.Pool`, optional 

398 If not `None`, a process pool with which to parallelize some 

399 operations. 

400 processes : `int`, optional 

401 The number of processes to use. Ignored if ``pool`` is not `None`. 

402 

403 Yields 

404 ------ 

405 exposure : `RawExposureData` 

406 Data structures containing dimension records, filenames, and data 

407 IDs to be ingested (one structure for each exposure). 

408 bad_files : `list` of `str` 

409 List of all the files that could not have metadata extracted. 

410 """ 

411 if pool is None and processes > 1: 

412 pool = Pool(processes) 

413 mapFunc = map if pool is None else pool.imap_unordered 

414 

415 # Extract metadata and build per-detector regions. 

416 # This could run in a subprocess so collect all output 

417 # before looking at failures. 

418 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

419 

420 # Filter out all the failed reads and store them for later 

421 # reporting 

422 good_files = [] 

423 bad_files = [] 

424 for fileDatum in fileData: 

425 if not fileDatum.datasets: 

426 bad_files.append(fileDatum.filename) 

427 else: 

428 good_files.append(fileDatum) 

429 fileData = good_files 

430 

431 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

432 len(fileData), "" if len(fileData) == 1 else "s", 

433 len(bad_files), "" if len(bad_files) == 1 else "s") 

434 

435 # Use that metadata to group files (and extracted metadata) by 

436 # exposure. Never parallelized because it's intrinsically a gather 

437 # step. 

438 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

439 

440 # The next operation operates on RawExposureData instances (one at 

441 # a time) in-place and then returns the modified instance. We call it 

442 # as a pass-through instead of relying on the arguments we pass in to 

443 # have been modified because in the parallel case those arguments are 

444 # going to be pickled and unpickled, and I'm not certain 

445 # multiprocessing is careful enough with that for output arguments to 

446 # work. 

447 

448 # Expand the data IDs to include all dimension metadata; we need this 

449 # because we may need to generate path templates that rely on that 

450 # metadata. 

451 # This is the first step that involves actual database calls (but just 

452 # SELECTs), so if there's going to be a problem with connections vs. 

453 # multiple processes, or lock contention (in SQLite) slowing things 

454 # down, it'll happen here. 

455 return mapFunc(self.expandDataIds, exposureData), bad_files 

456 

457 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

458 ) -> List[DatasetRef]: 

459 """Ingest all raw files in one exposure. 

460 

461 Parameters 

462 ---------- 

463 exposure : `RawExposureData` 

464 A structure containing information about the exposure to be 

465 ingested. Must have `RawExposureData.records` populated and all 

466 data ID attributes expanded. 

467 run : `str`, optional 

468 Name of a RUN-type collection to write to, overriding 

469 ``self.butler.run``. 

470 

471 Returns 

472 ------- 

473 refs : `list` of `lsst.daf.butler.DatasetRef` 

474 Dataset references for ingested raws. 

475 """ 

476 datasets = [FileDataset(path=os.path.abspath(file.filename), 

477 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

478 formatter=file.FormatterClass) 

479 for file in exposure.files] 

480 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

481 return [ref for dataset in datasets for ref in dataset.refs] 

482 

483 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

484 """Ingest files into a Butler data repository. 

485 

486 This creates any new exposure or visit Dimension entries needed to 

487 identify the ingested files, creates new Dataset entries in the 

488 Registry and finally ingests the files themselves into the Datastore. 

489 Any needed instrument, detector, and physical_filter Dimension entries 

490 must exist in the Registry before `run` is called. 

491 

492 Parameters 

493 ---------- 

494 files : iterable over `str` or path-like objects 

495 Paths to the files to be ingested. Will be made absolute 

496 if they are not already. 

497 pool : `multiprocessing.Pool`, optional 

498 If not `None`, a process pool with which to parallelize some 

499 operations. 

500 processes : `int`, optional 

501 The number of processes to use. Ignored if ``pool`` is not `None`. 

502 run : `str`, optional 

503 Name of a RUN-type collection to write to, overriding 

504 the default derived from the instrument name. 

505 

506 Returns 

507 ------- 

508 refs : `list` of `lsst.daf.butler.DatasetRef` 

509 Dataset references for ingested raws. 

510 

511 Notes 

512 ----- 

513 This method inserts all datasets for an exposure within a transaction, 

514 guaranteeing that partial exposures are never ingested. The exposure 

515 dimension record is inserted with `Registry.syncDimensionData` first 

516 (in its own transaction), which inserts only if a record with the same 

517 primary key does not already exist. This allows different files within 

518 the same exposure to be incremented in different runs. 

519 """ 

520 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

521 # Up to this point, we haven't modified the data repository at all. 

522 # Now we finally do that, with one transaction per exposure. This is 

523 # not parallelized at present because the performance of this step is 

524 # limited by the database server. That may or may not change in the 

525 # future once we increase our usage of bulk inserts and reduce our 

526 # usage of savepoints; we've tried to get everything but the database 

527 # operations done in advance to reduce the time spent inside 

528 # transactions. 

529 self.butler.registry.registerDatasetType(self.datasetType) 

530 refs = [] 

531 runs = set() 

532 n_exposures = 0 

533 n_exposures_failed = 0 

534 n_ingests_failed = 0 

535 for exposure in exposureData: 

536 

537 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

538 len(exposure.files), "" if len(exposure.files) == 1 else "s", 

539 exposure.record.instrument, exposure.record.obs_id) 

540 

541 try: 

542 self.butler.registry.syncDimensionData("exposure", exposure.record) 

543 except Exception as e: 

544 n_exposures_failed += 1 

545 self.log.warning("Exposure %s:%s could not be registered: %s", 

546 exposure.record.instrument, exposure.record.obs_id, e) 

547 continue 

548 

549 # Override default run if nothing specified explicitly 

550 if run is None: 

551 instrumentClass = exposure.files[0].instrumentClass 

552 this_run = instrumentClass.makeDefaultRawIngestRunName() 

553 else: 

554 this_run = run 

555 if this_run not in runs: 

556 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

557 runs.add(this_run) 

558 try: 

559 with self.butler.transaction(): 

560 refs.extend(self.ingestExposureDatasets(exposure, run=this_run)) 

561 except Exception as e: 

562 n_ingests_failed += 1 

563 self.log.warning("Failed to ingest the following for reason: %s", e) 

564 for f in exposure.files: 

565 self.log.warning("- %s", f.filename) 

566 continue 

567 

568 # Success for this exposure 

569 n_exposures += 1 

570 self.log.info("Exposure %s:%s ingested successfully", 

571 exposure.record.instrument, exposure.record.obs_id) 

572 

573 had_failure = False 

574 

575 if bad_files: 

576 had_failure = True 

577 self.log.warning("Could not extract observation metadata from the following:") 

578 for f in bad_files: 

579 self.log.warning("- %s", f) 

580 

581 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

582 " registration and %d failure%s from file ingest.", 

583 n_exposures, "" if n_exposures == 1 else "s", 

584 n_exposures_failed, "" if n_exposures_failed == 1 else "s", 

585 n_ingests_failed, "" if n_ingests_failed == 1 else "s") 

586 if n_exposures_failed > 0 or n_ingests_failed > 0: 

587 had_failure = True 

588 self.log.info("Ingested %d distinct Butler dataset%s", 

589 len(refs), "" if len(refs) == 1 else "s") 

590 

591 if had_failure: 

592 raise RuntimeError("Some failures encountered during ingestion") 

593 

594 return refs