Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import List, Iterator, Iterable, Type, Optional, Any 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, merge_headers 

32from lsst.afw.fits import readMetadata 

33from lsst.daf.butler import ( 

34 Butler, 

35 CollectionType, 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 FileDataset, 

42 Formatter, 

43) 

44from lsst.pex.config import Config, ChoiceField 

45from lsst.pipe.base import Task 

46 

47from ._instrument import Instrument, makeExposureRecordFromObsInfo 

48from ._fitsRawFormatterBase import FitsRawFormatterBase 

49 

50 

51@dataclass 

52class RawFileDatasetInfo: 

53 """Structure that holds information about a single dataset within a 

54 raw file. 

55 """ 

56 

57 dataId: DataCoordinate 

58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

59 """ 

60 

61 obsInfo: ObservationInfo 

62 """Standardized observation metadata extracted directly from the file 

63 headers (`astro_metadata_translator.ObservationInfo`). 

64 """ 

65 

66 

67@dataclass 

68class RawFileData: 

69 """Structure that holds information about a single raw file, used during 

70 ingest. 

71 """ 

72 

73 datasets: List[RawFileDatasetInfo] 

74 """The information describing each dataset within this raw file. 

75 (`list` of `RawFileDatasetInfo`) 

76 """ 

77 

78 filename: str 

79 """Name of the file this information was extracted from (`str`). 

80 

81 This is the path prior to ingest, not the path after ingest. 

82 """ 

83 

84 FormatterClass: Type[FitsRawFormatterBase] 

85 """Formatter class that should be used to ingest this file (`type`; as 

86 subclass of `FitsRawFormatterBase`). 

87 """ 

88 

89 instrumentClass: Optional[Type[Instrument]] 

90 """The `Instrument` class associated with this file. Can be `None` 

91 if ``datasets`` is an empty list.""" 

92 

93 

94@dataclass 

95class RawExposureData: 

96 """Structure that holds information about a complete raw exposure, used 

97 during ingest. 

98 """ 

99 

100 dataId: DataCoordinate 

101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

102 """ 

103 

104 files: List[RawFileData] 

105 """List of structures containing file-level information. 

106 """ 

107 

108 universe: InitVar[DimensionUniverse] 

109 """Set of all known dimensions. 

110 """ 

111 

112 record: Optional[DimensionRecord] = None 

113 """The exposure `DimensionRecord` that must be inserted into the 

114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

115 """ 

116 

117 def __post_init__(self, universe: DimensionUniverse): 

118 # We don't care which file or dataset we read metadata from, because 

119 # we're assuming they'll all be the same; just use the first ones. 

120 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

121 

122 

123def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

124 """Create a Config field with options for how to transfer files between 

125 data repositories. 

126 

127 The allowed options for the field are exactly those supported by 

128 `lsst.daf.butler.Datastore.ingest`. 

129 

130 Parameters 

131 ---------- 

132 doc : `str` 

133 Documentation for the configuration field. 

134 

135 Returns 

136 ------- 

137 field : `lsst.pex.config.ChoiceField` 

138 Configuration field. 

139 """ 

140 return ChoiceField( 

141 doc=doc, 

142 dtype=str, 

143 allowed={"move": "move", 

144 "copy": "copy", 

145 "auto": "choice will depend on datastore", 

146 "link": "hard link falling back to symbolic link", 

147 "hardlink": "hard link", 

148 "symlink": "symbolic (soft) link", 

149 "relsymlink": "relative symbolic link", 

150 }, 

151 optional=True, 

152 default=default 

153 ) 

154 

155 

156class RawIngestConfig(Config): 

157 transfer = makeTransferChoiceField() 

158 

159 

160class RawIngestTask(Task): 

161 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

162 

163 Parameters 

164 ---------- 

165 config : `RawIngestConfig` 

166 Configuration for the task. 

167 butler : `~lsst.daf.butler.Butler` 

168 Writeable butler instance, with ``butler.run`` set to the appropriate 

169 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

170 datasets. 

171 **kwargs 

172 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

173 constructor. 

174 

175 Notes 

176 ----- 

177 Each instance of `RawIngestTask` writes to the same Butler. Each 

178 invocation of `RawIngestTask.run` ingests a list of files. 

179 """ 

180 

181 ConfigClass = RawIngestConfig 

182 

183 _DefaultName = "ingest" 

184 

185 def getDatasetType(self): 

186 """Return the DatasetType of the datasets ingested by this Task. 

187 """ 

188 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

189 universe=self.butler.registry.dimensions) 

190 

191 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any): 

192 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

193 super().__init__(config, **kwargs) 

194 self.butler = butler 

195 self.universe = self.butler.registry.dimensions 

196 self.datasetType = self.getDatasetType() 

197 

198 # Import all the instrument classes so that we ensure that we 

199 # have all the relevant metadata translators loaded. 

200 Instrument.importAll(self.butler.registry) 

201 

202 def _reduce_kwargs(self): 

203 # Add extra parameters to pickle 

204 return dict(**super()._reduce_kwargs(), butler=self.butler) 

205 

206 def extractMetadata(self, filename: str) -> RawFileData: 

207 """Extract and process metadata from a single raw file. 

208 

209 Parameters 

210 ---------- 

211 filename : `str` 

212 Path to the file. 

213 

214 Returns 

215 ------- 

216 data : `RawFileData` 

217 A structure containing the metadata extracted from the file, 

218 as well as the original filename. All fields will be populated, 

219 but the `RawFileData.dataId` attribute will be a minimal 

220 (unexpanded) `DataCoordinate` instance. 

221 

222 Notes 

223 ----- 

224 Assumes that there is a single dataset associated with the given 

225 file. Instruments using a single file to store multiple datasets 

226 must implement their own version of this method. 

227 """ 

228 

229 # We do not want to stop ingest if we are given a bad file. 

230 # Instead return a RawFileData with no datasets and allow 

231 # the caller to report the failure. 

232 

233 try: 

234 # Manually merge the primary and "first data" headers here because 

235 # we do not know in general if an input file has set INHERIT=T. 

236 phdu = readMetadata(filename, 0) 

237 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

238 datasets = [self._calculate_dataset_info(header, filename)] 

239 except Exception as e: 

240 self.log.debug("Problem extracting metadata from %s: %s", filename, e) 

241 # Indicate to the caller that we failed to read 

242 datasets = [] 

243 FormatterClass = Formatter 

244 instrument = None 

245 else: 

246 self.log.debug("Extracted metadata from file %s", filename) 

247 # The data model currently assumes that whilst multiple datasets 

248 # can be associated with a single file, they must all share the 

249 # same formatter. 

250 try: 

251 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry) 

252 except LookupError: 

253 self.log.warning("Instrument %s for file %s not known to registry", 

254 datasets[0].dataId["instrument"], filename) 

255 datasets = [] 

256 FormatterClass = Formatter 

257 instrument = None 

258 else: 

259 FormatterClass = instrument.getRawFormatter(datasets[0].dataId) 

260 

261 return RawFileData(datasets=datasets, filename=filename, 

262 FormatterClass=FormatterClass, 

263 instrumentClass=instrument) 

264 

265 def _calculate_dataset_info(self, header, filename): 

266 """Calculate a RawFileDatasetInfo from the supplied information. 

267 

268 Parameters 

269 ---------- 

270 header : `Mapping` 

271 Header from the dataset. 

272 filename : `str` 

273 Filename to use for error messages. 

274 

275 Returns 

276 ------- 

277 dataset : `RawFileDatasetInfo` 

278 The dataId, and observation information associated with this 

279 dataset. 

280 """ 

281 # To ensure we aren't slowed down for no reason, explicitly 

282 # list here the properties we need for the schema 

283 # Use a dict with values a boolean where True indicates 

284 # that it is required that we calculate this property. 

285 ingest_subset = { 

286 "altaz_begin": False, 

287 "boresight_rotation_coord": False, 

288 "boresight_rotation_angle": False, 

289 "dark_time": False, 

290 "datetime_begin": True, 

291 "datetime_end": True, 

292 "detector_num": True, 

293 "exposure_group": False, 

294 "exposure_id": True, 

295 "exposure_time": True, 

296 "instrument": True, 

297 "tracking_radec": False, 

298 "object": False, 

299 "observation_counter": False, 

300 "observation_id": True, 

301 "observation_reason": False, 

302 "observation_type": True, 

303 "observing_day": False, 

304 "physical_filter": True, 

305 "science_program": False, 

306 "visit_id": False, 

307 } 

308 

309 obsInfo = ObservationInfo(header, pedantic=False, filename=filename, 

310 required={k for k in ingest_subset if ingest_subset[k]}, 

311 subset=set(ingest_subset)) 

312 

313 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

314 exposure=obsInfo.exposure_id, 

315 detector=obsInfo.detector_num, 

316 universe=self.universe) 

317 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

318 

319 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

320 """Group an iterable of `RawFileData` by exposure. 

321 

322 Parameters 

323 ---------- 

324 files : iterable of `RawFileData` 

325 File-level information to group. 

326 

327 Returns 

328 ------- 

329 exposures : `list` of `RawExposureData` 

330 A list of structures that group the file-level information by 

331 exposure. All fields will be populated. The 

332 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

333 `DataCoordinate` instances. 

334 """ 

335 exposureDimensions = self.universe["exposure"].graph 

336 byExposure = defaultdict(list) 

337 for f in files: 

338 # Assume that the first dataset is representative for the file 

339 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

340 

341 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

342 for dataId, exposureFiles in byExposure.items()] 

343 

344 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

345 """Expand the data IDs associated with a raw exposure to include 

346 additional metadata records. 

347 

348 Parameters 

349 ---------- 

350 exposure : `RawExposureData` 

351 A structure containing information about the exposure to be 

352 ingested. Must have `RawExposureData.records` populated. Should 

353 be considered consumed upon return. 

354 

355 Returns 

356 ------- 

357 exposure : `RawExposureData` 

358 An updated version of the input structure, with 

359 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

360 updated to data IDs for which `DataCoordinate.hasRecords` returns 

361 `True`. 

362 """ 

363 # We start by expanded the exposure-level data ID; we won't use that 

364 # directly in file ingest, but this lets us do some database lookups 

365 # once per exposure instead of once per file later. 

366 data.dataId = self.butler.registry.expandDataId( 

367 data.dataId, 

368 # We pass in the records we'll be inserting shortly so they aren't 

369 # looked up from the database. We do expect instrument and filter 

370 # records to be retrieved from the database here (though the 

371 # Registry may cache them so there isn't a lookup every time). 

372 records={ 

373 self.butler.registry.dimensions["exposure"]: data.record, 

374 } 

375 ) 

376 # Now we expand the per-file (exposure+detector) data IDs. This time 

377 # we pass in the records we just retrieved from the exposure data ID 

378 # expansion. 

379 for file in data.files: 

380 for dataset in file.datasets: 

381 dataset.dataId = self.butler.registry.expandDataId( 

382 dataset.dataId, 

383 records=dict(data.dataId.records) 

384 ) 

385 return data 

386 

387 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

388 """Perform all ingest preprocessing steps that do not involve actually 

389 modifying the database. 

390 

391 Parameters 

392 ---------- 

393 files : iterable over `str` or path-like objects 

394 Paths to the files to be ingested. Will be made absolute 

395 if they are not already. 

396 pool : `multiprocessing.Pool`, optional 

397 If not `None`, a process pool with which to parallelize some 

398 operations. 

399 processes : `int`, optional 

400 The number of processes to use. Ignored if ``pool`` is not `None`. 

401 

402 Yields 

403 ------ 

404 exposure : `RawExposureData` 

405 Data structures containing dimension records, filenames, and data 

406 IDs to be ingested (one structure for each exposure). 

407 bad_files : `list` of `str` 

408 List of all the files that could not have metadata extracted. 

409 """ 

410 if pool is None and processes > 1: 

411 pool = Pool(processes) 

412 mapFunc = map if pool is None else pool.imap_unordered 

413 

414 # Extract metadata and build per-detector regions. 

415 # This could run in a subprocess so collect all output 

416 # before looking at failures. 

417 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

418 

419 # Filter out all the failed reads and store them for later 

420 # reporting 

421 good_files = [] 

422 bad_files = [] 

423 for fileDatum in fileData: 

424 if not fileDatum.datasets: 

425 bad_files.append(fileDatum.filename) 

426 else: 

427 good_files.append(fileDatum) 

428 fileData = good_files 

429 

430 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

431 len(fileData), "" if len(fileData) == 1 else "s", 

432 len(bad_files), "" if len(bad_files) == 1 else "s") 

433 

434 # Use that metadata to group files (and extracted metadata) by 

435 # exposure. Never parallelized because it's intrinsically a gather 

436 # step. 

437 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

438 

439 # The next operation operates on RawExposureData instances (one at 

440 # a time) in-place and then returns the modified instance. We call it 

441 # as a pass-through instead of relying on the arguments we pass in to 

442 # have been modified because in the parallel case those arguments are 

443 # going to be pickled and unpickled, and I'm not certain 

444 # multiprocessing is careful enough with that for output arguments to 

445 # work. 

446 

447 # Expand the data IDs to include all dimension metadata; we need this 

448 # because we may need to generate path templates that rely on that 

449 # metadata. 

450 # This is the first step that involves actual database calls (but just 

451 # SELECTs), so if there's going to be a problem with connections vs. 

452 # multiple processes, or lock contention (in SQLite) slowing things 

453 # down, it'll happen here. 

454 return mapFunc(self.expandDataIds, exposureData), bad_files 

455 

456 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

457 ) -> List[DatasetRef]: 

458 """Ingest all raw files in one exposure. 

459 

460 Parameters 

461 ---------- 

462 exposure : `RawExposureData` 

463 A structure containing information about the exposure to be 

464 ingested. Must have `RawExposureData.records` populated and all 

465 data ID attributes expanded. 

466 run : `str`, optional 

467 Name of a RUN-type collection to write to, overriding 

468 ``self.butler.run``. 

469 

470 Returns 

471 ------- 

472 refs : `list` of `lsst.daf.butler.DatasetRef` 

473 Dataset references for ingested raws. 

474 """ 

475 datasets = [FileDataset(path=os.path.abspath(file.filename), 

476 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

477 formatter=file.FormatterClass) 

478 for file in exposure.files] 

479 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

480 return [ref for dataset in datasets for ref in dataset.refs] 

481 

482 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

483 """Ingest files into a Butler data repository. 

484 

485 This creates any new exposure or visit Dimension entries needed to 

486 identify the ingested files, creates new Dataset entries in the 

487 Registry and finally ingests the files themselves into the Datastore. 

488 Any needed instrument, detector, and physical_filter Dimension entries 

489 must exist in the Registry before `run` is called. 

490 

491 Parameters 

492 ---------- 

493 files : iterable over `str` or path-like objects 

494 Paths to the files to be ingested. Will be made absolute 

495 if they are not already. 

496 pool : `multiprocessing.Pool`, optional 

497 If not `None`, a process pool with which to parallelize some 

498 operations. 

499 processes : `int`, optional 

500 The number of processes to use. Ignored if ``pool`` is not `None`. 

501 run : `str`, optional 

502 Name of a RUN-type collection to write to, overriding 

503 the default derived from the instrument name. 

504 

505 Returns 

506 ------- 

507 refs : `list` of `lsst.daf.butler.DatasetRef` 

508 Dataset references for ingested raws. 

509 

510 Notes 

511 ----- 

512 This method inserts all datasets for an exposure within a transaction, 

513 guaranteeing that partial exposures are never ingested. The exposure 

514 dimension record is inserted with `Registry.syncDimensionData` first 

515 (in its own transaction), which inserts only if a record with the same 

516 primary key does not already exist. This allows different files within 

517 the same exposure to be incremented in different runs. 

518 """ 

519 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

520 # Up to this point, we haven't modified the data repository at all. 

521 # Now we finally do that, with one transaction per exposure. This is 

522 # not parallelized at present because the performance of this step is 

523 # limited by the database server. That may or may not change in the 

524 # future once we increase our usage of bulk inserts and reduce our 

525 # usage of savepoints; we've tried to get everything but the database 

526 # operations done in advance to reduce the time spent inside 

527 # transactions. 

528 self.butler.registry.registerDatasetType(self.datasetType) 

529 refs = [] 

530 runs = set() 

531 n_exposures = 0 

532 n_exposures_failed = 0 

533 n_ingests_failed = 0 

534 for exposure in exposureData: 

535 

536 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

537 len(exposure.files), "" if len(exposure.files) == 1 else "s", 

538 exposure.record.instrument, exposure.record.obs_id) 

539 

540 try: 

541 self.butler.registry.syncDimensionData("exposure", exposure.record) 

542 except Exception as e: 

543 n_exposures_failed += 1 

544 self.log.warning("Exposure %s:%s could not be registered: %s", 

545 exposure.record.instrument, exposure.record.obs_id, e) 

546 continue 

547 

548 # Override default run if nothing specified explicitly 

549 if run is None: 

550 instrumentClass = exposure.files[0].instrumentClass 

551 this_run = instrumentClass.makeDefaultRawIngestRunName() 

552 else: 

553 this_run = run 

554 if this_run not in runs: 

555 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

556 runs.add(this_run) 

557 try: 

558 with self.butler.transaction(): 

559 refs.extend(self.ingestExposureDatasets(exposure, run=this_run)) 

560 except Exception as e: 

561 n_ingests_failed += 1 

562 self.log.warning("Failed to ingest the following for reason: %s", e) 

563 for f in exposure.files: 

564 self.log.warning("- %s", f.filename) 

565 continue 

566 

567 # Success for this exposure 

568 n_exposures += 1 

569 self.log.info("Exposure %s:%s ingested successfully", 

570 exposure.record.instrument, exposure.record.obs_id) 

571 

572 had_failure = False 

573 

574 if bad_files: 

575 had_failure = True 

576 self.log.warning("Could not extract observation metadata from the following:") 

577 for f in bad_files: 

578 self.log.warning("- %s", f) 

579 

580 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

581 " registration and %d failure%s from file ingest.", 

582 n_exposures, "" if n_exposures == 1 else "s", 

583 n_exposures_failed, "" if n_exposures_failed == 1 else "s", 

584 n_ingests_failed, "" if n_ingests_failed == 1 else "s") 

585 if n_exposures_failed > 0 or n_ingests_failed > 0: 

586 had_failure = True 

587 self.log.info("Ingested %d distinct Butler dataset%s", 

588 len(refs), "" if len(refs) == 1 else "s") 

589 

590 if had_failure: 

591 raise RuntimeError("Some failures encountered during ingestion") 

592 

593 return refs