Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import List, Iterator, Iterable, Type, Optional, Any 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

32from lsst.afw.fits import readMetadata 

33from lsst.daf.butler import ( 

34 Butler, 

35 CollectionType, 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 FileDataset, 

42 Formatter, 

43) 

44from lsst.pex.config import Config, ChoiceField 

45from lsst.pipe.base import Task 

46 

47from ._instrument import Instrument, makeExposureRecordFromObsInfo 

48from ._fitsRawFormatterBase import FitsRawFormatterBase 

49 

50 

51@dataclass 

52class RawFileDatasetInfo: 

53 """Structure that holds information about a single dataset within a 

54 raw file. 

55 """ 

56 

57 dataId: DataCoordinate 

58 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

59 """ 

60 

61 obsInfo: ObservationInfo 

62 """Standardized observation metadata extracted directly from the file 

63 headers (`astro_metadata_translator.ObservationInfo`). 

64 """ 

65 

66 

67@dataclass 

68class RawFileData: 

69 """Structure that holds information about a single raw file, used during 

70 ingest. 

71 """ 

72 

73 datasets: List[RawFileDatasetInfo] 

74 """The information describing each dataset within this raw file. 

75 (`list` of `RawFileDatasetInfo`) 

76 """ 

77 

78 filename: str 

79 """Name of the file this information was extracted from (`str`). 

80 

81 This is the path prior to ingest, not the path after ingest. 

82 """ 

83 

84 FormatterClass: Type[FitsRawFormatterBase] 

85 """Formatter class that should be used to ingest this file (`type`; as 

86 subclass of `FitsRawFormatterBase`). 

87 """ 

88 

89 instrumentClass: Optional[Type[Instrument]] 

90 """The `Instrument` class associated with this file. Can be `None` 

91 if ``datasets`` is an empty list.""" 

92 

93 

94@dataclass 

95class RawExposureData: 

96 """Structure that holds information about a complete raw exposure, used 

97 during ingest. 

98 """ 

99 

100 dataId: DataCoordinate 

101 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

102 """ 

103 

104 files: List[RawFileData] 

105 """List of structures containing file-level information. 

106 """ 

107 

108 universe: InitVar[DimensionUniverse] 

109 """Set of all known dimensions. 

110 """ 

111 

112 record: Optional[DimensionRecord] = None 

113 """The exposure `DimensionRecord` that must be inserted into the 

114 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

115 """ 

116 

117 def __post_init__(self, universe: DimensionUniverse): 

118 # We don't care which file or dataset we read metadata from, because 

119 # we're assuming they'll all be the same; just use the first ones. 

120 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

121 

122 

123def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

124 """Create a Config field with options for how to transfer files between 

125 data repositories. 

126 

127 The allowed options for the field are exactly those supported by 

128 `lsst.daf.butler.Datastore.ingest`. 

129 

130 Parameters 

131 ---------- 

132 doc : `str` 

133 Documentation for the configuration field. 

134 

135 Returns 

136 ------- 

137 field : `lsst.pex.config.ChoiceField` 

138 Configuration field. 

139 """ 

140 return ChoiceField( 

141 doc=doc, 

142 dtype=str, 

143 allowed={"move": "move", 

144 "copy": "copy", 

145 "auto": "choice will depend on datastore", 

146 "link": "hard link falling back to symbolic link", 

147 "hardlink": "hard link", 

148 "symlink": "symbolic (soft) link", 

149 "relsymlink": "relative symbolic link", 

150 }, 

151 optional=True, 

152 default=default 

153 ) 

154 

155 

156class RawIngestConfig(Config): 

157 transfer = makeTransferChoiceField() 

158 

159 

160class RawIngestTask(Task): 

161 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

162 

163 Parameters 

164 ---------- 

165 config : `RawIngestConfig` 

166 Configuration for the task. 

167 butler : `~lsst.daf.butler.Butler` 

168 Writeable butler instance, with ``butler.run`` set to the appropriate 

169 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

170 datasets. 

171 **kwargs 

172 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

173 constructor. 

174 

175 Notes 

176 ----- 

177 Each instance of `RawIngestTask` writes to the same Butler. Each 

178 invocation of `RawIngestTask.run` ingests a list of files. 

179 """ 

180 

181 ConfigClass = RawIngestConfig 

182 

183 _DefaultName = "ingest" 

184 

185 def getDatasetType(self): 

186 """Return the DatasetType of the datasets ingested by this Task. 

187 """ 

188 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

189 universe=self.butler.registry.dimensions) 

190 

191 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any): 

192 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

193 super().__init__(config, **kwargs) 

194 self.butler = butler 

195 self.universe = self.butler.registry.dimensions 

196 self.datasetType = self.getDatasetType() 

197 

198 # Import all the instrument classes so that we ensure that we 

199 # have all the relevant metadata translators loaded. 

200 Instrument.importAll(self.butler.registry) 

201 

202 @classmethod 

203 # WARNING: this method hardcodes the parameters to pipe.base.Task.__init__. 

204 # Nobody seems to know a way to delegate them to Task code. 

205 def _makeTask(cls, config: RawIngestConfig, butler: Butler, name: str, parentTask: Task): 

206 """Construct a RawIngestTask using only positional arguments. 

207 

208 Parameters 

209 ---------- 

210 All parameters are as for `RawIngestTask`. 

211 """ 

212 return cls(config=config, butler=butler, name=name, parentTask=parentTask) 

213 

214 # Overrides Task.__reduce__ 

215 def __reduce__(self): 

216 return (self._makeTask, (self.config, self.butler, self._name, self._parentTask)) 

217 

218 def extractMetadata(self, filename: str) -> RawFileData: 

219 """Extract and process metadata from a single raw file. 

220 

221 Parameters 

222 ---------- 

223 filename : `str` 

224 Path to the file. 

225 

226 Returns 

227 ------- 

228 data : `RawFileData` 

229 A structure containing the metadata extracted from the file, 

230 as well as the original filename. All fields will be populated, 

231 but the `RawFileData.dataId` attribute will be a minimal 

232 (unexpanded) `DataCoordinate` instance. 

233 

234 Notes 

235 ----- 

236 Assumes that there is a single dataset associated with the given 

237 file. Instruments using a single file to store multiple datasets 

238 must implement their own version of this method. 

239 """ 

240 

241 # We do not want to stop ingest if we are given a bad file. 

242 # Instead return a RawFileData with no datasets and allow 

243 # the caller to report the failure. 

244 

245 try: 

246 # Manually merge the primary and "first data" headers here because 

247 # we do not know in general if an input file has set INHERIT=T. 

248 phdu = readMetadata(filename, 0) 

249 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

250 fix_header(header) 

251 datasets = [self._calculate_dataset_info(header, filename)] 

252 except Exception as e: 

253 self.log.debug("Problem extracting metadata from %s: %s", filename, e) 

254 # Indicate to the caller that we failed to read 

255 datasets = [] 

256 FormatterClass = Formatter 

257 instrument = None 

258 else: 

259 self.log.debug("Extracted metadata from file %s", filename) 

260 # The data model currently assumes that whilst multiple datasets 

261 # can be associated with a single file, they must all share the 

262 # same formatter. 

263 try: 

264 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry) 

265 except LookupError: 

266 self.log.warning("Instrument %s for file %s not known to registry", 

267 datasets[0].dataId["instrument"], filename) 

268 datasets = [] 

269 FormatterClass = Formatter 

270 instrument = None 

271 else: 

272 FormatterClass = instrument.getRawFormatter(datasets[0].dataId) 

273 

274 return RawFileData(datasets=datasets, filename=filename, 

275 FormatterClass=FormatterClass, 

276 instrumentClass=instrument) 

277 

278 def _calculate_dataset_info(self, header, filename): 

279 """Calculate a RawFileDatasetInfo from the supplied information. 

280 

281 Parameters 

282 ---------- 

283 header : `Mapping` 

284 Header from the dataset. 

285 filename : `str` 

286 Filename to use for error messages. 

287 

288 Returns 

289 ------- 

290 dataset : `RawFileDatasetInfo` 

291 The dataId, and observation information associated with this 

292 dataset. 

293 """ 

294 obsInfo = ObservationInfo(header) 

295 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

296 exposure=obsInfo.exposure_id, 

297 detector=obsInfo.detector_num, 

298 universe=self.universe) 

299 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

300 

301 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

302 """Group an iterable of `RawFileData` by exposure. 

303 

304 Parameters 

305 ---------- 

306 files : iterable of `RawFileData` 

307 File-level information to group. 

308 

309 Returns 

310 ------- 

311 exposures : `list` of `RawExposureData` 

312 A list of structures that group the file-level information by 

313 exposure. All fields will be populated. The 

314 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

315 `DataCoordinate` instances. 

316 """ 

317 exposureDimensions = self.universe["exposure"].graph 

318 byExposure = defaultdict(list) 

319 for f in files: 

320 # Assume that the first dataset is representative for the file 

321 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

322 

323 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

324 for dataId, exposureFiles in byExposure.items()] 

325 

326 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

327 """Expand the data IDs associated with a raw exposure to include 

328 additional metadata records. 

329 

330 Parameters 

331 ---------- 

332 exposure : `RawExposureData` 

333 A structure containing information about the exposure to be 

334 ingested. Must have `RawExposureData.records` populated. Should 

335 be considered consumed upon return. 

336 

337 Returns 

338 ------- 

339 exposure : `RawExposureData` 

340 An updated version of the input structure, with 

341 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

342 updated to data IDs for which `DataCoordinate.hasRecords` returns 

343 `True`. 

344 """ 

345 # We start by expanded the exposure-level data ID; we won't use that 

346 # directly in file ingest, but this lets us do some database lookups 

347 # once per exposure instead of once per file later. 

348 data.dataId = self.butler.registry.expandDataId( 

349 data.dataId, 

350 # We pass in the records we'll be inserting shortly so they aren't 

351 # looked up from the database. We do expect instrument and filter 

352 # records to be retrieved from the database here (though the 

353 # Registry may cache them so there isn't a lookup every time). 

354 records={ 

355 self.butler.registry.dimensions["exposure"]: data.record, 

356 } 

357 ) 

358 # Now we expand the per-file (exposure+detector) data IDs. This time 

359 # we pass in the records we just retrieved from the exposure data ID 

360 # expansion. 

361 for file in data.files: 

362 for dataset in file.datasets: 

363 dataset.dataId = self.butler.registry.expandDataId( 

364 dataset.dataId, 

365 records=dict(data.dataId.records) 

366 ) 

367 return data 

368 

369 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

370 """Perform all ingest preprocessing steps that do not involve actually 

371 modifying the database. 

372 

373 Parameters 

374 ---------- 

375 files : iterable over `str` or path-like objects 

376 Paths to the files to be ingested. Will be made absolute 

377 if they are not already. 

378 pool : `multiprocessing.Pool`, optional 

379 If not `None`, a process pool with which to parallelize some 

380 operations. 

381 processes : `int`, optional 

382 The number of processes to use. Ignored if ``pool`` is not `None`. 

383 

384 Yields 

385 ------ 

386 exposure : `RawExposureData` 

387 Data structures containing dimension records, filenames, and data 

388 IDs to be ingested (one structure for each exposure). 

389 bad_files : `list` of `str` 

390 List of all the files that could not have metadata extracted. 

391 """ 

392 if pool is None and processes > 1: 

393 pool = Pool(processes) 

394 mapFunc = map if pool is None else pool.imap_unordered 

395 

396 # Extract metadata and build per-detector regions. 

397 # This could run in a subprocess so collect all output 

398 # before looking at failures. 

399 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

400 

401 # Filter out all the failed reads and store them for later 

402 # reporting 

403 good_files = [] 

404 bad_files = [] 

405 for fileDatum in fileData: 

406 if not fileDatum.datasets: 

407 bad_files.append(fileDatum.filename) 

408 else: 

409 good_files.append(fileDatum) 

410 fileData = good_files 

411 

412 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

413 len(fileData), "" if len(fileData) == 1 else "s", 

414 len(bad_files), "" if len(bad_files) == 1 else "s") 

415 

416 # Use that metadata to group files (and extracted metadata) by 

417 # exposure. Never parallelized because it's intrinsically a gather 

418 # step. 

419 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

420 

421 # The next operation operates on RawExposureData instances (one at 

422 # a time) in-place and then returns the modified instance. We call it 

423 # as a pass-through instead of relying on the arguments we pass in to 

424 # have been modified because in the parallel case those arguments are 

425 # going to be pickled and unpickled, and I'm not certain 

426 # multiprocessing is careful enough with that for output arguments to 

427 # work. 

428 

429 # Expand the data IDs to include all dimension metadata; we need this 

430 # because we may need to generate path templates that rely on that 

431 # metadata. 

432 # This is the first step that involves actual database calls (but just 

433 # SELECTs), so if there's going to be a problem with connections vs. 

434 # multiple processes, or lock contention (in SQLite) slowing things 

435 # down, it'll happen here. 

436 return mapFunc(self.expandDataIds, exposureData), bad_files 

437 

438 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

439 ) -> List[DatasetRef]: 

440 """Ingest all raw files in one exposure. 

441 

442 Parameters 

443 ---------- 

444 exposure : `RawExposureData` 

445 A structure containing information about the exposure to be 

446 ingested. Must have `RawExposureData.records` populated and all 

447 data ID attributes expanded. 

448 run : `str`, optional 

449 Name of a RUN-type collection to write to, overriding 

450 ``self.butler.run``. 

451 

452 Returns 

453 ------- 

454 refs : `list` of `lsst.daf.butler.DatasetRef` 

455 Dataset references for ingested raws. 

456 """ 

457 datasets = [FileDataset(path=os.path.abspath(file.filename), 

458 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

459 formatter=file.FormatterClass) 

460 for file in exposure.files] 

461 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

462 return [ref for dataset in datasets for ref in dataset.refs] 

463 

464 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

465 """Ingest files into a Butler data repository. 

466 

467 This creates any new exposure or visit Dimension entries needed to 

468 identify the ingested files, creates new Dataset entries in the 

469 Registry and finally ingests the files themselves into the Datastore. 

470 Any needed instrument, detector, and physical_filter Dimension entries 

471 must exist in the Registry before `run` is called. 

472 

473 Parameters 

474 ---------- 

475 files : iterable over `str` or path-like objects 

476 Paths to the files to be ingested. Will be made absolute 

477 if they are not already. 

478 pool : `multiprocessing.Pool`, optional 

479 If not `None`, a process pool with which to parallelize some 

480 operations. 

481 processes : `int`, optional 

482 The number of processes to use. Ignored if ``pool`` is not `None`. 

483 run : `str`, optional 

484 Name of a RUN-type collection to write to, overriding 

485 the default derived from the instrument name. 

486 

487 Returns 

488 ------- 

489 refs : `list` of `lsst.daf.butler.DatasetRef` 

490 Dataset references for ingested raws. 

491 

492 Notes 

493 ----- 

494 This method inserts all datasets for an exposure within a transaction, 

495 guaranteeing that partial exposures are never ingested. The exposure 

496 dimension record is inserted with `Registry.syncDimensionData` first 

497 (in its own transaction), which inserts only if a record with the same 

498 primary key does not already exist. This allows different files within 

499 the same exposure to be incremented in different runs. 

500 """ 

501 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

502 # Up to this point, we haven't modified the data repository at all. 

503 # Now we finally do that, with one transaction per exposure. This is 

504 # not parallelized at present because the performance of this step is 

505 # limited by the database server. That may or may not change in the 

506 # future once we increase our usage of bulk inserts and reduce our 

507 # usage of savepoints; we've tried to get everything but the database 

508 # operations done in advance to reduce the time spent inside 

509 # transactions. 

510 self.butler.registry.registerDatasetType(self.datasetType) 

511 refs = [] 

512 runs = set() 

513 n_exposures = 0 

514 n_exposures_failed = 0 

515 n_ingests_failed = 0 

516 for exposure in exposureData: 

517 

518 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

519 len(exposure.files), "" if len(exposure.files) == 1 else "s", 

520 exposure.record.instrument, exposure.record.name) 

521 

522 try: 

523 self.butler.registry.syncDimensionData("exposure", exposure.record) 

524 except Exception as e: 

525 n_exposures_failed += 1 

526 self.log.warning("Exposure %s:%s could not be registered: %s", 

527 exposure.record.instrument, exposure.record.name, e) 

528 continue 

529 

530 # Override default run if nothing specified explicitly 

531 if run is None: 

532 instrumentClass = exposure.files[0].instrumentClass 

533 this_run = instrumentClass.makeDefaultRawIngestRunName() 

534 else: 

535 this_run = run 

536 if this_run not in runs: 

537 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

538 runs.add(this_run) 

539 try: 

540 with self.butler.transaction(): 

541 refs.extend(self.ingestExposureDatasets(exposure, run=this_run)) 

542 except Exception as e: 

543 n_ingests_failed += 1 

544 self.log.warning("Failed to ingest the following for reason: %s", e) 

545 for f in exposure.files: 

546 self.log.warning("- %s", f.filename) 

547 continue 

548 

549 # Success for this exposure 

550 n_exposures += 1 

551 self.log.info("Exposure %s:%s ingested successfully", 

552 exposure.record.instrument, exposure.record.name) 

553 

554 had_failure = False 

555 

556 if bad_files: 

557 had_failure = True 

558 self.log.warning("Could not extract observation metadata from the following:") 

559 for f in bad_files: 

560 self.log.warning("- %s", f) 

561 

562 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

563 " registration and %d failure%s from file ingest.", 

564 n_exposures, "" if n_exposures == 1 else "s", 

565 n_exposures_failed, "" if n_exposures_failed == 1 else "s", 

566 n_ingests_failed, "" if n_ingests_failed == 1 else "s") 

567 if n_exposures_failed > 0 or n_ingests_failed > 0: 

568 had_failure = True 

569 self.log.info("Ingested %d distinct Butler dataset%s", 

570 len(refs), "" if len(refs) == 1 else "s") 

571 

572 if had_failure: 

573 raise RuntimeError("Some failures encountered during ingestion") 

574 

575 return refs