Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26import itertools 

27from dataclasses import dataclass 

28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

33from lsst.utils import doImport 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 DataCoordinate, 

38 DatasetRef, 

39 DatasetType, 

40 DimensionRecord, 

41 FileDataset, 

42) 

43from lsst.geom import Box2D 

44from lsst.pex.config import Config, Field, ChoiceField 

45from lsst.pipe.base import Task 

46from lsst.sphgeom import ConvexPolygon 

47 

48from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo 

49from .fitsRawFormatterBase import FitsRawFormatterBase 

50 

51 

52@dataclass 

53class RawFileDatasetInfo: 

54 """Structure that hold information about a single dataset within a 

55 raw file. 

56 """ 

57 

58 dataId: DataCoordinate 

59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

60 

61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

63 """ 

64 

65 obsInfo: ObservationInfo 

66 """Standardized observation metadata extracted directly from the file 

67 headers (`astro_metadata_translator.ObservationInfo`). 

68 """ 

69 

70 region: ConvexPolygon 

71 """Region on the sky covered by this file, possibly with padding 

72 (`lsst.sphgeom.ConvexPolygon`). 

73 """ 

74 

75 

76@dataclass 

77class RawFileData: 

78 """Structure that holds information about a single raw file, used during 

79 ingest. 

80 """ 

81 

82 datasets: List[RawFileDatasetInfo] 

83 """The information describing each dataset within this raw file. 

84 (`list` of `RawFileDatasetInfo`) 

85 """ 

86 

87 filename: str 

88 """Name of the file this information was extracted from (`str`). 

89 

90 This is the path prior to ingest, not the path after ingest. 

91 """ 

92 

93 FormatterClass: Type[FitsRawFormatterBase] 

94 """Formatter class that should be used to ingest this file and compute 

95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 

96 """ 

97 

98 

99@dataclass 

100class RawExposureData: 

101 """Structure that holds information about a complete raw exposure, used 

102 during ingest. 

103 """ 

104 

105 dataId: DataCoordinate 

106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

107 

108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

110 """ 

111 

112 files: List[RawFileData] 

113 """List of structures containing file-level information. 

114 """ 

115 

116 records: Optional[Dict[str, List[DimensionRecord]]] = None 

117 """Dictionary containing `DimensionRecord` instances that must be inserted 

118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 

119 

120 Keys are the names of dimension elements ("exposure" and optionally "visit" 

121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 

122 

123 May be `None` during some ingest steps. 

124 """ 

125 

126 

127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None): 

128 """Create a Config field with options for how to transfer files between 

129 data repositories. 

130 

131 The allowed options for the field are exactly those supported by 

132 `lsst.daf.butler.Datastore.ingest`. 

133 

134 Parameters 

135 ---------- 

136 doc : `str` 

137 Documentation for the configuration field. 

138 

139 Returns 

140 ------- 

141 field : `lsst.pex.config.ChoiceField` 

142 Configuration field. 

143 """ 

144 return ChoiceField( 

145 doc=doc, 

146 dtype=str, 

147 allowed={"move": "move", 

148 "copy": "copy", 

149 "hardlink": "hard link", 

150 "symlink": "symbolic (soft) link"}, 

151 optional=True, 

152 default=default 

153 ) 

154 

155 

156class RawIngestConfig(Config): 

157 transfer = makeTransferChoiceField() 

158 padRegionAmount = Field( 

159 dtype=int, 

160 default=0, 

161 doc="Pad an image with specified number of pixels before calculating region" 

162 ) 

163 instrument = Field( 

164 doc=("Fully-qualified Python name of the `Instrument` subclass to " 

165 "associate with all raws."), 

166 dtype=str, 

167 optional=False, 

168 default=None, 

169 ) 

170 

171 

172class RawIngestTask(Task): 

173 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

174 

175 This Task is intended to be runnable from the command-line, but it doesn't 

176 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 

177 gain much from being one. It also wouldn't really be appropriate as a 

178 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 

179 leverage the logging and configurability functionality that provides. 

180 

181 Each instance of `RawIngestTask` writes to the same Butler. Each 

182 invocation of `RawIngestTask.run` ingests a list of files. 

183 

184 Parameters 

185 ---------- 

186 config : `RawIngestConfig` 

187 Configuration for the task. 

188 butler : `~lsst.daf.butler.Butler` 

189 Butler instance. Ingested Datasets will be created as part of 

190 ``butler.run`` and associated with its Collection. 

191 kwds 

192 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

193 constructor. 

194 

195 Other keyword arguments are forwarded to the Task base class constructor. 

196 """ 

197 

198 ConfigClass = RawIngestConfig 

199 

200 _DefaultName = "ingest" 

201 

202 def getDatasetType(self): 

203 """Return the DatasetType of the Datasets ingested by this Task. 

204 """ 

205 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

206 universe=self.butler.registry.dimensions) 

207 

208 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any): 

209 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

210 super().__init__(config, **kwds) 

211 self.butler = butler 

212 self.universe = self.butler.registry.dimensions 

213 self.instrument = doImport(self.config.instrument)() 

214 # For now, we get a nominal Camera from the Instrument. 

215 # In the future, we may want to load one from a Butler calibration 

216 # collection that's appropriate for the observation timestamp of 

217 # the exposure. 

218 self.camera = self.instrument.getCamera() 

219 self.datasetType = self.getDatasetType() 

220 

221 def extractMetadata(self, filename: str) -> RawFileData: 

222 """Extract and process metadata from a single raw file. 

223 

224 Parameters 

225 ---------- 

226 filename : `str` 

227 Path to the file. 

228 

229 Returns 

230 ------- 

231 data : `RawFileData` 

232 A structure containing the metadata extracted from the file, 

233 as well as the original filename. All fields will be populated, 

234 but the `RawFileData.dataId` attribute will be a minimal 

235 (unexpanded) `DataCoordinate` instance. 

236 

237 Notes 

238 ----- 

239 Assumes that there is a single dataset associated with the given 

240 file. Instruments using a single file to store multiple datasets 

241 must implement their own version of this method. 

242 """ 

243 # Manually merge the primary and "first data" headers here because we 

244 # do not know in general if an input file has set INHERIT=T. 

245 phdu = readMetadata(filename, 0) 

246 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

247 fix_header(header) 

248 datasets = [self._calculate_dataset_info(header, filename)] 

249 

250 # The data model currently assumes that whilst multiple datasets 

251 # can be associated with a single file, they must all share the 

252 # same formatter. 

253 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId) 

254 

255 return RawFileData(datasets=datasets, filename=filename, 

256 FormatterClass=FormatterClass) 

257 

258 def _calculate_dataset_info(self, header, filename): 

259 """Calculate a RawFileDatasetInfo from the supplied information. 

260 

261 Parameters 

262 ---------- 

263 header : `Mapping` 

264 Header from the dataset. 

265 filename : `str` 

266 Filename to use for error messages. 

267 

268 Returns 

269 ------- 

270 dataset : `RawFileDatasetInfo` 

271 The region, dataId, and observation information associated with 

272 this dataset. 

273 """ 

274 obsInfo = ObservationInfo(header) 

275 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

276 exposure=obsInfo.exposure_id, 

277 detector=obsInfo.detector_num, 

278 universe=self.universe) 

279 if obsInfo.instrument != self.instrument.getName(): 

280 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, " 

281 f"got {obsInfo.instrument}) for file {filename}.") 

282 

283 FormatterClass = self.instrument.getRawFormatter(dataId) 

284 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass) 

285 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId) 

286 

287 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass): 

288 """Calculate the sky region covered by the supplied observation 

289 information. 

290 

291 Parameters 

292 ---------- 

293 obsInfo : `~astro_metadata_translator.ObservationInfo` 

294 Summary information of this dataset. 

295 header : `Mapping` 

296 Header from the dataset. 

297 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 

298 Formatter class that should be used to compute the spatial region. 

299 

300 Returns 

301 ------- 

302 region : `lsst.sphgeom.ConvexPolygon` 

303 Region of sky covered by this observation. 

304 """ 

305 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None: 

306 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo) 

307 visitInfo = formatter.makeVisitInfo() 

308 detector = self.camera[obsInfo.detector_num] 

309 wcs = formatter.makeWcs(visitInfo, detector) 

310 pixBox = Box2D(detector.getBBox()) 

311 if self.config.padRegionAmount > 0: 

312 pixBox.grow(self.config.padRegionAmount) 

313 pixCorners = pixBox.getCorners() 

314 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners] 

315 region = ConvexPolygon(sphCorners) 

316 else: 

317 region = None 

318 return region 

319 

320 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

321 """Group an iterable of `RawFileData` by exposure. 

322 

323 Parameters 

324 ---------- 

325 files : iterable of `RawFileData` 

326 File-level information to group. 

327 

328 Returns 

329 ------- 

330 exposures : `list` of `RawExposureData` 

331 A list of structures that group the file-level information by 

332 exposure. The `RawExposureData.records` attributes of elements 

333 will be `None`, but all other fields will be populated. The 

334 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

335 `DataCoordinate` instances. 

336 """ 

337 exposureDimensions = self.universe["exposure"].graph 

338 byExposure = defaultdict(list) 

339 for f in files: 

340 # Assume that the first dataset is representative for the file 

341 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

342 

343 return [RawExposureData(dataId=dataId, files=exposureFiles) 

344 for dataId, exposureFiles in byExposure.items()] 

345 

346 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData: 

347 """Collect the `DimensionRecord` instances that must be inserted into 

348 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 

349 

350 Parameters 

351 ---------- 

352 exposure : `RawExposureData` 

353 A structure containing information about the exposure to be 

354 ingested. Should be considered consumed upon return. 

355 

356 Returns 

357 ------- 

358 exposure : `RawExposureData` 

359 An updated version of the input structure, with 

360 `RawExposureData.records` populated. 

361 """ 

362 firstFile = exposure.files[0] 

363 firstDataset = firstFile.datasets[0] 

364 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass 

365 exposure.records = { 

366 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)], 

367 } 

368 if firstDataset.obsInfo.visit_id is not None: 

369 exposure.records["visit_detector_region"] = [] 

370 visitVertices = [] 

371 for file in exposure.files: 

372 for dataset in file.datasets: 

373 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id: 

374 raise ValueError(f"Inconsistent visit/exposure relationship for " 

375 f"exposure {firstDataset.obsInfo.exposure_id} between " 

376 f"{file.filename} and {firstFile.filename}: " 

377 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.") 

378 if dataset.region is None: 

379 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id, 

380 dataset.obsInfo.detector_num) 

381 continue 

382 visitVertices.extend(dataset.region.getVertices()) 

383 exposure.records["visit_detector_region"].append( 

384 VisitDetectorRegionRecordClass.fromDict({ 

385 "instrument": dataset.obsInfo.instrument, 

386 "visit": dataset.obsInfo.visit_id, 

387 "detector": dataset.obsInfo.detector_num, 

388 "region": dataset.region, 

389 }) 

390 ) 

391 if visitVertices: 

392 visitRegion = ConvexPolygon(visitVertices) 

393 else: 

394 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id) 

395 visitRegion = None 

396 exposure.records["visit"] = [ 

397 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion) 

398 ] 

399 return exposure 

400 

401 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

402 """Expand the data IDs associated with a raw exposure to include 

403 additional metadata records. 

404 

405 Parameters 

406 ---------- 

407 exposure : `RawExposureData` 

408 A structure containing information about the exposure to be 

409 ingested. Must have `RawExposureData.records` populated. Should 

410 be considered consumed upon return. 

411 

412 Returns 

413 ------- 

414 exposure : `RawExposureData` 

415 An updated version of the input structure, with 

416 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

417 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 

418 """ 

419 hasVisit = "visit" in data.records 

420 # We start by expanded the exposure-level data ID; we won't use that 

421 # directly in file ingest, but this lets us do some database lookups 

422 # once per exposure instead of once per file later. 

423 data.dataId = self.butler.registry.expandDataId( 

424 data.dataId, 

425 # We pass in the records we'll be inserting shortly so they aren't 

426 # looked up from the database. We do expect instrument and filter 

427 # records to be retrieved from the database here (though the 

428 # Registry may cache them so there isn't a lookup every time). 

429 records={ 

430 "exposure": data.records["exposure"][0], 

431 "visit": data.records["visit"][0] if hasVisit else None, 

432 } 

433 ) 

434 # Now we expand the per-file (exposure+detector) data IDs. This time 

435 # we pass in the records we just retrieved from the exposure data ID 

436 # expansion as well as the visit_detector_region record, if there is 

437 # one. 

438 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None) 

439 for file, vdrRecord in zip(data.files, vdrRecords): 

440 for dataset in file.datasets: 

441 dataset.dataId = self.butler.registry.expandDataId( 

442 dataset.dataId, 

443 records=dict(data.dataId.records, visit_detector_region=vdrRecord) 

444 ) 

445 return data 

446 

447 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

448 """Perform all ingest preprocessing steps that do not involve actually 

449 modifying the database. 

450 

451 Parameters 

452 ---------- 

453 files : iterable over `str` or path-like objects 

454 Paths to the files to be ingested. Will be made absolute 

455 if they are not already. 

456 pool : `multiprocessing.Pool`, optional 

457 If not `None`, a process pool with which to parallelize some 

458 operations. 

459 processes : `int`, optional 

460 The number of processes to use. Ignored if ``pool`` is not `None`. 

461 

462 Yields 

463 ------ 

464 exposure : `RawExposureData` 

465 Data structures containing dimension records, filenames, and data 

466 IDs to be ingested (one structure for each exposure). 

467 """ 

468 if pool is None and processes > 1: 

469 pool = Pool(processes) 

470 mapFunc = map if pool is None else pool.imap_unordered 

471 

472 # Extract metadata and build per-detector regions. 

473 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

474 

475 # Use that metadata to group files (and extracted metadata) by 

476 # exposure. Never parallelized because it's intrinsically a gather 

477 # step. 

478 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

479 

480 # The next few operations operate on RawExposureData instances (one at 

481 # a time) in-place and then return the modified instance. We call them 

482 # as pass-throughs instead of relying on the arguments we pass in to 

483 # have been modified because in the parallel case those arguments are 

484 # going to be pickled and unpickled, and I'm not certain 

485 # multiprocessing is careful enough with that for output arguments to 

486 # work. We use the same variable names to reflect the fact that we 

487 # consider the arguments to have been consumed/invalidated. 

488 

489 # Extract DimensionRecords from the metadata that will need to be 

490 # inserted into the Registry before the raw datasets themselves are 

491 # ingested. 

492 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData) 

493 

494 # Expand the data IDs to include all dimension metadata; we need this 

495 # because we may need to generate path templates that rely on that 

496 # metadata. 

497 # This is the first step that involves actual database calls (but just 

498 # SELECTs), so if there's going to be a problem with connections vs. 

499 # multiple processes, or lock contention (in SQLite) slowing things 

500 # down, it'll happen here. 

501 return mapFunc(self.expandDataIds, exposureData) 

502 

503 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]): 

504 """Insert dimension records for one or more exposures. 

505 

506 Parameters 

507 ---------- 

508 records : `dict` mapping `str` to `list` 

509 Dimension records to be inserted, organized as a mapping from 

510 dimension name to a list of records for that dimension. This 

511 may be a single `RawExposureData.records` dict, or an aggregate 

512 for multiple exposures created by concatenating the value lists 

513 of those dictionaries. 

514 

515 Returns 

516 ------- 

517 refs : `list` of `lsst.daf.butler.DatasetRef` 

518 Dataset references for ingested raws. 

519 """ 

520 # TODO: This currently assumes that either duplicate inserts of 

521 # visit records are ignored, or there is exactly one visit per 

522 # exposure. I expect us to switch up the visit-exposure 

523 # relationship and hence rewrite some of this code before that 

524 # becomes a practical problem. 

525 # Iterate over dimensions explicitly to order for foreign key 

526 # relationships. 

527 for dimension in ("visit", "exposure", "visit_detector_region"): 

528 recordsForDimension = records.get(dimension) 

529 if recordsForDimension: 

530 # TODO: once Registry has options to ignore or replace 

531 # existing dimension records with the same primary keys 

532 # instead of aborting on conflicts, add configuration 

533 # options and logic to use them. 

534 self.butler.registry.insertDimensionData(dimension, *recordsForDimension) 

535 

536 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None 

537 ) -> List[DatasetRef]: 

538 """Ingest all raw files in one exposure. 

539 

540 Parameters 

541 ---------- 

542 exposure : `RawExposureData` 

543 A structure containing information about the exposure to be 

544 ingested. Must have `RawExposureData.records` populated and all 

545 data ID attributes expanded. 

546 butler : `lsst.daf.butler.Butler`, optional 

547 Butler to use for ingest. If not provided, ``self.butler`` will 

548 be used. 

549 

550 Returns 

551 ------- 

552 refs : `list` of `lsst.daf.butler.DatasetRef` 

553 Dataset references for ingested raws. 

554 """ 

555 if butler is None: 

556 butler = self.butler 

557 datasets = [FileDataset(path=os.path.abspath(file.filename), 

558 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

559 formatter=file.FormatterClass) 

560 for file in exposure.files] 

561 butler.ingest(*datasets, transfer=self.config.transfer) 

562 return [ref for dataset in datasets for ref in dataset.refs] 

563 

564 def run(self, files, pool: Optional[Pool] = None, processes: int = 1): 

565 """Ingest files into a Butler data repository. 

566 

567 This creates any new exposure or visit Dimension entries needed to 

568 identify the ingested files, creates new Dataset entries in the 

569 Registry and finally ingests the files themselves into the Datastore. 

570 Any needed instrument, detector, and physical_filter Dimension entries 

571 must exist in the Registry before `run` is called. 

572 

573 Parameters 

574 ---------- 

575 files : iterable over `str` or path-like objects 

576 Paths to the files to be ingested. Will be made absolute 

577 if they are not already. 

578 pool : `multiprocessing.Pool`, optional 

579 If not `None`, a process pool with which to parallelize some 

580 operations. 

581 processes : `int`, optional 

582 The number of processes to use. Ignored if ``pool`` is not `None`. 

583 

584 Returns 

585 ------- 

586 refs : `list` of `lsst.daf.butler.DatasetRef` 

587 Dataset references for ingested raws. 

588 

589 Notes 

590 ----- 

591 This method inserts all records (dimensions and datasets) for an 

592 exposure within a transaction, guaranteeing that partial exposures 

593 are never ingested. 

594 """ 

595 exposureData = self.prep(files, pool=pool, processes=processes) 

596 # Up to this point, we haven't modified the data repository at all. 

597 # Now we finally do that, with one transaction per exposure. This is 

598 # not parallelized at present because the performance of this step is 

599 # limited by the database server. That may or may not change in the 

600 # future once we increase our usage of bulk inserts and reduce our 

601 # usage of savepoints; we've tried to get everything but the database 

602 # operations done in advance to reduce the time spent inside 

603 # transactions. 

604 self.butler.registry.registerDatasetType(self.datasetType) 

605 refs = [] 

606 for exposure in exposureData: 

607 with self.butler.transaction(): 

608 self.insertDimensionData(exposure.records) 

609 refs.extend(self.ingestExposureDatasets(exposure)) 

610 return refs