Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26import itertools 

27from dataclasses import dataclass 

28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

33from lsst.utils import doImport 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 DataCoordinate, 

38 DatasetRef, 

39 DatasetType, 

40 DimensionRecord, 

41 FileDataset, 

42) 

43from lsst.obs.base.instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo 

44from lsst.geom import Box2D 

45from lsst.pex.config import Config, Field, ChoiceField 

46from lsst.pipe.base import Task 

47from lsst.sphgeom import ConvexPolygon 

48 

49from .fitsRawFormatterBase import FitsRawFormatterBase 

50 

51 

52@dataclass 

53class RawFileDatasetInfo: 

54 """Structure that hold information about a single dataset within a 

55 raw file. 

56 """ 

57 

58 dataId: DataCoordinate 

59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

60 

61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

63 """ 

64 

65 obsInfo: ObservationInfo 

66 """Standardized observation metadata extracted directly from the file 

67 headers (`astro_metadata_translator.ObservationInfo`). 

68 """ 

69 

70 region: ConvexPolygon 

71 """Region on the sky covered by this file, possibly with padding 

72 (`lsst.sphgeom.ConvexPolygon`). 

73 """ 

74 

75 

76@dataclass 

77class RawFileData: 

78 """Structure that holds information about a single raw file, used during 

79 ingest. 

80 """ 

81 

82 datasets: List[RawFileDatasetInfo] 

83 """The information describing each dataset within this raw file. 

84 (`list` of `RawFileDatasetInfo`) 

85 """ 

86 

87 filename: str 

88 """Name of the file this information was extracted from (`str`). 

89 

90 This is the path prior to ingest, not the path after ingest. 

91 """ 

92 

93 FormatterClass: Type[FitsRawFormatterBase] 

94 """Formatter class that should be used to ingest this file and compute 

95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 

96 """ 

97 

98 

99@dataclass 

100class RawExposureData: 

101 """Structure that holds information about a complete raw exposure, used 

102 during ingest. 

103 """ 

104 

105 dataId: DataCoordinate 

106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

107 

108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

110 """ 

111 

112 files: List[RawFileData] 

113 """List of structures containing file-level information. 

114 """ 

115 

116 records: Optional[Dict[str, List[DimensionRecord]]] = None 

117 """Dictionary containing `DimensionRecord` instances that must be inserted 

118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 

119 

120 Keys are the names of dimension elements ("exposure" and optionally "visit" 

121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 

122 

123 May be `None` during some ingest steps. 

124 """ 

125 

126 

127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None): 

128 """Create a Config field with options for how to transfer files between 

129 data repositories. 

130 

131 The allowed options for the field are exactly those supported by 

132 `lsst.daf.butler.Datastore.ingest`. 

133 

134 Parameters 

135 ---------- 

136 doc : `str` 

137 Documentation for the configuration field. 

138 

139 Returns 

140 ------- 

141 field : `lsst.pex.config.ChoiceField` 

142 Configuration field. 

143 """ 

144 return ChoiceField( 

145 doc=doc, 

146 dtype=str, 

147 allowed={"move": "move", 

148 "copy": "copy", 

149 "hardlink": "hard link", 

150 "symlink": "symbolic (soft) link"}, 

151 optional=True, 

152 default=default 

153 ) 

154 

155 

156class RawIngestConfig(Config): 

157 transfer = makeTransferChoiceField() 

158 padRegionAmount = Field( 

159 dtype=int, 

160 default=0, 

161 doc="Pad an image with specified number of pixels before calculating region" 

162 ) 

163 instrument = Field( 

164 doc=("Fully-qualified Python name of the `Instrument` subclass to " 

165 "associate with all raws."), 

166 dtype=str, 

167 optional=False, 

168 default=None, 

169 ) 

170 

171 

172class RawIngestTask(Task): 

173 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

174 

175 This Task is intended to be runnable from the command-line, but it doesn't 

176 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 

177 gain much from being one. It also wouldn't really be appropriate as a 

178 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 

179 leverage the logging and configurability functionality that provides. 

180 

181 Each instance of `RawIngestTask` writes to the same Butler. Each 

182 invocation of `RawIngestTask.run` ingests a list of files. 

183 

184 Parameters 

185 ---------- 

186 config : `RawIngestConfig` 

187 Configuration for the task. 

188 butler : `~lsst.daf.butler.Butler` 

189 Butler instance. Ingested Datasets will be created as part of 

190 ``butler.run`` and associated with its Collection. 

191 kwds 

192 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

193 constructor. 

194 

195 Other keyword arguments are forwarded to the Task base class constructor. 

196 """ 

197 

198 ConfigClass = RawIngestConfig 

199 

200 _DefaultName = "ingest" 

201 

202 def getDatasetType(self): 

203 """Return the DatasetType of the Datasets ingested by this Task. 

204 """ 

205 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

206 universe=self.butler.registry.dimensions) 

207 

208 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any): 

209 super().__init__(config, **kwds) 

210 self.butler = butler 

211 self.universe = self.butler.registry.dimensions 

212 self.instrument = doImport(self.config.instrument)() 

213 # For now, we get a nominal Camera from the Instrument. 

214 # In the future, we may want to load one from a Butler calibration 

215 # collection that's appropriate for the observation timestamp of 

216 # the exposure. 

217 self.camera = self.instrument.getCamera() 

218 self.datasetType = self.getDatasetType() 

219 

220 def extractMetadata(self, filename: str) -> RawFileData: 

221 """Extract and process metadata from a single raw file. 

222 

223 Parameters 

224 ---------- 

225 filename : `str` 

226 Path to the file. 

227 

228 Returns 

229 ------- 

230 data : `RawFileData` 

231 A structure containing the metadata extracted from the file, 

232 as well as the original filename. All fields will be populated, 

233 but the `RawFileData.dataId` attribute will be a minimal 

234 (unexpanded) `DataCoordinate` instance. 

235 

236 Notes 

237 ----- 

238 Assumes that there is a single dataset associated with the given 

239 file. Instruments using a single file to store multiple datasets 

240 must implement their own version of this method. 

241 """ 

242 # Manually merge the primary and "first data" headers here because we 

243 # do not know in general if an input file has set INHERIT=T. 

244 phdu = readMetadata(filename, 0) 

245 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

246 fix_header(header) 

247 datasets = [self._calculate_dataset_info(header, filename)] 

248 

249 # The data model currently assumes that whilst multiple datasets 

250 # can be associated with a single file, they must all share the 

251 # same formatter. 

252 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId) 

253 

254 return RawFileData(datasets=datasets, filename=filename, 

255 FormatterClass=FormatterClass) 

256 

257 def _calculate_dataset_info(self, header, filename): 

258 """Calculate a RawFileDatasetInfo from the supplied information. 

259 

260 Parameters 

261 ---------- 

262 header : `Mapping` 

263 Header from the dataset. 

264 filename : `str` 

265 Filename to use for error messages. 

266 

267 Returns 

268 ------- 

269 dataset : `RawFileDatasetInfo` 

270 The region, dataId, and observation information associated with 

271 this dataset. 

272 """ 

273 obsInfo = ObservationInfo(header) 

274 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

275 exposure=obsInfo.exposure_id, 

276 detector=obsInfo.detector_num, 

277 universe=self.universe) 

278 if obsInfo.instrument != self.instrument.getName(): 

279 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, " 

280 f"got {obsInfo.instrument}) for file {filename}.") 

281 

282 FormatterClass = self.instrument.getRawFormatter(dataId) 

283 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass) 

284 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId) 

285 

286 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass): 

287 """Calculate the sky region covered by the supplied observation 

288 information. 

289 

290 Parameters 

291 ---------- 

292 obsInfo : `~astro_metadata_translator.ObservationInfo` 

293 Summary information of this dataset. 

294 header : `Mapping` 

295 Header from the dataset. 

296 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 

297 Formatter class that should be used to compute the spatial region. 

298 

299 Returns 

300 ------- 

301 region : `lsst.sphgeom.ConvexPolygon` 

302 Region of sky covered by this observation. 

303 """ 

304 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None: 

305 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo) 

306 visitInfo = formatter.makeVisitInfo() 

307 detector = self.camera[obsInfo.detector_num] 

308 wcs = formatter.makeWcs(visitInfo, detector) 

309 pixBox = Box2D(detector.getBBox()) 

310 if self.config.padRegionAmount > 0: 

311 pixBox.grow(self.config.padRegionAmount) 

312 pixCorners = pixBox.getCorners() 

313 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners] 

314 region = ConvexPolygon(sphCorners) 

315 else: 

316 region = None 

317 return region 

318 

319 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

320 """Group an iterable of `RawFileData` by exposure. 

321 

322 Parameters 

323 ---------- 

324 files : iterable of `RawFileData` 

325 File-level information to group. 

326 

327 Returns 

328 ------- 

329 exposures : `list` of `RawExposureData` 

330 A list of structures that group the file-level information by 

331 exposure. The `RawExposureData.records` attributes of elements 

332 will be `None`, but all other fields will be populated. The 

333 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

334 `DataCoordinate` instances. 

335 """ 

336 exposureDimensions = self.universe["exposure"].graph 

337 byExposure = defaultdict(list) 

338 for f in files: 

339 # Assume that the first dataset is representative for the file 

340 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

341 

342 return [RawExposureData(dataId=dataId, files=exposureFiles) 

343 for dataId, exposureFiles in byExposure.items()] 

344 

345 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData: 

346 """Collect the `DimensionRecord` instances that must be inserted into 

347 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 

348 

349 Parameters 

350 ---------- 

351 exposure : `RawExposureData` 

352 A structure containing information about the exposure to be 

353 ingested. Should be considered consumed upon return. 

354 

355 Returns 

356 ------- 

357 exposure : `RawExposureData` 

358 An updated version of the input structure, with 

359 `RawExposureData.records` populated. 

360 """ 

361 firstFile = exposure.files[0] 

362 firstDataset = firstFile.datasets[0] 

363 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass 

364 exposure.records = { 

365 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)], 

366 } 

367 if firstDataset.obsInfo.visit_id is not None: 

368 exposure.records["visit_detector_region"] = [] 

369 visitVertices = [] 

370 for file in exposure.files: 

371 for dataset in file.datasets: 

372 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id: 

373 raise ValueError(f"Inconsistent visit/exposure relationship for " 

374 f"exposure {firstDataset.obsInfo.exposure_id} between " 

375 f"{file.filename} and {firstFile.filename}: " 

376 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.") 

377 if dataset.region is None: 

378 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id, 

379 dataset.obsInfo.detector_num) 

380 continue 

381 visitVertices.extend(dataset.region.getVertices()) 

382 exposure.records["visit_detector_region"].append( 

383 VisitDetectorRegionRecordClass.fromDict({ 

384 "instrument": dataset.obsInfo.instrument, 

385 "visit": dataset.obsInfo.visit_id, 

386 "detector": dataset.obsInfo.detector_num, 

387 "region": dataset.region, 

388 }) 

389 ) 

390 if visitVertices: 

391 visitRegion = ConvexPolygon(visitVertices) 

392 else: 

393 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id) 

394 visitRegion = None 

395 exposure.records["visit"] = [ 

396 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion) 

397 ] 

398 return exposure 

399 

400 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

401 """Expand the data IDs associated with a raw exposure to include 

402 additional metadata records. 

403 

404 Parameters 

405 ---------- 

406 exposure : `RawExposureData` 

407 A structure containing information about the exposure to be 

408 ingested. Must have `RawExposureData.records` populated. Should 

409 be considered consumed upon return. 

410 

411 Returns 

412 ------- 

413 exposure : `RawExposureData` 

414 An updated version of the input structure, with 

415 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

416 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 

417 """ 

418 hasVisit = "visit" in data.records 

419 # We start by expanded the exposure-level data ID; we won't use that 

420 # directly in file ingest, but this lets us do some database lookups 

421 # once per exposure instead of once per file later. 

422 data.dataId = self.butler.registry.expandDataId( 

423 data.dataId, 

424 # We pass in the records we'll be inserting shortly so they aren't 

425 # looked up from the database. We do expect instrument and filter 

426 # records to be retrieved from the database here (though the 

427 # Registry may cache them so there isn't a lookup every time). 

428 records={ 

429 "exposure": data.records["exposure"][0], 

430 "visit": data.records["visit"][0] if hasVisit else None, 

431 } 

432 ) 

433 # Now we expand the per-file (exposure+detector) data IDs. This time 

434 # we pass in the records we just retrieved from the exposure data ID 

435 # expansion as well as the visit_detector_region record, if there is 

436 # one. 

437 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None) 

438 for file, vdrRecord in zip(data.files, vdrRecords): 

439 for dataset in file.datasets: 

440 dataset.dataId = self.butler.registry.expandDataId( 

441 dataset.dataId, 

442 records=dict(data.dataId.records, visit_detector_region=vdrRecord) 

443 ) 

444 return data 

445 

446 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

447 """Perform all ingest preprocessing steps that do not involve actually 

448 modifying the database. 

449 

450 Parameters 

451 ---------- 

452 files : iterable over `str` or path-like objects 

453 Paths to the files to be ingested. Will be made absolute 

454 if they are not already. 

455 pool : `multiprocessing.Pool`, optional 

456 If not `None`, a process pool with which to parallelize some 

457 operations. 

458 processes : `int`, optional 

459 The number of processes to use. Ignored if ``pool`` is not `None`. 

460 

461 Yields 

462 ------ 

463 exposure : `RawExposureData` 

464 Data structures containing dimension records, filenames, and data 

465 IDs to be ingested (one structure for each exposure). 

466 """ 

467 if pool is None and processes > 1: 

468 pool = Pool(processes) 

469 mapFunc = map if pool is None else pool.imap_unordered 

470 

471 # Extract metadata and build per-detector regions. 

472 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

473 

474 # Use that metadata to group files (and extracted metadata) by 

475 # exposure. Never parallelized because it's intrinsically a gather 

476 # step. 

477 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

478 

479 # The next few operations operate on RawExposureData instances (one at 

480 # a time) in-place and then return the modified instance. We call them 

481 # as pass-throughs instead of relying on the arguments we pass in to 

482 # have been modified because in the parallel case those arguments are 

483 # going to be pickled and unpickled, and I'm not certain 

484 # multiprocessing is careful enough with that for output arguments to 

485 # work. We use the same variable names to reflect the fact that we 

486 # consider the arguments to have been consumed/invalidated. 

487 

488 # Extract DimensionRecords from the metadata that will need to be 

489 # inserted into the Registry before the raw datasets themselves are 

490 # ingested. 

491 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData) 

492 

493 # Expand the data IDs to include all dimension metadata; we need this 

494 # because we may need to generate path templates that rely on that 

495 # metadata. 

496 # This is the first step that involves actual database calls (but just 

497 # SELECTs), so if there's going to be a problem with connections vs. 

498 # multiple processes, or lock contention (in SQLite) slowing things 

499 # down, it'll happen here. 

500 return mapFunc(self.expandDataIds, exposureData) 

501 

502 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]): 

503 """Insert dimension records for one or more exposures. 

504 

505 Parameters 

506 ---------- 

507 records : `dict` mapping `str` to `list` 

508 Dimension records to be inserted, organized as a mapping from 

509 dimension name to a list of records for that dimension. This 

510 may be a single `RawExposureData.records` dict, or an aggregate 

511 for multiple exposures created by concatenating the value lists 

512 of those dictionaries. 

513 

514 Returns 

515 ------- 

516 refs : `list` of `lsst.daf.butler.DatasetRef` 

517 Dataset references for ingested raws. 

518 """ 

519 # TODO: This currently assumes that either duplicate inserts of 

520 # visit records are ignored, or there is exactly one visit per 

521 # exposure. I expect us to switch up the visit-exposure 

522 # relationship and hence rewrite some of this code before that 

523 # becomes a practical problem. 

524 # Iterate over dimensions explicitly to order for foreign key 

525 # relationships. 

526 for dimension in ("visit", "exposure", "visit_detector_region"): 

527 recordsForDimension = records.get(dimension) 

528 if recordsForDimension: 

529 # TODO: once Registry has options to ignore or replace 

530 # existing dimension records with the same primary keys 

531 # instead of aborting on conflicts, add configuration 

532 # options and logic to use them. 

533 self.butler.registry.insertDimensionData(dimension, *recordsForDimension) 

534 

535 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None 

536 ) -> List[DatasetRef]: 

537 """Ingest all raw files in one exposure. 

538 

539 Parameters 

540 ---------- 

541 exposure : `RawExposureData` 

542 A structure containing information about the exposure to be 

543 ingested. Must have `RawExposureData.records` populated and all 

544 data ID attributes expanded. 

545 butler : `lsst.daf.butler.Butler`, optional 

546 Butler to use for ingest. If not provided, ``self.butler`` will 

547 be used. 

548 

549 Returns 

550 ------- 

551 refs : `list` of `lsst.daf.butler.DatasetRef` 

552 Dataset references for ingested raws. 

553 """ 

554 if butler is None: 

555 butler = self.butler 

556 datasets = [FileDataset(path=os.path.abspath(file.filename), 

557 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

558 formatter=file.FormatterClass) 

559 for file in exposure.files] 

560 butler.ingest(*datasets, transfer=self.config.transfer) 

561 return [ref for dataset in datasets for ref in dataset.refs] 

562 

563 def run(self, files, pool: Optional[Pool] = None, processes: int = 1): 

564 """Ingest files into a Butler data repository. 

565 

566 This creates any new exposure or visit Dimension entries needed to 

567 identify the ingested files, creates new Dataset entries in the 

568 Registry and finally ingests the files themselves into the Datastore. 

569 Any needed instrument, detector, and physical_filter Dimension entries 

570 must exist in the Registry before `run` is called. 

571 

572 Parameters 

573 ---------- 

574 files : iterable over `str` or path-like objects 

575 Paths to the files to be ingested. Will be made absolute 

576 if they are not already. 

577 pool : `multiprocessing.Pool`, optional 

578 If not `None`, a process pool with which to parallelize some 

579 operations. 

580 processes : `int`, optional 

581 The number of processes to use. Ignored if ``pool`` is not `None`. 

582 

583 Returns 

584 ------- 

585 refs : `list` of `lsst.daf.butler.DatasetRef` 

586 Dataset references for ingested raws. 

587 

588 Notes 

589 ----- 

590 This method inserts all records (dimensions and datasets) for an 

591 exposure within a transaction, guaranteeing that partial exposures 

592 are never ingested. 

593 """ 

594 exposureData = self.prep(files, pool=pool, processes=processes) 

595 # Up to this point, we haven't modified the data repository at all. 

596 # Now we finally do that, with one transaction per exposure. This is 

597 # not parallelized at present because the performance of this step is 

598 # limited by the database server. That may or may not change in the 

599 # future once we increase our usage of bulk inserts and reduce our 

600 # usage of savepoints; we've tried to get everything but the database 

601 # operations done in advance to reduce the time spent inside 

602 # transactions. 

603 self.butler.registry.registerDatasetType(self.datasetType) 

604 refs = [] 

605 for exposure in exposureData: 

606 with self.butler.transaction(): 

607 self.insertDimensionData(exposure.records) 

608 refs.extend(self.ingestExposureDatasets(exposure)) 

609 return refs