Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26import itertools 

27from dataclasses import dataclass 

28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

33from lsst.utils import doImport 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 DataCoordinate, 

38 DatasetRef, 

39 DatasetType, 

40 DimensionRecord, 

41 FileDataset, 

42) 

43from lsst.geom import Box2D 

44from lsst.pex.config import Config, Field, ChoiceField 

45from lsst.pipe.base import Task 

46from lsst.sphgeom import ConvexPolygon 

47 

48from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo 

49from .fitsRawFormatterBase import FitsRawFormatterBase 

50 

51 

52@dataclass 

53class RawFileDatasetInfo: 

54 """Structure that hold information about a single dataset within a 

55 raw file. 

56 """ 

57 

58 dataId: DataCoordinate 

59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

60 

61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

63 """ 

64 

65 obsInfo: ObservationInfo 

66 """Standardized observation metadata extracted directly from the file 

67 headers (`astro_metadata_translator.ObservationInfo`). 

68 """ 

69 

70 region: ConvexPolygon 

71 """Region on the sky covered by this file, possibly with padding 

72 (`lsst.sphgeom.ConvexPolygon`). 

73 """ 

74 

75 

76@dataclass 

77class RawFileData: 

78 """Structure that holds information about a single raw file, used during 

79 ingest. 

80 """ 

81 

82 datasets: List[RawFileDatasetInfo] 

83 """The information describing each dataset within this raw file. 

84 (`list` of `RawFileDatasetInfo`) 

85 """ 

86 

87 filename: str 

88 """Name of the file this information was extracted from (`str`). 

89 

90 This is the path prior to ingest, not the path after ingest. 

91 """ 

92 

93 FormatterClass: Type[FitsRawFormatterBase] 

94 """Formatter class that should be used to ingest this file and compute 

95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 

96 """ 

97 

98 

99@dataclass 

100class RawExposureData: 

101 """Structure that holds information about a complete raw exposure, used 

102 during ingest. 

103 """ 

104 

105 dataId: DataCoordinate 

106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

107 

108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

110 """ 

111 

112 files: List[RawFileData] 

113 """List of structures containing file-level information. 

114 """ 

115 

116 records: Optional[Dict[str, List[DimensionRecord]]] = None 

117 """Dictionary containing `DimensionRecord` instances that must be inserted 

118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 

119 

120 Keys are the names of dimension elements ("exposure" and optionally "visit" 

121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 

122 

123 May be `None` during some ingest steps. 

124 """ 

125 

126 

127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None): 

128 """Create a Config field with options for how to transfer files between 

129 data repositories. 

130 

131 The allowed options for the field are exactly those supported by 

132 `lsst.daf.butler.Datastore.ingest`. 

133 

134 Parameters 

135 ---------- 

136 doc : `str` 

137 Documentation for the configuration field. 

138 

139 Returns 

140 ------- 

141 field : `lsst.pex.config.ChoiceField` 

142 Configuration field. 

143 """ 

144 return ChoiceField( 

145 doc=doc, 

146 dtype=str, 

147 allowed={"move": "move", 

148 "copy": "copy", 

149 "auto": "choice will depend on datastore", 

150 "link": "hard link falling back to symbolic link", 

151 "hardlink": "hard link", 

152 "symlink": "symbolic (soft) link"}, 

153 optional=True, 

154 default=default 

155 ) 

156 

157 

158class RawIngestConfig(Config): 

159 transfer = makeTransferChoiceField() 

160 padRegionAmount = Field( 

161 dtype=int, 

162 default=0, 

163 doc="Pad an image with specified number of pixels before calculating region" 

164 ) 

165 instrument = Field( 

166 doc=("Fully-qualified Python name of the `Instrument` subclass to " 

167 "associate with all raws."), 

168 dtype=str, 

169 optional=False, 

170 default=None, 

171 ) 

172 

173 

174class RawIngestTask(Task): 

175 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

176 

177 This Task is intended to be runnable from the command-line, but it doesn't 

178 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 

179 gain much from being one. It also wouldn't really be appropriate as a 

180 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 

181 leverage the logging and configurability functionality that provides. 

182 

183 Each instance of `RawIngestTask` writes to the same Butler. Each 

184 invocation of `RawIngestTask.run` ingests a list of files. 

185 

186 Parameters 

187 ---------- 

188 config : `RawIngestConfig` 

189 Configuration for the task. 

190 butler : `~lsst.daf.butler.Butler` 

191 Butler instance. Ingested Datasets will be created as part of 

192 ``butler.run`` and associated with its Collection. 

193 kwds 

194 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

195 constructor. 

196 

197 Other keyword arguments are forwarded to the Task base class constructor. 

198 """ 

199 

200 ConfigClass = RawIngestConfig 

201 

202 _DefaultName = "ingest" 

203 

204 def getDatasetType(self): 

205 """Return the DatasetType of the Datasets ingested by this Task. 

206 """ 

207 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

208 universe=self.butler.registry.dimensions) 

209 

210 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any): 

211 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

212 super().__init__(config, **kwds) 

213 self.butler = butler 

214 self.universe = self.butler.registry.dimensions 

215 self.instrument = doImport(self.config.instrument)() 

216 # For now, we get a nominal Camera from the Instrument. 

217 # In the future, we may want to load one from a Butler calibration 

218 # collection that's appropriate for the observation timestamp of 

219 # the exposure. 

220 self.camera = self.instrument.getCamera() 

221 self.datasetType = self.getDatasetType() 

222 

223 def extractMetadata(self, filename: str) -> RawFileData: 

224 """Extract and process metadata from a single raw file. 

225 

226 Parameters 

227 ---------- 

228 filename : `str` 

229 Path to the file. 

230 

231 Returns 

232 ------- 

233 data : `RawFileData` 

234 A structure containing the metadata extracted from the file, 

235 as well as the original filename. All fields will be populated, 

236 but the `RawFileData.dataId` attribute will be a minimal 

237 (unexpanded) `DataCoordinate` instance. 

238 

239 Notes 

240 ----- 

241 Assumes that there is a single dataset associated with the given 

242 file. Instruments using a single file to store multiple datasets 

243 must implement their own version of this method. 

244 """ 

245 # Manually merge the primary and "first data" headers here because we 

246 # do not know in general if an input file has set INHERIT=T. 

247 phdu = readMetadata(filename, 0) 

248 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

249 fix_header(header) 

250 datasets = [self._calculate_dataset_info(header, filename)] 

251 

252 # The data model currently assumes that whilst multiple datasets 

253 # can be associated with a single file, they must all share the 

254 # same formatter. 

255 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId) 

256 

257 return RawFileData(datasets=datasets, filename=filename, 

258 FormatterClass=FormatterClass) 

259 

260 def _calculate_dataset_info(self, header, filename): 

261 """Calculate a RawFileDatasetInfo from the supplied information. 

262 

263 Parameters 

264 ---------- 

265 header : `Mapping` 

266 Header from the dataset. 

267 filename : `str` 

268 Filename to use for error messages. 

269 

270 Returns 

271 ------- 

272 dataset : `RawFileDatasetInfo` 

273 The region, dataId, and observation information associated with 

274 this dataset. 

275 """ 

276 obsInfo = ObservationInfo(header) 

277 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

278 exposure=obsInfo.exposure_id, 

279 detector=obsInfo.detector_num, 

280 universe=self.universe) 

281 if obsInfo.instrument != self.instrument.getName(): 

282 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, " 

283 f"got {obsInfo.instrument}) for file {filename}.") 

284 

285 FormatterClass = self.instrument.getRawFormatter(dataId) 

286 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass) 

287 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId) 

288 

289 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass): 

290 """Calculate the sky region covered by the supplied observation 

291 information. 

292 

293 Parameters 

294 ---------- 

295 obsInfo : `~astro_metadata_translator.ObservationInfo` 

296 Summary information of this dataset. 

297 header : `Mapping` 

298 Header from the dataset. 

299 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 

300 Formatter class that should be used to compute the spatial region. 

301 

302 Returns 

303 ------- 

304 region : `lsst.sphgeom.ConvexPolygon` 

305 Region of sky covered by this observation. 

306 """ 

307 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None: 

308 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo) 

309 visitInfo = formatter.makeVisitInfo() 

310 detector = self.camera[obsInfo.detector_num] 

311 wcs = formatter.makeWcs(visitInfo, detector) 

312 pixBox = Box2D(detector.getBBox()) 

313 if self.config.padRegionAmount > 0: 

314 pixBox.grow(self.config.padRegionAmount) 

315 pixCorners = pixBox.getCorners() 

316 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners] 

317 region = ConvexPolygon(sphCorners) 

318 else: 

319 region = None 

320 return region 

321 

322 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

323 """Group an iterable of `RawFileData` by exposure. 

324 

325 Parameters 

326 ---------- 

327 files : iterable of `RawFileData` 

328 File-level information to group. 

329 

330 Returns 

331 ------- 

332 exposures : `list` of `RawExposureData` 

333 A list of structures that group the file-level information by 

334 exposure. The `RawExposureData.records` attributes of elements 

335 will be `None`, but all other fields will be populated. The 

336 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

337 `DataCoordinate` instances. 

338 """ 

339 exposureDimensions = self.universe["exposure"].graph 

340 byExposure = defaultdict(list) 

341 for f in files: 

342 # Assume that the first dataset is representative for the file 

343 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

344 

345 return [RawExposureData(dataId=dataId, files=exposureFiles) 

346 for dataId, exposureFiles in byExposure.items()] 

347 

348 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData: 

349 """Collect the `DimensionRecord` instances that must be inserted into 

350 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 

351 

352 Parameters 

353 ---------- 

354 exposure : `RawExposureData` 

355 A structure containing information about the exposure to be 

356 ingested. Should be considered consumed upon return. 

357 

358 Returns 

359 ------- 

360 exposure : `RawExposureData` 

361 An updated version of the input structure, with 

362 `RawExposureData.records` populated. 

363 """ 

364 firstFile = exposure.files[0] 

365 firstDataset = firstFile.datasets[0] 

366 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass 

367 exposure.records = { 

368 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)], 

369 } 

370 if firstDataset.obsInfo.visit_id is not None: 

371 exposure.records["visit_detector_region"] = [] 

372 visitVertices = [] 

373 for file in exposure.files: 

374 for dataset in file.datasets: 

375 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id: 

376 raise ValueError(f"Inconsistent visit/exposure relationship for " 

377 f"exposure {firstDataset.obsInfo.exposure_id} between " 

378 f"{file.filename} and {firstFile.filename}: " 

379 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.") 

380 if dataset.region is None: 

381 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id, 

382 dataset.obsInfo.detector_num) 

383 continue 

384 visitVertices.extend(dataset.region.getVertices()) 

385 exposure.records["visit_detector_region"].append( 

386 VisitDetectorRegionRecordClass.fromDict({ 

387 "instrument": dataset.obsInfo.instrument, 

388 "visit": dataset.obsInfo.visit_id, 

389 "detector": dataset.obsInfo.detector_num, 

390 "region": dataset.region, 

391 }) 

392 ) 

393 if visitVertices: 

394 visitRegion = ConvexPolygon(visitVertices) 

395 else: 

396 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id) 

397 visitRegion = None 

398 exposure.records["visit"] = [ 

399 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion) 

400 ] 

401 return exposure 

402 

403 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

404 """Expand the data IDs associated with a raw exposure to include 

405 additional metadata records. 

406 

407 Parameters 

408 ---------- 

409 exposure : `RawExposureData` 

410 A structure containing information about the exposure to be 

411 ingested. Must have `RawExposureData.records` populated. Should 

412 be considered consumed upon return. 

413 

414 Returns 

415 ------- 

416 exposure : `RawExposureData` 

417 An updated version of the input structure, with 

418 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

419 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 

420 """ 

421 hasVisit = "visit" in data.records 

422 # We start by expanded the exposure-level data ID; we won't use that 

423 # directly in file ingest, but this lets us do some database lookups 

424 # once per exposure instead of once per file later. 

425 data.dataId = self.butler.registry.expandDataId( 

426 data.dataId, 

427 # We pass in the records we'll be inserting shortly so they aren't 

428 # looked up from the database. We do expect instrument and filter 

429 # records to be retrieved from the database here (though the 

430 # Registry may cache them so there isn't a lookup every time). 

431 records={ 

432 "exposure": data.records["exposure"][0], 

433 "visit": data.records["visit"][0] if hasVisit else None, 

434 } 

435 ) 

436 # Now we expand the per-file (exposure+detector) data IDs. This time 

437 # we pass in the records we just retrieved from the exposure data ID 

438 # expansion as well as the visit_detector_region record, if there is 

439 # one. 

440 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None) 

441 for file, vdrRecord in zip(data.files, vdrRecords): 

442 for dataset in file.datasets: 

443 dataset.dataId = self.butler.registry.expandDataId( 

444 dataset.dataId, 

445 records=dict(data.dataId.records, visit_detector_region=vdrRecord) 

446 ) 

447 return data 

448 

449 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

450 """Perform all ingest preprocessing steps that do not involve actually 

451 modifying the database. 

452 

453 Parameters 

454 ---------- 

455 files : iterable over `str` or path-like objects 

456 Paths to the files to be ingested. Will be made absolute 

457 if they are not already. 

458 pool : `multiprocessing.Pool`, optional 

459 If not `None`, a process pool with which to parallelize some 

460 operations. 

461 processes : `int`, optional 

462 The number of processes to use. Ignored if ``pool`` is not `None`. 

463 

464 Yields 

465 ------ 

466 exposure : `RawExposureData` 

467 Data structures containing dimension records, filenames, and data 

468 IDs to be ingested (one structure for each exposure). 

469 """ 

470 if pool is None and processes > 1: 

471 pool = Pool(processes) 

472 mapFunc = map if pool is None else pool.imap_unordered 

473 

474 # Extract metadata and build per-detector regions. 

475 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

476 

477 # Use that metadata to group files (and extracted metadata) by 

478 # exposure. Never parallelized because it's intrinsically a gather 

479 # step. 

480 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

481 

482 # The next few operations operate on RawExposureData instances (one at 

483 # a time) in-place and then return the modified instance. We call them 

484 # as pass-throughs instead of relying on the arguments we pass in to 

485 # have been modified because in the parallel case those arguments are 

486 # going to be pickled and unpickled, and I'm not certain 

487 # multiprocessing is careful enough with that for output arguments to 

488 # work. We use the same variable names to reflect the fact that we 

489 # consider the arguments to have been consumed/invalidated. 

490 

491 # Extract DimensionRecords from the metadata that will need to be 

492 # inserted into the Registry before the raw datasets themselves are 

493 # ingested. 

494 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData) 

495 

496 # Expand the data IDs to include all dimension metadata; we need this 

497 # because we may need to generate path templates that rely on that 

498 # metadata. 

499 # This is the first step that involves actual database calls (but just 

500 # SELECTs), so if there's going to be a problem with connections vs. 

501 # multiple processes, or lock contention (in SQLite) slowing things 

502 # down, it'll happen here. 

503 return mapFunc(self.expandDataIds, exposureData) 

504 

505 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]): 

506 """Insert dimension records for one or more exposures. 

507 

508 Parameters 

509 ---------- 

510 records : `dict` mapping `str` to `list` 

511 Dimension records to be inserted, organized as a mapping from 

512 dimension name to a list of records for that dimension. This 

513 may be a single `RawExposureData.records` dict, or an aggregate 

514 for multiple exposures created by concatenating the value lists 

515 of those dictionaries. 

516 

517 Returns 

518 ------- 

519 refs : `list` of `lsst.daf.butler.DatasetRef` 

520 Dataset references for ingested raws. 

521 """ 

522 # TODO: This currently assumes that either duplicate inserts of 

523 # visit records are ignored, or there is exactly one visit per 

524 # exposure. I expect us to switch up the visit-exposure 

525 # relationship and hence rewrite some of this code before that 

526 # becomes a practical problem. 

527 # Iterate over dimensions explicitly to order for foreign key 

528 # relationships. 

529 for dimension in ("visit", "exposure", "visit_detector_region"): 

530 recordsForDimension = records.get(dimension) 

531 if recordsForDimension: 

532 # TODO: once Registry has options to ignore or replace 

533 # existing dimension records with the same primary keys 

534 # instead of aborting on conflicts, add configuration 

535 # options and logic to use them. 

536 self.butler.registry.insertDimensionData(dimension, *recordsForDimension) 

537 

538 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None 

539 ) -> List[DatasetRef]: 

540 """Ingest all raw files in one exposure. 

541 

542 Parameters 

543 ---------- 

544 exposure : `RawExposureData` 

545 A structure containing information about the exposure to be 

546 ingested. Must have `RawExposureData.records` populated and all 

547 data ID attributes expanded. 

548 butler : `lsst.daf.butler.Butler`, optional 

549 Butler to use for ingest. If not provided, ``self.butler`` will 

550 be used. 

551 

552 Returns 

553 ------- 

554 refs : `list` of `lsst.daf.butler.DatasetRef` 

555 Dataset references for ingested raws. 

556 """ 

557 if butler is None: 

558 butler = self.butler 

559 datasets = [FileDataset(path=os.path.abspath(file.filename), 

560 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

561 formatter=file.FormatterClass) 

562 for file in exposure.files] 

563 butler.ingest(*datasets, transfer=self.config.transfer) 

564 return [ref for dataset in datasets for ref in dataset.refs] 

565 

566 def run(self, files, pool: Optional[Pool] = None, processes: int = 1): 

567 """Ingest files into a Butler data repository. 

568 

569 This creates any new exposure or visit Dimension entries needed to 

570 identify the ingested files, creates new Dataset entries in the 

571 Registry and finally ingests the files themselves into the Datastore. 

572 Any needed instrument, detector, and physical_filter Dimension entries 

573 must exist in the Registry before `run` is called. 

574 

575 Parameters 

576 ---------- 

577 files : iterable over `str` or path-like objects 

578 Paths to the files to be ingested. Will be made absolute 

579 if they are not already. 

580 pool : `multiprocessing.Pool`, optional 

581 If not `None`, a process pool with which to parallelize some 

582 operations. 

583 processes : `int`, optional 

584 The number of processes to use. Ignored if ``pool`` is not `None`. 

585 

586 Returns 

587 ------- 

588 refs : `list` of `lsst.daf.butler.DatasetRef` 

589 Dataset references for ingested raws. 

590 

591 Notes 

592 ----- 

593 This method inserts all records (dimensions and datasets) for an 

594 exposure within a transaction, guaranteeing that partial exposures 

595 are never ingested. 

596 """ 

597 exposureData = self.prep(files, pool=pool, processes=processes) 

598 # Up to this point, we haven't modified the data repository at all. 

599 # Now we finally do that, with one transaction per exposure. This is 

600 # not parallelized at present because the performance of this step is 

601 # limited by the database server. That may or may not change in the 

602 # future once we increase our usage of bulk inserts and reduce our 

603 # usage of savepoints; we've tried to get everything but the database 

604 # operations done in advance to reduce the time spent inside 

605 # transactions. 

606 self.butler.registry.registerDatasetType(self.datasetType) 

607 refs = [] 

608 for exposure in exposureData: 

609 with self.butler.transaction(): 

610 self.insertDimensionData(exposure.records) 

611 refs.extend(self.ingestExposureDatasets(exposure)) 

612 return refs