Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26import itertools 

27from dataclasses import dataclass 

28from typing import List, Dict, Iterator, Iterable, Type, Optional, Any, Mapping 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

33from lsst.utils import doImport 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 DataCoordinate, 

38 DatasetRef, 

39 DatasetType, 

40 DimensionRecord, 

41 FileDataset, 

42) 

43from lsst.geom import Box2D 

44from lsst.pex.config import Config, Field, ChoiceField 

45from lsst.pipe.base import Task 

46from lsst.sphgeom import ConvexPolygon 

47 

48from .instrument import makeExposureRecordFromObsInfo, makeVisitRecordFromObsInfo 

49from .fitsRawFormatterBase import FitsRawFormatterBase 

50 

51 

52@dataclass 

53class RawFileDatasetInfo: 

54 """Structure that hold information about a single dataset within a 

55 raw file. 

56 """ 

57 

58 dataId: DataCoordinate 

59 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

60 

61 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

62 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

63 """ 

64 

65 obsInfo: ObservationInfo 

66 """Standardized observation metadata extracted directly from the file 

67 headers (`astro_metadata_translator.ObservationInfo`). 

68 """ 

69 

70 region: ConvexPolygon 

71 """Region on the sky covered by this file, possibly with padding 

72 (`lsst.sphgeom.ConvexPolygon`). 

73 """ 

74 

75 

76@dataclass 

77class RawFileData: 

78 """Structure that holds information about a single raw file, used during 

79 ingest. 

80 """ 

81 

82 datasets: List[RawFileDatasetInfo] 

83 """The information describing each dataset within this raw file. 

84 (`list` of `RawFileDatasetInfo`) 

85 """ 

86 

87 filename: str 

88 """Name of the file this information was extracted from (`str`). 

89 

90 This is the path prior to ingest, not the path after ingest. 

91 """ 

92 

93 FormatterClass: Type[FitsRawFormatterBase] 

94 """Formatter class that should be used to ingest this file and compute 

95 a spatial region for it (`type`; as subclass of `FitsRawFormatterBase`). 

96 """ 

97 

98 

99@dataclass 

100class RawExposureData: 

101 """Structure that holds information about a complete raw exposure, used 

102 during ingest. 

103 """ 

104 

105 dataId: DataCoordinate 

106 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

107 

108 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

109 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

110 """ 

111 

112 files: List[RawFileData] 

113 """List of structures containing file-level information. 

114 """ 

115 

116 records: Optional[Dict[str, List[DimensionRecord]]] = None 

117 """Dictionary containing `DimensionRecord` instances that must be inserted 

118 into the `~lsst.daf.butler.Registry` prior to file-level ingest (`dict`). 

119 

120 Keys are the names of dimension elements ("exposure" and optionally "visit" 

121 and "visit_detector_region"), while values are lists of `DimensionRecord`. 

122 

123 May be `None` during some ingest steps. 

124 """ 

125 

126 

127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None): 

128 """Create a Config field with options for how to transfer files between 

129 data repositories. 

130 

131 The allowed options for the field are exactly those supported by 

132 `lsst.daf.butler.Datastore.ingest`. 

133 

134 Parameters 

135 ---------- 

136 doc : `str` 

137 Documentation for the configuration field. 

138 

139 Returns 

140 ------- 

141 field : `lsst.pex.config.ChoiceField` 

142 Configuration field. 

143 """ 

144 return ChoiceField( 

145 doc=doc, 

146 dtype=str, 

147 allowed={"move": "move", 

148 "copy": "copy", 

149 "auto": "choice will depend on datastore", 

150 "link": "hard link falling back to symbolic link", 

151 "hardlink": "hard link", 

152 "symlink": "symbolic (soft) link", 

153 "relsymlink": "relative symbolic link", 

154 }, 

155 optional=True, 

156 default=default 

157 ) 

158 

159 

160class RawIngestConfig(Config): 

161 transfer = makeTransferChoiceField() 

162 padRegionAmount = Field( 

163 dtype=int, 

164 default=0, 

165 doc="Pad an image with specified number of pixels before calculating region" 

166 ) 

167 instrument = Field( 

168 doc=("Fully-qualified Python name of the `Instrument` subclass to " 

169 "associate with all raws."), 

170 dtype=str, 

171 optional=False, 

172 default=None, 

173 ) 

174 

175 

176class RawIngestTask(Task): 

177 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

178 

179 This Task is intended to be runnable from the command-line, but it doesn't 

180 meet the other requirements of CmdLineTask or PipelineTask, and wouldn't 

181 gain much from being one. It also wouldn't really be appropriate as a 

182 subtask of a CmdLineTask or PipelineTask; it's a Task essentially just to 

183 leverage the logging and configurability functionality that provides. 

184 

185 Each instance of `RawIngestTask` writes to the same Butler. Each 

186 invocation of `RawIngestTask.run` ingests a list of files. 

187 

188 Parameters 

189 ---------- 

190 config : `RawIngestConfig` 

191 Configuration for the task. 

192 butler : `~lsst.daf.butler.Butler` 

193 Butler instance. Ingested Datasets will be created as part of 

194 ``butler.run`` and associated with its Collection. 

195 kwds 

196 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

197 constructor. 

198 

199 Other keyword arguments are forwarded to the Task base class constructor. 

200 """ 

201 

202 ConfigClass = RawIngestConfig 

203 

204 _DefaultName = "ingest" 

205 

206 def getDatasetType(self): 

207 """Return the DatasetType of the Datasets ingested by this Task. 

208 """ 

209 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

210 universe=self.butler.registry.dimensions) 

211 

212 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwds: Any): 

213 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

214 super().__init__(config, **kwds) 

215 self.butler = butler 

216 self.universe = self.butler.registry.dimensions 

217 self.instrument = doImport(self.config.instrument)() 

218 # For now, we get a nominal Camera from the Instrument. 

219 # In the future, we may want to load one from a Butler calibration 

220 # collection that's appropriate for the observation timestamp of 

221 # the exposure. 

222 self.camera = self.instrument.getCamera() 

223 self.datasetType = self.getDatasetType() 

224 

225 def extractMetadata(self, filename: str) -> RawFileData: 

226 """Extract and process metadata from a single raw file. 

227 

228 Parameters 

229 ---------- 

230 filename : `str` 

231 Path to the file. 

232 

233 Returns 

234 ------- 

235 data : `RawFileData` 

236 A structure containing the metadata extracted from the file, 

237 as well as the original filename. All fields will be populated, 

238 but the `RawFileData.dataId` attribute will be a minimal 

239 (unexpanded) `DataCoordinate` instance. 

240 

241 Notes 

242 ----- 

243 Assumes that there is a single dataset associated with the given 

244 file. Instruments using a single file to store multiple datasets 

245 must implement their own version of this method. 

246 """ 

247 # Manually merge the primary and "first data" headers here because we 

248 # do not know in general if an input file has set INHERIT=T. 

249 phdu = readMetadata(filename, 0) 

250 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

251 fix_header(header) 

252 datasets = [self._calculate_dataset_info(header, filename)] 

253 

254 # The data model currently assumes that whilst multiple datasets 

255 # can be associated with a single file, they must all share the 

256 # same formatter. 

257 FormatterClass = self.instrument.getRawFormatter(datasets[0].dataId) 

258 

259 return RawFileData(datasets=datasets, filename=filename, 

260 FormatterClass=FormatterClass) 

261 

262 def _calculate_dataset_info(self, header, filename): 

263 """Calculate a RawFileDatasetInfo from the supplied information. 

264 

265 Parameters 

266 ---------- 

267 header : `Mapping` 

268 Header from the dataset. 

269 filename : `str` 

270 Filename to use for error messages. 

271 

272 Returns 

273 ------- 

274 dataset : `RawFileDatasetInfo` 

275 The region, dataId, and observation information associated with 

276 this dataset. 

277 """ 

278 obsInfo = ObservationInfo(header) 

279 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

280 exposure=obsInfo.exposure_id, 

281 detector=obsInfo.detector_num, 

282 universe=self.universe) 

283 if obsInfo.instrument != self.instrument.getName(): 

284 raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, " 

285 f"got {obsInfo.instrument}) for file {filename}.") 

286 

287 FormatterClass = self.instrument.getRawFormatter(dataId) 

288 region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass) 

289 return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId) 

290 

291 def _calculate_region_from_dataset_metadata(self, obsInfo, header, FormatterClass): 

292 """Calculate the sky region covered by the supplied observation 

293 information. 

294 

295 Parameters 

296 ---------- 

297 obsInfo : `~astro_metadata_translator.ObservationInfo` 

298 Summary information of this dataset. 

299 header : `Mapping` 

300 Header from the dataset. 

301 FormatterClass: `type` as subclass of `FitsRawFormatterBase` 

302 Formatter class that should be used to compute the spatial region. 

303 

304 Returns 

305 ------- 

306 region : `lsst.sphgeom.ConvexPolygon` 

307 Region of sky covered by this observation. 

308 """ 

309 if obsInfo.visit_id is not None and obsInfo.tracking_radec is not None: 

310 formatter = FormatterClass.fromMetadata(metadata=header, obsInfo=obsInfo) 

311 visitInfo = formatter.makeVisitInfo() 

312 detector = self.camera[obsInfo.detector_num] 

313 wcs = formatter.makeWcs(visitInfo, detector) 

314 pixBox = Box2D(detector.getBBox()) 

315 if self.config.padRegionAmount > 0: 

316 pixBox.grow(self.config.padRegionAmount) 

317 pixCorners = pixBox.getCorners() 

318 sphCorners = [wcs.pixelToSky(point).getVector() for point in pixCorners] 

319 region = ConvexPolygon(sphCorners) 

320 else: 

321 region = None 

322 return region 

323 

324 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

325 """Group an iterable of `RawFileData` by exposure. 

326 

327 Parameters 

328 ---------- 

329 files : iterable of `RawFileData` 

330 File-level information to group. 

331 

332 Returns 

333 ------- 

334 exposures : `list` of `RawExposureData` 

335 A list of structures that group the file-level information by 

336 exposure. The `RawExposureData.records` attributes of elements 

337 will be `None`, but all other fields will be populated. The 

338 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

339 `DataCoordinate` instances. 

340 """ 

341 exposureDimensions = self.universe["exposure"].graph 

342 byExposure = defaultdict(list) 

343 for f in files: 

344 # Assume that the first dataset is representative for the file 

345 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

346 

347 return [RawExposureData(dataId=dataId, files=exposureFiles) 

348 for dataId, exposureFiles in byExposure.items()] 

349 

350 def collectDimensionRecords(self, exposure: RawExposureData) -> RawExposureData: 

351 """Collect the `DimensionRecord` instances that must be inserted into 

352 the `~lsst.daf.butler.Registry` before an exposure's raw files may be. 

353 

354 Parameters 

355 ---------- 

356 exposure : `RawExposureData` 

357 A structure containing information about the exposure to be 

358 ingested. Should be considered consumed upon return. 

359 

360 Returns 

361 ------- 

362 exposure : `RawExposureData` 

363 An updated version of the input structure, with 

364 `RawExposureData.records` populated. 

365 """ 

366 firstFile = exposure.files[0] 

367 firstDataset = firstFile.datasets[0] 

368 VisitDetectorRegionRecordClass = self.universe["visit_detector_region"].RecordClass 

369 exposure.records = { 

370 "exposure": [makeExposureRecordFromObsInfo(firstDataset.obsInfo, self.universe)], 

371 } 

372 if firstDataset.obsInfo.visit_id is not None: 

373 exposure.records["visit_detector_region"] = [] 

374 visitVertices = [] 

375 for file in exposure.files: 

376 for dataset in file.datasets: 

377 if dataset.obsInfo.visit_id != firstDataset.obsInfo.visit_id: 

378 raise ValueError(f"Inconsistent visit/exposure relationship for " 

379 f"exposure {firstDataset.obsInfo.exposure_id} between " 

380 f"{file.filename} and {firstFile.filename}: " 

381 f"{dataset.obsInfo.visit_id} != {firstDataset.obsInfo.visit_id}.") 

382 if dataset.region is None: 

383 self.log.warn("No region found for visit=%s, detector=%s.", dataset.obsInfo.visit_id, 

384 dataset.obsInfo.detector_num) 

385 continue 

386 visitVertices.extend(dataset.region.getVertices()) 

387 exposure.records["visit_detector_region"].append( 

388 VisitDetectorRegionRecordClass.fromDict({ 

389 "instrument": dataset.obsInfo.instrument, 

390 "visit": dataset.obsInfo.visit_id, 

391 "detector": dataset.obsInfo.detector_num, 

392 "region": dataset.region, 

393 }) 

394 ) 

395 if visitVertices: 

396 visitRegion = ConvexPolygon(visitVertices) 

397 else: 

398 self.log.warn("No region found for visit=%s.", firstDataset.obsInfo.visit_id) 

399 visitRegion = None 

400 exposure.records["visit"] = [ 

401 makeVisitRecordFromObsInfo(firstDataset.obsInfo, self.universe, region=visitRegion) 

402 ] 

403 return exposure 

404 

405 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

406 """Expand the data IDs associated with a raw exposure to include 

407 additional metadata records. 

408 

409 Parameters 

410 ---------- 

411 exposure : `RawExposureData` 

412 A structure containing information about the exposure to be 

413 ingested. Must have `RawExposureData.records` populated. Should 

414 be considered consumed upon return. 

415 

416 Returns 

417 ------- 

418 exposure : `RawExposureData` 

419 An updated version of the input structure, with 

420 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

421 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 

422 """ 

423 hasVisit = "visit" in data.records 

424 # We start by expanded the exposure-level data ID; we won't use that 

425 # directly in file ingest, but this lets us do some database lookups 

426 # once per exposure instead of once per file later. 

427 data.dataId = self.butler.registry.expandDataId( 

428 data.dataId, 

429 # We pass in the records we'll be inserting shortly so they aren't 

430 # looked up from the database. We do expect instrument and filter 

431 # records to be retrieved from the database here (though the 

432 # Registry may cache them so there isn't a lookup every time). 

433 records={ 

434 "exposure": data.records["exposure"][0], 

435 "visit": data.records["visit"][0] if hasVisit else None, 

436 } 

437 ) 

438 # Now we expand the per-file (exposure+detector) data IDs. This time 

439 # we pass in the records we just retrieved from the exposure data ID 

440 # expansion as well as the visit_detector_region record, if there is 

441 # one. 

442 vdrRecords = data.records["visit_detector_region"] if hasVisit else itertools.repeat(None) 

443 for file, vdrRecord in zip(data.files, vdrRecords): 

444 for dataset in file.datasets: 

445 dataset.dataId = self.butler.registry.expandDataId( 

446 dataset.dataId, 

447 records=dict(data.dataId.records, visit_detector_region=vdrRecord) 

448 ) 

449 return data 

450 

451 def prep(self, files, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

452 """Perform all ingest preprocessing steps that do not involve actually 

453 modifying the database. 

454 

455 Parameters 

456 ---------- 

457 files : iterable over `str` or path-like objects 

458 Paths to the files to be ingested. Will be made absolute 

459 if they are not already. 

460 pool : `multiprocessing.Pool`, optional 

461 If not `None`, a process pool with which to parallelize some 

462 operations. 

463 processes : `int`, optional 

464 The number of processes to use. Ignored if ``pool`` is not `None`. 

465 

466 Yields 

467 ------ 

468 exposure : `RawExposureData` 

469 Data structures containing dimension records, filenames, and data 

470 IDs to be ingested (one structure for each exposure). 

471 """ 

472 if pool is None and processes > 1: 

473 pool = Pool(processes) 

474 mapFunc = map if pool is None else pool.imap_unordered 

475 

476 # Extract metadata and build per-detector regions. 

477 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

478 

479 # Use that metadata to group files (and extracted metadata) by 

480 # exposure. Never parallelized because it's intrinsically a gather 

481 # step. 

482 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

483 

484 # The next few operations operate on RawExposureData instances (one at 

485 # a time) in-place and then return the modified instance. We call them 

486 # as pass-throughs instead of relying on the arguments we pass in to 

487 # have been modified because in the parallel case those arguments are 

488 # going to be pickled and unpickled, and I'm not certain 

489 # multiprocessing is careful enough with that for output arguments to 

490 # work. We use the same variable names to reflect the fact that we 

491 # consider the arguments to have been consumed/invalidated. 

492 

493 # Extract DimensionRecords from the metadata that will need to be 

494 # inserted into the Registry before the raw datasets themselves are 

495 # ingested. 

496 exposureData: Iterator[RawExposureData] = mapFunc(self.collectDimensionRecords, exposureData) 

497 

498 # Expand the data IDs to include all dimension metadata; we need this 

499 # because we may need to generate path templates that rely on that 

500 # metadata. 

501 # This is the first step that involves actual database calls (but just 

502 # SELECTs), so if there's going to be a problem with connections vs. 

503 # multiple processes, or lock contention (in SQLite) slowing things 

504 # down, it'll happen here. 

505 return mapFunc(self.expandDataIds, exposureData) 

506 

507 def insertDimensionData(self, records: Mapping[str, List[DimensionRecord]]): 

508 """Insert dimension records for one or more exposures. 

509 

510 Parameters 

511 ---------- 

512 records : `dict` mapping `str` to `list` 

513 Dimension records to be inserted, organized as a mapping from 

514 dimension name to a list of records for that dimension. This 

515 may be a single `RawExposureData.records` dict, or an aggregate 

516 for multiple exposures created by concatenating the value lists 

517 of those dictionaries. 

518 

519 Returns 

520 ------- 

521 refs : `list` of `lsst.daf.butler.DatasetRef` 

522 Dataset references for ingested raws. 

523 """ 

524 # TODO: This currently assumes that either duplicate inserts of 

525 # visit records are ignored, or there is exactly one visit per 

526 # exposure. I expect us to switch up the visit-exposure 

527 # relationship and hence rewrite some of this code before that 

528 # becomes a practical problem. 

529 # Iterate over dimensions explicitly to order for foreign key 

530 # relationships. 

531 for dimension in ("visit", "exposure", "visit_detector_region"): 

532 recordsForDimension = records.get(dimension) 

533 if recordsForDimension: 

534 # TODO: once Registry has options to ignore or replace 

535 # existing dimension records with the same primary keys 

536 # instead of aborting on conflicts, add configuration 

537 # options and logic to use them. 

538 self.butler.registry.insertDimensionData(dimension, *recordsForDimension) 

539 

540 def ingestExposureDatasets(self, exposure: RawExposureData, butler: Optional[Butler] = None 

541 ) -> List[DatasetRef]: 

542 """Ingest all raw files in one exposure. 

543 

544 Parameters 

545 ---------- 

546 exposure : `RawExposureData` 

547 A structure containing information about the exposure to be 

548 ingested. Must have `RawExposureData.records` populated and all 

549 data ID attributes expanded. 

550 butler : `lsst.daf.butler.Butler`, optional 

551 Butler to use for ingest. If not provided, ``self.butler`` will 

552 be used. 

553 

554 Returns 

555 ------- 

556 refs : `list` of `lsst.daf.butler.DatasetRef` 

557 Dataset references for ingested raws. 

558 """ 

559 if butler is None: 

560 butler = self.butler 

561 datasets = [FileDataset(path=os.path.abspath(file.filename), 

562 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

563 formatter=file.FormatterClass) 

564 for file in exposure.files] 

565 butler.ingest(*datasets, transfer=self.config.transfer) 

566 return [ref for dataset in datasets for ref in dataset.refs] 

567 

568 def run(self, files, pool: Optional[Pool] = None, processes: int = 1): 

569 """Ingest files into a Butler data repository. 

570 

571 This creates any new exposure or visit Dimension entries needed to 

572 identify the ingested files, creates new Dataset entries in the 

573 Registry and finally ingests the files themselves into the Datastore. 

574 Any needed instrument, detector, and physical_filter Dimension entries 

575 must exist in the Registry before `run` is called. 

576 

577 Parameters 

578 ---------- 

579 files : iterable over `str` or path-like objects 

580 Paths to the files to be ingested. Will be made absolute 

581 if they are not already. 

582 pool : `multiprocessing.Pool`, optional 

583 If not `None`, a process pool with which to parallelize some 

584 operations. 

585 processes : `int`, optional 

586 The number of processes to use. Ignored if ``pool`` is not `None`. 

587 

588 Returns 

589 ------- 

590 refs : `list` of `lsst.daf.butler.DatasetRef` 

591 Dataset references for ingested raws. 

592 

593 Notes 

594 ----- 

595 This method inserts all records (dimensions and datasets) for an 

596 exposure within a transaction, guaranteeing that partial exposures 

597 are never ingested. 

598 """ 

599 exposureData = self.prep(files, pool=pool, processes=processes) 

600 # Up to this point, we haven't modified the data repository at all. 

601 # Now we finally do that, with one transaction per exposure. This is 

602 # not parallelized at present because the performance of this step is 

603 # limited by the database server. That may or may not change in the 

604 # future once we increase our usage of bulk inserts and reduce our 

605 # usage of savepoints; we've tried to get everything but the database 

606 # operations done in advance to reduce the time spent inside 

607 # transactions. 

608 self.butler.registry.registerDatasetType(self.datasetType) 

609 refs = [] 

610 for exposure in exposureData: 

611 with self.butler.transaction(): 

612 self.insertDimensionData(exposure.records) 

613 refs.extend(self.ingestExposureDatasets(exposure)) 

614 return refs