Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import List, Iterator, Iterable, Type, Optional, Any 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

32from lsst.afw.fits import readMetadata 

33from lsst.daf.butler import ( 

34 Butler, 

35 CollectionType, 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 FileDataset, 

42) 

43from lsst.pex.config import Config, ChoiceField 

44from lsst.pipe.base import Task 

45 

46from ._instrument import Instrument, makeExposureRecordFromObsInfo 

47from ._fitsRawFormatterBase import FitsRawFormatterBase 

48 

49 

50@dataclass 

51class RawFileDatasetInfo: 

52 """Structure that holds information about a single dataset within a 

53 raw file. 

54 """ 

55 

56 dataId: DataCoordinate 

57 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

58 """ 

59 

60 obsInfo: ObservationInfo 

61 """Standardized observation metadata extracted directly from the file 

62 headers (`astro_metadata_translator.ObservationInfo`). 

63 """ 

64 

65 

66@dataclass 

67class RawFileData: 

68 """Structure that holds information about a single raw file, used during 

69 ingest. 

70 """ 

71 

72 datasets: List[RawFileDatasetInfo] 

73 """The information describing each dataset within this raw file. 

74 (`list` of `RawFileDatasetInfo`) 

75 """ 

76 

77 filename: str 

78 """Name of the file this information was extracted from (`str`). 

79 

80 This is the path prior to ingest, not the path after ingest. 

81 """ 

82 

83 FormatterClass: Type[FitsRawFormatterBase] 

84 """Formatter class that should be used to ingest this file (`type`; as 

85 subclass of `FitsRawFormatterBase`). 

86 """ 

87 

88 instrumentClass: Type[Instrument] 

89 """The `Instrument` class associated with this file.""" 

90 

91 

92@dataclass 

93class RawExposureData: 

94 """Structure that holds information about a complete raw exposure, used 

95 during ingest. 

96 """ 

97 

98 dataId: DataCoordinate 

99 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

100 """ 

101 

102 files: List[RawFileData] 

103 """List of structures containing file-level information. 

104 """ 

105 

106 universe: InitVar[DimensionUniverse] 

107 """Set of all known dimensions. 

108 """ 

109 

110 record: Optional[DimensionRecord] = None 

111 """The exposure `DimensionRecord` that must be inserted into the 

112 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

113 """ 

114 

115 def __post_init__(self, universe: DimensionUniverse): 

116 # We don't care which file or dataset we read metadata from, because 

117 # we're assuming they'll all be the same; just use the first ones. 

118 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

119 

120 

121def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

122 """Create a Config field with options for how to transfer files between 

123 data repositories. 

124 

125 The allowed options for the field are exactly those supported by 

126 `lsst.daf.butler.Datastore.ingest`. 

127 

128 Parameters 

129 ---------- 

130 doc : `str` 

131 Documentation for the configuration field. 

132 

133 Returns 

134 ------- 

135 field : `lsst.pex.config.ChoiceField` 

136 Configuration field. 

137 """ 

138 return ChoiceField( 

139 doc=doc, 

140 dtype=str, 

141 allowed={"move": "move", 

142 "copy": "copy", 

143 "auto": "choice will depend on datastore", 

144 "link": "hard link falling back to symbolic link", 

145 "hardlink": "hard link", 

146 "symlink": "symbolic (soft) link", 

147 "relsymlink": "relative symbolic link", 

148 }, 

149 optional=True, 

150 default=default 

151 ) 

152 

153 

154class RawIngestConfig(Config): 

155 transfer = makeTransferChoiceField() 

156 

157 

158class RawIngestTask(Task): 

159 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

160 

161 Parameters 

162 ---------- 

163 config : `RawIngestConfig` 

164 Configuration for the task. 

165 butler : `~lsst.daf.butler.Butler` 

166 Writeable butler instance, with ``butler.run`` set to the appropriate 

167 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

168 datasets. 

169 **kwargs 

170 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

171 constructor. 

172 

173 Notes 

174 ----- 

175 Each instance of `RawIngestTask` writes to the same Butler. Each 

176 invocation of `RawIngestTask.run` ingests a list of files. 

177 """ 

178 

179 ConfigClass = RawIngestConfig 

180 

181 _DefaultName = "ingest" 

182 

183 def getDatasetType(self): 

184 """Return the DatasetType of the datasets ingested by this Task. 

185 """ 

186 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

187 universe=self.butler.registry.dimensions) 

188 

189 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any): 

190 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

191 super().__init__(config, **kwargs) 

192 self.butler = butler 

193 self.universe = self.butler.registry.dimensions 

194 self.datasetType = self.getDatasetType() 

195 

196 # Import all the instrument classes so that we ensure that we 

197 # have all the relevant metadata translators loaded. 

198 Instrument.importAll(self.butler.registry) 

199 

200 @classmethod 

201 # WARNING: this method hardcodes the parameters to pipe.base.Task.__init__. 

202 # Nobody seems to know a way to delegate them to Task code. 

203 def _makeTask(cls, config: RawIngestConfig, butler: Butler, name: str, parentTask: Task): 

204 """Construct a RawIngestTask using only positional arguments. 

205 

206 Parameters 

207 ---------- 

208 All parameters are as for `RawIngestTask`. 

209 """ 

210 return cls(config=config, butler=butler, name=name, parentTask=parentTask) 

211 

212 # Overrides Task.__reduce__ 

213 def __reduce__(self): 

214 return (self._makeTask, (self.config, self.butler, self._name, self._parentTask)) 

215 

216 def extractMetadata(self, filename: str) -> RawFileData: 

217 """Extract and process metadata from a single raw file. 

218 

219 Parameters 

220 ---------- 

221 filename : `str` 

222 Path to the file. 

223 

224 Returns 

225 ------- 

226 data : `RawFileData` 

227 A structure containing the metadata extracted from the file, 

228 as well as the original filename. All fields will be populated, 

229 but the `RawFileData.dataId` attribute will be a minimal 

230 (unexpanded) `DataCoordinate` instance. 

231 

232 Notes 

233 ----- 

234 Assumes that there is a single dataset associated with the given 

235 file. Instruments using a single file to store multiple datasets 

236 must implement their own version of this method. 

237 """ 

238 # Manually merge the primary and "first data" headers here because we 

239 # do not know in general if an input file has set INHERIT=T. 

240 phdu = readMetadata(filename, 0) 

241 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

242 fix_header(header) 

243 datasets = [self._calculate_dataset_info(header, filename)] 

244 

245 # The data model currently assumes that whilst multiple datasets 

246 # can be associated with a single file, they must all share the 

247 # same formatter. 

248 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry) 

249 FormatterClass = instrument.getRawFormatter(datasets[0].dataId) 

250 

251 return RawFileData(datasets=datasets, filename=filename, 

252 FormatterClass=FormatterClass, 

253 instrumentClass=instrument) 

254 

255 def _calculate_dataset_info(self, header, filename): 

256 """Calculate a RawFileDatasetInfo from the supplied information. 

257 

258 Parameters 

259 ---------- 

260 header : `Mapping` 

261 Header from the dataset. 

262 filename : `str` 

263 Filename to use for error messages. 

264 

265 Returns 

266 ------- 

267 dataset : `RawFileDatasetInfo` 

268 The dataId, and observation information associated with this 

269 dataset. 

270 """ 

271 obsInfo = ObservationInfo(header) 

272 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

273 exposure=obsInfo.exposure_id, 

274 detector=obsInfo.detector_num, 

275 universe=self.universe) 

276 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

277 

278 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

279 """Group an iterable of `RawFileData` by exposure. 

280 

281 Parameters 

282 ---------- 

283 files : iterable of `RawFileData` 

284 File-level information to group. 

285 

286 Returns 

287 ------- 

288 exposures : `list` of `RawExposureData` 

289 A list of structures that group the file-level information by 

290 exposure. All fields will be populated. The 

291 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

292 `DataCoordinate` instances. 

293 """ 

294 exposureDimensions = self.universe["exposure"].graph 

295 byExposure = defaultdict(list) 

296 for f in files: 

297 # Assume that the first dataset is representative for the file 

298 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

299 

300 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

301 for dataId, exposureFiles in byExposure.items()] 

302 

303 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

304 """Expand the data IDs associated with a raw exposure to include 

305 additional metadata records. 

306 

307 Parameters 

308 ---------- 

309 exposure : `RawExposureData` 

310 A structure containing information about the exposure to be 

311 ingested. Must have `RawExposureData.records` populated. Should 

312 be considered consumed upon return. 

313 

314 Returns 

315 ------- 

316 exposure : `RawExposureData` 

317 An updated version of the input structure, with 

318 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

319 updated to data IDs for which `DataCoordinate.hasRecords` returns 

320 `True`. 

321 """ 

322 # We start by expanded the exposure-level data ID; we won't use that 

323 # directly in file ingest, but this lets us do some database lookups 

324 # once per exposure instead of once per file later. 

325 data.dataId = self.butler.registry.expandDataId( 

326 data.dataId, 

327 # We pass in the records we'll be inserting shortly so they aren't 

328 # looked up from the database. We do expect instrument and filter 

329 # records to be retrieved from the database here (though the 

330 # Registry may cache them so there isn't a lookup every time). 

331 records={ 

332 self.butler.registry.dimensions["exposure"]: data.record, 

333 } 

334 ) 

335 # Now we expand the per-file (exposure+detector) data IDs. This time 

336 # we pass in the records we just retrieved from the exposure data ID 

337 # expansion. 

338 for file in data.files: 

339 for dataset in file.datasets: 

340 dataset.dataId = self.butler.registry.expandDataId( 

341 dataset.dataId, 

342 records=dict(data.dataId.records) 

343 ) 

344 return data 

345 

346 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

347 """Perform all ingest preprocessing steps that do not involve actually 

348 modifying the database. 

349 

350 Parameters 

351 ---------- 

352 files : iterable over `str` or path-like objects 

353 Paths to the files to be ingested. Will be made absolute 

354 if they are not already. 

355 pool : `multiprocessing.Pool`, optional 

356 If not `None`, a process pool with which to parallelize some 

357 operations. 

358 processes : `int`, optional 

359 The number of processes to use. Ignored if ``pool`` is not `None`. 

360 

361 Yields 

362 ------ 

363 exposure : `RawExposureData` 

364 Data structures containing dimension records, filenames, and data 

365 IDs to be ingested (one structure for each exposure). 

366 """ 

367 if pool is None and processes > 1: 

368 pool = Pool(processes) 

369 mapFunc = map if pool is None else pool.imap_unordered 

370 

371 # Extract metadata and build per-detector regions. 

372 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

373 

374 # Use that metadata to group files (and extracted metadata) by 

375 # exposure. Never parallelized because it's intrinsically a gather 

376 # step. 

377 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

378 

379 # The next operation operates on RawExposureData instances (one at 

380 # a time) in-place and then returns the modified instance. We call it 

381 # as a pass-through instead of relying on the arguments we pass in to 

382 # have been modified because in the parallel case those arguments are 

383 # going to be pickled and unpickled, and I'm not certain 

384 # multiprocessing is careful enough with that for output arguments to 

385 # work. 

386 

387 # Expand the data IDs to include all dimension metadata; we need this 

388 # because we may need to generate path templates that rely on that 

389 # metadata. 

390 # This is the first step that involves actual database calls (but just 

391 # SELECTs), so if there's going to be a problem with connections vs. 

392 # multiple processes, or lock contention (in SQLite) slowing things 

393 # down, it'll happen here. 

394 return mapFunc(self.expandDataIds, exposureData) 

395 

396 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

397 ) -> List[DatasetRef]: 

398 """Ingest all raw files in one exposure. 

399 

400 Parameters 

401 ---------- 

402 exposure : `RawExposureData` 

403 A structure containing information about the exposure to be 

404 ingested. Must have `RawExposureData.records` populated and all 

405 data ID attributes expanded. 

406 run : `str`, optional 

407 Name of a RUN-type collection to write to, overriding 

408 ``self.butler.run``. 

409 

410 Returns 

411 ------- 

412 refs : `list` of `lsst.daf.butler.DatasetRef` 

413 Dataset references for ingested raws. 

414 """ 

415 datasets = [FileDataset(path=os.path.abspath(file.filename), 

416 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

417 formatter=file.FormatterClass) 

418 for file in exposure.files] 

419 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

420 return [ref for dataset in datasets for ref in dataset.refs] 

421 

422 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

423 """Ingest files into a Butler data repository. 

424 

425 This creates any new exposure or visit Dimension entries needed to 

426 identify the ingested files, creates new Dataset entries in the 

427 Registry and finally ingests the files themselves into the Datastore. 

428 Any needed instrument, detector, and physical_filter Dimension entries 

429 must exist in the Registry before `run` is called. 

430 

431 Parameters 

432 ---------- 

433 files : iterable over `str` or path-like objects 

434 Paths to the files to be ingested. Will be made absolute 

435 if they are not already. 

436 pool : `multiprocessing.Pool`, optional 

437 If not `None`, a process pool with which to parallelize some 

438 operations. 

439 processes : `int`, optional 

440 The number of processes to use. Ignored if ``pool`` is not `None`. 

441 run : `str`, optional 

442 Name of a RUN-type collection to write to, overriding 

443 the default derived from the instrument name. 

444 

445 Returns 

446 ------- 

447 refs : `list` of `lsst.daf.butler.DatasetRef` 

448 Dataset references for ingested raws. 

449 

450 Notes 

451 ----- 

452 This method inserts all datasets for an exposure within a transaction, 

453 guaranteeing that partial exposures are never ingested. The exposure 

454 dimension record is inserted with `Registry.syncDimensionData` first 

455 (in its own transaction), which inserts only if a record with the same 

456 primary key does not already exist. This allows different files within 

457 the same exposure to be incremented in different runs. 

458 """ 

459 exposureData = self.prep(files, pool=pool, processes=processes) 

460 # Up to this point, we haven't modified the data repository at all. 

461 # Now we finally do that, with one transaction per exposure. This is 

462 # not parallelized at present because the performance of this step is 

463 # limited by the database server. That may or may not change in the 

464 # future once we increase our usage of bulk inserts and reduce our 

465 # usage of savepoints; we've tried to get everything but the database 

466 # operations done in advance to reduce the time spent inside 

467 # transactions. 

468 self.butler.registry.registerDatasetType(self.datasetType) 

469 refs = [] 

470 runs = set() 

471 for exposure in exposureData: 

472 self.butler.registry.syncDimensionData("exposure", exposure.record) 

473 # Override default run if nothing specified explicitly 

474 if run is None: 

475 instrumentClass = exposure.files[0].instrumentClass 

476 this_run = instrumentClass.makeDefaultRawIngestRunName() 

477 else: 

478 this_run = run 

479 if this_run not in runs: 

480 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

481 runs.add(this_run) 

482 with self.butler.transaction(): 

483 refs.extend(self.ingestExposureDatasets(exposure, run=this_run)) 

484 return refs