Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import os.path 

26from dataclasses import dataclass, InitVar 

27from typing import List, Iterator, Iterable, Type, Optional, Any 

28from collections import defaultdict 

29from multiprocessing import Pool 

30 

31from astro_metadata_translator import ObservationInfo, fix_header, merge_headers 

32from lsst.afw.fits import readMetadata 

33from lsst.daf.butler import ( 

34 Butler, 

35 CollectionType, 

36 DataCoordinate, 

37 DatasetRef, 

38 DatasetType, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 FileDataset, 

42) 

43from lsst.pex.config import Config, ChoiceField 

44from lsst.pipe.base import Task 

45 

46from ._instrument import Instrument, makeExposureRecordFromObsInfo 

47from ._fitsRawFormatterBase import FitsRawFormatterBase 

48 

49 

50@dataclass 

51class RawFileDatasetInfo: 

52 """Structure that holds information about a single dataset within a 

53 raw file. 

54 """ 

55 

56 dataId: DataCoordinate 

57 """Data ID for this file (`lsst.daf.butler.DataCoordinate`). 

58 

59 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

60 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

61 """ 

62 

63 obsInfo: ObservationInfo 

64 """Standardized observation metadata extracted directly from the file 

65 headers (`astro_metadata_translator.ObservationInfo`). 

66 """ 

67 

68 

69@dataclass 

70class RawFileData: 

71 """Structure that holds information about a single raw file, used during 

72 ingest. 

73 """ 

74 

75 datasets: List[RawFileDatasetInfo] 

76 """The information describing each dataset within this raw file. 

77 (`list` of `RawFileDatasetInfo`) 

78 """ 

79 

80 filename: str 

81 """Name of the file this information was extracted from (`str`). 

82 

83 This is the path prior to ingest, not the path after ingest. 

84 """ 

85 

86 FormatterClass: Type[FitsRawFormatterBase] 

87 """Formatter class that should be used to ingest this file (`type`; as 

88 subclass of `FitsRawFormatterBase`). 

89 """ 

90 

91 instrumentClass: Type[Instrument] 

92 """The `Instrument` class associated with this file.""" 

93 

94 

95@dataclass 

96class RawExposureData: 

97 """Structure that holds information about a complete raw exposure, used 

98 during ingest. 

99 """ 

100 

101 dataId: DataCoordinate 

102 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

103 

104 This may be a minimal `~lsst.daf.butler.DataCoordinate` base instance, or 

105 a complete `~lsst.daf.butler.ExpandedDataCoordinate`. 

106 """ 

107 

108 files: List[RawFileData] 

109 """List of structures containing file-level information. 

110 """ 

111 

112 universe: InitVar[DimensionUniverse] 

113 """Set of all known dimensions. 

114 """ 

115 

116 record: Optional[DimensionRecord] = None 

117 """The exposure `DimensionRecord` that must be inserted into the 

118 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

119 """ 

120 

121 def __post_init__(self, universe: DimensionUniverse): 

122 # We don't care which file or dataset we read metadata from, because 

123 # we're assuming they'll all be the same; just use the first ones. 

124 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

125 

126 

127def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

128 """Create a Config field with options for how to transfer files between 

129 data repositories. 

130 

131 The allowed options for the field are exactly those supported by 

132 `lsst.daf.butler.Datastore.ingest`. 

133 

134 Parameters 

135 ---------- 

136 doc : `str` 

137 Documentation for the configuration field. 

138 

139 Returns 

140 ------- 

141 field : `lsst.pex.config.ChoiceField` 

142 Configuration field. 

143 """ 

144 return ChoiceField( 

145 doc=doc, 

146 dtype=str, 

147 allowed={"move": "move", 

148 "copy": "copy", 

149 "auto": "choice will depend on datastore", 

150 "link": "hard link falling back to symbolic link", 

151 "hardlink": "hard link", 

152 "symlink": "symbolic (soft) link", 

153 "relsymlink": "relative symbolic link", 

154 }, 

155 optional=True, 

156 default=default 

157 ) 

158 

159 

160class RawIngestConfig(Config): 

161 transfer = makeTransferChoiceField() 

162 

163 

164class RawIngestTask(Task): 

165 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

166 

167 Parameters 

168 ---------- 

169 config : `RawIngestConfig` 

170 Configuration for the task. 

171 butler : `~lsst.daf.butler.Butler` 

172 Writeable butler instance, with ``butler.run`` set to the appropriate 

173 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

174 datasets. 

175 **kwargs 

176 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

177 constructor. 

178 

179 Notes 

180 ----- 

181 Each instance of `RawIngestTask` writes to the same Butler. Each 

182 invocation of `RawIngestTask.run` ingests a list of files. 

183 """ 

184 

185 ConfigClass = RawIngestConfig 

186 

187 _DefaultName = "ingest" 

188 

189 def getDatasetType(self): 

190 """Return the DatasetType of the datasets ingested by this Task. 

191 """ 

192 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

193 universe=self.butler.registry.dimensions) 

194 

195 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, **kwargs: Any): 

196 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

197 super().__init__(config, **kwargs) 

198 self.butler = butler 

199 self.universe = self.butler.registry.dimensions 

200 self.datasetType = self.getDatasetType() 

201 

202 # Import all the instrument classes so that we ensure that we 

203 # have all the relevant metadata translators loaded. 

204 Instrument.importAll(self.butler.registry) 

205 

206 def extractMetadata(self, filename: str) -> RawFileData: 

207 """Extract and process metadata from a single raw file. 

208 

209 Parameters 

210 ---------- 

211 filename : `str` 

212 Path to the file. 

213 

214 Returns 

215 ------- 

216 data : `RawFileData` 

217 A structure containing the metadata extracted from the file, 

218 as well as the original filename. All fields will be populated, 

219 but the `RawFileData.dataId` attribute will be a minimal 

220 (unexpanded) `DataCoordinate` instance. 

221 

222 Notes 

223 ----- 

224 Assumes that there is a single dataset associated with the given 

225 file. Instruments using a single file to store multiple datasets 

226 must implement their own version of this method. 

227 """ 

228 # Manually merge the primary and "first data" headers here because we 

229 # do not know in general if an input file has set INHERIT=T. 

230 phdu = readMetadata(filename, 0) 

231 header = merge_headers([phdu, readMetadata(filename)], mode="overwrite") 

232 fix_header(header) 

233 datasets = [self._calculate_dataset_info(header, filename)] 

234 

235 # The data model currently assumes that whilst multiple datasets 

236 # can be associated with a single file, they must all share the 

237 # same formatter. 

238 instrument = Instrument.fromName(datasets[0].dataId["instrument"], self.butler.registry) 

239 FormatterClass = instrument.getRawFormatter(datasets[0].dataId) 

240 

241 return RawFileData(datasets=datasets, filename=filename, 

242 FormatterClass=FormatterClass, 

243 instrumentClass=instrument) 

244 

245 def _calculate_dataset_info(self, header, filename): 

246 """Calculate a RawFileDatasetInfo from the supplied information. 

247 

248 Parameters 

249 ---------- 

250 header : `Mapping` 

251 Header from the dataset. 

252 filename : `str` 

253 Filename to use for error messages. 

254 

255 Returns 

256 ------- 

257 dataset : `RawFileDatasetInfo` 

258 The dataId, and observation information associated with this 

259 dataset. 

260 """ 

261 obsInfo = ObservationInfo(header) 

262 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

263 exposure=obsInfo.exposure_id, 

264 detector=obsInfo.detector_num, 

265 universe=self.universe) 

266 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

267 

268 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

269 """Group an iterable of `RawFileData` by exposure. 

270 

271 Parameters 

272 ---------- 

273 files : iterable of `RawFileData` 

274 File-level information to group. 

275 

276 Returns 

277 ------- 

278 exposures : `list` of `RawExposureData` 

279 A list of structures that group the file-level information by 

280 exposure. All fields will be populated. The 

281 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

282 `DataCoordinate` instances. 

283 """ 

284 exposureDimensions = self.universe["exposure"].graph 

285 byExposure = defaultdict(list) 

286 for f in files: 

287 # Assume that the first dataset is representative for the file 

288 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

289 

290 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

291 for dataId, exposureFiles in byExposure.items()] 

292 

293 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

294 """Expand the data IDs associated with a raw exposure to include 

295 additional metadata records. 

296 

297 Parameters 

298 ---------- 

299 exposure : `RawExposureData` 

300 A structure containing information about the exposure to be 

301 ingested. Must have `RawExposureData.records` populated. Should 

302 be considered consumed upon return. 

303 

304 Returns 

305 ------- 

306 exposure : `RawExposureData` 

307 An updated version of the input structure, with 

308 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

309 containing `~lsst.daf.butler.ExpandedDataCoordinate` instances. 

310 """ 

311 # We start by expanded the exposure-level data ID; we won't use that 

312 # directly in file ingest, but this lets us do some database lookups 

313 # once per exposure instead of once per file later. 

314 data.dataId = self.butler.registry.expandDataId( 

315 data.dataId, 

316 # We pass in the records we'll be inserting shortly so they aren't 

317 # looked up from the database. We do expect instrument and filter 

318 # records to be retrieved from the database here (though the 

319 # Registry may cache them so there isn't a lookup every time). 

320 records={ 

321 self.butler.registry.dimensions["exposure"]: data.record, 

322 } 

323 ) 

324 # Now we expand the per-file (exposure+detector) data IDs. This time 

325 # we pass in the records we just retrieved from the exposure data ID 

326 # expansion. 

327 for file in data.files: 

328 for dataset in file.datasets: 

329 dataset.dataId = self.butler.registry.expandDataId( 

330 dataset.dataId, 

331 records=dict(data.dataId.records) 

332 ) 

333 return data 

334 

335 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1) -> Iterator[RawExposureData]: 

336 """Perform all ingest preprocessing steps that do not involve actually 

337 modifying the database. 

338 

339 Parameters 

340 ---------- 

341 files : iterable over `str` or path-like objects 

342 Paths to the files to be ingested. Will be made absolute 

343 if they are not already. 

344 pool : `multiprocessing.Pool`, optional 

345 If not `None`, a process pool with which to parallelize some 

346 operations. 

347 processes : `int`, optional 

348 The number of processes to use. Ignored if ``pool`` is not `None`. 

349 

350 Yields 

351 ------ 

352 exposure : `RawExposureData` 

353 Data structures containing dimension records, filenames, and data 

354 IDs to be ingested (one structure for each exposure). 

355 """ 

356 if pool is None and processes > 1: 

357 pool = Pool(processes) 

358 mapFunc = map if pool is None else pool.imap_unordered 

359 

360 # Extract metadata and build per-detector regions. 

361 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

362 

363 # Use that metadata to group files (and extracted metadata) by 

364 # exposure. Never parallelized because it's intrinsically a gather 

365 # step. 

366 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

367 

368 # The next operation operates on RawExposureData instances (one at 

369 # a time) in-place and then returns the modified instance. We call it 

370 # as a pass-through instead of relying on the arguments we pass in to 

371 # have been modified because in the parallel case those arguments are 

372 # going to be pickled and unpickled, and I'm not certain 

373 # multiprocessing is careful enough with that for output arguments to 

374 # work. 

375 

376 # Expand the data IDs to include all dimension metadata; we need this 

377 # because we may need to generate path templates that rely on that 

378 # metadata. 

379 # This is the first step that involves actual database calls (but just 

380 # SELECTs), so if there's going to be a problem with connections vs. 

381 # multiple processes, or lock contention (in SQLite) slowing things 

382 # down, it'll happen here. 

383 return mapFunc(self.expandDataIds, exposureData) 

384 

385 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

386 ) -> List[DatasetRef]: 

387 """Ingest all raw files in one exposure. 

388 

389 Parameters 

390 ---------- 

391 exposure : `RawExposureData` 

392 A structure containing information about the exposure to be 

393 ingested. Must have `RawExposureData.records` populated and all 

394 data ID attributes expanded. 

395 run : `str`, optional 

396 Name of a RUN-type collection to write to, overriding 

397 ``self.butler.run``. 

398 

399 Returns 

400 ------- 

401 refs : `list` of `lsst.daf.butler.DatasetRef` 

402 Dataset references for ingested raws. 

403 """ 

404 datasets = [FileDataset(path=os.path.abspath(file.filename), 

405 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

406 formatter=file.FormatterClass) 

407 for file in exposure.files] 

408 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

409 return [ref for dataset in datasets for ref in dataset.refs] 

410 

411 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None): 

412 """Ingest files into a Butler data repository. 

413 

414 This creates any new exposure or visit Dimension entries needed to 

415 identify the ingested files, creates new Dataset entries in the 

416 Registry and finally ingests the files themselves into the Datastore. 

417 Any needed instrument, detector, and physical_filter Dimension entries 

418 must exist in the Registry before `run` is called. 

419 

420 Parameters 

421 ---------- 

422 files : iterable over `str` or path-like objects 

423 Paths to the files to be ingested. Will be made absolute 

424 if they are not already. 

425 pool : `multiprocessing.Pool`, optional 

426 If not `None`, a process pool with which to parallelize some 

427 operations. 

428 processes : `int`, optional 

429 The number of processes to use. Ignored if ``pool`` is not `None`. 

430 run : `str`, optional 

431 Name of a RUN-type collection to write to, overriding 

432 the default derived from the instrument name. 

433 

434 Returns 

435 ------- 

436 refs : `list` of `lsst.daf.butler.DatasetRef` 

437 Dataset references for ingested raws. 

438 

439 Notes 

440 ----- 

441 This method inserts all datasets for an exposure within a transaction, 

442 guaranteeing that partial exposures are never ingested. The exposure 

443 dimension record is inserted with `Registry.syncDimensionData` first 

444 (in its own transaction), which inserts only if a record with the same 

445 primary key does not already exist. This allows different files within 

446 the same exposure to be incremented in different runs. 

447 """ 

448 exposureData = self.prep(files, pool=pool, processes=processes) 

449 # Up to this point, we haven't modified the data repository at all. 

450 # Now we finally do that, with one transaction per exposure. This is 

451 # not parallelized at present because the performance of this step is 

452 # limited by the database server. That may or may not change in the 

453 # future once we increase our usage of bulk inserts and reduce our 

454 # usage of savepoints; we've tried to get everything but the database 

455 # operations done in advance to reduce the time spent inside 

456 # transactions. 

457 self.butler.registry.registerDatasetType(self.datasetType) 

458 refs = [] 

459 runs = set() 

460 for exposure in exposureData: 

461 self.butler.registry.syncDimensionData("exposure", exposure.record) 

462 # Override default run if nothing specified explicitly 

463 if run is None: 

464 instrumentClass = exposure.files[0].instrumentClass 

465 this_run = instrumentClass.makeDefaultRawIngestRunName() 

466 else: 

467 this_run = run 

468 if this_run not in runs: 

469 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

470 runs.add(this_run) 

471 with self.butler.transaction(): 

472 refs.extend(self.ingestExposureDatasets(exposure, run=this_run)) 

473 return refs