Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of a dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "ingestDataset"] 

31 

32import fnmatch 

33import os 

34import shutil 

35import tarfile 

36from glob import glob 

37import sqlite3 

38 

39import lsst.utils 

40import lsst.log 

41import lsst.pex.config as pexConfig 

42import lsst.pipe.base as pipeBase 

43 

44from lsst.pipe.tasks.ingest import IngestTask 

45from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

46from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

47 

48 

49class DatasetIngestConfig(pexConfig.Config): 

50 """Settings and defaults for `DatasetIngestTask`. 

51 

52 The correct targets for this task's subtasks can be found in the 

53 documentation of the appropriate ``obs`` package. 

54 

55 Because `DatasetIngestTask` is not designed to be run from the command line, 

56 and its arguments are completely determined by the choice of dataset, 

57 this config includes settings that would normally be passed as command-line 

58 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

59 """ 

60 

61 dataIngester = pexConfig.ConfigurableField( 

62 target=IngestTask, 

63 doc="Task used to perform raw data ingestion.", 

64 ) 

65 dataFiles = pexConfig.ListField( 

66 dtype=str, 

67 default=["*.fits", "*.fz", "*.fits.gz"], 

68 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.", 

69 ) 

70 dataBadFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=[], 

73 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

74 "supersedes ``dataFiles``.", 

75 ) 

76 

77 calibIngester = pexConfig.ConfigurableField( 

78 target=IngestCalibsTask, 

79 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

80 ) 

81 calibFiles = pexConfig.ListField( 

82 dtype=str, 

83 default=["*.fits", "*.fz", "*.fits.gz"], 

84 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.", 

85 ) 

86 calibBadFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=[], 

89 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

90 ) 

91 calibValidity = pexConfig.Field( 

92 dtype=int, 

93 default=9999, 

94 doc="Calibration validity period (days). Assumed equal for all calib types.") 

95 

96 curatedCalibPaths = pexConfig.ListField( 

97 dtype=str, 

98 default=[], 

99 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). " 

100 "Each path should be a directory which contains one subdirectory per sensor." 

101 ) 

102 curatedCalibIngester = pexConfig.ConfigurableField( 

103 target=IngestCuratedCalibsTask, 

104 doc="Task used to ingest curated calibs.", 

105 ) 

106 

107 refcats = pexConfig.DictField( 

108 keytype=str, 

109 itemtype=str, 

110 default={}, 

111 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

112 ) 

113 

114 

115class DatasetIngestTask(pipeBase.Task): 

116 """Task for automating ingestion of a dataset. 

117 

118 Each dataset configures this task as appropriate for the files it provides 

119 and the target instrument. Therefore, this task takes no input besides the 

120 dataset to load and the repositories to ingest to. 

121 """ 

122 

123 ConfigClass = DatasetIngestConfig 

124 _DefaultName = "datasetIngest" 

125 

126 def __init__(self, *args, **kwargs): 

127 pipeBase.Task.__init__(self, *args, **kwargs) 

128 self.makeSubtask("dataIngester") 

129 self.makeSubtask("calibIngester") 

130 self.makeSubtask("curatedCalibIngester") 

131 

132 def run(self, dataset, workspace): 

133 """Ingest the contents of a dataset into a Butler repository. 

134 

135 Parameters 

136 ---------- 

137 dataset : `lsst.ap.verify.dataset.Dataset` 

138 The dataset to be ingested. 

139 workspace : `lsst.ap.verify.workspace.Workspace` 

140 The abstract location where ingestion repositories will be created. 

141 If the repositories already exist, they must support the same 

142 ``obs`` package as this task's subtasks. 

143 """ 

144 # We're assuming ingest tasks always give absolute path to butler 

145 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

146 self._ingestRaws(dataset, workspace) 

147 self._ingestCalibs(dataset, workspace) 

148 self._ingestCuratedCalibs(dataset, workspace) 

149 self._ingestRefcats(dataset, workspace) 

150 self._copyConfigs(dataset, workspace) 

151 

152 def _ingestRaws(self, dataset, workspace): 

153 """Ingest the science data for use by LSST. 

154 

155 After this method returns, the data repository in ``workspace`` shall 

156 contain all science data from ``dataset``. Butler operations on the 

157 repository shall not be able to modify ``dataset``. 

158 

159 Parameters 

160 ---------- 

161 dataset : `lsst.ap.verify.dataset.Dataset` 

162 The dataset on which the pipeline will be run. 

163 workspace : `lsst.ap.verify.workspace.Workspace` 

164 The location containing all ingestion repositories. 

165 

166 Raises 

167 ------ 

168 RuntimeError 

169 Raised if there are no files to ingest. 

170 """ 

171 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

172 self.log.info("Raw images were previously ingested, skipping...") 

173 else: 

174 self.log.info("Ingesting raw images...") 

175 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

176 if dataFiles: 

177 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

178 dataFiles, self.config.dataBadFiles) 

179 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

180 else: 

181 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

182 

183 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

184 """Ingest raw images into a repository. 

185 

186 ``repo`` shall be populated with *links* to ``dataFiles``. 

187 

188 Parameters 

189 ---------- 

190 repo : `str` 

191 The output repository location on disk for raw images. Must exist. 

192 calibRepo : `str` 

193 The output calibration repository location on disk. 

194 dataFiles : `list` of `str` 

195 A list of filenames to ingest. May contain wildcards. 

196 badFiles : `list` of `str` 

197 A list of filenames to exclude from ingestion. Must not contain paths. 

198 May contain wildcards. 

199 

200 Raises 

201 ------ 

202 RuntimeError 

203 Raised if ``dataFiles`` is empty. 

204 """ 

205 if not dataFiles: 

206 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

207 

208 args = [repo, "--calib", calibRepo, "--mode", "link"] 

209 args.extend(dataFiles) 

210 if badFiles: 

211 args.append('--badFile') 

212 args.extend(badFiles) 

213 try: 

214 _runIngestTask(self.dataIngester, args) 

215 except sqlite3.IntegrityError as detail: 

216 raise RuntimeError("Not all raw files are unique") from detail 

217 

218 def _ingestCalibs(self, dataset, workspace): 

219 """Ingest the calibration files for use by LSST. 

220 

221 After this method returns, the calibration repository in ``workspace`` 

222 shall contain all calibration data from ``dataset``. Butler operations 

223 on the repository shall not be able to modify ``dataset``. 

224 

225 Parameters 

226 ---------- 

227 dataset : `lsst.ap.verify.dataset.Dataset` 

228 The dataset on which the pipeline will be run. 

229 workspace : `lsst.ap.verify.workspace.Workspace` 

230 The location containing all ingestion repositories. 

231 

232 Raises 

233 ------ 

234 RuntimeError 

235 Raised if there are no files to ingest. 

236 """ 

237 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

238 self.log.info("Calibration files were previously ingested, skipping...") 

239 else: 

240 self.log.info("Ingesting calibration files...") 

241 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

242 self.config.calibFiles, self.config.calibBadFiles) 

243 if calibDataFiles: 

244 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

245 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

246 workspace.dataRepo, workspace.calibRepo)) 

247 else: 

248 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

249 

250 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

251 """Ingest calibration images into a calibration repository. 

252 

253 Parameters 

254 ---------- 

255 repo : `str` 

256 The output repository location on disk for raw images. Must exist. 

257 calibRepo : `str` 

258 The output repository location on disk for calibration files. Must 

259 exist. 

260 calibDataFiles : `list` of `str` 

261 A list of filenames to ingest. Supported files vary by instrument 

262 but may include flats, biases, darks, fringes, or sky. May contain 

263 wildcards. 

264 

265 Raises 

266 ------ 

267 RuntimeError 

268 Raised if ``calibDataFiles`` is empty. 

269 """ 

270 if not calibDataFiles: 

271 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

272 % calibDataFiles) 

273 

274 # TODO: --output is workaround for DM-11668 

275 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

276 "--mode", "link", "--validity", str(self.config.calibValidity)] 

277 args.extend(calibDataFiles) 

278 try: 

279 _runIngestTask(self.calibIngester, args) 

280 except sqlite3.IntegrityError as detail: 

281 raise RuntimeError("Not all calibration files are unique") from detail 

282 

283 def _ingestCuratedCalibs(self, dataset, workspace): 

284 """Ingest the curated calib files for use by LSST. 

285 

286 After this method returns, the calibration repository in ``workspace`` 

287 shall contain all curated calibs mentioned in curatedCalibPaths. Butler 

288 operations on the repository shall not be able to modify ``dataset``. 

289 

290 Parameters 

291 ---------- 

292 dataset : `lsst.ap.verify.dataset.Dataset` 

293 The dataset on which the pipeline will be run. 

294 workspace : `lsst.ap.verify.workspace.Workspace` 

295 The location containing all ingestion repositories. 

296 """ 

297 for curated in self.config.curatedCalibPaths: 

298 self.log.info("Ingesting curated calibs...") 

299 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated) 

300 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo)) 

301 

302 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath): 

303 """Ingest curated calib data. 

304 

305 Parameters 

306 ---------- 

307 repo : `str` 

308 The output repository location on disk for raw images. Must exist. 

309 calibRepo : `str` 

310 The output repository location on disk for calibration files. Must 

311 exist. 

312 curatedPath : `str` 

313 Path to the curated calibs in standard text form. This is probably 

314 a path in ``obs_*_data``. 

315 """ 

316 

317 curatedargs = [repo, curatedPath, "--calib", calibRepo] 

318 try: 

319 _runIngestTask(self.curatedCalibIngester, curatedargs) 

320 except sqlite3.IntegrityError as detail: 

321 raise RuntimeError("Not all curated calib files are unique") from detail 

322 

323 def _ingestRefcats(self, dataset, workspace): 

324 """Ingest the refcats for use by LSST. 

325 

326 After this method returns, the data repository in ``workspace`` shall 

327 contain all reference catalogs from ``dataset``. Operations on the 

328 repository shall not be able to modify ``dataset``. 

329 

330 Parameters 

331 ---------- 

332 dataset : `lsst.ap.verify.dataset.Dataset` 

333 The dataset on which the pipeline will be run. 

334 workspace : `lsst.ap.verify.workspace.Workspace` 

335 The location containing all ingestion repositories. 

336 

337 Notes 

338 ----- 

339 Refcats are not, at present, registered as part of the repository. They 

340 are not guaranteed to be visible to anything other than a 

341 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

342 for more details. 

343 """ 

344 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

345 self.log.info("Refcats were previously ingested, skipping...") 

346 else: 

347 self.log.info("Ingesting reference catalogs...") 

348 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

349 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

350 

351 def _doIngestRefcats(self, repo, refcats): 

352 """Place refcats inside a particular repository. 

353 

354 Parameters 

355 ---------- 

356 repo : `str` 

357 The output repository location on disk for raw images. Must exist. 

358 refcats : `str` 

359 A directory containing .tar.gz files with LSST-formatted astrometric 

360 or photometric reference catalog information. 

361 """ 

362 for refcatName, tarball in self.config.refcats.items(): 

363 tarball = os.path.join(refcats, tarball) 

364 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

365 with tarfile.open(tarball, "r") as opened: 

366 opened.extractall(refcatDir) 

367 

368 def _copyConfigs(self, dataset, workspace): 

369 """Give a workspace a copy of all configs associated with the ingested data. 

370 

371 After this method returns, the config directory in ``workspace`` shall 

372 contain all config files from ``dataset``. 

373 

374 Parameters 

375 ---------- 

376 dataset : `lsst.ap.verify.dataset.Dataset` 

377 The dataset on which the pipeline will be run. 

378 workspace : `lsst.ap.verify.workspace.Workspace` 

379 The location containing the config directory. 

380 """ 

381 if os.listdir(workspace.configDir): 

382 self.log.info("Configs already copied, skipping...") 

383 else: 

384 self.log.info("Storing data-specific configs...") 

385 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

386 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

387 

388 def _doCopyConfigs(self, destination, source): 

389 """Place configs inside a particular repository. 

390 

391 Parameters 

392 ---------- 

393 destination : `str` 

394 The directory to which the configs must be copied. Must exist. 

395 source : `str` 

396 A directory containing Task config files. 

397 """ 

398 for configFile in _findMatchingFiles(source, ['*.py']): 

399 shutil.copy2(configFile, destination) 

400 

401 

402def ingestDataset(dataset, workspace): 

403 """Ingest the contents of a dataset into a Butler repository. 

404 

405 The original data directory shall not be modified. 

406 

407 Parameters 

408 ---------- 

409 dataset : `lsst.ap.verify.dataset.Dataset` 

410 The dataset to be ingested. 

411 workspace : `lsst.ap.verify.workspace.Workspace` 

412 The abstract location where ingestion repositories will be created. 

413 If the repositories already exist, they must be compatible with 

414 ``dataset`` (in particular, they must support the relevant 

415 ``obs`` package). 

416 """ 

417 # TODO: generalize to support arbitrary URIs (DM-11482) 

418 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

419 

420 ingester = DatasetIngestTask(config=_getConfig(dataset)) 

421 ingester.run(dataset, workspace) 

422 log.info("Data ingested") 

423 

424 

425def _getConfig(dataset): 

426 """Return the ingestion config associated with a specific dataset. 

427 

428 Parameters 

429 ---------- 

430 dataset : `lsst.ap.verify.dataset.Dataset` 

431 The dataset whose ingestion config is desired. 

432 

433 Returns 

434 ------- 

435 config : `DatasetIngestConfig` 

436 The config for running `DatasetIngestTask` on ``dataset``. 

437 """ 

438 overrideFile = DatasetIngestTask._DefaultName + ".py" 

439 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

440 

441 config = DatasetIngestTask.ConfigClass() 

442 for path in [ 

443 os.path.join(packageDir, 'config'), 

444 os.path.join(packageDir, 'config', dataset.camera), 

445 dataset.configLocation, 

446 ]: 

447 overridePath = os.path.join(path, overrideFile) 

448 if os.path.exists(overridePath): 

449 config.load(overridePath) 

450 return config 

451 

452 

453def _runIngestTask(task, args): 

454 """Run an ingestion task on a set of inputs. 

455 

456 Parameters 

457 ---------- 

458 task : `lsst.pipe.tasks.IngestTask` 

459 The task to run. 

460 args : list of command-line arguments, split using Python conventions 

461 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

462 """ 

463 argumentParser = task.ArgumentParser(name=task.getName()) 

464 try: 

465 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

466 except SystemExit as e: 

467 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

468 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

469 task.run(parsedCmd) 

470 

471 

472def _findMatchingFiles(basePath, include, exclude=None): 

473 """Recursively identify files matching one set of patterns and not matching another. 

474 

475 Parameters 

476 ---------- 

477 basePath : `str` 

478 The path on disk where the files in ``include`` are located. 

479 include : iterable of `str` 

480 A collection of files (with wildcards) to include. Must not 

481 contain paths. 

482 exclude : iterable of `str`, optional 

483 A collection of filenames (with wildcards) to exclude. Must not 

484 contain paths. If omitted, all files matching ``include`` are returned. 

485 

486 Returns 

487 ------- 

488 files : `set` of `str` 

489 The files in ``basePath`` or any subdirectory that match ``include`` 

490 but not ``exclude``. 

491 """ 

492 _exclude = exclude if exclude is not None else [] 

493 

494 allFiles = set() 

495 for pattern in include: 

496 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

497 

498 for pattern in _exclude: 

499 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

500 allFiles.difference_update(excludedFiles) 

501 return allFiles