Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of a dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "ingestDataset"] 

31 

32import fnmatch 

33import os 

34import shutil 

35import tarfile 

36from glob import glob 

37import sqlite3 

38 

39import lsst.utils 

40import lsst.log 

41import lsst.pex.config as pexConfig 

42import lsst.pipe.base as pipeBase 

43 

44from lsst.pipe.tasks.ingest import IngestTask 

45from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

46from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

47 

48 

49class DatasetIngestConfig(pexConfig.Config): 

50 """Settings and defaults for `DatasetIngestTask`. 

51 

52 The correct targets for this task's subtasks can be found in the 

53 documentation of the appropriate ``obs`` package. 

54 

55 Because `DatasetIngestTask` is not designed to be run from the command line, 

56 and its arguments are completely determined by the choice of dataset, 

57 this config includes settings that would normally be passed as command-line 

58 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

59 """ 

60 

61 dataIngester = pexConfig.ConfigurableField( 

62 target=IngestTask, 

63 doc="Task used to perform raw data ingestion.", 

64 ) 

65 dataFiles = pexConfig.ListField( 

66 dtype=str, 

67 default=["*.fits", "*.fz", "*.fits.gz"], 

68 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.", 

69 ) 

70 dataBadFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=[], 

73 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

74 "supersedes ``dataFiles``.", 

75 ) 

76 

77 calibIngester = pexConfig.ConfigurableField( 

78 target=IngestCalibsTask, 

79 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

80 ) 

81 calibFiles = pexConfig.ListField( 

82 dtype=str, 

83 default=["*.fits", "*.fz", "*.fits.gz"], 

84 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.", 

85 ) 

86 calibBadFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=[], 

89 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

90 ) 

91 calibValidity = pexConfig.Field( 

92 dtype=int, 

93 default=9999, 

94 doc="Calibration validity period (days). Assumed equal for all calib types.") 

95 

96 textDefectPath = pexConfig.Field( 

97 dtype=str, 

98 default='', 

99 doc="Path to top level of the defect tree. This is a directory with a directory per sensor." 

100 ) 

101 defectIngester = pexConfig.ConfigurableField( 

102 target=IngestCuratedCalibsTask, 

103 doc="Task used to ingest defects.", 

104 ) 

105 

106 refcats = pexConfig.DictField( 

107 keytype=str, 

108 itemtype=str, 

109 default={}, 

110 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

111 ) 

112 

113 

114class DatasetIngestTask(pipeBase.Task): 

115 """Task for automating ingestion of a dataset. 

116 

117 Each dataset configures this task as appropriate for the files it provides 

118 and the target instrument. Therefore, this task takes no input besides the 

119 dataset to load and the repositories to ingest to. 

120 """ 

121 

122 ConfigClass = DatasetIngestConfig 

123 _DefaultName = "datasetIngest" 

124 

125 def __init__(self, *args, **kwargs): 

126 pipeBase.Task.__init__(self, *args, **kwargs) 

127 self.makeSubtask("dataIngester") 

128 self.makeSubtask("calibIngester") 

129 self.makeSubtask("defectIngester") 

130 

131 def run(self, dataset, workspace): 

132 """Ingest the contents of a dataset into a Butler repository. 

133 

134 Parameters 

135 ---------- 

136 dataset : `lsst.ap.verify.dataset.Dataset` 

137 The dataset to be ingested. 

138 workspace : `lsst.ap.verify.workspace.Workspace` 

139 The abstract location where ingestion repositories will be created. 

140 If the repositories already exist, they must support the same 

141 ``obs`` package as this task's subtasks. 

142 """ 

143 # We're assuming ingest tasks always give absolute path to butler 

144 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

145 self._ingestRaws(dataset, workspace) 

146 self._ingestCalibs(dataset, workspace) 

147 self._ingestDefects(dataset, workspace) 

148 self._ingestRefcats(dataset, workspace) 

149 self._copyConfigs(dataset, workspace) 

150 

151 def _ingestRaws(self, dataset, workspace): 

152 """Ingest the science data for use by LSST. 

153 

154 After this method returns, the data repository in ``workspace`` shall 

155 contain all science data from ``dataset``. Butler operations on the 

156 repository shall not be able to modify ``dataset``. 

157 

158 Parameters 

159 ---------- 

160 dataset : `lsst.ap.verify.dataset.Dataset` 

161 The dataset on which the pipeline will be run. 

162 workspace : `lsst.ap.verify.workspace.Workspace` 

163 The location containing all ingestion repositories. 

164 

165 Raises 

166 ------ 

167 RuntimeError 

168 Raised if there are no files to ingest. 

169 """ 

170 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

171 self.log.info("Raw images were previously ingested, skipping...") 

172 else: 

173 self.log.info("Ingesting raw images...") 

174 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

175 if dataFiles: 

176 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

177 dataFiles, self.config.dataBadFiles) 

178 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

179 else: 

180 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

181 

182 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

183 """Ingest raw images into a repository. 

184 

185 ``repo`` shall be populated with *links* to ``dataFiles``. 

186 

187 Parameters 

188 ---------- 

189 repo : `str` 

190 The output repository location on disk for raw images. Must exist. 

191 calibRepo : `str` 

192 The output calibration repository location on disk. 

193 dataFiles : `list` of `str` 

194 A list of filenames to ingest. May contain wildcards. 

195 badFiles : `list` of `str` 

196 A list of filenames to exclude from ingestion. Must not contain paths. 

197 May contain wildcards. 

198 

199 Raises 

200 ------ 

201 RuntimeError 

202 Raised if ``dataFiles`` is empty. 

203 """ 

204 if not dataFiles: 

205 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

206 

207 args = [repo, "--calib", calibRepo, "--mode", "link"] 

208 args.extend(dataFiles) 

209 if badFiles: 

210 args.append('--badFile') 

211 args.extend(badFiles) 

212 try: 

213 _runIngestTask(self.dataIngester, args) 

214 except sqlite3.IntegrityError as detail: 

215 raise RuntimeError("Not all raw files are unique") from detail 

216 

217 def _ingestCalibs(self, dataset, workspace): 

218 """Ingest the calibration files for use by LSST. 

219 

220 After this method returns, the calibration repository in ``workspace`` 

221 shall contain all calibration data from ``dataset``. Butler operations 

222 on the repository shall not be able to modify ``dataset``. 

223 

224 Parameters 

225 ---------- 

226 dataset : `lsst.ap.verify.dataset.Dataset` 

227 The dataset on which the pipeline will be run. 

228 workspace : `lsst.ap.verify.workspace.Workspace` 

229 The location containing all ingestion repositories. 

230 

231 Raises 

232 ------ 

233 RuntimeError 

234 Raised if there are no files to ingest. 

235 """ 

236 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

237 self.log.info("Calibration files were previously ingested, skipping...") 

238 else: 

239 self.log.info("Ingesting calibration files...") 

240 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

241 self.config.calibFiles, self.config.calibBadFiles) 

242 if calibDataFiles: 

243 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

244 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

245 workspace.dataRepo, workspace.calibRepo)) 

246 else: 

247 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

248 

249 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

250 """Ingest calibration images into a calibration repository. 

251 

252 Parameters 

253 ---------- 

254 repo : `str` 

255 The output repository location on disk for raw images. Must exist. 

256 calibRepo : `str` 

257 The output repository location on disk for calibration files. Must 

258 exist. 

259 calibDataFiles : `list` of `str` 

260 A list of filenames to ingest. Supported files vary by instrument 

261 but may include flats, biases, darks, fringes, or sky. May contain 

262 wildcards. 

263 

264 Raises 

265 ------ 

266 RuntimeError 

267 Raised if ``calibDataFiles`` is empty. 

268 """ 

269 if not calibDataFiles: 

270 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

271 % calibDataFiles) 

272 

273 # TODO: --output is workaround for DM-11668 

274 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

275 "--mode", "link", "--validity", str(self.config.calibValidity)] 

276 args.extend(calibDataFiles) 

277 try: 

278 _runIngestTask(self.calibIngester, args) 

279 except sqlite3.IntegrityError as detail: 

280 raise RuntimeError("Not all calibration files are unique") from detail 

281 

282 def _ingestDefects(self, dataset, workspace): 

283 """Ingest the defect files for use by LSST. 

284 

285 After this method returns, the calibration repository in ``workspace`` 

286 shall contain all defects from ``dataset``. Butler operations on the 

287 repository shall not be able to modify ``dataset``. 

288 

289 Parameters 

290 ---------- 

291 dataset : `lsst.ap.verify.dataset.Dataset` 

292 The dataset on which the pipeline will be run. 

293 workspace : `lsst.ap.verify.workspace.Workspace` 

294 The location containing all ingestion repositories. 

295 

296 Raises 

297 ------ 

298 RuntimeError 

299 Raised if defect ingestion requested but no defects found. 

300 """ 

301 if os.path.exists(os.path.join(workspace.calibRepo, "defects")): 

302 self.log.info("Defects were previously ingested, skipping...") 

303 else: 

304 self.log.info("Ingesting defects...") 

305 self._doIngestDefects(workspace.dataRepo, workspace.calibRepo, self.config.textDefectPath) 

306 self.log.info("Defects are now ingested in {0}".format(workspace.calibRepo)) 

307 

308 def _doIngestDefects(self, repo, calibRepo, defectPath): 

309 """Ingest defect images. 

310 

311 Parameters 

312 ---------- 

313 repo : `str` 

314 The output repository location on disk for raw images. Must exist. 

315 calibRepo : `str` 

316 The output repository location on disk for calibration files. Must 

317 exist. 

318 defectPath : `str` 

319 Path to the defects in standard text form. This is probably a path in ``obs_decam_data``. 

320 

321 Raises 

322 ------ 

323 RuntimeError 

324 Raised if ``defectTarball`` exists but is empty. 

325 """ 

326 

327 defectargs = [repo, defectPath, "--calib", calibRepo] 

328 try: 

329 _runIngestTask(self.defectIngester, defectargs) 

330 except sqlite3.IntegrityError as detail: 

331 raise RuntimeError("Not all defect files are unique") from detail 

332 

333 def _ingestRefcats(self, dataset, workspace): 

334 """Ingest the refcats for use by LSST. 

335 

336 After this method returns, the data repository in ``workspace`` shall 

337 contain all reference catalogs from ``dataset``. Operations on the 

338 repository shall not be able to modify ``dataset``. 

339 

340 Parameters 

341 ---------- 

342 dataset : `lsst.ap.verify.dataset.Dataset` 

343 The dataset on which the pipeline will be run. 

344 workspace : `lsst.ap.verify.workspace.Workspace` 

345 The location containing all ingestion repositories. 

346 

347 Notes 

348 ----- 

349 Refcats are not, at present, registered as part of the repository. They 

350 are not guaranteed to be visible to anything other than a 

351 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

352 for more details. 

353 """ 

354 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

355 self.log.info("Refcats were previously ingested, skipping...") 

356 else: 

357 self.log.info("Ingesting reference catalogs...") 

358 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

359 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

360 

361 def _doIngestRefcats(self, repo, refcats): 

362 """Place refcats inside a particular repository. 

363 

364 Parameters 

365 ---------- 

366 repo : `str` 

367 The output repository location on disk for raw images. Must exist. 

368 refcats : `str` 

369 A directory containing .tar.gz files with LSST-formatted astrometric 

370 or photometric reference catalog information. 

371 """ 

372 for refcatName, tarball in self.config.refcats.items(): 

373 tarball = os.path.join(refcats, tarball) 

374 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

375 with tarfile.open(tarball, "r") as opened: 

376 opened.extractall(refcatDir) 

377 

378 def _copyConfigs(self, dataset, workspace): 

379 """Give a workspace a copy of all configs associated with the ingested data. 

380 

381 After this method returns, the config directory in ``workspace`` shall 

382 contain all config files from ``dataset``. 

383 

384 Parameters 

385 ---------- 

386 dataset : `lsst.ap.verify.dataset.Dataset` 

387 The dataset on which the pipeline will be run. 

388 workspace : `lsst.ap.verify.workspace.Workspace` 

389 The location containing the config directory. 

390 """ 

391 if os.listdir(workspace.configDir): 

392 self.log.info("Configs already copied, skipping...") 

393 else: 

394 self.log.info("Storing data-specific configs...") 

395 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

396 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

397 

398 def _doCopyConfigs(self, destination, source): 

399 """Place configs inside a particular repository. 

400 

401 Parameters 

402 ---------- 

403 destination : `str` 

404 The directory to which the configs must be copied. Must exist. 

405 source : `str` 

406 A directory containing Task config files. 

407 """ 

408 for configFile in _findMatchingFiles(source, ['*.py']): 

409 shutil.copy2(configFile, destination) 

410 

411 

412def ingestDataset(dataset, workspace): 

413 """Ingest the contents of a dataset into a Butler repository. 

414 

415 The original data directory shall not be modified. 

416 

417 Parameters 

418 ---------- 

419 dataset : `lsst.ap.verify.dataset.Dataset` 

420 The dataset to be ingested. 

421 workspace : `lsst.ap.verify.workspace.Workspace` 

422 The abstract location where ingestion repositories will be created. 

423 If the repositories already exist, they must be compatible with 

424 ``dataset`` (in particular, they must support the relevant 

425 ``obs`` package). 

426 """ 

427 # TODO: generalize to support arbitrary URIs (DM-11482) 

428 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

429 

430 ingester = DatasetIngestTask(config=_getConfig(dataset)) 

431 ingester.run(dataset, workspace) 

432 log.info("Data ingested") 

433 

434 

435def _getConfig(dataset): 

436 """Return the ingestion config associated with a specific dataset. 

437 

438 Parameters 

439 ---------- 

440 dataset : `lsst.ap.verify.dataset.Dataset` 

441 The dataset whose ingestion config is desired. 

442 

443 Returns 

444 ------- 

445 config : `DatasetIngestConfig` 

446 The config for running `DatasetIngestTask` on ``dataset``. 

447 """ 

448 overrideFile = DatasetIngestTask._DefaultName + ".py" 

449 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

450 

451 config = DatasetIngestTask.ConfigClass() 

452 for path in [ 

453 os.path.join(packageDir, 'config'), 

454 os.path.join(packageDir, 'config', dataset.camera), 

455 dataset.configLocation, 

456 ]: 

457 overridePath = os.path.join(path, overrideFile) 

458 if os.path.exists(overridePath): 

459 config.load(overridePath) 

460 return config 

461 

462 

463def _runIngestTask(task, args): 

464 """Run an ingestion task on a set of inputs. 

465 

466 Parameters 

467 ---------- 

468 task : `lsst.pipe.tasks.IngestTask` 

469 The task to run. 

470 args : list of command-line arguments, split using Python conventions 

471 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

472 """ 

473 argumentParser = task.ArgumentParser(name=task.getName()) 

474 try: 

475 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

476 except SystemExit as e: 

477 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

478 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

479 task.run(parsedCmd) 

480 

481 

482def _findMatchingFiles(basePath, include, exclude=None): 

483 """Recursively identify files matching one set of patterns and not matching another. 

484 

485 Parameters 

486 ---------- 

487 basePath : `str` 

488 The path on disk where the files in ``include`` are located. 

489 include : iterable of `str` 

490 A collection of files (with wildcards) to include. Must not 

491 contain paths. 

492 exclude : iterable of `str`, optional 

493 A collection of filenames (with wildcards) to exclude. Must not 

494 contain paths. If omitted, all files matching ``include`` are returned. 

495 

496 Returns 

497 ------- 

498 files : `set` of `str` 

499 The files in ``basePath`` or any subdirectory that match ``include`` 

500 but not ``exclude``. 

501 """ 

502 _exclude = exclude if exclude is not None else [] 

503 

504 allFiles = set() 

505 for pattern in include: 

506 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

507 

508 for pattern in _exclude: 

509 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

510 allFiles.difference_update(excludedFiles) 

511 return allFiles