Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of a dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "ingestDataset"] 

31 

32import fnmatch 

33import os 

34import shutil 

35import tarfile 

36from contextlib import contextmanager 

37from glob import glob 

38import sqlite3 

39 

40import lsst.utils 

41import lsst.log 

42import lsst.pex.config as pexConfig 

43import lsst.pipe.base as pipeBase 

44 

45from lsst.pipe.tasks.ingest import IngestTask 

46from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

47from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

48 

49 

50class DatasetIngestConfig(pexConfig.Config): 

51 """Settings and defaults for `DatasetIngestTask`. 

52 

53 The correct targets for this task's subtasks can be found in the 

54 documentation of the appropriate ``obs`` package. 

55 

56 Because `DatasetIngestTask` is not designed to be run from the command line, 

57 and its arguments are completely determined by the choice of dataset, 

58 this config includes settings that would normally be passed as command-line 

59 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

60 """ 

61 

62 dataIngester = pexConfig.ConfigurableField( 

63 target=IngestTask, 

64 doc="Task used to perform raw data ingestion.", 

65 ) 

66 dataFiles = pexConfig.ListField( 

67 dtype=str, 

68 default=["*.fits", "*.fz", "*.fits.gz"], 

69 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.", 

70 ) 

71 dataBadFiles = pexConfig.ListField( 

72 dtype=str, 

73 default=[], 

74 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

75 "supersedes ``dataFiles``.", 

76 ) 

77 

78 calibIngester = pexConfig.ConfigurableField( 

79 target=IngestCalibsTask, 

80 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

81 ) 

82 calibFiles = pexConfig.ListField( 

83 dtype=str, 

84 default=["*.fits", "*.fz", "*.fits.gz"], 

85 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.", 

86 ) 

87 calibBadFiles = pexConfig.ListField( 

88 dtype=str, 

89 default=[], 

90 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

91 ) 

92 calibValidity = pexConfig.Field( 

93 dtype=int, 

94 default=9999, 

95 doc="Calibration validity period (days). Assumed equal for all calib types.") 

96 

97 textDefectPath = pexConfig.Field( 

98 dtype=str, 

99 default='', 

100 doc="Path to top level of the defect tree. This is a directory with a directory per sensor." 

101 ) 

102 defectIngester = pexConfig.ConfigurableField( 

103 target=IngestCuratedCalibsTask, 

104 doc="Task used to ingest defects.", 

105 ) 

106 

107 refcats = pexConfig.DictField( 

108 keytype=str, 

109 itemtype=str, 

110 default={}, 

111 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

112 ) 

113 

114 

115class DatasetIngestTask(pipeBase.Task): 

116 """Task for automating ingestion of a dataset. 

117 

118 Each dataset configures this task as appropriate for the files it provides 

119 and the target instrument. Therefore, this task takes no input besides the 

120 dataset to load and the repositories to ingest to. 

121 """ 

122 

123 ConfigClass = DatasetIngestConfig 

124 _DefaultName = "datasetIngest" 

125 

126 def __init__(self, *args, **kwargs): 

127 pipeBase.Task.__init__(self, *args, **kwargs) 

128 self.makeSubtask("dataIngester") 

129 self.makeSubtask("calibIngester") 

130 self.makeSubtask("defectIngester") 

131 

132 def run(self, dataset, workspace): 

133 """Ingest the contents of a dataset into a Butler repository. 

134 

135 Parameters 

136 ---------- 

137 dataset : `lsst.ap.verify.dataset.Dataset` 

138 The dataset to be ingested. 

139 workspace : `lsst.ap.verify.workspace.Workspace` 

140 The abstract location where ingestion repositories will be created. 

141 If the repositories already exist, they must support the same 

142 ``obs`` package as this task's subtasks. 

143 """ 

144 # We're assuming ingest tasks always give absolute path to butler 

145 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

146 self._ingestRaws(dataset, workspace) 

147 self._ingestCalibs(dataset, workspace) 

148 self._ingestDefects(dataset, workspace) 

149 self._ingestRefcats(dataset, workspace) 

150 self._copyConfigs(dataset, workspace) 

151 

152 def _ingestRaws(self, dataset, workspace): 

153 """Ingest the science data for use by LSST. 

154 

155 After this method returns, the data repository in ``workspace`` shall 

156 contain all science data from ``dataset``. Butler operations on the 

157 repository shall not be able to modify ``dataset``. 

158 

159 Parameters 

160 ---------- 

161 dataset : `lsst.ap.verify.dataset.Dataset` 

162 The dataset on which the pipeline will be run. 

163 workspace : `lsst.ap.verify.workspace.Workspace` 

164 The location containing all ingestion repositories. 

165 

166 Raises 

167 ------ 

168 RuntimeError 

169 Raised if there are no files to ingest. 

170 """ 

171 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

172 self.log.info("Raw images were previously ingested, skipping...") 

173 else: 

174 self.log.info("Ingesting raw images...") 

175 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

176 if dataFiles: 

177 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

178 dataFiles, self.config.dataBadFiles) 

179 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

180 else: 

181 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

182 

183 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

184 """Ingest raw images into a repository. 

185 

186 ``repo`` shall be populated with *links* to ``dataFiles``. 

187 

188 Parameters 

189 ---------- 

190 repo : `str` 

191 The output repository location on disk for raw images. Must exist. 

192 calibRepo : `str` 

193 The output calibration repository location on disk. 

194 dataFiles : `list` of `str` 

195 A list of filenames to ingest. May contain wildcards. 

196 badFiles : `list` of `str` 

197 A list of filenames to exclude from ingestion. Must not contain paths. 

198 May contain wildcards. 

199 

200 Raises 

201 ------ 

202 RuntimeError 

203 Raised if ``dataFiles`` is empty. 

204 """ 

205 if not dataFiles: 

206 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

207 

208 args = [repo, "--calib", calibRepo, "--mode", "link"] 

209 args.extend(dataFiles) 

210 if badFiles: 

211 args.append('--badFile') 

212 args.extend(badFiles) 

213 try: 

214 _runIngestTask(self.dataIngester, args) 

215 except sqlite3.IntegrityError as detail: 

216 raise RuntimeError("Not all raw files are unique") from detail 

217 

218 def _ingestCalibs(self, dataset, workspace): 

219 """Ingest the calibration files for use by LSST. 

220 

221 After this method returns, the calibration repository in ``workspace`` 

222 shall contain all calibration data from ``dataset``. Butler operations 

223 on the repository shall not be able to modify ``dataset``. 

224 

225 Parameters 

226 ---------- 

227 dataset : `lsst.ap.verify.dataset.Dataset` 

228 The dataset on which the pipeline will be run. 

229 workspace : `lsst.ap.verify.workspace.Workspace` 

230 The location containing all ingestion repositories. 

231 

232 Raises 

233 ------ 

234 RuntimeError 

235 Raised if there are no files to ingest. 

236 """ 

237 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

238 self.log.info("Calibration files were previously ingested, skipping...") 

239 else: 

240 self.log.info("Ingesting calibration files...") 

241 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

242 self.config.calibFiles, self.config.calibBadFiles) 

243 if calibDataFiles: 

244 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

245 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

246 workspace.dataRepo, workspace.calibRepo)) 

247 else: 

248 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

249 

250 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

251 """Ingest calibration images into a calibration repository. 

252 

253 Parameters 

254 ---------- 

255 repo : `str` 

256 The output repository location on disk for raw images. Must exist. 

257 calibRepo : `str` 

258 The output repository location on disk for calibration files. Must 

259 exist. 

260 calibDataFiles : `list` of `str` 

261 A list of filenames to ingest. Supported files vary by instrument 

262 but may include flats, biases, darks, fringes, or sky. May contain 

263 wildcards. 

264 

265 Raises 

266 ------ 

267 RuntimeError 

268 Raised if ``calibDataFiles`` is empty. 

269 """ 

270 if not calibDataFiles: 

271 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

272 % calibDataFiles) 

273 

274 # TODO: --output is workaround for DM-11668 

275 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

276 "--mode", "link", "--validity", str(self.config.calibValidity)] 

277 args.extend(calibDataFiles) 

278 try: 

279 _runIngestTask(self.calibIngester, args) 

280 except sqlite3.IntegrityError as detail: 

281 raise RuntimeError("Not all calibration files are unique") from detail 

282 

283 def _ingestDefects(self, dataset, workspace): 

284 """Ingest the defect files for use by LSST. 

285 

286 After this method returns, the calibration repository in ``workspace`` 

287 shall contain all defects from ``dataset``. Butler operations on the 

288 repository shall not be able to modify ``dataset``. 

289 

290 Parameters 

291 ---------- 

292 dataset : `lsst.ap.verify.dataset.Dataset` 

293 The dataset on which the pipeline will be run. 

294 workspace : `lsst.ap.verify.workspace.Workspace` 

295 The location containing all ingestion repositories. 

296 

297 Raises 

298 ------ 

299 RuntimeError 

300 Raised if defect ingestion requested but no defects found. 

301 """ 

302 if os.path.exists(os.path.join(workspace.calibRepo, "defects")): 

303 self.log.info("Defects were previously ingested, skipping...") 

304 else: 

305 self.log.info("Ingesting defects...") 

306 self._doIngestDefects(workspace.dataRepo, workspace.calibRepo, self.config.textDefectPath) 

307 self.log.info("Defects are now ingested in {0}".format(workspace.calibRepo)) 

308 

309 def _doIngestDefects(self, repo, calibRepo, defectPath): 

310 """Ingest defect images. 

311 

312 Parameters 

313 ---------- 

314 repo : `str` 

315 The output repository location on disk for raw images. Must exist. 

316 calibRepo : `str` 

317 The output repository location on disk for calibration files. Must 

318 exist. 

319 defectPath : `str` 

320 Path to the defects in standard text form. This is probably a path in ``obs_decam_data``. 

321 

322 Raises 

323 ------ 

324 RuntimeError 

325 Raised if ``defectTarball`` exists but is empty. 

326 """ 

327 

328 defectargs = [repo, defectPath, "--calib", calibRepo] 

329 try: 

330 _runIngestTask(self.defectIngester, defectargs) 

331 except sqlite3.IntegrityError as detail: 

332 raise RuntimeError("Not all defect files are unique") from detail 

333 

334 def _ingestRefcats(self, dataset, workspace): 

335 """Ingest the refcats for use by LSST. 

336 

337 After this method returns, the data repository in ``workspace`` shall 

338 contain all reference catalogs from ``dataset``. Operations on the 

339 repository shall not be able to modify ``dataset``. 

340 

341 Parameters 

342 ---------- 

343 dataset : `lsst.ap.verify.dataset.Dataset` 

344 The dataset on which the pipeline will be run. 

345 workspace : `lsst.ap.verify.workspace.Workspace` 

346 The location containing all ingestion repositories. 

347 

348 Notes 

349 ----- 

350 Refcats are not, at present, registered as part of the repository. They 

351 are not guaranteed to be visible to anything other than a 

352 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

353 for more details. 

354 """ 

355 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

356 self.log.info("Refcats were previously ingested, skipping...") 

357 else: 

358 self.log.info("Ingesting reference catalogs...") 

359 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

360 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

361 

362 def _doIngestRefcats(self, repo, refcats): 

363 """Place refcats inside a particular repository. 

364 

365 Parameters 

366 ---------- 

367 repo : `str` 

368 The output repository location on disk for raw images. Must exist. 

369 refcats : `str` 

370 A directory containing .tar.gz files with LSST-formatted astrometric 

371 or photometric reference catalog information. 

372 """ 

373 for refcatName, tarball in self.config.refcats.items(): 

374 tarball = os.path.join(refcats, tarball) 

375 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

376 with tarfile.open(tarball, "r") as opened: 

377 opened.extractall(refcatDir) 

378 

379 def _copyConfigs(self, dataset, workspace): 

380 """Give a workspace a copy of all configs associated with the ingested data. 

381 

382 After this method returns, the config directory in ``workspace`` shall 

383 contain all config files from ``dataset``. 

384 

385 Parameters 

386 ---------- 

387 dataset : `lsst.ap.verify.dataset.Dataset` 

388 The dataset on which the pipeline will be run. 

389 workspace : `lsst.ap.verify.workspace.Workspace` 

390 The location containing the config directory. 

391 """ 

392 if os.listdir(workspace.configDir): 

393 self.log.info("Configs already copied, skipping...") 

394 else: 

395 self.log.info("Storing data-specific configs...") 

396 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

397 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

398 

399 def _doCopyConfigs(self, destination, source): 

400 """Place configs inside a particular repository. 

401 

402 Parameters 

403 ---------- 

404 destination : `str` 

405 The directory to which the configs must be copied. Must exist. 

406 source : `str` 

407 A directory containing Task config files. 

408 """ 

409 for configFile in _findMatchingFiles(source, ['*.py']): 

410 shutil.copy2(configFile, destination) 

411 

412 

413def ingestDataset(dataset, workspace): 

414 """Ingest the contents of a dataset into a Butler repository. 

415 

416 The original data directory shall not be modified. 

417 

418 Parameters 

419 ---------- 

420 dataset : `lsst.ap.verify.dataset.Dataset` 

421 The dataset to be ingested. 

422 workspace : `lsst.ap.verify.workspace.Workspace` 

423 The abstract location where ingestion repositories will be created. 

424 If the repositories already exist, they must be compatible with 

425 ``dataset`` (in particular, they must support the relevant 

426 ``obs`` package). 

427 """ 

428 # TODO: generalize to support arbitrary URIs (DM-11482) 

429 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

430 

431 ingester = DatasetIngestTask(config=_getConfig(dataset)) 

432 ingester.run(dataset, workspace) 

433 log.info("Data ingested") 

434 

435 

436def _getConfig(dataset): 

437 """Return the ingestion config associated with a specific dataset. 

438 

439 Parameters 

440 ---------- 

441 dataset : `lsst.ap.verify.dataset.Dataset` 

442 The dataset whose ingestion config is desired. 

443 

444 Returns 

445 ------- 

446 config : `DatasetIngestConfig` 

447 The config for running `DatasetIngestTask` on ``dataset``. 

448 """ 

449 overrideFile = DatasetIngestTask._DefaultName + ".py" 

450 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

451 

452 config = DatasetIngestTask.ConfigClass() 

453 for path in [ 

454 os.path.join(packageDir, 'config'), 

455 os.path.join(packageDir, 'config', dataset.camera), 

456 dataset.configLocation, 

457 ]: 

458 overridePath = os.path.join(path, overrideFile) 

459 if os.path.exists(overridePath): 

460 config.load(overridePath) 

461 return config 

462 

463 

464def _runIngestTask(task, args): 

465 """Run an ingestion task on a set of inputs. 

466 

467 Parameters 

468 ---------- 

469 task : `lsst.pipe.tasks.IngestTask` 

470 The task to run. 

471 args : list of command-line arguments, split using Python conventions 

472 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

473 """ 

474 argumentParser = task.ArgumentParser(name=task.getName()) 

475 try: 

476 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

477 except SystemExit as e: 

478 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

479 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

480 task.run(parsedCmd) 

481 

482 

483def _findMatchingFiles(basePath, include, exclude=None): 

484 """Recursively identify files matching one set of patterns and not matching another. 

485 

486 Parameters 

487 ---------- 

488 basePath : `str` 

489 The path on disk where the files in ``include`` are located. 

490 include : iterable of `str` 

491 A collection of files (with wildcards) to include. Must not 

492 contain paths. 

493 exclude : iterable of `str`, optional 

494 A collection of filenames (with wildcards) to exclude. Must not 

495 contain paths. If omitted, all files matching ``include`` are returned. 

496 

497 Returns 

498 ------- 

499 files : `set` of `str` 

500 The files in ``basePath`` or any subdirectory that match ``include`` 

501 but not ``exclude``. 

502 """ 

503 _exclude = exclude if exclude is not None else [] 

504 

505 allFiles = set() 

506 for pattern in include: 

507 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

508 

509 for pattern in _exclude: 

510 allFiles.difference_update(fnmatch.filter(allFiles, pattern)) 

511 return allFiles 

512 

513 

514@contextmanager 

515def _tempChDir(newDir): 

516 """Change to a new directory, while avoiding side effects in external code. 

517 

518 Note that no side effects are guaranteed in the case of normal operation or 

519 for exceptions raised by the body of a ``with`` statement, but not for 

520 exceptions raised by ``_tempChDir`` itself (see below). 

521 

522 This context manager cannot be used with "with ... as" statements. 

523 

524 Parameters 

525 ---------- 

526 newDir : `str` 

527 The directory to change to for the duration of a ``with`` statement. 

528 

529 Raises 

530 ------ 

531 OSError 

532 Raised if either the program cannot change to ``newDir``, or if it 

533 cannot undo the change. Failing to change to ``newDir`` is 

534 exception-safe (no side effects), but failing to undo is 

535 not recoverable. 

536 """ 

537 startDir = os.path.abspath(os.getcwd()) 

538 os.chdir(newDir) 

539 try: 

540 yield 

541 finally: 

542 os.chdir(startDir)