Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of a dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "ingestDataset"] 

31 

32import fnmatch 

33import os 

34import shutil 

35import tarfile 

36from glob import glob 

37import sqlite3 

38 

39import lsst.utils 

40import lsst.log 

41import lsst.pex.config as pexConfig 

42import lsst.pipe.base as pipeBase 

43 

44from lsst.pipe.tasks.ingest import IngestTask 

45from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

46from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

47 

48 

49class DatasetIngestConfig(pexConfig.Config): 

50 """Settings and defaults for `DatasetIngestTask`. 

51 

52 The correct targets for this task's subtasks can be found in the 

53 documentation of the appropriate ``obs`` package. 

54 

55 Because `DatasetIngestTask` is not designed to be run from the command line, 

56 and its arguments are completely determined by the choice of dataset, 

57 this config includes settings that would normally be passed as command-line 

58 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

59 """ 

60 

61 dataIngester = pexConfig.ConfigurableField( 

62 target=IngestTask, 

63 doc="Task used to perform raw data ingestion.", 

64 ) 

65 dataFiles = pexConfig.ListField( 

66 dtype=str, 

67 default=["*.fits", "*.fz", "*.fits.gz"], 

68 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.", 

69 ) 

70 dataBadFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=[], 

73 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

74 "supersedes ``dataFiles``.", 

75 ) 

76 

77 calibIngester = pexConfig.ConfigurableField( 

78 target=IngestCalibsTask, 

79 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

80 ) 

81 calibFiles = pexConfig.ListField( 

82 dtype=str, 

83 default=["*.fits", "*.fz", "*.fits.gz"], 

84 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.", 

85 ) 

86 calibBadFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=[], 

89 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

90 ) 

91 calibValidity = pexConfig.Field( 

92 dtype=int, 

93 default=9999, 

94 doc="Calibration validity period (days). Assumed equal for all calib types.") 

95 

96 textDefectPath = pexConfig.Field( 

97 dtype=str, 

98 default=None, 

99 optional=True, 

100 doc="Path to top level of the defect tree. This is a directory with a directory per sensor. " 

101 "Set to None to disable defect ingestion." 

102 ) 

103 defectIngester = pexConfig.ConfigurableField( 

104 target=IngestCuratedCalibsTask, 

105 doc="Task used to ingest defects.", 

106 ) 

107 

108 refcats = pexConfig.DictField( 

109 keytype=str, 

110 itemtype=str, 

111 default={}, 

112 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

113 ) 

114 

115 

116class DatasetIngestTask(pipeBase.Task): 

117 """Task for automating ingestion of a dataset. 

118 

119 Each dataset configures this task as appropriate for the files it provides 

120 and the target instrument. Therefore, this task takes no input besides the 

121 dataset to load and the repositories to ingest to. 

122 """ 

123 

124 ConfigClass = DatasetIngestConfig 

125 _DefaultName = "datasetIngest" 

126 

127 def __init__(self, *args, **kwargs): 

128 pipeBase.Task.__init__(self, *args, **kwargs) 

129 self.makeSubtask("dataIngester") 

130 self.makeSubtask("calibIngester") 

131 self.makeSubtask("defectIngester") 

132 

133 def run(self, dataset, workspace): 

134 """Ingest the contents of a dataset into a Butler repository. 

135 

136 Parameters 

137 ---------- 

138 dataset : `lsst.ap.verify.dataset.Dataset` 

139 The dataset to be ingested. 

140 workspace : `lsst.ap.verify.workspace.Workspace` 

141 The abstract location where ingestion repositories will be created. 

142 If the repositories already exist, they must support the same 

143 ``obs`` package as this task's subtasks. 

144 """ 

145 # We're assuming ingest tasks always give absolute path to butler 

146 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

147 self._ingestRaws(dataset, workspace) 

148 self._ingestCalibs(dataset, workspace) 

149 self._ingestDefects(dataset, workspace) 

150 self._ingestRefcats(dataset, workspace) 

151 self._copyConfigs(dataset, workspace) 

152 

153 def _ingestRaws(self, dataset, workspace): 

154 """Ingest the science data for use by LSST. 

155 

156 After this method returns, the data repository in ``workspace`` shall 

157 contain all science data from ``dataset``. Butler operations on the 

158 repository shall not be able to modify ``dataset``. 

159 

160 Parameters 

161 ---------- 

162 dataset : `lsst.ap.verify.dataset.Dataset` 

163 The dataset on which the pipeline will be run. 

164 workspace : `lsst.ap.verify.workspace.Workspace` 

165 The location containing all ingestion repositories. 

166 

167 Raises 

168 ------ 

169 RuntimeError 

170 Raised if there are no files to ingest. 

171 """ 

172 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

173 self.log.info("Raw images were previously ingested, skipping...") 

174 else: 

175 self.log.info("Ingesting raw images...") 

176 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

177 if dataFiles: 

178 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

179 dataFiles, self.config.dataBadFiles) 

180 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

181 else: 

182 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

183 

184 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

185 """Ingest raw images into a repository. 

186 

187 ``repo`` shall be populated with *links* to ``dataFiles``. 

188 

189 Parameters 

190 ---------- 

191 repo : `str` 

192 The output repository location on disk for raw images. Must exist. 

193 calibRepo : `str` 

194 The output calibration repository location on disk. 

195 dataFiles : `list` of `str` 

196 A list of filenames to ingest. May contain wildcards. 

197 badFiles : `list` of `str` 

198 A list of filenames to exclude from ingestion. Must not contain paths. 

199 May contain wildcards. 

200 

201 Raises 

202 ------ 

203 RuntimeError 

204 Raised if ``dataFiles`` is empty. 

205 """ 

206 if not dataFiles: 

207 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

208 

209 args = [repo, "--calib", calibRepo, "--mode", "link"] 

210 args.extend(dataFiles) 

211 if badFiles: 

212 args.append('--badFile') 

213 args.extend(badFiles) 

214 try: 

215 _runIngestTask(self.dataIngester, args) 

216 except sqlite3.IntegrityError as detail: 

217 raise RuntimeError("Not all raw files are unique") from detail 

218 

219 def _ingestCalibs(self, dataset, workspace): 

220 """Ingest the calibration files for use by LSST. 

221 

222 After this method returns, the calibration repository in ``workspace`` 

223 shall contain all calibration data from ``dataset``. Butler operations 

224 on the repository shall not be able to modify ``dataset``. 

225 

226 Parameters 

227 ---------- 

228 dataset : `lsst.ap.verify.dataset.Dataset` 

229 The dataset on which the pipeline will be run. 

230 workspace : `lsst.ap.verify.workspace.Workspace` 

231 The location containing all ingestion repositories. 

232 

233 Raises 

234 ------ 

235 RuntimeError 

236 Raised if there are no files to ingest. 

237 """ 

238 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

239 self.log.info("Calibration files were previously ingested, skipping...") 

240 else: 

241 self.log.info("Ingesting calibration files...") 

242 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

243 self.config.calibFiles, self.config.calibBadFiles) 

244 if calibDataFiles: 

245 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

246 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

247 workspace.dataRepo, workspace.calibRepo)) 

248 else: 

249 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

250 

251 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

252 """Ingest calibration images into a calibration repository. 

253 

254 Parameters 

255 ---------- 

256 repo : `str` 

257 The output repository location on disk for raw images. Must exist. 

258 calibRepo : `str` 

259 The output repository location on disk for calibration files. Must 

260 exist. 

261 calibDataFiles : `list` of `str` 

262 A list of filenames to ingest. Supported files vary by instrument 

263 but may include flats, biases, darks, fringes, or sky. May contain 

264 wildcards. 

265 

266 Raises 

267 ------ 

268 RuntimeError 

269 Raised if ``calibDataFiles`` is empty. 

270 """ 

271 if not calibDataFiles: 

272 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

273 % calibDataFiles) 

274 

275 # TODO: --output is workaround for DM-11668 

276 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

277 "--mode", "link", "--validity", str(self.config.calibValidity)] 

278 args.extend(calibDataFiles) 

279 try: 

280 _runIngestTask(self.calibIngester, args) 

281 except sqlite3.IntegrityError as detail: 

282 raise RuntimeError("Not all calibration files are unique") from detail 

283 

284 def _ingestDefects(self, dataset, workspace): 

285 """Ingest the defect files for use by LSST. 

286 

287 After this method returns, the calibration repository in ``workspace`` 

288 shall contain all defects from ``dataset``. Butler operations on the 

289 repository shall not be able to modify ``dataset``. 

290 

291 Parameters 

292 ---------- 

293 dataset : `lsst.ap.verify.dataset.Dataset` 

294 The dataset on which the pipeline will be run. 

295 workspace : `lsst.ap.verify.workspace.Workspace` 

296 The location containing all ingestion repositories. 

297 

298 Raises 

299 ------ 

300 RuntimeError 

301 Raised if defect ingestion requested but no defects found. 

302 """ 

303 if os.path.exists(os.path.join(workspace.calibRepo, "defects")): 

304 self.log.info("Defects were previously ingested, skipping...") 

305 elif self.config.textDefectPath: 

306 self.log.info("Ingesting defects...") 

307 self._doIngestDefects(workspace.dataRepo, workspace.calibRepo, self.config.textDefectPath) 

308 self.log.info("Defects are now ingested in {0}".format(workspace.calibRepo)) 

309 

310 def _doIngestDefects(self, repo, calibRepo, defectPath): 

311 """Ingest defect images. 

312 

313 Parameters 

314 ---------- 

315 repo : `str` 

316 The output repository location on disk for raw images. Must exist. 

317 calibRepo : `str` 

318 The output repository location on disk for calibration files. Must 

319 exist. 

320 defectPath : `str` 

321 Path to the defects in standard text form. This is probably a path in ``obs_*_data``. 

322 

323 Raises 

324 ------ 

325 RuntimeError 

326 Raised if ``defectTarball`` exists but is empty. 

327 """ 

328 

329 defectargs = [repo, defectPath, "--calib", calibRepo] 

330 try: 

331 _runIngestTask(self.defectIngester, defectargs) 

332 except sqlite3.IntegrityError as detail: 

333 raise RuntimeError("Not all defect files are unique") from detail 

334 

335 def _ingestRefcats(self, dataset, workspace): 

336 """Ingest the refcats for use by LSST. 

337 

338 After this method returns, the data repository in ``workspace`` shall 

339 contain all reference catalogs from ``dataset``. Operations on the 

340 repository shall not be able to modify ``dataset``. 

341 

342 Parameters 

343 ---------- 

344 dataset : `lsst.ap.verify.dataset.Dataset` 

345 The dataset on which the pipeline will be run. 

346 workspace : `lsst.ap.verify.workspace.Workspace` 

347 The location containing all ingestion repositories. 

348 

349 Notes 

350 ----- 

351 Refcats are not, at present, registered as part of the repository. They 

352 are not guaranteed to be visible to anything other than a 

353 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

354 for more details. 

355 """ 

356 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

357 self.log.info("Refcats were previously ingested, skipping...") 

358 else: 

359 self.log.info("Ingesting reference catalogs...") 

360 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

361 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

362 

363 def _doIngestRefcats(self, repo, refcats): 

364 """Place refcats inside a particular repository. 

365 

366 Parameters 

367 ---------- 

368 repo : `str` 

369 The output repository location on disk for raw images. Must exist. 

370 refcats : `str` 

371 A directory containing .tar.gz files with LSST-formatted astrometric 

372 or photometric reference catalog information. 

373 """ 

374 for refcatName, tarball in self.config.refcats.items(): 

375 tarball = os.path.join(refcats, tarball) 

376 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

377 with tarfile.open(tarball, "r") as opened: 

378 opened.extractall(refcatDir) 

379 

380 def _copyConfigs(self, dataset, workspace): 

381 """Give a workspace a copy of all configs associated with the ingested data. 

382 

383 After this method returns, the config directory in ``workspace`` shall 

384 contain all config files from ``dataset``. 

385 

386 Parameters 

387 ---------- 

388 dataset : `lsst.ap.verify.dataset.Dataset` 

389 The dataset on which the pipeline will be run. 

390 workspace : `lsst.ap.verify.workspace.Workspace` 

391 The location containing the config directory. 

392 """ 

393 if os.listdir(workspace.configDir): 

394 self.log.info("Configs already copied, skipping...") 

395 else: 

396 self.log.info("Storing data-specific configs...") 

397 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

398 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

399 

400 def _doCopyConfigs(self, destination, source): 

401 """Place configs inside a particular repository. 

402 

403 Parameters 

404 ---------- 

405 destination : `str` 

406 The directory to which the configs must be copied. Must exist. 

407 source : `str` 

408 A directory containing Task config files. 

409 """ 

410 for configFile in _findMatchingFiles(source, ['*.py']): 

411 shutil.copy2(configFile, destination) 

412 

413 

414def ingestDataset(dataset, workspace): 

415 """Ingest the contents of a dataset into a Butler repository. 

416 

417 The original data directory shall not be modified. 

418 

419 Parameters 

420 ---------- 

421 dataset : `lsst.ap.verify.dataset.Dataset` 

422 The dataset to be ingested. 

423 workspace : `lsst.ap.verify.workspace.Workspace` 

424 The abstract location where ingestion repositories will be created. 

425 If the repositories already exist, they must be compatible with 

426 ``dataset`` (in particular, they must support the relevant 

427 ``obs`` package). 

428 """ 

429 # TODO: generalize to support arbitrary URIs (DM-11482) 

430 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

431 

432 ingester = DatasetIngestTask(config=_getConfig(dataset)) 

433 ingester.run(dataset, workspace) 

434 log.info("Data ingested") 

435 

436 

437def _getConfig(dataset): 

438 """Return the ingestion config associated with a specific dataset. 

439 

440 Parameters 

441 ---------- 

442 dataset : `lsst.ap.verify.dataset.Dataset` 

443 The dataset whose ingestion config is desired. 

444 

445 Returns 

446 ------- 

447 config : `DatasetIngestConfig` 

448 The config for running `DatasetIngestTask` on ``dataset``. 

449 """ 

450 overrideFile = DatasetIngestTask._DefaultName + ".py" 

451 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

452 

453 config = DatasetIngestTask.ConfigClass() 

454 for path in [ 

455 os.path.join(packageDir, 'config'), 

456 os.path.join(packageDir, 'config', dataset.camera), 

457 dataset.configLocation, 

458 ]: 

459 overridePath = os.path.join(path, overrideFile) 

460 if os.path.exists(overridePath): 

461 config.load(overridePath) 

462 return config 

463 

464 

465def _runIngestTask(task, args): 

466 """Run an ingestion task on a set of inputs. 

467 

468 Parameters 

469 ---------- 

470 task : `lsst.pipe.tasks.IngestTask` 

471 The task to run. 

472 args : list of command-line arguments, split using Python conventions 

473 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

474 """ 

475 argumentParser = task.ArgumentParser(name=task.getName()) 

476 try: 

477 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

478 except SystemExit as e: 

479 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

480 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

481 task.run(parsedCmd) 

482 

483 

484def _findMatchingFiles(basePath, include, exclude=None): 

485 """Recursively identify files matching one set of patterns and not matching another. 

486 

487 Parameters 

488 ---------- 

489 basePath : `str` 

490 The path on disk where the files in ``include`` are located. 

491 include : iterable of `str` 

492 A collection of files (with wildcards) to include. Must not 

493 contain paths. 

494 exclude : iterable of `str`, optional 

495 A collection of filenames (with wildcards) to exclude. Must not 

496 contain paths. If omitted, all files matching ``include`` are returned. 

497 

498 Returns 

499 ------- 

500 files : `set` of `str` 

501 The files in ``basePath`` or any subdirectory that match ``include`` 

502 but not ``exclude``. 

503 """ 

504 _exclude = exclude if exclude is not None else [] 

505 

506 allFiles = set() 

507 for pattern in include: 

508 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

509 

510 for pattern in _exclude: 

511 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

512 allFiles.difference_update(excludedFiles) 

513 return allFiles