Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36import tarfile 

37from glob import glob 

38import sqlite3 

39 

40import lsst.utils 

41import lsst.log 

42import lsst.pex.config as pexConfig 

43import lsst.pipe.base as pipeBase 

44 

45import lsst.daf.butler 

46import lsst.obs.base 

47from lsst.pipe.tasks.ingest import IngestTask 

48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

50 

51 

52class DatasetIngestConfig(pexConfig.Config): 

53 """Settings and defaults for `DatasetIngestTask`. 

54 

55 The correct targets for this task's subtasks can be found in the 

56 documentation of the appropriate ``obs`` package. 

57 

58 Because `DatasetIngestTask` is not designed to be run from the command line, 

59 and its arguments are completely determined by the choice of dataset, 

60 this config includes settings that would normally be passed as command-line 

61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

62 """ 

63 

64 dataIngester = pexConfig.ConfigurableField( 

65 target=IngestTask, 

66 doc="Task used to perform raw data ingestion.", 

67 ) 

68 # Normally file patterns should be user input, but put them in a config so 

69 # the ap_verify dataset can configure them 

70 dataFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=["*.fits", "*.fz", "*.fits.gz"], 

73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

74 ) 

75 dataBadFiles = pexConfig.ListField( 

76 dtype=str, 

77 default=[], 

78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

79 "supersedes ``dataFiles``.", 

80 ) 

81 

82 calibIngester = pexConfig.ConfigurableField( 

83 target=IngestCalibsTask, 

84 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

85 ) 

86 calibFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=["*.fits", "*.fz", "*.fits.gz"], 

89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

90 ) 

91 calibBadFiles = pexConfig.ListField( 

92 dtype=str, 

93 default=[], 

94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

95 ) 

96 calibValidity = pexConfig.Field( 

97 dtype=int, 

98 default=9999, 

99 doc="Calibration validity period (days). Assumed equal for all calib types.") 

100 

101 curatedCalibPaths = pexConfig.ListField( 

102 dtype=str, 

103 default=[], 

104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). " 

105 "Each path should be a directory which contains one subdirectory per sensor." 

106 ) 

107 curatedCalibIngester = pexConfig.ConfigurableField( 

108 target=IngestCuratedCalibsTask, 

109 doc="Task used to ingest curated calibs.", 

110 ) 

111 

112 refcats = pexConfig.DictField( 

113 keytype=str, 

114 itemtype=str, 

115 default={}, 

116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

117 ) 

118 

119 

120class DatasetIngestTask(pipeBase.Task): 

121 """Task for automating ingestion of a ap_verify dataset. 

122 

123 Each dataset configures this task as appropriate for the files it provides 

124 and the target instrument. Therefore, this task takes no input besides the 

125 ap_verify dataset to load and the repositories to ingest to. 

126 """ 

127 

128 ConfigClass = DatasetIngestConfig 

129 _DefaultName = "datasetIngest" 

130 

131 def __init__(self, *args, **kwargs): 

132 pipeBase.Task.__init__(self, *args, **kwargs) 

133 self.makeSubtask("dataIngester") 

134 self.makeSubtask("calibIngester") 

135 self.makeSubtask("curatedCalibIngester") 

136 

137 def run(self, dataset, workspace): 

138 """Ingest the contents of a dataset into a Butler repository. 

139 

140 Parameters 

141 ---------- 

142 dataset : `lsst.ap.verify.dataset.Dataset` 

143 The dataset to be ingested. 

144 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

145 The abstract location where ingestion repositories will be created. 

146 If the repositories already exist, they must support the same 

147 ``obs`` package as this task's subtasks. 

148 """ 

149 # We're assuming ingest tasks always give absolute path to butler 

150 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

151 self._ingestRaws(dataset, workspace) 

152 self._ingestCalibs(dataset, workspace) 

153 self._ingestCuratedCalibs(dataset, workspace) 

154 self._ingestRefcats(dataset, workspace) 

155 self._copyConfigs(dataset, workspace) 

156 

157 def _ingestRaws(self, dataset, workspace): 

158 """Ingest the science data for use by LSST. 

159 

160 After this method returns, the data repository in ``workspace`` shall 

161 contain all science data from ``dataset``. Butler operations on the 

162 repository shall not be able to modify ``dataset``. 

163 

164 Parameters 

165 ---------- 

166 dataset : `lsst.ap.verify.dataset.Dataset` 

167 The dataset on which the pipeline will be run. 

168 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

169 The location containing all ingestion repositories. 

170 

171 Raises 

172 ------ 

173 RuntimeError 

174 Raised if there are no files to ingest. 

175 """ 

176 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

177 self.log.info("Raw images were previously ingested, skipping...") 

178 else: 

179 self.log.info("Ingesting raw images...") 

180 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

181 if dataFiles: 

182 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

183 dataFiles, self.config.dataBadFiles) 

184 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

185 else: 

186 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

187 

188 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

189 """Ingest raw images into a repository. 

190 

191 ``repo`` shall be populated with *links* to ``dataFiles``. 

192 

193 Parameters 

194 ---------- 

195 repo : `str` 

196 The output repository location on disk for raw images. Must exist. 

197 calibRepo : `str` 

198 The output calibration repository location on disk. 

199 dataFiles : `list` of `str` 

200 A list of filenames to ingest. May contain wildcards. 

201 badFiles : `list` of `str` 

202 A list of filenames to exclude from ingestion. Must not contain paths. 

203 May contain wildcards. 

204 

205 Raises 

206 ------ 

207 RuntimeError 

208 Raised if ``dataFiles`` is empty. 

209 """ 

210 if not dataFiles: 

211 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

212 

213 args = [repo, "--calib", calibRepo, "--mode", "link"] 

214 args.extend(dataFiles) 

215 if badFiles: 

216 args.append('--badFile') 

217 args.extend(badFiles) 

218 try: 

219 _runIngestTask(self.dataIngester, args) 

220 except sqlite3.IntegrityError as detail: 

221 raise RuntimeError("Not all raw files are unique") from detail 

222 

223 def _ingestCalibs(self, dataset, workspace): 

224 """Ingest the calibration files for use by LSST. 

225 

226 After this method returns, the calibration repository in ``workspace`` 

227 shall contain all calibration data from ``dataset``. Butler operations 

228 on the repository shall not be able to modify ``dataset``. 

229 

230 Parameters 

231 ---------- 

232 dataset : `lsst.ap.verify.dataset.Dataset` 

233 The dataset on which the pipeline will be run. 

234 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

235 The location containing all ingestion repositories. 

236 

237 Raises 

238 ------ 

239 RuntimeError 

240 Raised if there are no files to ingest. 

241 """ 

242 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

243 self.log.info("Calibration files were previously ingested, skipping...") 

244 else: 

245 self.log.info("Ingesting calibration files...") 

246 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

247 self.config.calibFiles, self.config.calibBadFiles) 

248 if calibDataFiles: 

249 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

250 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

251 workspace.dataRepo, workspace.calibRepo)) 

252 else: 

253 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

254 

255 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

256 """Ingest calibration images into a calibration repository. 

257 

258 Parameters 

259 ---------- 

260 repo : `str` 

261 The output repository location on disk for raw images. Must exist. 

262 calibRepo : `str` 

263 The output repository location on disk for calibration files. Must 

264 exist. 

265 calibDataFiles : `list` of `str` 

266 A list of filenames to ingest. Supported files vary by instrument 

267 but may include flats, biases, darks, fringes, or sky. May contain 

268 wildcards. 

269 

270 Raises 

271 ------ 

272 RuntimeError 

273 Raised if ``calibDataFiles`` is empty. 

274 """ 

275 if not calibDataFiles: 

276 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

277 % calibDataFiles) 

278 

279 # TODO: --output is workaround for DM-11668 

280 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

281 "--mode", "link", "--validity", str(self.config.calibValidity)] 

282 args.extend(calibDataFiles) 

283 try: 

284 _runIngestTask(self.calibIngester, args) 

285 except sqlite3.IntegrityError as detail: 

286 raise RuntimeError("Not all calibration files are unique") from detail 

287 

288 def _ingestCuratedCalibs(self, dataset, workspace): 

289 """Ingest the curated calib files for use by LSST. 

290 

291 After this method returns, the calibration repository in ``workspace`` 

292 shall contain all curated calibs mentioned in curatedCalibPaths. Butler 

293 operations on the repository shall not be able to modify ``dataset``. 

294 

295 Parameters 

296 ---------- 

297 dataset : `lsst.ap.verify.dataset.Dataset` 

298 The dataset on which the pipeline will be run. 

299 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

300 The location containing all ingestion repositories. 

301 """ 

302 for curated in self.config.curatedCalibPaths: 

303 self.log.info("Ingesting curated calibs...") 

304 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated) 

305 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo)) 

306 

307 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath): 

308 """Ingest curated calib data. 

309 

310 Parameters 

311 ---------- 

312 repo : `str` 

313 The output repository location on disk for raw images. Must exist. 

314 calibRepo : `str` 

315 The output repository location on disk for calibration files. Must 

316 exist. 

317 curatedPath : `str` 

318 Path to the curated calibs in standard text form. This is probably 

319 a path in ``obs_*_data``. 

320 """ 

321 

322 curatedargs = [repo, curatedPath, "--calib", calibRepo] 

323 try: 

324 _runIngestTask(self.curatedCalibIngester, curatedargs) 

325 except sqlite3.IntegrityError as detail: 

326 raise RuntimeError("Not all curated calib files are unique") from detail 

327 

328 def _ingestRefcats(self, dataset, workspace): 

329 """Ingest the refcats for use by LSST. 

330 

331 After this method returns, the data repository in ``workspace`` shall 

332 contain all reference catalogs from ``dataset``. Operations on the 

333 repository shall not be able to modify ``dataset``. 

334 

335 Parameters 

336 ---------- 

337 dataset : `lsst.ap.verify.dataset.Dataset` 

338 The dataset on which the pipeline will be run. 

339 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

340 The location containing all ingestion repositories. 

341 

342 Notes 

343 ----- 

344 Refcats are not, at present, registered as part of the repository. They 

345 are not guaranteed to be visible to anything other than a 

346 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

347 for more details. 

348 """ 

349 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

350 self.log.info("Refcats were previously ingested, skipping...") 

351 else: 

352 self.log.info("Ingesting reference catalogs...") 

353 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

354 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

355 

356 def _doIngestRefcats(self, repo, refcats): 

357 """Place refcats inside a particular repository. 

358 

359 Parameters 

360 ---------- 

361 repo : `str` 

362 The output repository location on disk for raw images. Must exist. 

363 refcats : `str` 

364 A directory containing .tar.gz files with LSST-formatted astrometric 

365 or photometric reference catalog information. 

366 """ 

367 for refcatName, tarball in self.config.refcats.items(): 

368 tarball = os.path.join(refcats, tarball) 

369 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

370 with tarfile.open(tarball, "r") as opened: 

371 opened.extractall(refcatDir) 

372 

373 def _copyConfigs(self, dataset, workspace): 

374 """Give a workspace a copy of all configs associated with the ingested data. 

375 

376 After this method returns, the config directory in ``workspace`` shall 

377 contain all config files from ``dataset``. 

378 

379 Parameters 

380 ---------- 

381 dataset : `lsst.ap.verify.dataset.Dataset` 

382 The dataset on which the pipeline will be run. 

383 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

384 The location containing the config directory. 

385 """ 

386 if os.listdir(workspace.configDir): 

387 self.log.info("Configs already copied, skipping...") 

388 else: 

389 self.log.info("Storing data-specific configs...") 

390 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

391 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

392 

393 def _doCopyConfigs(self, destination, source): 

394 """Place configs inside a particular repository. 

395 

396 Parameters 

397 ---------- 

398 destination : `str` 

399 The directory to which the configs must be copied. Must exist. 

400 source : `str` 

401 A directory containing Task config files. 

402 """ 

403 for configFile in _findMatchingFiles(source, ['*.py']): 

404 shutil.copy2(configFile, destination) 

405 

406 

407class Gen3DatasetIngestConfig(pexConfig.Config): 

408 """Settings and defaults for `Gen3DatasetIngestTask`. 

409 

410 The correct target for `ingester` can be found in the documentation of 

411 the appropriate ``obs`` package. 

412 """ 

413 

414 ingester = pexConfig.ConfigurableField( 

415 target=lsst.obs.base.RawIngestTask, 

416 doc="Task used to perform raw data ingestion.", 

417 ) 

418 visitDefiner = pexConfig.ConfigurableField( 

419 target=lsst.obs.base.DefineVisitsTask, 

420 doc="Task used to organize raw exposures into visits.", 

421 ) 

422 # Normally file patterns should be user input, but put them in a config so 

423 # the ap_verify dataset can configure them 

424 dataFiles = pexConfig.ListField( 

425 dtype=str, 

426 default=["*.fits", "*.fz", "*.fits.gz"], 

427 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

428 ) 

429 dataBadFiles = pexConfig.ListField( 

430 dtype=str, 

431 default=[], 

432 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

433 "supersedes ``dataFiles``.", 

434 ) 

435 

436 

437class Gen3DatasetIngestTask(pipeBase.Task): 

438 """Task for automating ingestion of a ap_verify dataset. 

439 

440 Each dataset configures this task as appropriate for the files it provides 

441 and the target instrument. Therefore, this task takes no input besides the 

442 ap_verify dataset to load and the repositories to ingest to. 

443 

444 Parameters 

445 ---------- 

446 dataset : `lsst.ap.verify.dataset.Dataset` 

447 The ``ap_verify`` dataset to be ingested. 

448 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

449 The abstract location for all ``ap_verify`` outputs, including 

450 a Gen 3 repository. 

451 """ 

452 

453 ConfigClass = Gen3DatasetIngestConfig 

454 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides 

455 _DefaultName = "datasetIngest-gen3" 

456 

457 def __init__(self, dataset, workspace, *args, **kwargs): 

458 super().__init__(*args, **kwargs) 

459 self.workspace = workspace 

460 self.dataset = dataset 

461 # workspace.workButler is undefined until the repository is created 

462 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

463 self.makeSubtask("ingester", butler=self.workspace.workButler) 

464 self.makeSubtask("visitDefiner", butler=self.workspace.workButler) 

465 

466 def run(self): 

467 """Ingest the contents of a dataset into a Butler repository. 

468 """ 

469 self._ensureRaws() 

470 self._defineVisits() 

471 self._copyConfigs() 

472 

473 def _ensureRaws(self): 

474 """Ensure that the repository in ``workspace`` has raws ingested. 

475 

476 After this method returns, this task's repository contains all science 

477 data from this task's ap_verify dataset. Butler operations on the 

478 repository are not able to modify ``dataset`` in any way. 

479 

480 Raises 

481 ------ 

482 RuntimeError 

483 Raised if there are no files to ingest. 

484 """ 

485 # TODO: regex is workaround for DM-25945 

486 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

487 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

488 if rawCollections: 

489 self.log.info("Raw images for %s were previously ingested, skipping...", 

490 self.dataset.instrument.getName()) 

491 else: 

492 self.log.info("Ingesting raw images...") 

493 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

494 exclude=self.config.dataBadFiles) 

495 if dataFiles: 

496 self._ingestRaws(dataFiles) 

497 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

498 else: 

499 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

500 

501 def _ingestRaws(self, dataFiles): 

502 """Ingest raw images into a repository. 

503 

504 This task's repository is populated with *links* to ``dataFiles``. 

505 

506 Parameters 

507 ---------- 

508 dataFiles : `list` of `str` 

509 A list of filenames to ingest. May contain wildcards. 

510 

511 Raises 

512 ------ 

513 RuntimeError 

514 Raised if ``dataFiles`` is empty or any file has already been ingested. 

515 """ 

516 if not dataFiles: 

517 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

518 

519 try: 

520 self.ingester.run(dataFiles, run=None) # expect ingester to name a new collection 

521 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

522 raise RuntimeError("Not all raw files are unique") from detail 

523 

524 def _defineVisits(self): 

525 """Map visits to the ingested exposures. 

526 

527 This step is necessary to be able to run most pipelines on raw datasets. 

528 

529 Raises 

530 ------ 

531 RuntimeError 

532 Raised if there are no exposures in the repository. 

533 """ 

534 exposures = set(self.workspace.workButler.registry.queryDimensions(["exposure"])) 

535 if not exposures: 

536 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.") 

537 

538 exposureKeys = list(exposures)[0].graph 

539 exposuresWithVisits = {x.subset(exposureKeys) for x in 

540 self.workspace.workButler.registry.queryDimensions(["exposure", "visit"])} 

541 exposuresNoVisits = exposures - exposuresWithVisits 

542 if exposuresNoVisits: 

543 self.log.info("Defining visits...") 

544 self.visitDefiner.run(exposuresNoVisits) 

545 else: 

546 self.log.info("Visits were previously defined, skipping...") 

547 

548 def _copyConfigs(self): 

549 """Give a workspace a copy of all configs associated with the 

550 ingested data. 

551 

552 After this method returns, the config directory in the workspace 

553 contains all config files from the ap_verify dataset. 

554 """ 

555 if os.listdir(self.workspace.configDir): 

556 self.log.info("Configs already copied, skipping...") 

557 else: 

558 self.log.info("Storing data-specific configs...") 

559 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

560 shutil.copy2(configFile, self.workspace.configDir) 

561 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir)) 

562 

563 

564def ingestDataset(dataset, workspace): 

565 """Ingest the contents of an ap_veify dataset into a Butler repository. 

566 

567 The original data directory shall not be modified. 

568 

569 Parameters 

570 ---------- 

571 dataset : `lsst.ap.verify.dataset.Dataset` 

572 The ap_verify dataset to be ingested. 

573 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

574 The abstract location where ingestion repositories will be created. 

575 If the repositories already exist, they must be compatible with 

576 ``dataset`` (in particular, they must support the relevant 

577 ``obs`` package). 

578 """ 

579 # TODO: generalize to support arbitrary URIs (DM-11482) 

580 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

581 

582 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset)) 

583 ingester.run(dataset, workspace) 

584 log.info("Data ingested") 

585 

586 

587def ingestDatasetGen3(dataset, workspace): 

588 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

589 

590 The original data directory is not modified. 

591 

592 Parameters 

593 ---------- 

594 dataset : `lsst.ap.verify.dataset.Dataset` 

595 The ap_verify dataset to be ingested. 

596 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

597 The abstract location where the epository is be created, if it does 

598 not already exist. 

599 """ 

600 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

601 

602 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

603 ingester.run() 

604 log.info("Data ingested") 

605 

606 

607def _getConfig(task, dataset): 

608 """Return the ingestion config associated with a specific dataset. 

609 

610 Parameters 

611 ---------- 

612 task : `lsst.pipe.base.Task`-type 

613 The task whose config is needed 

614 dataset : `lsst.ap.verify.dataset.Dataset` 

615 The dataset whose ingestion config is desired. 

616 

617 Returns 

618 ------- 

619 config : ``task.ConfigClass`` 

620 The config for running ``task`` on ``dataset``. 

621 """ 

622 # Can't use dataset.instrument.applyConfigOverrides for this, because the 

623 # dataset might not have Gen 3 support. 

624 overrideFile = task._DefaultName + ".py" 

625 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

626 

627 config = task.ConfigClass() 

628 for path in [ 

629 os.path.join(packageDir, 'config'), 

630 os.path.join(packageDir, 'config', dataset.camera), 

631 dataset.configLocation, 

632 ]: 

633 overridePath = os.path.join(path, overrideFile) 

634 if os.path.exists(overridePath): 

635 config.load(overridePath) 

636 return config 

637 

638 

639def _runIngestTask(task, args): 

640 """Run an ingestion task on a set of inputs. 

641 

642 Parameters 

643 ---------- 

644 task : `lsst.pipe.tasks.IngestTask` 

645 The task to run. 

646 args : list of command-line arguments, split using Python conventions 

647 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

648 """ 

649 argumentParser = task.ArgumentParser(name=task.getName()) 

650 try: 

651 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

652 except SystemExit as e: 

653 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

654 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

655 task.run(parsedCmd) 

656 

657 

658def _findMatchingFiles(basePath, include, exclude=None): 

659 """Recursively identify files matching one set of patterns and not matching another. 

660 

661 Parameters 

662 ---------- 

663 basePath : `str` 

664 The path on disk where the files in ``include`` are located. 

665 include : iterable of `str` 

666 A collection of files (with wildcards) to include. Must not 

667 contain paths. 

668 exclude : iterable of `str`, optional 

669 A collection of filenames (with wildcards) to exclude. Must not 

670 contain paths. If omitted, all files matching ``include`` are returned. 

671 

672 Returns 

673 ------- 

674 files : `set` of `str` 

675 The files in ``basePath`` or any subdirectory that match ``include`` 

676 but not ``exclude``. 

677 """ 

678 _exclude = exclude if exclude is not None else [] 

679 

680 allFiles = set() 

681 for pattern in include: 

682 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

683 

684 for pattern in _exclude: 

685 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

686 allFiles.difference_update(excludedFiles) 

687 return allFiles