Coverage for python/lsst/ap/verify/ingestion.py: 25%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

214 statements  

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36import tarfile 

37from glob import glob 

38import sqlite3 

39import logging 

40 

41import lsst.utils 

42import lsst.pex.config as pexConfig 

43import lsst.pipe.base as pipeBase 

44 

45import lsst.daf.butler 

46import lsst.obs.base 

47from lsst.pipe.tasks.ingest import IngestTask 

48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

50 

51_LOG = logging.getLogger(__name__) 

52 

53 

54class DatasetIngestConfig(pexConfig.Config): 

55 """Settings and defaults for `DatasetIngestTask`. 

56 

57 The correct targets for this task's subtasks can be found in the 

58 documentation of the appropriate ``obs`` package. 

59 

60 Because `DatasetIngestTask` is not designed to be run from the command line, 

61 and its arguments are completely determined by the choice of dataset, 

62 this config includes settings that would normally be passed as command-line 

63 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

64 """ 

65 

66 dataIngester = pexConfig.ConfigurableField( 

67 target=IngestTask, 

68 doc="Task used to perform raw data ingestion.", 

69 ) 

70 # Normally file patterns should be user input, but put them in a config so 

71 # the ap_verify dataset can configure them 

72 dataFiles = pexConfig.ListField( 

73 dtype=str, 

74 default=["*.fits", "*.fz", "*.fits.gz"], 

75 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

76 ) 

77 dataBadFiles = pexConfig.ListField( 

78 dtype=str, 

79 default=[], 

80 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

81 "supersedes ``dataFiles``.", 

82 ) 

83 

84 calibIngester = pexConfig.ConfigurableField( 

85 target=IngestCalibsTask, 

86 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

87 ) 

88 calibFiles = pexConfig.ListField( 

89 dtype=str, 

90 default=["*.fits", "*.fz", "*.fits.gz"], 

91 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

92 ) 

93 calibBadFiles = pexConfig.ListField( 

94 dtype=str, 

95 default=[], 

96 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

97 ) 

98 calibValidity = pexConfig.Field( 

99 dtype=int, 

100 default=9999, 

101 doc="Calibration validity period (days). Assumed equal for all calib types.") 

102 

103 curatedCalibPaths = pexConfig.ListField( 

104 dtype=str, 

105 default=[], 

106 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). " 

107 "Each path should be a directory which contains one subdirectory per sensor." 

108 ) 

109 curatedCalibIngester = pexConfig.ConfigurableField( 

110 target=IngestCuratedCalibsTask, 

111 doc="Task used to ingest curated calibs.", 

112 ) 

113 

114 refcats = pexConfig.DictField( 

115 keytype=str, 

116 itemtype=str, 

117 default={}, 

118 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

119 ) 

120 

121 def setDefaults(self): 

122 # Can't easily check for prior curated ingestion, so make it not matter 

123 self.curatedCalibIngester.clobber = True 

124 

125 

126class DatasetIngestTask(pipeBase.Task): 

127 """Task for automating ingestion of a ap_verify dataset. 

128 

129 Each dataset configures this task as appropriate for the files it provides 

130 and the target instrument. Therefore, this task takes no input besides the 

131 ap_verify dataset to load and the repositories to ingest to. 

132 """ 

133 

134 ConfigClass = DatasetIngestConfig 

135 _DefaultName = "datasetIngest" 

136 

137 def __init__(self, *args, **kwargs): 

138 pipeBase.Task.__init__(self, *args, **kwargs) 

139 self.makeSubtask("dataIngester") 

140 self.makeSubtask("calibIngester") 

141 self.makeSubtask("curatedCalibIngester") 

142 

143 def run(self, dataset, workspace): 

144 """Ingest the contents of a dataset into a Butler repository. 

145 

146 Parameters 

147 ---------- 

148 dataset : `lsst.ap.verify.dataset.Dataset` 

149 The dataset to be ingested. 

150 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

151 The abstract location where ingestion repositories will be created. 

152 If the repositories already exist, they must support the same 

153 ``obs`` package as this task's subtasks. 

154 """ 

155 # We're assuming ingest tasks always give absolute path to butler 

156 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

157 self._ingestRaws(dataset, workspace) 

158 self._ingestCalibs(dataset, workspace) 

159 self._ingestCuratedCalibs(dataset, workspace) 

160 self._ingestRefcats(dataset, workspace) 

161 self._copyConfigs(dataset, workspace) 

162 

163 def _ingestRaws(self, dataset, workspace): 

164 """Ingest the science data for use by LSST. 

165 

166 After this method returns, the data repository in ``workspace`` shall 

167 contain all science data from ``dataset``. Butler operations on the 

168 repository shall not be able to modify ``dataset``. 

169 

170 Parameters 

171 ---------- 

172 dataset : `lsst.ap.verify.dataset.Dataset` 

173 The dataset on which the pipeline will be run. 

174 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

175 The location containing all ingestion repositories. 

176 

177 Raises 

178 ------ 

179 RuntimeError 

180 Raised if there are no files to ingest. 

181 """ 

182 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

183 self.log.info("Raw images were previously ingested, skipping...") 

184 else: 

185 self.log.info("Ingesting raw images...") 

186 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

187 if dataFiles: 

188 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

189 dataFiles, self.config.dataBadFiles) 

190 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

191 else: 

192 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

193 

194 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

195 """Ingest raw images into a repository. 

196 

197 ``repo`` shall be populated with *links* to ``dataFiles``. 

198 

199 Parameters 

200 ---------- 

201 repo : `str` 

202 The output repository location on disk for raw images. Must exist. 

203 calibRepo : `str` 

204 The output calibration repository location on disk. 

205 dataFiles : `list` of `str` 

206 A list of filenames to ingest. May contain wildcards. 

207 badFiles : `list` of `str` 

208 A list of filenames to exclude from ingestion. Must not contain paths. 

209 May contain wildcards. 

210 

211 Raises 

212 ------ 

213 RuntimeError 

214 Raised if ``dataFiles`` is empty. 

215 """ 

216 if not dataFiles: 

217 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

218 

219 args = [repo, "--calib", calibRepo, "--mode", "link"] 

220 args.extend(dataFiles) 

221 if badFiles: 

222 args.append('--badFile') 

223 args.extend(badFiles) 

224 try: 

225 _runIngestTask(self.dataIngester, args) 

226 except sqlite3.IntegrityError as detail: 

227 raise RuntimeError("Not all raw files are unique") from detail 

228 

229 def _ingestCalibs(self, dataset, workspace): 

230 """Ingest the calibration files for use by LSST. 

231 

232 After this method returns, the calibration repository in ``workspace`` 

233 shall contain all calibration data from ``dataset``. Butler operations 

234 on the repository shall not be able to modify ``dataset``. 

235 

236 Parameters 

237 ---------- 

238 dataset : `lsst.ap.verify.dataset.Dataset` 

239 The dataset on which the pipeline will be run. 

240 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

241 The location containing all ingestion repositories. 

242 

243 Raises 

244 ------ 

245 RuntimeError 

246 Raised if there are no files to ingest. 

247 """ 

248 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

249 self.log.info("Calibration files were previously ingested, skipping...") 

250 else: 

251 self.log.info("Ingesting calibration files...") 

252 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

253 self.config.calibFiles, self.config.calibBadFiles) 

254 if calibDataFiles: 

255 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

256 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

257 workspace.dataRepo, workspace.calibRepo)) 

258 else: 

259 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

260 

261 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

262 """Ingest calibration images into a calibration repository. 

263 

264 Parameters 

265 ---------- 

266 repo : `str` 

267 The output repository location on disk for raw images. Must exist. 

268 calibRepo : `str` 

269 The output repository location on disk for calibration files. Must 

270 exist. 

271 calibDataFiles : `list` of `str` 

272 A list of filenames to ingest. Supported files vary by instrument 

273 but may include flats, biases, darks, fringes, or sky. May contain 

274 wildcards. 

275 

276 Raises 

277 ------ 

278 RuntimeError 

279 Raised if ``calibDataFiles`` is empty. 

280 """ 

281 if not calibDataFiles: 

282 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

283 % calibDataFiles) 

284 

285 # TODO: --output is workaround for DM-11668 

286 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

287 "--mode", "link", "--validity", str(self.config.calibValidity)] 

288 args.extend(calibDataFiles) 

289 try: 

290 _runIngestTask(self.calibIngester, args) 

291 except sqlite3.IntegrityError as detail: 

292 raise RuntimeError("Not all calibration files are unique") from detail 

293 

294 def _ingestCuratedCalibs(self, dataset, workspace): 

295 """Ingest the curated calib files for use by LSST. 

296 

297 After this method returns, the calibration repository in ``workspace`` 

298 shall contain all curated calibs mentioned in curatedCalibPaths. Butler 

299 operations on the repository shall not be able to modify ``dataset``. 

300 

301 Parameters 

302 ---------- 

303 dataset : `lsst.ap.verify.dataset.Dataset` 

304 The dataset on which the pipeline will be run. 

305 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

306 The location containing all ingestion repositories. 

307 """ 

308 for curated in self.config.curatedCalibPaths: 

309 # Can't easily check for prior ingestion; workaround in config 

310 self.log.info("Ingesting curated calibs...") 

311 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated) 

312 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo)) 

313 

314 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath): 

315 """Ingest curated calib data. 

316 

317 Parameters 

318 ---------- 

319 repo : `str` 

320 The output repository location on disk for raw images. Must exist. 

321 calibRepo : `str` 

322 The output repository location on disk for calibration files. Must 

323 exist. 

324 curatedPath : `str` 

325 Path to the curated calibs in standard text form. This is probably 

326 a path in ``obs_*_data``. 

327 """ 

328 

329 curatedargs = [repo, curatedPath, "--calib", calibRepo, "--ignore-ingested"] 

330 try: 

331 _runIngestTask(self.curatedCalibIngester, curatedargs) 

332 except sqlite3.IntegrityError as detail: 

333 raise RuntimeError("Not all curated calib files are unique") from detail 

334 

335 def _ingestRefcats(self, dataset, workspace): 

336 """Ingest the refcats for use by LSST. 

337 

338 After this method returns, the data repository in ``workspace`` shall 

339 contain all reference catalogs from ``dataset``. Operations on the 

340 repository shall not be able to modify ``dataset``. 

341 

342 Parameters 

343 ---------- 

344 dataset : `lsst.ap.verify.dataset.Dataset` 

345 The dataset on which the pipeline will be run. 

346 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

347 The location containing all ingestion repositories. 

348 

349 Notes 

350 ----- 

351 Refcats are not, at present, registered as part of the repository. They 

352 are not guaranteed to be visible to anything other than a 

353 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

354 for more details. 

355 """ 

356 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

357 self.log.info("Refcats were previously ingested, skipping...") 

358 else: 

359 self.log.info("Ingesting reference catalogs...") 

360 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

361 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

362 

363 def _doIngestRefcats(self, repo, refcats): 

364 """Place refcats inside a particular repository. 

365 

366 Parameters 

367 ---------- 

368 repo : `str` 

369 The output repository location on disk for raw images. Must exist. 

370 refcats : `str` 

371 A directory containing .tar.gz files with LSST-formatted astrometric 

372 or photometric reference catalog information. 

373 """ 

374 for refcatName, tarball in self.config.refcats.items(): 

375 tarball = os.path.join(refcats, tarball) 

376 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

377 with tarfile.open(tarball, "r") as opened: 

378 opened.extractall(refcatDir) 

379 

380 def _copyConfigs(self, dataset, workspace): 

381 """Give a workspace a copy of all configs associated with the ingested data. 

382 

383 After this method returns, the config directory in ``workspace`` shall 

384 contain all config files from ``dataset``. 

385 

386 Parameters 

387 ---------- 

388 dataset : `lsst.ap.verify.dataset.Dataset` 

389 The dataset on which the pipeline will be run. 

390 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

391 The location containing the config directory. 

392 """ 

393 if os.listdir(workspace.configDir): 

394 self.log.info("Configs already copied, skipping...") 

395 else: 

396 self.log.info("Storing data-specific configs...") 

397 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

398 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

399 

400 def _doCopyConfigs(self, destination, source): 

401 """Place configs inside a particular repository. 

402 

403 Parameters 

404 ---------- 

405 destination : `str` 

406 The directory to which the configs must be copied. Must exist. 

407 source : `str` 

408 A directory containing Task config files. 

409 """ 

410 for configFile in _findMatchingFiles(source, ['*.py']): 

411 shutil.copy2(configFile, destination) 

412 

413 

414class Gen3DatasetIngestConfig(pexConfig.Config): 

415 """Settings and defaults for `Gen3DatasetIngestTask`. 

416 

417 The correct target for `ingester` can be found in the documentation of 

418 the appropriate ``obs`` package. 

419 """ 

420 

421 ingester = pexConfig.ConfigurableField( 

422 target=lsst.obs.base.RawIngestTask, 

423 doc="Task used to perform raw data ingestion.", 

424 ) 

425 visitDefiner = pexConfig.ConfigurableField( 

426 target=lsst.obs.base.DefineVisitsTask, 

427 doc="Task used to organize raw exposures into visits.", 

428 ) 

429 # Normally file patterns should be user input, but put them in a config so 

430 # the ap_verify dataset can configure them 

431 dataFiles = pexConfig.ListField( 

432 dtype=str, 

433 default=["*.fits", "*.fz", "*.fits.gz"], 

434 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

435 ) 

436 dataBadFiles = pexConfig.ListField( 

437 dtype=str, 

438 default=[], 

439 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

440 "supersedes ``dataFiles``.", 

441 ) 

442 

443 

444class Gen3DatasetIngestTask(pipeBase.Task): 

445 """Task for automating ingestion of a ap_verify dataset. 

446 

447 Each dataset configures this task as appropriate for the files it provides 

448 and the target instrument. Therefore, this task takes no input besides the 

449 ap_verify dataset to load and the repositories to ingest to. 

450 

451 Parameters 

452 ---------- 

453 dataset : `lsst.ap.verify.dataset.Dataset` 

454 The ``ap_verify`` dataset to be ingested. 

455 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

456 The abstract location for all ``ap_verify`` outputs, including 

457 a Gen 3 repository. 

458 """ 

459 

460 ConfigClass = Gen3DatasetIngestConfig 

461 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides 

462 _DefaultName = "datasetIngest-gen3" 

463 

464 def __init__(self, dataset, workspace, *args, **kwargs): 

465 super().__init__(*args, **kwargs) 

466 self.workspace = workspace 

467 self.dataset = dataset 

468 # workspace.workButler is undefined until the repository is created 

469 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

470 self.makeSubtask("ingester", butler=self.workspace.workButler) 

471 self.makeSubtask("visitDefiner", butler=self.workspace.workButler) 

472 

473 def _reduce_kwargs(self): 

474 # Add extra parameters to pickle 

475 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace) 

476 

477 def run(self, processes=1): 

478 """Ingest the contents of a dataset into a Butler repository. 

479 

480 Parameters 

481 ---------- 

482 processes : `int` 

483 The number processes to use to ingest. 

484 """ 

485 self._ensureRaws(processes=processes) 

486 self._defineVisits(processes=processes) 

487 self._copyConfigs() 

488 

489 def _ensureRaws(self, processes): 

490 """Ensure that the repository in ``workspace`` has raws ingested. 

491 

492 After this method returns, this task's repository contains all science 

493 data from this task's ap_verify dataset. Butler operations on the 

494 repository are not able to modify ``dataset`` in any way. 

495 

496 Parameters 

497 ---------- 

498 processes : `int` 

499 The number processes to use to ingest, if ingestion must be run. 

500 

501 Raises 

502 ------ 

503 RuntimeError 

504 Raised if there are no files to ingest. 

505 """ 

506 # TODO: regex is workaround for DM-25945 

507 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

508 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

509 rawData = list(self.workspace.workButler.registry.queryDatasets( 

510 'raw', 

511 collections=rawCollections, 

512 dataId={"instrument": self.dataset.instrument.getName()})) \ 

513 if rawCollections else [] 

514 

515 if rawData: 

516 self.log.info("Raw images for %s were previously ingested, skipping...", 

517 self.dataset.instrument.getName()) 

518 else: 

519 self.log.info("Ingesting raw images...") 

520 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

521 exclude=self.config.dataBadFiles) 

522 if dataFiles: 

523 self._ingestRaws(dataFiles, processes=processes) 

524 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

525 else: 

526 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

527 

528 def _ingestRaws(self, dataFiles, processes): 

529 """Ingest raw images into a repository. 

530 

531 This task's repository is populated with *links* to ``dataFiles``. 

532 

533 Parameters 

534 ---------- 

535 dataFiles : `list` of `str` 

536 A list of filenames to ingest. May contain wildcards. 

537 processes : `int` 

538 The number processes to use to ingest. 

539 

540 Raises 

541 ------ 

542 RuntimeError 

543 Raised if ``dataFiles`` is empty or any file has already been ingested. 

544 """ 

545 if not dataFiles: 

546 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

547 

548 try: 

549 # run=None because expect ingester to name a new collection 

550 self.ingester.run(dataFiles, run=None, processes=processes) 

551 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

552 raise RuntimeError("Not all raw files are unique") from detail 

553 

554 def _defineVisits(self, processes): 

555 """Map visits to the ingested exposures. 

556 

557 This step is necessary to be able to run most pipelines on raw datasets. 

558 

559 Parameters 

560 ---------- 

561 processes : `int` 

562 The number processes to use to define visits. 

563 

564 Raises 

565 ------ 

566 RuntimeError 

567 Raised if there are no exposures in the repository. 

568 """ 

569 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"])) 

570 if not exposures: 

571 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.") 

572 

573 exposureKeys = list(exposures)[0].graph 

574 exposuresWithVisits = {x.subset(exposureKeys) for x in 

575 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])} 

576 exposuresNoVisits = exposures - exposuresWithVisits 

577 if exposuresNoVisits: 

578 self.log.info("Defining visits...") 

579 self.visitDefiner.run(exposuresNoVisits, processes=processes) 

580 else: 

581 self.log.info("Visits were previously defined, skipping...") 

582 

583 def _copyConfigs(self): 

584 """Give a workspace a copy of all configs associated with the 

585 ingested data. 

586 

587 After this method returns, the config directory in the workspace 

588 contains all config files from the ap_verify dataset, and the 

589 pipelines directory in the workspace contains all pipeline files 

590 from the dataset. 

591 """ 

592 if os.listdir(self.workspace.pipelineDir): 

593 self.log.info("Configs already copied, skipping...") 

594 else: 

595 self.log.info("Storing data-specific configs...") 

596 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

597 shutil.copy2(configFile, self.workspace.configDir) 

598 self.log.info("Configs are now stored in %s.", self.workspace.configDir) 

599 for pipelineFile in _findMatchingFiles(self.dataset.pipelineLocation, ['*.yaml']): 

600 shutil.copy2(pipelineFile, self.workspace.pipelineDir) 

601 self.log.info("Configs are now stored in %s.", self.workspace.pipelineDir) 

602 

603 

604def ingestDataset(dataset, workspace): 

605 """Ingest the contents of an ap_veify dataset into a Butler repository. 

606 

607 The original data directory shall not be modified. 

608 

609 Parameters 

610 ---------- 

611 dataset : `lsst.ap.verify.dataset.Dataset` 

612 The ap_verify dataset to be ingested. 

613 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

614 The abstract location where ingestion repositories will be created. 

615 If the repositories already exist, they must be compatible with 

616 ``dataset`` (in particular, they must support the relevant 

617 ``obs`` package). 

618 """ 

619 # TODO: generalize to support arbitrary URIs (DM-11482) 

620 log = _LOG.getChild("ingestDataset") 

621 

622 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset)) 

623 ingester.run(dataset, workspace) 

624 log.info("Data ingested") 

625 

626 

627def ingestDatasetGen3(dataset, workspace, processes=1): 

628 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

629 

630 The original data directory is not modified. 

631 

632 Parameters 

633 ---------- 

634 dataset : `lsst.ap.verify.dataset.Dataset` 

635 The ap_verify dataset to be ingested. 

636 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

637 The abstract location where the epository is be created, if it does 

638 not already exist. 

639 processes : `int` 

640 The number processes to use to ingest. 

641 """ 

642 log = _LOG.getChild("ingestDataset") 

643 

644 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

645 ingester.run(processes=processes) 

646 log.info("Data ingested") 

647 

648 

649def _getConfig(task, dataset): 

650 """Return the ingestion config associated with a specific dataset. 

651 

652 Parameters 

653 ---------- 

654 task : `lsst.pipe.base.Task`-type 

655 The task whose config is needed 

656 dataset : `lsst.ap.verify.dataset.Dataset` 

657 The dataset whose ingestion config is desired. 

658 

659 Returns 

660 ------- 

661 config : ``task.ConfigClass`` 

662 The config for running ``task`` on ``dataset``. 

663 """ 

664 # Can't use dataset.instrument.applyConfigOverrides for this, because the 

665 # dataset might not have Gen 3 support. 

666 overrideFile = task._DefaultName + ".py" 

667 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

668 

669 config = task.ConfigClass() 

670 for path in [ 

671 os.path.join(packageDir, 'config'), 

672 os.path.join(packageDir, 'config', dataset.camera), 

673 dataset.configLocation, 

674 ]: 

675 overridePath = os.path.join(path, overrideFile) 

676 if os.path.exists(overridePath): 

677 config.load(overridePath) 

678 return config 

679 

680 

681def _runIngestTask(task, args): 

682 """Run an ingestion task on a set of inputs. 

683 

684 Parameters 

685 ---------- 

686 task : `lsst.pipe.tasks.IngestTask` 

687 The task to run. 

688 args : list of command-line arguments, split using Python conventions 

689 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

690 """ 

691 argumentParser = task.ArgumentParser(name=task.getName()) 

692 try: 

693 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

694 except SystemExit as e: 

695 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

696 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

697 task.run(parsedCmd) 

698 

699 

700def _findMatchingFiles(basePath, include, exclude=None): 

701 """Recursively identify files matching one set of patterns and not matching another. 

702 

703 Parameters 

704 ---------- 

705 basePath : `str` 

706 The path on disk where the files in ``include`` are located. 

707 include : iterable of `str` 

708 A collection of files (with wildcards) to include. Must not 

709 contain paths. 

710 exclude : iterable of `str`, optional 

711 A collection of filenames (with wildcards) to exclude. Must not 

712 contain paths. If omitted, all files matching ``include`` are returned. 

713 

714 Returns 

715 ------- 

716 files : `set` of `str` 

717 The files in ``basePath`` or any subdirectory that match ``include`` 

718 but not ``exclude``. 

719 """ 

720 _exclude = exclude if exclude is not None else [] 

721 

722 allFiles = set() 

723 for pattern in include: 

724 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

725 

726 for pattern in _exclude: 

727 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

728 allFiles.difference_update(excludedFiles) 

729 return allFiles