Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36import tarfile 

37from glob import glob 

38import sqlite3 

39 

40import lsst.utils 

41import lsst.log 

42import lsst.pex.config as pexConfig 

43import lsst.pipe.base as pipeBase 

44 

45import lsst.daf.butler 

46import lsst.obs.base 

47from lsst.pipe.tasks.ingest import IngestTask 

48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

50 

51 

52class DatasetIngestConfig(pexConfig.Config): 

53 """Settings and defaults for `DatasetIngestTask`. 

54 

55 The correct targets for this task's subtasks can be found in the 

56 documentation of the appropriate ``obs`` package. 

57 

58 Because `DatasetIngestTask` is not designed to be run from the command line, 

59 and its arguments are completely determined by the choice of dataset, 

60 this config includes settings that would normally be passed as command-line 

61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

62 """ 

63 

64 dataIngester = pexConfig.ConfigurableField( 

65 target=IngestTask, 

66 doc="Task used to perform raw data ingestion.", 

67 ) 

68 # Normally file patterns should be user input, but put them in a config so 

69 # the ap_verify dataset can configure them 

70 dataFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=["*.fits", "*.fz", "*.fits.gz"], 

73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

74 ) 

75 dataBadFiles = pexConfig.ListField( 

76 dtype=str, 

77 default=[], 

78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

79 "supersedes ``dataFiles``.", 

80 ) 

81 

82 calibIngester = pexConfig.ConfigurableField( 

83 target=IngestCalibsTask, 

84 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

85 ) 

86 calibFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=["*.fits", "*.fz", "*.fits.gz"], 

89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

90 ) 

91 calibBadFiles = pexConfig.ListField( 

92 dtype=str, 

93 default=[], 

94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

95 ) 

96 calibValidity = pexConfig.Field( 

97 dtype=int, 

98 default=9999, 

99 doc="Calibration validity period (days). Assumed equal for all calib types.") 

100 

101 curatedCalibPaths = pexConfig.ListField( 

102 dtype=str, 

103 default=[], 

104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). " 

105 "Each path should be a directory which contains one subdirectory per sensor." 

106 ) 

107 curatedCalibIngester = pexConfig.ConfigurableField( 

108 target=IngestCuratedCalibsTask, 

109 doc="Task used to ingest curated calibs.", 

110 ) 

111 

112 refcats = pexConfig.DictField( 

113 keytype=str, 

114 itemtype=str, 

115 default={}, 

116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

117 ) 

118 

119 

120class DatasetIngestTask(pipeBase.Task): 

121 """Task for automating ingestion of a ap_verify dataset. 

122 

123 Each dataset configures this task as appropriate for the files it provides 

124 and the target instrument. Therefore, this task takes no input besides the 

125 ap_verify dataset to load and the repositories to ingest to. 

126 """ 

127 

128 ConfigClass = DatasetIngestConfig 

129 _DefaultName = "datasetIngest" 

130 

131 def __init__(self, *args, **kwargs): 

132 pipeBase.Task.__init__(self, *args, **kwargs) 

133 self.makeSubtask("dataIngester") 

134 self.makeSubtask("calibIngester") 

135 self.makeSubtask("curatedCalibIngester") 

136 

137 def run(self, dataset, workspace): 

138 """Ingest the contents of a dataset into a Butler repository. 

139 

140 Parameters 

141 ---------- 

142 dataset : `lsst.ap.verify.dataset.Dataset` 

143 The dataset to be ingested. 

144 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

145 The abstract location where ingestion repositories will be created. 

146 If the repositories already exist, they must support the same 

147 ``obs`` package as this task's subtasks. 

148 """ 

149 # We're assuming ingest tasks always give absolute path to butler 

150 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

151 self._ingestRaws(dataset, workspace) 

152 self._ingestCalibs(dataset, workspace) 

153 self._ingestCuratedCalibs(dataset, workspace) 

154 self._ingestRefcats(dataset, workspace) 

155 self._copyConfigs(dataset, workspace) 

156 

157 def _ingestRaws(self, dataset, workspace): 

158 """Ingest the science data for use by LSST. 

159 

160 After this method returns, the data repository in ``workspace`` shall 

161 contain all science data from ``dataset``. Butler operations on the 

162 repository shall not be able to modify ``dataset``. 

163 

164 Parameters 

165 ---------- 

166 dataset : `lsst.ap.verify.dataset.Dataset` 

167 The dataset on which the pipeline will be run. 

168 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

169 The location containing all ingestion repositories. 

170 

171 Raises 

172 ------ 

173 RuntimeError 

174 Raised if there are no files to ingest. 

175 """ 

176 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

177 self.log.info("Raw images were previously ingested, skipping...") 

178 else: 

179 self.log.info("Ingesting raw images...") 

180 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

181 if dataFiles: 

182 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

183 dataFiles, self.config.dataBadFiles) 

184 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

185 else: 

186 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

187 

188 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

189 """Ingest raw images into a repository. 

190 

191 ``repo`` shall be populated with *links* to ``dataFiles``. 

192 

193 Parameters 

194 ---------- 

195 repo : `str` 

196 The output repository location on disk for raw images. Must exist. 

197 calibRepo : `str` 

198 The output calibration repository location on disk. 

199 dataFiles : `list` of `str` 

200 A list of filenames to ingest. May contain wildcards. 

201 badFiles : `list` of `str` 

202 A list of filenames to exclude from ingestion. Must not contain paths. 

203 May contain wildcards. 

204 

205 Raises 

206 ------ 

207 RuntimeError 

208 Raised if ``dataFiles`` is empty. 

209 """ 

210 if not dataFiles: 

211 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

212 

213 args = [repo, "--calib", calibRepo, "--mode", "link"] 

214 args.extend(dataFiles) 

215 if badFiles: 

216 args.append('--badFile') 

217 args.extend(badFiles) 

218 try: 

219 _runIngestTask(self.dataIngester, args) 

220 except sqlite3.IntegrityError as detail: 

221 raise RuntimeError("Not all raw files are unique") from detail 

222 

223 def _ingestCalibs(self, dataset, workspace): 

224 """Ingest the calibration files for use by LSST. 

225 

226 After this method returns, the calibration repository in ``workspace`` 

227 shall contain all calibration data from ``dataset``. Butler operations 

228 on the repository shall not be able to modify ``dataset``. 

229 

230 Parameters 

231 ---------- 

232 dataset : `lsst.ap.verify.dataset.Dataset` 

233 The dataset on which the pipeline will be run. 

234 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

235 The location containing all ingestion repositories. 

236 

237 Raises 

238 ------ 

239 RuntimeError 

240 Raised if there are no files to ingest. 

241 """ 

242 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

243 self.log.info("Calibration files were previously ingested, skipping...") 

244 else: 

245 self.log.info("Ingesting calibration files...") 

246 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

247 self.config.calibFiles, self.config.calibBadFiles) 

248 if calibDataFiles: 

249 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

250 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

251 workspace.dataRepo, workspace.calibRepo)) 

252 else: 

253 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

254 

255 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

256 """Ingest calibration images into a calibration repository. 

257 

258 Parameters 

259 ---------- 

260 repo : `str` 

261 The output repository location on disk for raw images. Must exist. 

262 calibRepo : `str` 

263 The output repository location on disk for calibration files. Must 

264 exist. 

265 calibDataFiles : `list` of `str` 

266 A list of filenames to ingest. Supported files vary by instrument 

267 but may include flats, biases, darks, fringes, or sky. May contain 

268 wildcards. 

269 

270 Raises 

271 ------ 

272 RuntimeError 

273 Raised if ``calibDataFiles`` is empty. 

274 """ 

275 if not calibDataFiles: 

276 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

277 % calibDataFiles) 

278 

279 # TODO: --output is workaround for DM-11668 

280 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

281 "--mode", "link", "--validity", str(self.config.calibValidity)] 

282 args.extend(calibDataFiles) 

283 try: 

284 _runIngestTask(self.calibIngester, args) 

285 except sqlite3.IntegrityError as detail: 

286 raise RuntimeError("Not all calibration files are unique") from detail 

287 

288 def _ingestCuratedCalibs(self, dataset, workspace): 

289 """Ingest the curated calib files for use by LSST. 

290 

291 After this method returns, the calibration repository in ``workspace`` 

292 shall contain all curated calibs mentioned in curatedCalibPaths. Butler 

293 operations on the repository shall not be able to modify ``dataset``. 

294 

295 Parameters 

296 ---------- 

297 dataset : `lsst.ap.verify.dataset.Dataset` 

298 The dataset on which the pipeline will be run. 

299 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

300 The location containing all ingestion repositories. 

301 """ 

302 for curated in self.config.curatedCalibPaths: 

303 self.log.info("Ingesting curated calibs...") 

304 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated) 

305 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo)) 

306 

307 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath): 

308 """Ingest curated calib data. 

309 

310 Parameters 

311 ---------- 

312 repo : `str` 

313 The output repository location on disk for raw images. Must exist. 

314 calibRepo : `str` 

315 The output repository location on disk for calibration files. Must 

316 exist. 

317 curatedPath : `str` 

318 Path to the curated calibs in standard text form. This is probably 

319 a path in ``obs_*_data``. 

320 """ 

321 

322 curatedargs = [repo, curatedPath, "--calib", calibRepo] 

323 try: 

324 _runIngestTask(self.curatedCalibIngester, curatedargs) 

325 except sqlite3.IntegrityError as detail: 

326 raise RuntimeError("Not all curated calib files are unique") from detail 

327 

328 def _ingestRefcats(self, dataset, workspace): 

329 """Ingest the refcats for use by LSST. 

330 

331 After this method returns, the data repository in ``workspace`` shall 

332 contain all reference catalogs from ``dataset``. Operations on the 

333 repository shall not be able to modify ``dataset``. 

334 

335 Parameters 

336 ---------- 

337 dataset : `lsst.ap.verify.dataset.Dataset` 

338 The dataset on which the pipeline will be run. 

339 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

340 The location containing all ingestion repositories. 

341 

342 Notes 

343 ----- 

344 Refcats are not, at present, registered as part of the repository. They 

345 are not guaranteed to be visible to anything other than a 

346 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

347 for more details. 

348 """ 

349 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

350 self.log.info("Refcats were previously ingested, skipping...") 

351 else: 

352 self.log.info("Ingesting reference catalogs...") 

353 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

354 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

355 

356 def _doIngestRefcats(self, repo, refcats): 

357 """Place refcats inside a particular repository. 

358 

359 Parameters 

360 ---------- 

361 repo : `str` 

362 The output repository location on disk for raw images. Must exist. 

363 refcats : `str` 

364 A directory containing .tar.gz files with LSST-formatted astrometric 

365 or photometric reference catalog information. 

366 """ 

367 for refcatName, tarball in self.config.refcats.items(): 

368 tarball = os.path.join(refcats, tarball) 

369 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

370 with tarfile.open(tarball, "r") as opened: 

371 opened.extractall(refcatDir) 

372 

373 def _copyConfigs(self, dataset, workspace): 

374 """Give a workspace a copy of all configs associated with the ingested data. 

375 

376 After this method returns, the config directory in ``workspace`` shall 

377 contain all config files from ``dataset``. 

378 

379 Parameters 

380 ---------- 

381 dataset : `lsst.ap.verify.dataset.Dataset` 

382 The dataset on which the pipeline will be run. 

383 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

384 The location containing the config directory. 

385 """ 

386 if os.listdir(workspace.configDir): 

387 self.log.info("Configs already copied, skipping...") 

388 else: 

389 self.log.info("Storing data-specific configs...") 

390 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

391 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

392 

393 def _doCopyConfigs(self, destination, source): 

394 """Place configs inside a particular repository. 

395 

396 Parameters 

397 ---------- 

398 destination : `str` 

399 The directory to which the configs must be copied. Must exist. 

400 source : `str` 

401 A directory containing Task config files. 

402 """ 

403 for configFile in _findMatchingFiles(source, ['*.py']): 

404 shutil.copy2(configFile, destination) 

405 

406 

407class Gen3DatasetIngestConfig(pexConfig.Config): 

408 """Settings and defaults for `Gen3DatasetIngestTask`. 

409 

410 The correct target for `ingester` can be found in the documentation of 

411 the appropriate ``obs`` package. 

412 """ 

413 

414 ingester = pexConfig.ConfigurableField( 

415 target=lsst.obs.base.RawIngestTask, 

416 doc="Task used to perform raw data ingestion.", 

417 ) 

418 visitDefiner = pexConfig.ConfigurableField( 

419 target=lsst.obs.base.DefineVisitsTask, 

420 doc="Task used to organize raw exposures into visits.", 

421 ) 

422 # Normally file patterns should be user input, but put them in a config so 

423 # the ap_verify dataset can configure them 

424 dataFiles = pexConfig.ListField( 

425 dtype=str, 

426 default=["*.fits", "*.fz", "*.fits.gz"], 

427 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

428 ) 

429 dataBadFiles = pexConfig.ListField( 

430 dtype=str, 

431 default=[], 

432 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

433 "supersedes ``dataFiles``.", 

434 ) 

435 

436 

437class Gen3DatasetIngestTask(pipeBase.Task): 

438 """Task for automating ingestion of a ap_verify dataset. 

439 

440 Each dataset configures this task as appropriate for the files it provides 

441 and the target instrument. Therefore, this task takes no input besides the 

442 ap_verify dataset to load and the repositories to ingest to. 

443 

444 Parameters 

445 ---------- 

446 dataset : `lsst.ap.verify.dataset.Dataset` 

447 The ``ap_verify`` dataset to be ingested. 

448 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

449 The abstract location for all ``ap_verify`` outputs, including 

450 a Gen 3 repository. 

451 """ 

452 

453 ConfigClass = Gen3DatasetIngestConfig 

454 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides 

455 _DefaultName = "datasetIngest-gen3" 

456 

457 def __init__(self, dataset, workspace, *args, **kwargs): 

458 super().__init__(*args, **kwargs) 

459 self.workspace = workspace 

460 self.dataset = dataset 

461 # workspace.workButler is undefined until the repository is created 

462 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

463 self.makeSubtask("ingester", butler=self.workspace.workButler) 

464 self.makeSubtask("visitDefiner", butler=self.workspace.workButler) 

465 

466 def _reduce_kwargs(self): 

467 # Add extra parameters to pickle 

468 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace) 

469 

470 def run(self, processes=1): 

471 """Ingest the contents of a dataset into a Butler repository. 

472 

473 Parameters 

474 ---------- 

475 processes : `int` 

476 The number processes to use to ingest. 

477 """ 

478 self._ensureRaws(processes=processes) 

479 self._defineVisits(processes=processes) 

480 self._copyConfigs() 

481 

482 def _ensureRaws(self, processes): 

483 """Ensure that the repository in ``workspace`` has raws ingested. 

484 

485 After this method returns, this task's repository contains all science 

486 data from this task's ap_verify dataset. Butler operations on the 

487 repository are not able to modify ``dataset`` in any way. 

488 

489 Parameters 

490 ---------- 

491 processes : `int` 

492 The number processes to use to ingest, if ingestion must be run. 

493 

494 Raises 

495 ------ 

496 RuntimeError 

497 Raised if there are no files to ingest. 

498 """ 

499 # TODO: regex is workaround for DM-25945 

500 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

501 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

502 if rawCollections: 

503 self.log.info("Raw images for %s were previously ingested, skipping...", 

504 self.dataset.instrument.getName()) 

505 else: 

506 self.log.info("Ingesting raw images...") 

507 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

508 exclude=self.config.dataBadFiles) 

509 if dataFiles: 

510 self._ingestRaws(dataFiles, processes=processes) 

511 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

512 else: 

513 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

514 

515 def _ingestRaws(self, dataFiles, processes): 

516 """Ingest raw images into a repository. 

517 

518 This task's repository is populated with *links* to ``dataFiles``. 

519 

520 Parameters 

521 ---------- 

522 dataFiles : `list` of `str` 

523 A list of filenames to ingest. May contain wildcards. 

524 processes : `int` 

525 The number processes to use to ingest. 

526 

527 Raises 

528 ------ 

529 RuntimeError 

530 Raised if ``dataFiles`` is empty or any file has already been ingested. 

531 """ 

532 if not dataFiles: 

533 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

534 

535 try: 

536 # run=None because expect ingester to name a new collection 

537 self.ingester.run(dataFiles, run=None, processes=processes) 

538 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

539 raise RuntimeError("Not all raw files are unique") from detail 

540 

541 def _defineVisits(self, processes): 

542 """Map visits to the ingested exposures. 

543 

544 This step is necessary to be able to run most pipelines on raw datasets. 

545 

546 Parameters 

547 ---------- 

548 processes : `int` 

549 The number processes to use to define visits. 

550 

551 Raises 

552 ------ 

553 RuntimeError 

554 Raised if there are no exposures in the repository. 

555 """ 

556 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"])) 

557 if not exposures: 

558 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.") 

559 

560 exposureKeys = list(exposures)[0].graph 

561 exposuresWithVisits = {x.subset(exposureKeys) for x in 

562 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])} 

563 exposuresNoVisits = exposures - exposuresWithVisits 

564 if exposuresNoVisits: 

565 self.log.info("Defining visits...") 

566 self.visitDefiner.run(exposuresNoVisits, processes=processes) 

567 else: 

568 self.log.info("Visits were previously defined, skipping...") 

569 

570 def _copyConfigs(self): 

571 """Give a workspace a copy of all configs associated with the 

572 ingested data. 

573 

574 After this method returns, the config directory in the workspace 

575 contains all config files from the ap_verify dataset. 

576 """ 

577 if os.listdir(self.workspace.configDir): 

578 self.log.info("Configs already copied, skipping...") 

579 else: 

580 self.log.info("Storing data-specific configs...") 

581 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

582 shutil.copy2(configFile, self.workspace.configDir) 

583 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir)) 

584 

585 

586def ingestDataset(dataset, workspace): 

587 """Ingest the contents of an ap_veify dataset into a Butler repository. 

588 

589 The original data directory shall not be modified. 

590 

591 Parameters 

592 ---------- 

593 dataset : `lsst.ap.verify.dataset.Dataset` 

594 The ap_verify dataset to be ingested. 

595 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

596 The abstract location where ingestion repositories will be created. 

597 If the repositories already exist, they must be compatible with 

598 ``dataset`` (in particular, they must support the relevant 

599 ``obs`` package). 

600 """ 

601 # TODO: generalize to support arbitrary URIs (DM-11482) 

602 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

603 

604 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset)) 

605 ingester.run(dataset, workspace) 

606 log.info("Data ingested") 

607 

608 

609def ingestDatasetGen3(dataset, workspace, processes=1): 

610 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

611 

612 The original data directory is not modified. 

613 

614 Parameters 

615 ---------- 

616 dataset : `lsst.ap.verify.dataset.Dataset` 

617 The ap_verify dataset to be ingested. 

618 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

619 The abstract location where the epository is be created, if it does 

620 not already exist. 

621 processes : `int` 

622 The number processes to use to ingest. 

623 """ 

624 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

625 

626 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

627 ingester.run(processes=processes) 

628 log.info("Data ingested") 

629 

630 

631def _getConfig(task, dataset): 

632 """Return the ingestion config associated with a specific dataset. 

633 

634 Parameters 

635 ---------- 

636 task : `lsst.pipe.base.Task`-type 

637 The task whose config is needed 

638 dataset : `lsst.ap.verify.dataset.Dataset` 

639 The dataset whose ingestion config is desired. 

640 

641 Returns 

642 ------- 

643 config : ``task.ConfigClass`` 

644 The config for running ``task`` on ``dataset``. 

645 """ 

646 # Can't use dataset.instrument.applyConfigOverrides for this, because the 

647 # dataset might not have Gen 3 support. 

648 overrideFile = task._DefaultName + ".py" 

649 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

650 

651 config = task.ConfigClass() 

652 for path in [ 

653 os.path.join(packageDir, 'config'), 

654 os.path.join(packageDir, 'config', dataset.camera), 

655 dataset.configLocation, 

656 ]: 

657 overridePath = os.path.join(path, overrideFile) 

658 if os.path.exists(overridePath): 

659 config.load(overridePath) 

660 return config 

661 

662 

663def _runIngestTask(task, args): 

664 """Run an ingestion task on a set of inputs. 

665 

666 Parameters 

667 ---------- 

668 task : `lsst.pipe.tasks.IngestTask` 

669 The task to run. 

670 args : list of command-line arguments, split using Python conventions 

671 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

672 """ 

673 argumentParser = task.ArgumentParser(name=task.getName()) 

674 try: 

675 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

676 except SystemExit as e: 

677 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

678 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

679 task.run(parsedCmd) 

680 

681 

682def _findMatchingFiles(basePath, include, exclude=None): 

683 """Recursively identify files matching one set of patterns and not matching another. 

684 

685 Parameters 

686 ---------- 

687 basePath : `str` 

688 The path on disk where the files in ``include`` are located. 

689 include : iterable of `str` 

690 A collection of files (with wildcards) to include. Must not 

691 contain paths. 

692 exclude : iterable of `str`, optional 

693 A collection of filenames (with wildcards) to exclude. Must not 

694 contain paths. If omitted, all files matching ``include`` are returned. 

695 

696 Returns 

697 ------- 

698 files : `set` of `str` 

699 The files in ``basePath`` or any subdirectory that match ``include`` 

700 but not ``exclude``. 

701 """ 

702 _exclude = exclude if exclude is not None else [] 

703 

704 allFiles = set() 

705 for pattern in include: 

706 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

707 

708 for pattern in _exclude: 

709 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

710 allFiles.difference_update(excludedFiles) 

711 return allFiles