Coverage for python/lsst/ap/verify/ingestion.py: 25%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

210 statements  

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36import tarfile 

37from glob import glob 

38import sqlite3 

39 

40import lsst.utils 

41import lsst.log 

42import lsst.pex.config as pexConfig 

43import lsst.pipe.base as pipeBase 

44 

45import lsst.daf.butler 

46import lsst.obs.base 

47from lsst.pipe.tasks.ingest import IngestTask 

48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

50 

51 

52class DatasetIngestConfig(pexConfig.Config): 

53 """Settings and defaults for `DatasetIngestTask`. 

54 

55 The correct targets for this task's subtasks can be found in the 

56 documentation of the appropriate ``obs`` package. 

57 

58 Because `DatasetIngestTask` is not designed to be run from the command line, 

59 and its arguments are completely determined by the choice of dataset, 

60 this config includes settings that would normally be passed as command-line 

61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

62 """ 

63 

64 dataIngester = pexConfig.ConfigurableField( 

65 target=IngestTask, 

66 doc="Task used to perform raw data ingestion.", 

67 ) 

68 # Normally file patterns should be user input, but put them in a config so 

69 # the ap_verify dataset can configure them 

70 dataFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=["*.fits", "*.fz", "*.fits.gz"], 

73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

74 ) 

75 dataBadFiles = pexConfig.ListField( 

76 dtype=str, 

77 default=[], 

78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

79 "supersedes ``dataFiles``.", 

80 ) 

81 

82 calibIngester = pexConfig.ConfigurableField( 

83 target=IngestCalibsTask, 

84 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

85 ) 

86 calibFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=["*.fits", "*.fz", "*.fits.gz"], 

89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

90 ) 

91 calibBadFiles = pexConfig.ListField( 

92 dtype=str, 

93 default=[], 

94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

95 ) 

96 calibValidity = pexConfig.Field( 

97 dtype=int, 

98 default=9999, 

99 doc="Calibration validity period (days). Assumed equal for all calib types.") 

100 

101 curatedCalibPaths = pexConfig.ListField( 

102 dtype=str, 

103 default=[], 

104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). " 

105 "Each path should be a directory which contains one subdirectory per sensor." 

106 ) 

107 curatedCalibIngester = pexConfig.ConfigurableField( 

108 target=IngestCuratedCalibsTask, 

109 doc="Task used to ingest curated calibs.", 

110 ) 

111 

112 refcats = pexConfig.DictField( 

113 keytype=str, 

114 itemtype=str, 

115 default={}, 

116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

117 ) 

118 

119 def setDefaults(self): 

120 # Can't easily check for prior curated ingestion, so make it not matter 

121 self.curatedCalibIngester.clobber = True 

122 

123 

124class DatasetIngestTask(pipeBase.Task): 

125 """Task for automating ingestion of a ap_verify dataset. 

126 

127 Each dataset configures this task as appropriate for the files it provides 

128 and the target instrument. Therefore, this task takes no input besides the 

129 ap_verify dataset to load and the repositories to ingest to. 

130 """ 

131 

132 ConfigClass = DatasetIngestConfig 

133 _DefaultName = "datasetIngest" 

134 

135 def __init__(self, *args, **kwargs): 

136 pipeBase.Task.__init__(self, *args, **kwargs) 

137 self.makeSubtask("dataIngester") 

138 self.makeSubtask("calibIngester") 

139 self.makeSubtask("curatedCalibIngester") 

140 

141 def run(self, dataset, workspace): 

142 """Ingest the contents of a dataset into a Butler repository. 

143 

144 Parameters 

145 ---------- 

146 dataset : `lsst.ap.verify.dataset.Dataset` 

147 The dataset to be ingested. 

148 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

149 The abstract location where ingestion repositories will be created. 

150 If the repositories already exist, they must support the same 

151 ``obs`` package as this task's subtasks. 

152 """ 

153 # We're assuming ingest tasks always give absolute path to butler 

154 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

155 self._ingestRaws(dataset, workspace) 

156 self._ingestCalibs(dataset, workspace) 

157 self._ingestCuratedCalibs(dataset, workspace) 

158 self._ingestRefcats(dataset, workspace) 

159 self._copyConfigs(dataset, workspace) 

160 

161 def _ingestRaws(self, dataset, workspace): 

162 """Ingest the science data for use by LSST. 

163 

164 After this method returns, the data repository in ``workspace`` shall 

165 contain all science data from ``dataset``. Butler operations on the 

166 repository shall not be able to modify ``dataset``. 

167 

168 Parameters 

169 ---------- 

170 dataset : `lsst.ap.verify.dataset.Dataset` 

171 The dataset on which the pipeline will be run. 

172 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

173 The location containing all ingestion repositories. 

174 

175 Raises 

176 ------ 

177 RuntimeError 

178 Raised if there are no files to ingest. 

179 """ 

180 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

181 self.log.info("Raw images were previously ingested, skipping...") 

182 else: 

183 self.log.info("Ingesting raw images...") 

184 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

185 if dataFiles: 

186 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

187 dataFiles, self.config.dataBadFiles) 

188 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

189 else: 

190 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

191 

192 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

193 """Ingest raw images into a repository. 

194 

195 ``repo`` shall be populated with *links* to ``dataFiles``. 

196 

197 Parameters 

198 ---------- 

199 repo : `str` 

200 The output repository location on disk for raw images. Must exist. 

201 calibRepo : `str` 

202 The output calibration repository location on disk. 

203 dataFiles : `list` of `str` 

204 A list of filenames to ingest. May contain wildcards. 

205 badFiles : `list` of `str` 

206 A list of filenames to exclude from ingestion. Must not contain paths. 

207 May contain wildcards. 

208 

209 Raises 

210 ------ 

211 RuntimeError 

212 Raised if ``dataFiles`` is empty. 

213 """ 

214 if not dataFiles: 

215 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

216 

217 args = [repo, "--calib", calibRepo, "--mode", "link"] 

218 args.extend(dataFiles) 

219 if badFiles: 

220 args.append('--badFile') 

221 args.extend(badFiles) 

222 try: 

223 _runIngestTask(self.dataIngester, args) 

224 except sqlite3.IntegrityError as detail: 

225 raise RuntimeError("Not all raw files are unique") from detail 

226 

227 def _ingestCalibs(self, dataset, workspace): 

228 """Ingest the calibration files for use by LSST. 

229 

230 After this method returns, the calibration repository in ``workspace`` 

231 shall contain all calibration data from ``dataset``. Butler operations 

232 on the repository shall not be able to modify ``dataset``. 

233 

234 Parameters 

235 ---------- 

236 dataset : `lsst.ap.verify.dataset.Dataset` 

237 The dataset on which the pipeline will be run. 

238 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

239 The location containing all ingestion repositories. 

240 

241 Raises 

242 ------ 

243 RuntimeError 

244 Raised if there are no files to ingest. 

245 """ 

246 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

247 self.log.info("Calibration files were previously ingested, skipping...") 

248 else: 

249 self.log.info("Ingesting calibration files...") 

250 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

251 self.config.calibFiles, self.config.calibBadFiles) 

252 if calibDataFiles: 

253 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

254 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

255 workspace.dataRepo, workspace.calibRepo)) 

256 else: 

257 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

258 

259 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

260 """Ingest calibration images into a calibration repository. 

261 

262 Parameters 

263 ---------- 

264 repo : `str` 

265 The output repository location on disk for raw images. Must exist. 

266 calibRepo : `str` 

267 The output repository location on disk for calibration files. Must 

268 exist. 

269 calibDataFiles : `list` of `str` 

270 A list of filenames to ingest. Supported files vary by instrument 

271 but may include flats, biases, darks, fringes, or sky. May contain 

272 wildcards. 

273 

274 Raises 

275 ------ 

276 RuntimeError 

277 Raised if ``calibDataFiles`` is empty. 

278 """ 

279 if not calibDataFiles: 

280 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

281 % calibDataFiles) 

282 

283 # TODO: --output is workaround for DM-11668 

284 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

285 "--mode", "link", "--validity", str(self.config.calibValidity)] 

286 args.extend(calibDataFiles) 

287 try: 

288 _runIngestTask(self.calibIngester, args) 

289 except sqlite3.IntegrityError as detail: 

290 raise RuntimeError("Not all calibration files are unique") from detail 

291 

292 def _ingestCuratedCalibs(self, dataset, workspace): 

293 """Ingest the curated calib files for use by LSST. 

294 

295 After this method returns, the calibration repository in ``workspace`` 

296 shall contain all curated calibs mentioned in curatedCalibPaths. Butler 

297 operations on the repository shall not be able to modify ``dataset``. 

298 

299 Parameters 

300 ---------- 

301 dataset : `lsst.ap.verify.dataset.Dataset` 

302 The dataset on which the pipeline will be run. 

303 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

304 The location containing all ingestion repositories. 

305 """ 

306 for curated in self.config.curatedCalibPaths: 

307 # Can't easily check for prior ingestion; workaround in config 

308 self.log.info("Ingesting curated calibs...") 

309 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated) 

310 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo)) 

311 

312 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath): 

313 """Ingest curated calib data. 

314 

315 Parameters 

316 ---------- 

317 repo : `str` 

318 The output repository location on disk for raw images. Must exist. 

319 calibRepo : `str` 

320 The output repository location on disk for calibration files. Must 

321 exist. 

322 curatedPath : `str` 

323 Path to the curated calibs in standard text form. This is probably 

324 a path in ``obs_*_data``. 

325 """ 

326 

327 curatedargs = [repo, curatedPath, "--calib", calibRepo, "--ignore-ingested"] 

328 try: 

329 _runIngestTask(self.curatedCalibIngester, curatedargs) 

330 except sqlite3.IntegrityError as detail: 

331 raise RuntimeError("Not all curated calib files are unique") from detail 

332 

333 def _ingestRefcats(self, dataset, workspace): 

334 """Ingest the refcats for use by LSST. 

335 

336 After this method returns, the data repository in ``workspace`` shall 

337 contain all reference catalogs from ``dataset``. Operations on the 

338 repository shall not be able to modify ``dataset``. 

339 

340 Parameters 

341 ---------- 

342 dataset : `lsst.ap.verify.dataset.Dataset` 

343 The dataset on which the pipeline will be run. 

344 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

345 The location containing all ingestion repositories. 

346 

347 Notes 

348 ----- 

349 Refcats are not, at present, registered as part of the repository. They 

350 are not guaranteed to be visible to anything other than a 

351 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

352 for more details. 

353 """ 

354 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

355 self.log.info("Refcats were previously ingested, skipping...") 

356 else: 

357 self.log.info("Ingesting reference catalogs...") 

358 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

359 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

360 

361 def _doIngestRefcats(self, repo, refcats): 

362 """Place refcats inside a particular repository. 

363 

364 Parameters 

365 ---------- 

366 repo : `str` 

367 The output repository location on disk for raw images. Must exist. 

368 refcats : `str` 

369 A directory containing .tar.gz files with LSST-formatted astrometric 

370 or photometric reference catalog information. 

371 """ 

372 for refcatName, tarball in self.config.refcats.items(): 

373 tarball = os.path.join(refcats, tarball) 

374 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

375 with tarfile.open(tarball, "r") as opened: 

376 opened.extractall(refcatDir) 

377 

378 def _copyConfigs(self, dataset, workspace): 

379 """Give a workspace a copy of all configs associated with the ingested data. 

380 

381 After this method returns, the config directory in ``workspace`` shall 

382 contain all config files from ``dataset``. 

383 

384 Parameters 

385 ---------- 

386 dataset : `lsst.ap.verify.dataset.Dataset` 

387 The dataset on which the pipeline will be run. 

388 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

389 The location containing the config directory. 

390 """ 

391 if os.listdir(workspace.configDir): 

392 self.log.info("Configs already copied, skipping...") 

393 else: 

394 self.log.info("Storing data-specific configs...") 

395 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

396 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

397 

398 def _doCopyConfigs(self, destination, source): 

399 """Place configs inside a particular repository. 

400 

401 Parameters 

402 ---------- 

403 destination : `str` 

404 The directory to which the configs must be copied. Must exist. 

405 source : `str` 

406 A directory containing Task config files. 

407 """ 

408 for configFile in _findMatchingFiles(source, ['*.py']): 

409 shutil.copy2(configFile, destination) 

410 

411 

412class Gen3DatasetIngestConfig(pexConfig.Config): 

413 """Settings and defaults for `Gen3DatasetIngestTask`. 

414 

415 The correct target for `ingester` can be found in the documentation of 

416 the appropriate ``obs`` package. 

417 """ 

418 

419 ingester = pexConfig.ConfigurableField( 

420 target=lsst.obs.base.RawIngestTask, 

421 doc="Task used to perform raw data ingestion.", 

422 ) 

423 visitDefiner = pexConfig.ConfigurableField( 

424 target=lsst.obs.base.DefineVisitsTask, 

425 doc="Task used to organize raw exposures into visits.", 

426 ) 

427 # Normally file patterns should be user input, but put them in a config so 

428 # the ap_verify dataset can configure them 

429 dataFiles = pexConfig.ListField( 

430 dtype=str, 

431 default=["*.fits", "*.fz", "*.fits.gz"], 

432 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

433 ) 

434 dataBadFiles = pexConfig.ListField( 

435 dtype=str, 

436 default=[], 

437 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

438 "supersedes ``dataFiles``.", 

439 ) 

440 

441 

442class Gen3DatasetIngestTask(pipeBase.Task): 

443 """Task for automating ingestion of a ap_verify dataset. 

444 

445 Each dataset configures this task as appropriate for the files it provides 

446 and the target instrument. Therefore, this task takes no input besides the 

447 ap_verify dataset to load and the repositories to ingest to. 

448 

449 Parameters 

450 ---------- 

451 dataset : `lsst.ap.verify.dataset.Dataset` 

452 The ``ap_verify`` dataset to be ingested. 

453 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

454 The abstract location for all ``ap_verify`` outputs, including 

455 a Gen 3 repository. 

456 """ 

457 

458 ConfigClass = Gen3DatasetIngestConfig 

459 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides 

460 _DefaultName = "datasetIngest-gen3" 

461 

462 def __init__(self, dataset, workspace, *args, **kwargs): 

463 super().__init__(*args, **kwargs) 

464 self.workspace = workspace 

465 self.dataset = dataset 

466 # workspace.workButler is undefined until the repository is created 

467 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

468 self.makeSubtask("ingester", butler=self.workspace.workButler) 

469 self.makeSubtask("visitDefiner", butler=self.workspace.workButler) 

470 

471 def _reduce_kwargs(self): 

472 # Add extra parameters to pickle 

473 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace) 

474 

475 def run(self, processes=1): 

476 """Ingest the contents of a dataset into a Butler repository. 

477 

478 Parameters 

479 ---------- 

480 processes : `int` 

481 The number processes to use to ingest. 

482 """ 

483 self._ensureRaws(processes=processes) 

484 self._defineVisits(processes=processes) 

485 self._copyConfigs() 

486 

487 def _ensureRaws(self, processes): 

488 """Ensure that the repository in ``workspace`` has raws ingested. 

489 

490 After this method returns, this task's repository contains all science 

491 data from this task's ap_verify dataset. Butler operations on the 

492 repository are not able to modify ``dataset`` in any way. 

493 

494 Parameters 

495 ---------- 

496 processes : `int` 

497 The number processes to use to ingest, if ingestion must be run. 

498 

499 Raises 

500 ------ 

501 RuntimeError 

502 Raised if there are no files to ingest. 

503 """ 

504 # TODO: regex is workaround for DM-25945 

505 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

506 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

507 rawData = list(self.workspace.workButler.registry.queryDatasets( 

508 'raw', 

509 collections=rawCollections, 

510 dataId={"instrument": self.dataset.instrument.getName()})) \ 

511 if rawCollections else [] 

512 

513 if rawData: 

514 self.log.info("Raw images for %s were previously ingested, skipping...", 

515 self.dataset.instrument.getName()) 

516 else: 

517 self.log.info("Ingesting raw images...") 

518 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

519 exclude=self.config.dataBadFiles) 

520 if dataFiles: 

521 self._ingestRaws(dataFiles, processes=processes) 

522 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

523 else: 

524 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

525 

526 def _ingestRaws(self, dataFiles, processes): 

527 """Ingest raw images into a repository. 

528 

529 This task's repository is populated with *links* to ``dataFiles``. 

530 

531 Parameters 

532 ---------- 

533 dataFiles : `list` of `str` 

534 A list of filenames to ingest. May contain wildcards. 

535 processes : `int` 

536 The number processes to use to ingest. 

537 

538 Raises 

539 ------ 

540 RuntimeError 

541 Raised if ``dataFiles`` is empty or any file has already been ingested. 

542 """ 

543 if not dataFiles: 

544 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

545 

546 try: 

547 # run=None because expect ingester to name a new collection 

548 self.ingester.run(dataFiles, run=None, processes=processes) 

549 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

550 raise RuntimeError("Not all raw files are unique") from detail 

551 

552 def _defineVisits(self, processes): 

553 """Map visits to the ingested exposures. 

554 

555 This step is necessary to be able to run most pipelines on raw datasets. 

556 

557 Parameters 

558 ---------- 

559 processes : `int` 

560 The number processes to use to define visits. 

561 

562 Raises 

563 ------ 

564 RuntimeError 

565 Raised if there are no exposures in the repository. 

566 """ 

567 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"])) 

568 if not exposures: 

569 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.") 

570 

571 exposureKeys = list(exposures)[0].graph 

572 exposuresWithVisits = {x.subset(exposureKeys) for x in 

573 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])} 

574 exposuresNoVisits = exposures - exposuresWithVisits 

575 if exposuresNoVisits: 

576 self.log.info("Defining visits...") 

577 self.visitDefiner.run(exposuresNoVisits, processes=processes) 

578 else: 

579 self.log.info("Visits were previously defined, skipping...") 

580 

581 def _copyConfigs(self): 

582 """Give a workspace a copy of all configs associated with the 

583 ingested data. 

584 

585 After this method returns, the config directory in the workspace 

586 contains all config files from the ap_verify dataset. 

587 """ 

588 if os.listdir(self.workspace.configDir): 

589 self.log.info("Configs already copied, skipping...") 

590 else: 

591 self.log.info("Storing data-specific configs...") 

592 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

593 shutil.copy2(configFile, self.workspace.configDir) 

594 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir)) 

595 

596 

597def ingestDataset(dataset, workspace): 

598 """Ingest the contents of an ap_veify dataset into a Butler repository. 

599 

600 The original data directory shall not be modified. 

601 

602 Parameters 

603 ---------- 

604 dataset : `lsst.ap.verify.dataset.Dataset` 

605 The ap_verify dataset to be ingested. 

606 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

607 The abstract location where ingestion repositories will be created. 

608 If the repositories already exist, they must be compatible with 

609 ``dataset`` (in particular, they must support the relevant 

610 ``obs`` package). 

611 """ 

612 # TODO: generalize to support arbitrary URIs (DM-11482) 

613 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

614 

615 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset)) 

616 ingester.run(dataset, workspace) 

617 log.info("Data ingested") 

618 

619 

620def ingestDatasetGen3(dataset, workspace, processes=1): 

621 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

622 

623 The original data directory is not modified. 

624 

625 Parameters 

626 ---------- 

627 dataset : `lsst.ap.verify.dataset.Dataset` 

628 The ap_verify dataset to be ingested. 

629 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

630 The abstract location where the epository is be created, if it does 

631 not already exist. 

632 processes : `int` 

633 The number processes to use to ingest. 

634 """ 

635 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

636 

637 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

638 ingester.run(processes=processes) 

639 log.info("Data ingested") 

640 

641 

642def _getConfig(task, dataset): 

643 """Return the ingestion config associated with a specific dataset. 

644 

645 Parameters 

646 ---------- 

647 task : `lsst.pipe.base.Task`-type 

648 The task whose config is needed 

649 dataset : `lsst.ap.verify.dataset.Dataset` 

650 The dataset whose ingestion config is desired. 

651 

652 Returns 

653 ------- 

654 config : ``task.ConfigClass`` 

655 The config for running ``task`` on ``dataset``. 

656 """ 

657 # Can't use dataset.instrument.applyConfigOverrides for this, because the 

658 # dataset might not have Gen 3 support. 

659 overrideFile = task._DefaultName + ".py" 

660 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

661 

662 config = task.ConfigClass() 

663 for path in [ 

664 os.path.join(packageDir, 'config'), 

665 os.path.join(packageDir, 'config', dataset.camera), 

666 dataset.configLocation, 

667 ]: 

668 overridePath = os.path.join(path, overrideFile) 

669 if os.path.exists(overridePath): 

670 config.load(overridePath) 

671 return config 

672 

673 

674def _runIngestTask(task, args): 

675 """Run an ingestion task on a set of inputs. 

676 

677 Parameters 

678 ---------- 

679 task : `lsst.pipe.tasks.IngestTask` 

680 The task to run. 

681 args : list of command-line arguments, split using Python conventions 

682 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

683 """ 

684 argumentParser = task.ArgumentParser(name=task.getName()) 

685 try: 

686 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

687 except SystemExit as e: 

688 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

689 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

690 task.run(parsedCmd) 

691 

692 

693def _findMatchingFiles(basePath, include, exclude=None): 

694 """Recursively identify files matching one set of patterns and not matching another. 

695 

696 Parameters 

697 ---------- 

698 basePath : `str` 

699 The path on disk where the files in ``include`` are located. 

700 include : iterable of `str` 

701 A collection of files (with wildcards) to include. Must not 

702 contain paths. 

703 exclude : iterable of `str`, optional 

704 A collection of filenames (with wildcards) to exclude. Must not 

705 contain paths. If omitted, all files matching ``include`` are returned. 

706 

707 Returns 

708 ------- 

709 files : `set` of `str` 

710 The files in ``basePath`` or any subdirectory that match ``include`` 

711 but not ``exclude``. 

712 """ 

713 _exclude = exclude if exclude is not None else [] 

714 

715 allFiles = set() 

716 for pattern in include: 

717 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

718 

719 for pattern in _exclude: 

720 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

721 allFiles.difference_update(excludedFiles) 

722 return allFiles