Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36import tarfile 

37from glob import glob 

38import sqlite3 

39 

40import lsst.utils 

41import lsst.log 

42import lsst.pex.config as pexConfig 

43import lsst.pipe.base as pipeBase 

44 

45import lsst.daf.butler 

46import lsst.obs.base 

47from lsst.pipe.tasks.ingest import IngestTask 

48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask 

49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask 

50 

51 

52class DatasetIngestConfig(pexConfig.Config): 

53 """Settings and defaults for `DatasetIngestTask`. 

54 

55 The correct targets for this task's subtasks can be found in the 

56 documentation of the appropriate ``obs`` package. 

57 

58 Because `DatasetIngestTask` is not designed to be run from the command line, 

59 and its arguments are completely determined by the choice of dataset, 

60 this config includes settings that would normally be passed as command-line 

61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`. 

62 """ 

63 

64 dataIngester = pexConfig.ConfigurableField( 

65 target=IngestTask, 

66 doc="Task used to perform raw data ingestion.", 

67 ) 

68 # Normally file patterns should be user input, but put them in a config so 

69 # the ap_verify dataset can configure them 

70 dataFiles = pexConfig.ListField( 

71 dtype=str, 

72 default=["*.fits", "*.fz", "*.fits.gz"], 

73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

74 ) 

75 dataBadFiles = pexConfig.ListField( 

76 dtype=str, 

77 default=[], 

78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

79 "supersedes ``dataFiles``.", 

80 ) 

81 

82 calibIngester = pexConfig.ConfigurableField( 

83 target=IngestCalibsTask, 

84 doc="Task used to ingest flats, biases, darks, fringes, or sky.", 

85 ) 

86 calibFiles = pexConfig.ListField( 

87 dtype=str, 

88 default=["*.fits", "*.fz", "*.fits.gz"], 

89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

90 ) 

91 calibBadFiles = pexConfig.ListField( 

92 dtype=str, 

93 default=[], 

94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.", 

95 ) 

96 calibValidity = pexConfig.Field( 

97 dtype=int, 

98 default=9999, 

99 doc="Calibration validity period (days). Assumed equal for all calib types.") 

100 

101 curatedCalibPaths = pexConfig.ListField( 

102 dtype=str, 

103 default=[], 

104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). " 

105 "Each path should be a directory which contains one subdirectory per sensor." 

106 ) 

107 curatedCalibIngester = pexConfig.ConfigurableField( 

108 target=IngestCuratedCalibsTask, 

109 doc="Task used to ingest curated calibs.", 

110 ) 

111 

112 refcats = pexConfig.DictField( 

113 keytype=str, 

114 itemtype=str, 

115 default={}, 

116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.", 

117 ) 

118 

119 

120class DatasetIngestTask(pipeBase.Task): 

121 """Task for automating ingestion of a ap_verify dataset. 

122 

123 Each dataset configures this task as appropriate for the files it provides 

124 and the target instrument. Therefore, this task takes no input besides the 

125 ap_verify dataset to load and the repositories to ingest to. 

126 """ 

127 

128 ConfigClass = DatasetIngestConfig 

129 _DefaultName = "datasetIngest" 

130 

131 def __init__(self, *args, **kwargs): 

132 pipeBase.Task.__init__(self, *args, **kwargs) 

133 self.makeSubtask("dataIngester") 

134 self.makeSubtask("calibIngester") 

135 self.makeSubtask("curatedCalibIngester") 

136 

137 def run(self, dataset, workspace): 

138 """Ingest the contents of a dataset into a Butler repository. 

139 

140 Parameters 

141 ---------- 

142 dataset : `lsst.ap.verify.dataset.Dataset` 

143 The dataset to be ingested. 

144 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

145 The abstract location where ingestion repositories will be created. 

146 If the repositories already exist, they must support the same 

147 ``obs`` package as this task's subtasks. 

148 """ 

149 # We're assuming ingest tasks always give absolute path to butler 

150 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo)) 

151 self._ingestRaws(dataset, workspace) 

152 self._ingestCalibs(dataset, workspace) 

153 self._ingestCuratedCalibs(dataset, workspace) 

154 self._ingestRefcats(dataset, workspace) 

155 self._copyConfigs(dataset, workspace) 

156 

157 def _ingestRaws(self, dataset, workspace): 

158 """Ingest the science data for use by LSST. 

159 

160 After this method returns, the data repository in ``workspace`` shall 

161 contain all science data from ``dataset``. Butler operations on the 

162 repository shall not be able to modify ``dataset``. 

163 

164 Parameters 

165 ---------- 

166 dataset : `lsst.ap.verify.dataset.Dataset` 

167 The dataset on which the pipeline will be run. 

168 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

169 The location containing all ingestion repositories. 

170 

171 Raises 

172 ------ 

173 RuntimeError 

174 Raised if there are no files to ingest. 

175 """ 

176 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")): 

177 self.log.info("Raw images were previously ingested, skipping...") 

178 else: 

179 self.log.info("Ingesting raw images...") 

180 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles) 

181 if dataFiles: 

182 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo, 

183 dataFiles, self.config.dataBadFiles) 

184 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo)) 

185 else: 

186 raise RuntimeError("No raw files found at %s." % dataset.rawLocation) 

187 

188 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles): 

189 """Ingest raw images into a repository. 

190 

191 ``repo`` shall be populated with *links* to ``dataFiles``. 

192 

193 Parameters 

194 ---------- 

195 repo : `str` 

196 The output repository location on disk for raw images. Must exist. 

197 calibRepo : `str` 

198 The output calibration repository location on disk. 

199 dataFiles : `list` of `str` 

200 A list of filenames to ingest. May contain wildcards. 

201 badFiles : `list` of `str` 

202 A list of filenames to exclude from ingestion. Must not contain paths. 

203 May contain wildcards. 

204 

205 Raises 

206 ------ 

207 RuntimeError 

208 Raised if ``dataFiles`` is empty. 

209 """ 

210 if not dataFiles: 

211 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

212 

213 args = [repo, "--calib", calibRepo, "--mode", "link"] 

214 args.extend(dataFiles) 

215 if badFiles: 

216 args.append('--badFile') 

217 args.extend(badFiles) 

218 try: 

219 _runIngestTask(self.dataIngester, args) 

220 except sqlite3.IntegrityError as detail: 

221 raise RuntimeError("Not all raw files are unique") from detail 

222 

223 def _ingestCalibs(self, dataset, workspace): 

224 """Ingest the calibration files for use by LSST. 

225 

226 After this method returns, the calibration repository in ``workspace`` 

227 shall contain all calibration data from ``dataset``. Butler operations 

228 on the repository shall not be able to modify ``dataset``. 

229 

230 Parameters 

231 ---------- 

232 dataset : `lsst.ap.verify.dataset.Dataset` 

233 The dataset on which the pipeline will be run. 

234 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

235 The location containing all ingestion repositories. 

236 

237 Raises 

238 ------ 

239 RuntimeError 

240 Raised if there are no files to ingest. 

241 """ 

242 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")): 

243 self.log.info("Calibration files were previously ingested, skipping...") 

244 else: 

245 self.log.info("Ingesting calibration files...") 

246 calibDataFiles = _findMatchingFiles(dataset.calibLocation, 

247 self.config.calibFiles, self.config.calibBadFiles) 

248 if calibDataFiles: 

249 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles) 

250 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format( 

251 workspace.dataRepo, workspace.calibRepo)) 

252 else: 

253 raise RuntimeError("No calib files found at %s." % dataset.calibLocation) 

254 

255 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles): 

256 """Ingest calibration images into a calibration repository. 

257 

258 Parameters 

259 ---------- 

260 repo : `str` 

261 The output repository location on disk for raw images. Must exist. 

262 calibRepo : `str` 

263 The output repository location on disk for calibration files. Must 

264 exist. 

265 calibDataFiles : `list` of `str` 

266 A list of filenames to ingest. Supported files vary by instrument 

267 but may include flats, biases, darks, fringes, or sky. May contain 

268 wildcards. 

269 

270 Raises 

271 ------ 

272 RuntimeError 

273 Raised if ``calibDataFiles`` is empty. 

274 """ 

275 if not calibDataFiles: 

276 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)." 

277 % calibDataFiles) 

278 

279 # TODO: --output is workaround for DM-11668 

280 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"), 

281 "--mode", "link", "--validity", str(self.config.calibValidity)] 

282 args.extend(calibDataFiles) 

283 try: 

284 _runIngestTask(self.calibIngester, args) 

285 except sqlite3.IntegrityError as detail: 

286 raise RuntimeError("Not all calibration files are unique") from detail 

287 

288 def _ingestCuratedCalibs(self, dataset, workspace): 

289 """Ingest the curated calib files for use by LSST. 

290 

291 After this method returns, the calibration repository in ``workspace`` 

292 shall contain all curated calibs mentioned in curatedCalibPaths. Butler 

293 operations on the repository shall not be able to modify ``dataset``. 

294 

295 Parameters 

296 ---------- 

297 dataset : `lsst.ap.verify.dataset.Dataset` 

298 The dataset on which the pipeline will be run. 

299 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

300 The location containing all ingestion repositories. 

301 """ 

302 for curated in self.config.curatedCalibPaths: 

303 self.log.info("Ingesting curated calibs...") 

304 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated) 

305 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo)) 

306 

307 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath): 

308 """Ingest curated calib data. 

309 

310 Parameters 

311 ---------- 

312 repo : `str` 

313 The output repository location on disk for raw images. Must exist. 

314 calibRepo : `str` 

315 The output repository location on disk for calibration files. Must 

316 exist. 

317 curatedPath : `str` 

318 Path to the curated calibs in standard text form. This is probably 

319 a path in ``obs_*_data``. 

320 """ 

321 

322 curatedargs = [repo, curatedPath, "--calib", calibRepo] 

323 try: 

324 _runIngestTask(self.curatedCalibIngester, curatedargs) 

325 except sqlite3.IntegrityError as detail: 

326 raise RuntimeError("Not all curated calib files are unique") from detail 

327 

328 def _ingestRefcats(self, dataset, workspace): 

329 """Ingest the refcats for use by LSST. 

330 

331 After this method returns, the data repository in ``workspace`` shall 

332 contain all reference catalogs from ``dataset``. Operations on the 

333 repository shall not be able to modify ``dataset``. 

334 

335 Parameters 

336 ---------- 

337 dataset : `lsst.ap.verify.dataset.Dataset` 

338 The dataset on which the pipeline will be run. 

339 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

340 The location containing all ingestion repositories. 

341 

342 Notes 

343 ----- 

344 Refcats are not, at present, registered as part of the repository. They 

345 are not guaranteed to be visible to anything other than a 

346 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523) 

347 for more details. 

348 """ 

349 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")): 

350 self.log.info("Refcats were previously ingested, skipping...") 

351 else: 

352 self.log.info("Ingesting reference catalogs...") 

353 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation) 

354 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo)) 

355 

356 def _doIngestRefcats(self, repo, refcats): 

357 """Place refcats inside a particular repository. 

358 

359 Parameters 

360 ---------- 

361 repo : `str` 

362 The output repository location on disk for raw images. Must exist. 

363 refcats : `str` 

364 A directory containing .tar.gz files with LSST-formatted astrometric 

365 or photometric reference catalog information. 

366 """ 

367 for refcatName, tarball in self.config.refcats.items(): 

368 tarball = os.path.join(refcats, tarball) 

369 refcatDir = os.path.join(repo, "ref_cats", refcatName) 

370 with tarfile.open(tarball, "r") as opened: 

371 opened.extractall(refcatDir) 

372 

373 def _copyConfigs(self, dataset, workspace): 

374 """Give a workspace a copy of all configs associated with the ingested data. 

375 

376 After this method returns, the config directory in ``workspace`` shall 

377 contain all config files from ``dataset``. 

378 

379 Parameters 

380 ---------- 

381 dataset : `lsst.ap.verify.dataset.Dataset` 

382 The dataset on which the pipeline will be run. 

383 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

384 The location containing the config directory. 

385 """ 

386 if os.listdir(workspace.configDir): 

387 self.log.info("Configs already copied, skipping...") 

388 else: 

389 self.log.info("Storing data-specific configs...") 

390 self._doCopyConfigs(workspace.configDir, dataset.configLocation) 

391 self.log.info("Configs are now stored in {0}".format(workspace.configDir)) 

392 

393 def _doCopyConfigs(self, destination, source): 

394 """Place configs inside a particular repository. 

395 

396 Parameters 

397 ---------- 

398 destination : `str` 

399 The directory to which the configs must be copied. Must exist. 

400 source : `str` 

401 A directory containing Task config files. 

402 """ 

403 for configFile in _findMatchingFiles(source, ['*.py']): 

404 shutil.copy2(configFile, destination) 

405 

406 

407class Gen3DatasetIngestConfig(pexConfig.Config): 

408 """Settings and defaults for `Gen3DatasetIngestTask`. 

409 

410 The correct target for `ingester` can be found in the documentation of 

411 the appropriate ``obs`` package. 

412 """ 

413 

414 ingester = pexConfig.ConfigurableField( 

415 target=lsst.obs.base.RawIngestTask, 

416 doc="Task used to perform raw data ingestion.", 

417 ) 

418 # Normally file patterns should be user input, but put them in a config so 

419 # the ap_verify dataset can configure them 

420 dataFiles = pexConfig.ListField( 

421 dtype=str, 

422 default=["*.fits", "*.fz", "*.fits.gz"], 

423 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

424 ) 

425 dataBadFiles = pexConfig.ListField( 

426 dtype=str, 

427 default=[], 

428 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

429 "supersedes ``dataFiles``.", 

430 ) 

431 

432 

433class Gen3DatasetIngestTask(pipeBase.Task): 

434 """Task for automating ingestion of a ap_verify dataset. 

435 

436 Each dataset configures this task as appropriate for the files it provides 

437 and the target instrument. Therefore, this task takes no input besides the 

438 ap_verify dataset to load and the repositories to ingest to. 

439 

440 Parameters 

441 ---------- 

442 dataset : `lsst.ap.verify.dataset.Dataset` 

443 The ``ap_verify`` dataset to be ingested. 

444 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

445 The abstract location for all ``ap_verify`` outputs, including 

446 a Gen 3 repository. 

447 """ 

448 

449 ConfigClass = Gen3DatasetIngestConfig 

450 _DefaultName = "gen3DatasetIngest" 

451 

452 def __init__(self, dataset, workspace, *args, **kwargs): 

453 super().__init__(*args, **kwargs) 

454 self.workspace = workspace 

455 self.dataset = dataset 

456 # workspace.workButler is undefined until the repository is created 

457 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

458 self.makeSubtask("ingester", butler=self.workspace.workButler) 

459 

460 def run(self): 

461 """Ingest the contents of a dataset into a Butler repository. 

462 """ 

463 self._ensureRaws() 

464 self._copyConfigs() 

465 

466 def _ensureRaws(self): 

467 """Ensure that the repository in ``workspace`` has raws ingested. 

468 

469 After this method returns, this task's repository contains all science 

470 data from this task's ap_verify dataset. Butler operations on the 

471 repository are not able to modify ``dataset`` in any way. 

472 

473 Raises 

474 ------ 

475 RuntimeError 

476 Raised if there are no files to ingest. 

477 """ 

478 # TODO: regex is workaround for DM-25945 

479 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

480 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

481 if rawCollections: 

482 self.log.info("Raw images for %s were previously ingested, skipping...", 

483 self.dataset.instrument.getName()) 

484 else: 

485 self.log.info("Ingesting raw images...") 

486 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

487 exclude=self.config.dataBadFiles) 

488 if dataFiles: 

489 self._ingestRaws(dataFiles) 

490 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

491 else: 

492 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

493 

494 def _ingestRaws(self, dataFiles): 

495 """Ingest raw images into a repository. 

496 

497 This task's repository is populated with *links* to ``dataFiles``. 

498 

499 Parameters 

500 ---------- 

501 dataFiles : `list` of `str` 

502 A list of filenames to ingest. May contain wildcards. 

503 

504 Raises 

505 ------ 

506 RuntimeError 

507 Raised if ``dataFiles`` is empty or any file has already been ingested. 

508 """ 

509 if not dataFiles: 

510 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

511 

512 try: 

513 self.ingester.run(dataFiles, run=None) # expect ingester to name a new collection 

514 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

515 raise RuntimeError("Not all raw files are unique") from detail 

516 

517 def _copyConfigs(self): 

518 """Give a workspace a copy of all configs associated with the 

519 ingested data. 

520 

521 After this method returns, the config directory in the workspace 

522 contains all config files from the ap_verify dataset. 

523 """ 

524 if os.listdir(self.workspace.configDir): 

525 self.log.info("Configs already copied, skipping...") 

526 else: 

527 self.log.info("Storing data-specific configs...") 

528 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

529 shutil.copy2(configFile, self.workspace.configDir) 

530 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir)) 

531 

532 

533def ingestDataset(dataset, workspace): 

534 """Ingest the contents of an ap_veify dataset into a Butler repository. 

535 

536 The original data directory shall not be modified. 

537 

538 Parameters 

539 ---------- 

540 dataset : `lsst.ap.verify.dataset.Dataset` 

541 The ap_verify dataset to be ingested. 

542 workspace : `lsst.ap.verify.workspace.WorkspaceGen2` 

543 The abstract location where ingestion repositories will be created. 

544 If the repositories already exist, they must be compatible with 

545 ``dataset`` (in particular, they must support the relevant 

546 ``obs`` package). 

547 """ 

548 # TODO: generalize to support arbitrary URIs (DM-11482) 

549 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

550 

551 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset)) 

552 ingester.run(dataset, workspace) 

553 log.info("Data ingested") 

554 

555 

556def ingestDatasetGen3(dataset, workspace): 

557 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

558 

559 The original data directory is not modified. 

560 

561 Parameters 

562 ---------- 

563 dataset : `lsst.ap.verify.dataset.Dataset` 

564 The ap_verify dataset to be ingested. 

565 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

566 The abstract location where the epository is be created, if it does 

567 not already exist. 

568 """ 

569 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset") 

570 

571 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

572 ingester.run() 

573 log.info("Data ingested") 

574 

575 

576def _getConfig(task, dataset): 

577 """Return the ingestion config associated with a specific dataset. 

578 

579 Parameters 

580 ---------- 

581 task : `lsst.pipe.base.Task`-type 

582 The task whose config is needed 

583 dataset : `lsst.ap.verify.dataset.Dataset` 

584 The dataset whose ingestion config is desired. 

585 

586 Returns 

587 ------- 

588 config : ``task.ConfigClass`` 

589 The config for running ``task`` on ``dataset``. 

590 """ 

591 # Can't use dataset.instrument.applyConfigOverrides for this, because the 

592 # dataset might not have Gen 3 support. 

593 overrideFile = task._DefaultName + ".py" 

594 packageDir = lsst.utils.getPackageDir(dataset.obsPackage) 

595 

596 config = task.ConfigClass() 

597 for path in [ 

598 os.path.join(packageDir, 'config'), 

599 os.path.join(packageDir, 'config', dataset.camera), 

600 dataset.configLocation, 

601 ]: 

602 overridePath = os.path.join(path, overrideFile) 

603 if os.path.exists(overridePath): 

604 config.load(overridePath) 

605 return config 

606 

607 

608def _runIngestTask(task, args): 

609 """Run an ingestion task on a set of inputs. 

610 

611 Parameters 

612 ---------- 

613 task : `lsst.pipe.tasks.IngestTask` 

614 The task to run. 

615 args : list of command-line arguments, split using Python conventions 

616 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``. 

617 """ 

618 argumentParser = task.ArgumentParser(name=task.getName()) 

619 try: 

620 parsedCmd = argumentParser.parse_args(config=task.config, args=args) 

621 except SystemExit as e: 

622 # SystemExit is not an appropriate response when the arguments aren't user-supplied 

623 raise ValueError("Invalid ingestion arguments: %s" % args) from e 

624 task.run(parsedCmd) 

625 

626 

627def _findMatchingFiles(basePath, include, exclude=None): 

628 """Recursively identify files matching one set of patterns and not matching another. 

629 

630 Parameters 

631 ---------- 

632 basePath : `str` 

633 The path on disk where the files in ``include`` are located. 

634 include : iterable of `str` 

635 A collection of files (with wildcards) to include. Must not 

636 contain paths. 

637 exclude : iterable of `str`, optional 

638 A collection of filenames (with wildcards) to exclude. Must not 

639 contain paths. If omitted, all files matching ``include`` are returned. 

640 

641 Returns 

642 ------- 

643 files : `set` of `str` 

644 The files in ``basePath`` or any subdirectory that match ``include`` 

645 but not ``exclude``. 

646 """ 

647 _exclude = exclude if exclude is not None else [] 

648 

649 allFiles = set() 

650 for pattern in include: 

651 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

652 

653 for pattern in _exclude: 

654 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

655 allFiles.difference_update(excludedFiles) 

656 return allFiles