Coverage for python/lsst/ap/verify/ingestion.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of an ap_verify dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"]
32import fnmatch
33import os
34import re
35import shutil
36import tarfile
37from glob import glob
38import sqlite3
40import lsst.utils
41import lsst.log
42import lsst.pex.config as pexConfig
43import lsst.pipe.base as pipeBase
45import lsst.daf.butler
46import lsst.obs.base
47from lsst.pipe.tasks.ingest import IngestTask
48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
52class DatasetIngestConfig(pexConfig.Config):
53 """Settings and defaults for `DatasetIngestTask`.
55 The correct targets for this task's subtasks can be found in the
56 documentation of the appropriate ``obs`` package.
58 Because `DatasetIngestTask` is not designed to be run from the command line,
59 and its arguments are completely determined by the choice of dataset,
60 this config includes settings that would normally be passed as command-line
61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
62 """
64 dataIngester = pexConfig.ConfigurableField(
65 target=IngestTask,
66 doc="Task used to perform raw data ingestion.",
67 )
68 # Normally file patterns should be user input, but put them in a config so
69 # the ap_verify dataset can configure them
70 dataFiles = pexConfig.ListField(
71 dtype=str,
72 default=["*.fits", "*.fz", "*.fits.gz"],
73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
74 )
75 dataBadFiles = pexConfig.ListField(
76 dtype=str,
77 default=[],
78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
79 "supersedes ``dataFiles``.",
80 )
82 calibIngester = pexConfig.ConfigurableField(
83 target=IngestCalibsTask,
84 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
85 )
86 calibFiles = pexConfig.ListField(
87 dtype=str,
88 default=["*.fits", "*.fz", "*.fits.gz"],
89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
90 )
91 calibBadFiles = pexConfig.ListField(
92 dtype=str,
93 default=[],
94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
95 )
96 calibValidity = pexConfig.Field(
97 dtype=int,
98 default=9999,
99 doc="Calibration validity period (days). Assumed equal for all calib types.")
101 curatedCalibPaths = pexConfig.ListField(
102 dtype=str,
103 default=[],
104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). "
105 "Each path should be a directory which contains one subdirectory per sensor."
106 )
107 curatedCalibIngester = pexConfig.ConfigurableField(
108 target=IngestCuratedCalibsTask,
109 doc="Task used to ingest curated calibs.",
110 )
112 refcats = pexConfig.DictField(
113 keytype=str,
114 itemtype=str,
115 default={},
116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
117 )
120class DatasetIngestTask(pipeBase.Task):
121 """Task for automating ingestion of a ap_verify dataset.
123 Each dataset configures this task as appropriate for the files it provides
124 and the target instrument. Therefore, this task takes no input besides the
125 ap_verify dataset to load and the repositories to ingest to.
126 """
128 ConfigClass = DatasetIngestConfig
129 _DefaultName = "datasetIngest"
131 def __init__(self, *args, **kwargs):
132 pipeBase.Task.__init__(self, *args, **kwargs)
133 self.makeSubtask("dataIngester")
134 self.makeSubtask("calibIngester")
135 self.makeSubtask("curatedCalibIngester")
137 def run(self, dataset, workspace):
138 """Ingest the contents of a dataset into a Butler repository.
140 Parameters
141 ----------
142 dataset : `lsst.ap.verify.dataset.Dataset`
143 The dataset to be ingested.
144 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
145 The abstract location where ingestion repositories will be created.
146 If the repositories already exist, they must support the same
147 ``obs`` package as this task's subtasks.
148 """
149 # We're assuming ingest tasks always give absolute path to butler
150 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
151 self._ingestRaws(dataset, workspace)
152 self._ingestCalibs(dataset, workspace)
153 self._ingestCuratedCalibs(dataset, workspace)
154 self._ingestRefcats(dataset, workspace)
155 self._copyConfigs(dataset, workspace)
157 def _ingestRaws(self, dataset, workspace):
158 """Ingest the science data for use by LSST.
160 After this method returns, the data repository in ``workspace`` shall
161 contain all science data from ``dataset``. Butler operations on the
162 repository shall not be able to modify ``dataset``.
164 Parameters
165 ----------
166 dataset : `lsst.ap.verify.dataset.Dataset`
167 The dataset on which the pipeline will be run.
168 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
169 The location containing all ingestion repositories.
171 Raises
172 ------
173 RuntimeError
174 Raised if there are no files to ingest.
175 """
176 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
177 self.log.info("Raw images were previously ingested, skipping...")
178 else:
179 self.log.info("Ingesting raw images...")
180 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
181 if dataFiles:
182 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
183 dataFiles, self.config.dataBadFiles)
184 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
185 else:
186 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
188 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
189 """Ingest raw images into a repository.
191 ``repo`` shall be populated with *links* to ``dataFiles``.
193 Parameters
194 ----------
195 repo : `str`
196 The output repository location on disk for raw images. Must exist.
197 calibRepo : `str`
198 The output calibration repository location on disk.
199 dataFiles : `list` of `str`
200 A list of filenames to ingest. May contain wildcards.
201 badFiles : `list` of `str`
202 A list of filenames to exclude from ingestion. Must not contain paths.
203 May contain wildcards.
205 Raises
206 ------
207 RuntimeError
208 Raised if ``dataFiles`` is empty.
209 """
210 if not dataFiles:
211 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
213 args = [repo, "--calib", calibRepo, "--mode", "link"]
214 args.extend(dataFiles)
215 if badFiles:
216 args.append('--badFile')
217 args.extend(badFiles)
218 try:
219 _runIngestTask(self.dataIngester, args)
220 except sqlite3.IntegrityError as detail:
221 raise RuntimeError("Not all raw files are unique") from detail
223 def _ingestCalibs(self, dataset, workspace):
224 """Ingest the calibration files for use by LSST.
226 After this method returns, the calibration repository in ``workspace``
227 shall contain all calibration data from ``dataset``. Butler operations
228 on the repository shall not be able to modify ``dataset``.
230 Parameters
231 ----------
232 dataset : `lsst.ap.verify.dataset.Dataset`
233 The dataset on which the pipeline will be run.
234 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
235 The location containing all ingestion repositories.
237 Raises
238 ------
239 RuntimeError
240 Raised if there are no files to ingest.
241 """
242 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
243 self.log.info("Calibration files were previously ingested, skipping...")
244 else:
245 self.log.info("Ingesting calibration files...")
246 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
247 self.config.calibFiles, self.config.calibBadFiles)
248 if calibDataFiles:
249 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
250 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
251 workspace.dataRepo, workspace.calibRepo))
252 else:
253 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
255 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
256 """Ingest calibration images into a calibration repository.
258 Parameters
259 ----------
260 repo : `str`
261 The output repository location on disk for raw images. Must exist.
262 calibRepo : `str`
263 The output repository location on disk for calibration files. Must
264 exist.
265 calibDataFiles : `list` of `str`
266 A list of filenames to ingest. Supported files vary by instrument
267 but may include flats, biases, darks, fringes, or sky. May contain
268 wildcards.
270 Raises
271 ------
272 RuntimeError
273 Raised if ``calibDataFiles`` is empty.
274 """
275 if not calibDataFiles:
276 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
277 % calibDataFiles)
279 # TODO: --output is workaround for DM-11668
280 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
281 "--mode", "link", "--validity", str(self.config.calibValidity)]
282 args.extend(calibDataFiles)
283 try:
284 _runIngestTask(self.calibIngester, args)
285 except sqlite3.IntegrityError as detail:
286 raise RuntimeError("Not all calibration files are unique") from detail
288 def _ingestCuratedCalibs(self, dataset, workspace):
289 """Ingest the curated calib files for use by LSST.
291 After this method returns, the calibration repository in ``workspace``
292 shall contain all curated calibs mentioned in curatedCalibPaths. Butler
293 operations on the repository shall not be able to modify ``dataset``.
295 Parameters
296 ----------
297 dataset : `lsst.ap.verify.dataset.Dataset`
298 The dataset on which the pipeline will be run.
299 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
300 The location containing all ingestion repositories.
301 """
302 for curated in self.config.curatedCalibPaths:
303 self.log.info("Ingesting curated calibs...")
304 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated)
305 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo))
307 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath):
308 """Ingest curated calib data.
310 Parameters
311 ----------
312 repo : `str`
313 The output repository location on disk for raw images. Must exist.
314 calibRepo : `str`
315 The output repository location on disk for calibration files. Must
316 exist.
317 curatedPath : `str`
318 Path to the curated calibs in standard text form. This is probably
319 a path in ``obs_*_data``.
320 """
322 curatedargs = [repo, curatedPath, "--calib", calibRepo]
323 try:
324 _runIngestTask(self.curatedCalibIngester, curatedargs)
325 except sqlite3.IntegrityError as detail:
326 raise RuntimeError("Not all curated calib files are unique") from detail
328 def _ingestRefcats(self, dataset, workspace):
329 """Ingest the refcats for use by LSST.
331 After this method returns, the data repository in ``workspace`` shall
332 contain all reference catalogs from ``dataset``. Operations on the
333 repository shall not be able to modify ``dataset``.
335 Parameters
336 ----------
337 dataset : `lsst.ap.verify.dataset.Dataset`
338 The dataset on which the pipeline will be run.
339 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
340 The location containing all ingestion repositories.
342 Notes
343 -----
344 Refcats are not, at present, registered as part of the repository. They
345 are not guaranteed to be visible to anything other than a
346 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
347 for more details.
348 """
349 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
350 self.log.info("Refcats were previously ingested, skipping...")
351 else:
352 self.log.info("Ingesting reference catalogs...")
353 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
354 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
356 def _doIngestRefcats(self, repo, refcats):
357 """Place refcats inside a particular repository.
359 Parameters
360 ----------
361 repo : `str`
362 The output repository location on disk for raw images. Must exist.
363 refcats : `str`
364 A directory containing .tar.gz files with LSST-formatted astrometric
365 or photometric reference catalog information.
366 """
367 for refcatName, tarball in self.config.refcats.items():
368 tarball = os.path.join(refcats, tarball)
369 refcatDir = os.path.join(repo, "ref_cats", refcatName)
370 with tarfile.open(tarball, "r") as opened:
371 opened.extractall(refcatDir)
373 def _copyConfigs(self, dataset, workspace):
374 """Give a workspace a copy of all configs associated with the ingested data.
376 After this method returns, the config directory in ``workspace`` shall
377 contain all config files from ``dataset``.
379 Parameters
380 ----------
381 dataset : `lsst.ap.verify.dataset.Dataset`
382 The dataset on which the pipeline will be run.
383 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
384 The location containing the config directory.
385 """
386 if os.listdir(workspace.configDir):
387 self.log.info("Configs already copied, skipping...")
388 else:
389 self.log.info("Storing data-specific configs...")
390 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
391 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
393 def _doCopyConfigs(self, destination, source):
394 """Place configs inside a particular repository.
396 Parameters
397 ----------
398 destination : `str`
399 The directory to which the configs must be copied. Must exist.
400 source : `str`
401 A directory containing Task config files.
402 """
403 for configFile in _findMatchingFiles(source, ['*.py']):
404 shutil.copy2(configFile, destination)
407class Gen3DatasetIngestConfig(pexConfig.Config):
408 """Settings and defaults for `Gen3DatasetIngestTask`.
410 The correct target for `ingester` can be found in the documentation of
411 the appropriate ``obs`` package.
412 """
414 ingester = pexConfig.ConfigurableField(
415 target=lsst.obs.base.RawIngestTask,
416 doc="Task used to perform raw data ingestion.",
417 )
418 visitDefiner = pexConfig.ConfigurableField(
419 target=lsst.obs.base.DefineVisitsTask,
420 doc="Task used to organize raw exposures into visits.",
421 )
422 # Normally file patterns should be user input, but put them in a config so
423 # the ap_verify dataset can configure them
424 dataFiles = pexConfig.ListField(
425 dtype=str,
426 default=["*.fits", "*.fz", "*.fits.gz"],
427 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
428 )
429 dataBadFiles = pexConfig.ListField(
430 dtype=str,
431 default=[],
432 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
433 "supersedes ``dataFiles``.",
434 )
437class Gen3DatasetIngestTask(pipeBase.Task):
438 """Task for automating ingestion of a ap_verify dataset.
440 Each dataset configures this task as appropriate for the files it provides
441 and the target instrument. Therefore, this task takes no input besides the
442 ap_verify dataset to load and the repositories to ingest to.
444 Parameters
445 ----------
446 dataset : `lsst.ap.verify.dataset.Dataset`
447 The ``ap_verify`` dataset to be ingested.
448 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
449 The abstract location for all ``ap_verify`` outputs, including
450 a Gen 3 repository.
451 """
453 ConfigClass = Gen3DatasetIngestConfig
454 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides
455 _DefaultName = "datasetIngest-gen3"
457 def __init__(self, dataset, workspace, *args, **kwargs):
458 super().__init__(*args, **kwargs)
459 self.workspace = workspace
460 self.dataset = dataset
461 # workspace.workButler is undefined until the repository is created
462 self.dataset.makeCompatibleRepoGen3(self.workspace.repo)
463 self.makeSubtask("ingester", butler=self.workspace.workButler)
464 self.makeSubtask("visitDefiner", butler=self.workspace.workButler)
466 def _reduce_kwargs(self):
467 # Add extra parameters to pickle
468 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace)
470 def run(self, processes=1):
471 """Ingest the contents of a dataset into a Butler repository.
473 Parameters
474 ----------
475 processes : `int`
476 The number processes to use to ingest.
477 """
478 self._ensureRaws(processes=processes)
479 self._defineVisits(processes=processes)
480 self._copyConfigs()
482 def _ensureRaws(self, processes):
483 """Ensure that the repository in ``workspace`` has raws ingested.
485 After this method returns, this task's repository contains all science
486 data from this task's ap_verify dataset. Butler operations on the
487 repository are not able to modify ``dataset`` in any way.
489 Parameters
490 ----------
491 processes : `int`
492 The number processes to use to ingest, if ingestion must be run.
494 Raises
495 ------
496 RuntimeError
497 Raised if there are no files to ingest.
498 """
499 # TODO: regex is workaround for DM-25945
500 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
501 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
502 if rawCollections:
503 self.log.info("Raw images for %s were previously ingested, skipping...",
504 self.dataset.instrument.getName())
505 else:
506 self.log.info("Ingesting raw images...")
507 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles,
508 exclude=self.config.dataBadFiles)
509 if dataFiles:
510 self._ingestRaws(dataFiles, processes=processes)
511 self.log.info("Images are now ingested in {0}".format(self.workspace.repo))
512 else:
513 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation)
515 def _ingestRaws(self, dataFiles, processes):
516 """Ingest raw images into a repository.
518 This task's repository is populated with *links* to ``dataFiles``.
520 Parameters
521 ----------
522 dataFiles : `list` of `str`
523 A list of filenames to ingest. May contain wildcards.
524 processes : `int`
525 The number processes to use to ingest.
527 Raises
528 ------
529 RuntimeError
530 Raised if ``dataFiles`` is empty or any file has already been ingested.
531 """
532 if not dataFiles:
533 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
535 try:
536 # run=None because expect ingester to name a new collection
537 self.ingester.run(dataFiles, run=None, processes=processes)
538 except lsst.daf.butler.registry.ConflictingDefinitionError as detail:
539 raise RuntimeError("Not all raw files are unique") from detail
541 def _defineVisits(self, processes):
542 """Map visits to the ingested exposures.
544 This step is necessary to be able to run most pipelines on raw datasets.
546 Parameters
547 ----------
548 processes : `int`
549 The number processes to use to define visits.
551 Raises
552 ------
553 RuntimeError
554 Raised if there are no exposures in the repository.
555 """
556 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"]))
557 if not exposures:
558 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.")
560 exposureKeys = list(exposures)[0].graph
561 exposuresWithVisits = {x.subset(exposureKeys) for x in
562 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])}
563 exposuresNoVisits = exposures - exposuresWithVisits
564 if exposuresNoVisits:
565 self.log.info("Defining visits...")
566 self.visitDefiner.run(exposuresNoVisits, processes=processes)
567 else:
568 self.log.info("Visits were previously defined, skipping...")
570 def _copyConfigs(self):
571 """Give a workspace a copy of all configs associated with the
572 ingested data.
574 After this method returns, the config directory in the workspace
575 contains all config files from the ap_verify dataset.
576 """
577 if os.listdir(self.workspace.configDir):
578 self.log.info("Configs already copied, skipping...")
579 else:
580 self.log.info("Storing data-specific configs...")
581 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']):
582 shutil.copy2(configFile, self.workspace.configDir)
583 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir))
586def ingestDataset(dataset, workspace):
587 """Ingest the contents of an ap_veify dataset into a Butler repository.
589 The original data directory shall not be modified.
591 Parameters
592 ----------
593 dataset : `lsst.ap.verify.dataset.Dataset`
594 The ap_verify dataset to be ingested.
595 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
596 The abstract location where ingestion repositories will be created.
597 If the repositories already exist, they must be compatible with
598 ``dataset`` (in particular, they must support the relevant
599 ``obs`` package).
600 """
601 # TODO: generalize to support arbitrary URIs (DM-11482)
602 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
604 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset))
605 ingester.run(dataset, workspace)
606 log.info("Data ingested")
609def ingestDatasetGen3(dataset, workspace, processes=1):
610 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository.
612 The original data directory is not modified.
614 Parameters
615 ----------
616 dataset : `lsst.ap.verify.dataset.Dataset`
617 The ap_verify dataset to be ingested.
618 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
619 The abstract location where the epository is be created, if it does
620 not already exist.
621 processes : `int`
622 The number processes to use to ingest.
623 """
624 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
626 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset))
627 ingester.run(processes=processes)
628 log.info("Data ingested")
631def _getConfig(task, dataset):
632 """Return the ingestion config associated with a specific dataset.
634 Parameters
635 ----------
636 task : `lsst.pipe.base.Task`-type
637 The task whose config is needed
638 dataset : `lsst.ap.verify.dataset.Dataset`
639 The dataset whose ingestion config is desired.
641 Returns
642 -------
643 config : ``task.ConfigClass``
644 The config for running ``task`` on ``dataset``.
645 """
646 # Can't use dataset.instrument.applyConfigOverrides for this, because the
647 # dataset might not have Gen 3 support.
648 overrideFile = task._DefaultName + ".py"
649 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
651 config = task.ConfigClass()
652 for path in [
653 os.path.join(packageDir, 'config'),
654 os.path.join(packageDir, 'config', dataset.camera),
655 dataset.configLocation,
656 ]:
657 overridePath = os.path.join(path, overrideFile)
658 if os.path.exists(overridePath):
659 config.load(overridePath)
660 return config
663def _runIngestTask(task, args):
664 """Run an ingestion task on a set of inputs.
666 Parameters
667 ----------
668 task : `lsst.pipe.tasks.IngestTask`
669 The task to run.
670 args : list of command-line arguments, split using Python conventions
671 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
672 """
673 argumentParser = task.ArgumentParser(name=task.getName())
674 try:
675 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
676 except SystemExit as e:
677 # SystemExit is not an appropriate response when the arguments aren't user-supplied
678 raise ValueError("Invalid ingestion arguments: %s" % args) from e
679 task.run(parsedCmd)
682def _findMatchingFiles(basePath, include, exclude=None):
683 """Recursively identify files matching one set of patterns and not matching another.
685 Parameters
686 ----------
687 basePath : `str`
688 The path on disk where the files in ``include`` are located.
689 include : iterable of `str`
690 A collection of files (with wildcards) to include. Must not
691 contain paths.
692 exclude : iterable of `str`, optional
693 A collection of filenames (with wildcards) to exclude. Must not
694 contain paths. If omitted, all files matching ``include`` are returned.
696 Returns
697 -------
698 files : `set` of `str`
699 The files in ``basePath`` or any subdirectory that match ``include``
700 but not ``exclude``.
701 """
702 _exclude = exclude if exclude is not None else []
704 allFiles = set()
705 for pattern in include:
706 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
708 for pattern in _exclude:
709 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
710 allFiles.difference_update(excludedFiles)
711 return allFiles