Coverage for python/lsst/ap/verify/ingestion.py: 27%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of an ap_verify dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"]
32import fnmatch
33import os
34import re
35import shutil
36import tarfile
37from glob import glob
38import sqlite3
39import logging
41import lsst.utils
42import lsst.pex.config as pexConfig
43import lsst.pipe.base as pipeBase
45import lsst.daf.butler
46import lsst.obs.base
47from lsst.pipe.tasks.ingest import IngestTask
48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
51_LOG = logging.getLogger(__name__)
54class DatasetIngestConfig(pexConfig.Config):
55 """Settings and defaults for `DatasetIngestTask`.
57 The correct targets for this task's subtasks can be found in the
58 documentation of the appropriate ``obs`` package.
60 Because `DatasetIngestTask` is not designed to be run from the command line,
61 and its arguments are completely determined by the choice of dataset,
62 this config includes settings that would normally be passed as command-line
63 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
64 """
66 dataIngester = pexConfig.ConfigurableField(
67 target=IngestTask,
68 doc="Task used to perform raw data ingestion.",
69 )
70 # Normally file patterns should be user input, but put them in a config so
71 # the ap_verify dataset can configure them
72 dataFiles = pexConfig.ListField(
73 dtype=str,
74 default=["*.fits", "*.fz", "*.fits.gz"],
75 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
76 )
77 dataBadFiles = pexConfig.ListField(
78 dtype=str,
79 default=[],
80 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
81 "supersedes ``dataFiles``.",
82 )
84 calibIngester = pexConfig.ConfigurableField(
85 target=IngestCalibsTask,
86 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
87 )
88 calibFiles = pexConfig.ListField(
89 dtype=str,
90 default=["*.fits", "*.fz", "*.fits.gz"],
91 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
92 )
93 calibBadFiles = pexConfig.ListField(
94 dtype=str,
95 default=[],
96 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
97 )
98 calibValidity = pexConfig.Field(
99 dtype=int,
100 default=9999,
101 doc="Calibration validity period (days). Assumed equal for all calib types.")
103 curatedCalibPaths = pexConfig.ListField(
104 dtype=str,
105 default=[],
106 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). "
107 "Each path should be a directory which contains one subdirectory per sensor."
108 )
109 curatedCalibIngester = pexConfig.ConfigurableField(
110 target=IngestCuratedCalibsTask,
111 doc="Task used to ingest curated calibs.",
112 )
114 refcats = pexConfig.DictField(
115 keytype=str,
116 itemtype=str,
117 default={},
118 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
119 )
121 def setDefaults(self):
122 # Can't easily check for prior curated ingestion, so make it not matter
123 self.curatedCalibIngester.clobber = True
126class DatasetIngestTask(pipeBase.Task):
127 """Task for automating ingestion of a ap_verify dataset.
129 Each dataset configures this task as appropriate for the files it provides
130 and the target instrument. Therefore, this task takes no input besides the
131 ap_verify dataset to load and the repositories to ingest to.
132 """
134 ConfigClass = DatasetIngestConfig
135 _DefaultName = "datasetIngest"
137 def __init__(self, *args, **kwargs):
138 pipeBase.Task.__init__(self, *args, **kwargs)
139 self.makeSubtask("dataIngester")
140 self.makeSubtask("calibIngester")
141 self.makeSubtask("curatedCalibIngester")
143 def run(self, dataset, workspace):
144 """Ingest the contents of a dataset into a Butler repository.
146 Parameters
147 ----------
148 dataset : `lsst.ap.verify.dataset.Dataset`
149 The dataset to be ingested.
150 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
151 The abstract location where ingestion repositories will be created.
152 If the repositories already exist, they must support the same
153 ``obs`` package as this task's subtasks.
154 """
155 # We're assuming ingest tasks always give absolute path to butler
156 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
157 self._ingestRaws(dataset, workspace)
158 self._ingestCalibs(dataset, workspace)
159 self._ingestCuratedCalibs(dataset, workspace)
160 self._ingestRefcats(dataset, workspace)
161 self._copyConfigs(dataset, workspace)
163 def _ingestRaws(self, dataset, workspace):
164 """Ingest the science data for use by LSST.
166 After this method returns, the data repository in ``workspace`` shall
167 contain all science data from ``dataset``. Butler operations on the
168 repository shall not be able to modify ``dataset``.
170 Parameters
171 ----------
172 dataset : `lsst.ap.verify.dataset.Dataset`
173 The dataset on which the pipeline will be run.
174 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
175 The location containing all ingestion repositories.
177 Raises
178 ------
179 RuntimeError
180 Raised if there are no files to ingest.
181 """
182 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
183 self.log.info("Raw images were previously ingested, skipping...")
184 else:
185 self.log.info("Ingesting raw images...")
186 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
187 if dataFiles:
188 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
189 dataFiles, self.config.dataBadFiles)
190 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
191 else:
192 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
194 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
195 """Ingest raw images into a repository.
197 ``repo`` shall be populated with *links* to ``dataFiles``.
199 Parameters
200 ----------
201 repo : `str`
202 The output repository location on disk for raw images. Must exist.
203 calibRepo : `str`
204 The output calibration repository location on disk.
205 dataFiles : `list` of `str`
206 A list of filenames to ingest. May contain wildcards.
207 badFiles : `list` of `str`
208 A list of filenames to exclude from ingestion. Must not contain paths.
209 May contain wildcards.
211 Raises
212 ------
213 RuntimeError
214 Raised if ``dataFiles`` is empty.
215 """
216 if not dataFiles:
217 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
219 args = [repo, "--calib", calibRepo, "--mode", "link"]
220 args.extend(dataFiles)
221 if badFiles:
222 args.append('--badFile')
223 args.extend(badFiles)
224 try:
225 _runIngestTask(self.dataIngester, args)
226 except sqlite3.IntegrityError as detail:
227 raise RuntimeError("Not all raw files are unique") from detail
229 def _ingestCalibs(self, dataset, workspace):
230 """Ingest the calibration files for use by LSST.
232 After this method returns, the calibration repository in ``workspace``
233 shall contain all calibration data from ``dataset``. Butler operations
234 on the repository shall not be able to modify ``dataset``.
236 Parameters
237 ----------
238 dataset : `lsst.ap.verify.dataset.Dataset`
239 The dataset on which the pipeline will be run.
240 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
241 The location containing all ingestion repositories.
243 Raises
244 ------
245 RuntimeError
246 Raised if there are no files to ingest.
247 """
248 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
249 self.log.info("Calibration files were previously ingested, skipping...")
250 else:
251 self.log.info("Ingesting calibration files...")
252 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
253 self.config.calibFiles, self.config.calibBadFiles)
254 if calibDataFiles:
255 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
256 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
257 workspace.dataRepo, workspace.calibRepo))
258 else:
259 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
261 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
262 """Ingest calibration images into a calibration repository.
264 Parameters
265 ----------
266 repo : `str`
267 The output repository location on disk for raw images. Must exist.
268 calibRepo : `str`
269 The output repository location on disk for calibration files. Must
270 exist.
271 calibDataFiles : `list` of `str`
272 A list of filenames to ingest. Supported files vary by instrument
273 but may include flats, biases, darks, fringes, or sky. May contain
274 wildcards.
276 Raises
277 ------
278 RuntimeError
279 Raised if ``calibDataFiles`` is empty.
280 """
281 if not calibDataFiles:
282 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
283 % calibDataFiles)
285 # TODO: --output is workaround for DM-11668
286 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
287 "--mode", "link", "--validity", str(self.config.calibValidity)]
288 args.extend(calibDataFiles)
289 try:
290 _runIngestTask(self.calibIngester, args)
291 except sqlite3.IntegrityError as detail:
292 raise RuntimeError("Not all calibration files are unique") from detail
294 def _ingestCuratedCalibs(self, dataset, workspace):
295 """Ingest the curated calib files for use by LSST.
297 After this method returns, the calibration repository in ``workspace``
298 shall contain all curated calibs mentioned in curatedCalibPaths. Butler
299 operations on the repository shall not be able to modify ``dataset``.
301 Parameters
302 ----------
303 dataset : `lsst.ap.verify.dataset.Dataset`
304 The dataset on which the pipeline will be run.
305 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
306 The location containing all ingestion repositories.
307 """
308 for curated in self.config.curatedCalibPaths:
309 # Can't easily check for prior ingestion; workaround in config
310 self.log.info("Ingesting curated calibs...")
311 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated)
312 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo))
314 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath):
315 """Ingest curated calib data.
317 Parameters
318 ----------
319 repo : `str`
320 The output repository location on disk for raw images. Must exist.
321 calibRepo : `str`
322 The output repository location on disk for calibration files. Must
323 exist.
324 curatedPath : `str`
325 Path to the curated calibs in standard text form. This is probably
326 a path in ``obs_*_data``.
327 """
329 curatedargs = [repo, curatedPath, "--calib", calibRepo, "--ignore-ingested"]
330 try:
331 _runIngestTask(self.curatedCalibIngester, curatedargs)
332 except sqlite3.IntegrityError as detail:
333 raise RuntimeError("Not all curated calib files are unique") from detail
335 def _ingestRefcats(self, dataset, workspace):
336 """Ingest the refcats for use by LSST.
338 After this method returns, the data repository in ``workspace`` shall
339 contain all reference catalogs from ``dataset``. Operations on the
340 repository shall not be able to modify ``dataset``.
342 Parameters
343 ----------
344 dataset : `lsst.ap.verify.dataset.Dataset`
345 The dataset on which the pipeline will be run.
346 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
347 The location containing all ingestion repositories.
349 Notes
350 -----
351 Refcats are not, at present, registered as part of the repository. They
352 are not guaranteed to be visible to anything other than a
353 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
354 for more details.
355 """
356 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
357 self.log.info("Refcats were previously ingested, skipping...")
358 else:
359 self.log.info("Ingesting reference catalogs...")
360 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
361 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
363 def _doIngestRefcats(self, repo, refcats):
364 """Place refcats inside a particular repository.
366 Parameters
367 ----------
368 repo : `str`
369 The output repository location on disk for raw images. Must exist.
370 refcats : `str`
371 A directory containing .tar.gz files with LSST-formatted astrometric
372 or photometric reference catalog information.
373 """
374 for refcatName, tarball in self.config.refcats.items():
375 tarball = os.path.join(refcats, tarball)
376 refcatDir = os.path.join(repo, "ref_cats", refcatName)
377 with tarfile.open(tarball, "r") as opened:
378 opened.extractall(refcatDir)
380 def _copyConfigs(self, dataset, workspace):
381 """Give a workspace a copy of all configs associated with the ingested data.
383 After this method returns, the config directory in ``workspace`` shall
384 contain all config files from ``dataset``.
386 Parameters
387 ----------
388 dataset : `lsst.ap.verify.dataset.Dataset`
389 The dataset on which the pipeline will be run.
390 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
391 The location containing the config directory.
392 """
393 if os.listdir(workspace.configDir):
394 self.log.info("Configs already copied, skipping...")
395 else:
396 self.log.info("Storing data-specific configs...")
397 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
398 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
400 def _doCopyConfigs(self, destination, source):
401 """Place configs inside a particular repository.
403 Parameters
404 ----------
405 destination : `str`
406 The directory to which the configs must be copied. Must exist.
407 source : `str`
408 A directory containing Task config files.
409 """
410 for configFile in _findMatchingFiles(source, ['*.py']):
411 shutil.copy2(configFile, destination)
414class Gen3DatasetIngestConfig(pexConfig.Config):
415 """Settings and defaults for `Gen3DatasetIngestTask`.
417 The correct target for `ingester` can be found in the documentation of
418 the appropriate ``obs`` package.
419 """
421 ingester = pexConfig.ConfigurableField(
422 target=lsst.obs.base.RawIngestTask,
423 doc="Task used to perform raw data ingestion.",
424 )
425 visitDefiner = pexConfig.ConfigurableField(
426 target=lsst.obs.base.DefineVisitsTask,
427 doc="Task used to organize raw exposures into visits.",
428 )
429 # Normally file patterns should be user input, but put them in a config so
430 # the ap_verify dataset can configure them
431 dataFiles = pexConfig.ListField(
432 dtype=str,
433 default=["*.fits", "*.fz", "*.fits.gz"],
434 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
435 )
436 dataBadFiles = pexConfig.ListField(
437 dtype=str,
438 default=[],
439 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
440 "supersedes ``dataFiles``.",
441 )
444class Gen3DatasetIngestTask(pipeBase.Task):
445 """Task for automating ingestion of a ap_verify dataset.
447 Each dataset configures this task as appropriate for the files it provides
448 and the target instrument. Therefore, this task takes no input besides the
449 ap_verify dataset to load and the repositories to ingest to.
451 Parameters
452 ----------
453 dataset : `lsst.ap.verify.dataset.Dataset`
454 The ``ap_verify`` dataset to be ingested.
455 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
456 The abstract location for all ``ap_verify`` outputs, including
457 a Gen 3 repository.
458 """
460 ConfigClass = Gen3DatasetIngestConfig
461 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides
462 _DefaultName = "datasetIngest-gen3"
464 def __init__(self, dataset, workspace, *args, **kwargs):
465 super().__init__(*args, **kwargs)
466 self.workspace = workspace
467 self.dataset = dataset
468 # workspace.workButler is undefined until the repository is created
469 self.dataset.makeCompatibleRepoGen3(self.workspace.repo)
470 self.makeSubtask("ingester", butler=self.workspace.workButler)
471 self.makeSubtask("visitDefiner", butler=self.workspace.workButler)
473 def _reduce_kwargs(self):
474 # Add extra parameters to pickle
475 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace)
477 def run(self, processes=1):
478 """Ingest the contents of a dataset into a Butler repository.
480 Parameters
481 ----------
482 processes : `int`
483 The number processes to use to ingest.
484 """
485 self._ensureRaws(processes=processes)
486 self._defineVisits(processes=processes)
487 self._copyConfigs()
489 def _ensureRaws(self, processes):
490 """Ensure that the repository in ``workspace`` has raws ingested.
492 After this method returns, this task's repository contains all science
493 data from this task's ap_verify dataset. Butler operations on the
494 repository are not able to modify ``dataset`` in any way.
496 Parameters
497 ----------
498 processes : `int`
499 The number processes to use to ingest, if ingestion must be run.
501 Raises
502 ------
503 RuntimeError
504 Raised if there are no files to ingest.
505 """
506 # TODO: regex is workaround for DM-25945
507 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
508 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
509 rawData = list(self.workspace.workButler.registry.queryDatasets(
510 'raw',
511 collections=rawCollections,
512 dataId={"instrument": self.dataset.instrument.getName()})) \
513 if rawCollections else []
515 if rawData:
516 self.log.info("Raw images for %s were previously ingested, skipping...",
517 self.dataset.instrument.getName())
518 else:
519 self.log.info("Ingesting raw images...")
520 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles,
521 exclude=self.config.dataBadFiles)
522 if dataFiles:
523 self._ingestRaws(dataFiles, processes=processes)
524 self.log.info("Images are now ingested in {0}".format(self.workspace.repo))
525 else:
526 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation)
528 def _ingestRaws(self, dataFiles, processes):
529 """Ingest raw images into a repository.
531 This task's repository is populated with *links* to ``dataFiles``.
533 Parameters
534 ----------
535 dataFiles : `list` of `str`
536 A list of filenames to ingest. May contain wildcards.
537 processes : `int`
538 The number processes to use to ingest.
540 Raises
541 ------
542 RuntimeError
543 Raised if ``dataFiles`` is empty or any file has already been ingested.
544 """
545 if not dataFiles:
546 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
548 try:
549 # run=None because expect ingester to name a new collection
550 self.ingester.run(dataFiles, run=None, processes=processes)
551 except lsst.daf.butler.registry.ConflictingDefinitionError as detail:
552 raise RuntimeError("Not all raw files are unique") from detail
554 def _defineVisits(self, processes):
555 """Map visits to the ingested exposures.
557 This step is necessary to be able to run most pipelines on raw datasets.
559 Parameters
560 ----------
561 processes : `int`
562 The number processes to use to define visits.
564 Raises
565 ------
566 RuntimeError
567 Raised if there are no exposures in the repository.
568 """
569 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"]))
570 if not exposures:
571 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.")
573 exposureKeys = list(exposures)[0].graph
574 exposuresWithVisits = {x.subset(exposureKeys) for x in
575 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])}
576 exposuresNoVisits = exposures - exposuresWithVisits
577 if exposuresNoVisits:
578 self.log.info("Defining visits...")
579 self.visitDefiner.run(exposuresNoVisits, processes=processes)
580 else:
581 self.log.info("Visits were previously defined, skipping...")
583 def _copyConfigs(self):
584 """Give a workspace a copy of all configs associated with the
585 ingested data.
587 After this method returns, the config directory in the workspace
588 contains all config files from the ap_verify dataset, and the
589 pipelines directory in the workspace contains all pipeline files
590 from the dataset.
591 """
592 if os.listdir(self.workspace.pipelineDir):
593 self.log.info("Configs already copied, skipping...")
594 else:
595 self.log.info("Storing data-specific configs...")
596 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']):
597 shutil.copy2(configFile, self.workspace.configDir)
598 self.log.info("Configs are now stored in %s.", self.workspace.configDir)
599 for pipelineFile in _findMatchingFiles(self.dataset.pipelineLocation, ['*.yaml']):
600 shutil.copy2(pipelineFile, self.workspace.pipelineDir)
601 self.log.info("Configs are now stored in %s.", self.workspace.pipelineDir)
604def ingestDataset(dataset, workspace):
605 """Ingest the contents of an ap_veify dataset into a Butler repository.
607 The original data directory shall not be modified.
609 Parameters
610 ----------
611 dataset : `lsst.ap.verify.dataset.Dataset`
612 The ap_verify dataset to be ingested.
613 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
614 The abstract location where ingestion repositories will be created.
615 If the repositories already exist, they must be compatible with
616 ``dataset`` (in particular, they must support the relevant
617 ``obs`` package).
618 """
619 # TODO: generalize to support arbitrary URIs (DM-11482)
620 log = _LOG.getChild("ingestDataset")
622 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset))
623 ingester.run(dataset, workspace)
624 log.info("Data ingested")
627def ingestDatasetGen3(dataset, workspace, processes=1):
628 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository.
630 The original data directory is not modified.
632 Parameters
633 ----------
634 dataset : `lsst.ap.verify.dataset.Dataset`
635 The ap_verify dataset to be ingested.
636 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
637 The abstract location where the epository is be created, if it does
638 not already exist.
639 processes : `int`
640 The number processes to use to ingest.
641 """
642 log = _LOG.getChild("ingestDataset")
644 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset))
645 ingester.run(processes=processes)
646 log.info("Data ingested")
649def _getConfig(task, dataset):
650 """Return the ingestion config associated with a specific dataset.
652 Parameters
653 ----------
654 task : `lsst.pipe.base.Task`-type
655 The task whose config is needed
656 dataset : `lsst.ap.verify.dataset.Dataset`
657 The dataset whose ingestion config is desired.
659 Returns
660 -------
661 config : ``task.ConfigClass``
662 The config for running ``task`` on ``dataset``.
663 """
664 # Can't use dataset.instrument.applyConfigOverrides for this, because the
665 # dataset might not have Gen 3 support.
666 overrideFile = task._DefaultName + ".py"
667 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
669 config = task.ConfigClass()
670 for path in [
671 os.path.join(packageDir, 'config'),
672 os.path.join(packageDir, 'config', dataset.camera),
673 dataset.configLocation,
674 ]:
675 overridePath = os.path.join(path, overrideFile)
676 if os.path.exists(overridePath):
677 config.load(overridePath)
678 return config
681def _runIngestTask(task, args):
682 """Run an ingestion task on a set of inputs.
684 Parameters
685 ----------
686 task : `lsst.pipe.tasks.IngestTask`
687 The task to run.
688 args : list of command-line arguments, split using Python conventions
689 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
690 """
691 argumentParser = task.ArgumentParser(name=task.getName())
692 try:
693 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
694 except SystemExit as e:
695 # SystemExit is not an appropriate response when the arguments aren't user-supplied
696 raise ValueError("Invalid ingestion arguments: %s" % args) from e
697 task.run(parsedCmd)
700def _findMatchingFiles(basePath, include, exclude=None):
701 """Recursively identify files matching one set of patterns and not matching another.
703 Parameters
704 ----------
705 basePath : `str`
706 The path on disk where the files in ``include`` are located.
707 include : iterable of `str`
708 A collection of files (with wildcards) to include. Must not
709 contain paths.
710 exclude : iterable of `str`, optional
711 A collection of filenames (with wildcards) to exclude. Must not
712 contain paths. If omitted, all files matching ``include`` are returned.
714 Returns
715 -------
716 files : `set` of `str`
717 The files in ``basePath`` or any subdirectory that match ``include``
718 but not ``exclude``.
719 """
720 _exclude = exclude if exclude is not None else []
722 allFiles = set()
723 for pattern in include:
724 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
726 for pattern in _exclude:
727 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
728 allFiles.difference_update(excludedFiles)
729 return allFiles