Coverage for python/lsst/ap/verify/ingestion.py: 25%
210 statements
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 03:57 -0700
« prev ^ index » next coverage.py v7.2.1, created at 2023-03-12 03:57 -0700
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of an ap_verify dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"]
32import fnmatch
33import os
34import re
35import shutil
36import tarfile
37from glob import glob
38import sqlite3
40import lsst.utils
41import lsst.log
42import lsst.pex.config as pexConfig
43import lsst.pipe.base as pipeBase
45import lsst.daf.butler
46import lsst.obs.base
47from lsst.pipe.tasks.ingest import IngestTask
48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
52class DatasetIngestConfig(pexConfig.Config):
53 """Settings and defaults for `DatasetIngestTask`.
55 The correct targets for this task's subtasks can be found in the
56 documentation of the appropriate ``obs`` package.
58 Because `DatasetIngestTask` is not designed to be run from the command line,
59 and its arguments are completely determined by the choice of dataset,
60 this config includes settings that would normally be passed as command-line
61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
62 """
64 dataIngester = pexConfig.ConfigurableField(
65 target=IngestTask,
66 doc="Task used to perform raw data ingestion.",
67 )
68 # Normally file patterns should be user input, but put them in a config so
69 # the ap_verify dataset can configure them
70 dataFiles = pexConfig.ListField(
71 dtype=str,
72 default=["*.fits", "*.fz", "*.fits.gz"],
73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
74 )
75 dataBadFiles = pexConfig.ListField(
76 dtype=str,
77 default=[],
78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
79 "supersedes ``dataFiles``.",
80 )
82 calibIngester = pexConfig.ConfigurableField(
83 target=IngestCalibsTask,
84 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
85 )
86 calibFiles = pexConfig.ListField(
87 dtype=str,
88 default=["*.fits", "*.fz", "*.fits.gz"],
89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
90 )
91 calibBadFiles = pexConfig.ListField(
92 dtype=str,
93 default=[],
94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
95 )
96 calibValidity = pexConfig.Field(
97 dtype=int,
98 default=9999,
99 doc="Calibration validity period (days). Assumed equal for all calib types.")
101 curatedCalibPaths = pexConfig.ListField(
102 dtype=str,
103 default=[],
104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). "
105 "Each path should be a directory which contains one subdirectory per sensor."
106 )
107 curatedCalibIngester = pexConfig.ConfigurableField(
108 target=IngestCuratedCalibsTask,
109 doc="Task used to ingest curated calibs.",
110 )
112 refcats = pexConfig.DictField(
113 keytype=str,
114 itemtype=str,
115 default={},
116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
117 )
119 def setDefaults(self):
120 # Can't easily check for prior curated ingestion, so make it not matter
121 self.curatedCalibIngester.clobber = True
124class DatasetIngestTask(pipeBase.Task):
125 """Task for automating ingestion of a ap_verify dataset.
127 Each dataset configures this task as appropriate for the files it provides
128 and the target instrument. Therefore, this task takes no input besides the
129 ap_verify dataset to load and the repositories to ingest to.
130 """
132 ConfigClass = DatasetIngestConfig
133 _DefaultName = "datasetIngest"
135 def __init__(self, *args, **kwargs):
136 pipeBase.Task.__init__(self, *args, **kwargs)
137 self.makeSubtask("dataIngester")
138 self.makeSubtask("calibIngester")
139 self.makeSubtask("curatedCalibIngester")
141 def run(self, dataset, workspace):
142 """Ingest the contents of a dataset into a Butler repository.
144 Parameters
145 ----------
146 dataset : `lsst.ap.verify.dataset.Dataset`
147 The dataset to be ingested.
148 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
149 The abstract location where ingestion repositories will be created.
150 If the repositories already exist, they must support the same
151 ``obs`` package as this task's subtasks.
152 """
153 # We're assuming ingest tasks always give absolute path to butler
154 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
155 self._ingestRaws(dataset, workspace)
156 self._ingestCalibs(dataset, workspace)
157 self._ingestCuratedCalibs(dataset, workspace)
158 self._ingestRefcats(dataset, workspace)
159 self._copyConfigs(dataset, workspace)
161 def _ingestRaws(self, dataset, workspace):
162 """Ingest the science data for use by LSST.
164 After this method returns, the data repository in ``workspace`` shall
165 contain all science data from ``dataset``. Butler operations on the
166 repository shall not be able to modify ``dataset``.
168 Parameters
169 ----------
170 dataset : `lsst.ap.verify.dataset.Dataset`
171 The dataset on which the pipeline will be run.
172 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
173 The location containing all ingestion repositories.
175 Raises
176 ------
177 RuntimeError
178 Raised if there are no files to ingest.
179 """
180 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
181 self.log.info("Raw images were previously ingested, skipping...")
182 else:
183 self.log.info("Ingesting raw images...")
184 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
185 if dataFiles:
186 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
187 dataFiles, self.config.dataBadFiles)
188 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
189 else:
190 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
192 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
193 """Ingest raw images into a repository.
195 ``repo`` shall be populated with *links* to ``dataFiles``.
197 Parameters
198 ----------
199 repo : `str`
200 The output repository location on disk for raw images. Must exist.
201 calibRepo : `str`
202 The output calibration repository location on disk.
203 dataFiles : `list` of `str`
204 A list of filenames to ingest. May contain wildcards.
205 badFiles : `list` of `str`
206 A list of filenames to exclude from ingestion. Must not contain paths.
207 May contain wildcards.
209 Raises
210 ------
211 RuntimeError
212 Raised if ``dataFiles`` is empty.
213 """
214 if not dataFiles:
215 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
217 args = [repo, "--calib", calibRepo, "--mode", "link"]
218 args.extend(dataFiles)
219 if badFiles:
220 args.append('--badFile')
221 args.extend(badFiles)
222 try:
223 _runIngestTask(self.dataIngester, args)
224 except sqlite3.IntegrityError as detail:
225 raise RuntimeError("Not all raw files are unique") from detail
227 def _ingestCalibs(self, dataset, workspace):
228 """Ingest the calibration files for use by LSST.
230 After this method returns, the calibration repository in ``workspace``
231 shall contain all calibration data from ``dataset``. Butler operations
232 on the repository shall not be able to modify ``dataset``.
234 Parameters
235 ----------
236 dataset : `lsst.ap.verify.dataset.Dataset`
237 The dataset on which the pipeline will be run.
238 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
239 The location containing all ingestion repositories.
241 Raises
242 ------
243 RuntimeError
244 Raised if there are no files to ingest.
245 """
246 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
247 self.log.info("Calibration files were previously ingested, skipping...")
248 else:
249 self.log.info("Ingesting calibration files...")
250 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
251 self.config.calibFiles, self.config.calibBadFiles)
252 if calibDataFiles:
253 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
254 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
255 workspace.dataRepo, workspace.calibRepo))
256 else:
257 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
259 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
260 """Ingest calibration images into a calibration repository.
262 Parameters
263 ----------
264 repo : `str`
265 The output repository location on disk for raw images. Must exist.
266 calibRepo : `str`
267 The output repository location on disk for calibration files. Must
268 exist.
269 calibDataFiles : `list` of `str`
270 A list of filenames to ingest. Supported files vary by instrument
271 but may include flats, biases, darks, fringes, or sky. May contain
272 wildcards.
274 Raises
275 ------
276 RuntimeError
277 Raised if ``calibDataFiles`` is empty.
278 """
279 if not calibDataFiles:
280 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
281 % calibDataFiles)
283 # TODO: --output is workaround for DM-11668
284 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
285 "--mode", "link", "--validity", str(self.config.calibValidity)]
286 args.extend(calibDataFiles)
287 try:
288 _runIngestTask(self.calibIngester, args)
289 except sqlite3.IntegrityError as detail:
290 raise RuntimeError("Not all calibration files are unique") from detail
292 def _ingestCuratedCalibs(self, dataset, workspace):
293 """Ingest the curated calib files for use by LSST.
295 After this method returns, the calibration repository in ``workspace``
296 shall contain all curated calibs mentioned in curatedCalibPaths. Butler
297 operations on the repository shall not be able to modify ``dataset``.
299 Parameters
300 ----------
301 dataset : `lsst.ap.verify.dataset.Dataset`
302 The dataset on which the pipeline will be run.
303 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
304 The location containing all ingestion repositories.
305 """
306 for curated in self.config.curatedCalibPaths:
307 # Can't easily check for prior ingestion; workaround in config
308 self.log.info("Ingesting curated calibs...")
309 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated)
310 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo))
312 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath):
313 """Ingest curated calib data.
315 Parameters
316 ----------
317 repo : `str`
318 The output repository location on disk for raw images. Must exist.
319 calibRepo : `str`
320 The output repository location on disk for calibration files. Must
321 exist.
322 curatedPath : `str`
323 Path to the curated calibs in standard text form. This is probably
324 a path in ``obs_*_data``.
325 """
327 curatedargs = [repo, curatedPath, "--calib", calibRepo, "--ignore-ingested"]
328 try:
329 _runIngestTask(self.curatedCalibIngester, curatedargs)
330 except sqlite3.IntegrityError as detail:
331 raise RuntimeError("Not all curated calib files are unique") from detail
333 def _ingestRefcats(self, dataset, workspace):
334 """Ingest the refcats for use by LSST.
336 After this method returns, the data repository in ``workspace`` shall
337 contain all reference catalogs from ``dataset``. Operations on the
338 repository shall not be able to modify ``dataset``.
340 Parameters
341 ----------
342 dataset : `lsst.ap.verify.dataset.Dataset`
343 The dataset on which the pipeline will be run.
344 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
345 The location containing all ingestion repositories.
347 Notes
348 -----
349 Refcats are not, at present, registered as part of the repository. They
350 are not guaranteed to be visible to anything other than a
351 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
352 for more details.
353 """
354 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
355 self.log.info("Refcats were previously ingested, skipping...")
356 else:
357 self.log.info("Ingesting reference catalogs...")
358 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
359 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
361 def _doIngestRefcats(self, repo, refcats):
362 """Place refcats inside a particular repository.
364 Parameters
365 ----------
366 repo : `str`
367 The output repository location on disk for raw images. Must exist.
368 refcats : `str`
369 A directory containing .tar.gz files with LSST-formatted astrometric
370 or photometric reference catalog information.
371 """
372 for refcatName, tarball in self.config.refcats.items():
373 tarball = os.path.join(refcats, tarball)
374 refcatDir = os.path.join(repo, "ref_cats", refcatName)
375 with tarfile.open(tarball, "r") as opened:
376 opened.extractall(refcatDir)
378 def _copyConfigs(self, dataset, workspace):
379 """Give a workspace a copy of all configs associated with the ingested data.
381 After this method returns, the config directory in ``workspace`` shall
382 contain all config files from ``dataset``.
384 Parameters
385 ----------
386 dataset : `lsst.ap.verify.dataset.Dataset`
387 The dataset on which the pipeline will be run.
388 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
389 The location containing the config directory.
390 """
391 if os.listdir(workspace.configDir):
392 self.log.info("Configs already copied, skipping...")
393 else:
394 self.log.info("Storing data-specific configs...")
395 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
396 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
398 def _doCopyConfigs(self, destination, source):
399 """Place configs inside a particular repository.
401 Parameters
402 ----------
403 destination : `str`
404 The directory to which the configs must be copied. Must exist.
405 source : `str`
406 A directory containing Task config files.
407 """
408 for configFile in _findMatchingFiles(source, ['*.py']):
409 shutil.copy2(configFile, destination)
412class Gen3DatasetIngestConfig(pexConfig.Config):
413 """Settings and defaults for `Gen3DatasetIngestTask`.
415 The correct target for `ingester` can be found in the documentation of
416 the appropriate ``obs`` package.
417 """
419 ingester = pexConfig.ConfigurableField(
420 target=lsst.obs.base.RawIngestTask,
421 doc="Task used to perform raw data ingestion.",
422 )
423 visitDefiner = pexConfig.ConfigurableField(
424 target=lsst.obs.base.DefineVisitsTask,
425 doc="Task used to organize raw exposures into visits.",
426 )
427 # Normally file patterns should be user input, but put them in a config so
428 # the ap_verify dataset can configure them
429 dataFiles = pexConfig.ListField(
430 dtype=str,
431 default=["*.fits", "*.fz", "*.fits.gz"],
432 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
433 )
434 dataBadFiles = pexConfig.ListField(
435 dtype=str,
436 default=[],
437 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
438 "supersedes ``dataFiles``.",
439 )
442class Gen3DatasetIngestTask(pipeBase.Task):
443 """Task for automating ingestion of a ap_verify dataset.
445 Each dataset configures this task as appropriate for the files it provides
446 and the target instrument. Therefore, this task takes no input besides the
447 ap_verify dataset to load and the repositories to ingest to.
449 Parameters
450 ----------
451 dataset : `lsst.ap.verify.dataset.Dataset`
452 The ``ap_verify`` dataset to be ingested.
453 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
454 The abstract location for all ``ap_verify`` outputs, including
455 a Gen 3 repository.
456 """
458 ConfigClass = Gen3DatasetIngestConfig
459 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides
460 _DefaultName = "datasetIngest-gen3"
462 def __init__(self, dataset, workspace, *args, **kwargs):
463 super().__init__(*args, **kwargs)
464 self.workspace = workspace
465 self.dataset = dataset
466 # workspace.workButler is undefined until the repository is created
467 self.dataset.makeCompatibleRepoGen3(self.workspace.repo)
468 self.makeSubtask("ingester", butler=self.workspace.workButler)
469 self.makeSubtask("visitDefiner", butler=self.workspace.workButler)
471 def _reduce_kwargs(self):
472 # Add extra parameters to pickle
473 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace)
475 def run(self, processes=1):
476 """Ingest the contents of a dataset into a Butler repository.
478 Parameters
479 ----------
480 processes : `int`
481 The number processes to use to ingest.
482 """
483 self._ensureRaws(processes=processes)
484 self._defineVisits(processes=processes)
485 self._copyConfigs()
487 def _ensureRaws(self, processes):
488 """Ensure that the repository in ``workspace`` has raws ingested.
490 After this method returns, this task's repository contains all science
491 data from this task's ap_verify dataset. Butler operations on the
492 repository are not able to modify ``dataset`` in any way.
494 Parameters
495 ----------
496 processes : `int`
497 The number processes to use to ingest, if ingestion must be run.
499 Raises
500 ------
501 RuntimeError
502 Raised if there are no files to ingest.
503 """
504 # TODO: regex is workaround for DM-25945
505 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
506 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
507 rawData = list(self.workspace.workButler.registry.queryDatasets(
508 'raw',
509 collections=rawCollections,
510 dataId={"instrument": self.dataset.instrument.getName()})) \
511 if rawCollections else []
513 if rawData:
514 self.log.info("Raw images for %s were previously ingested, skipping...",
515 self.dataset.instrument.getName())
516 else:
517 self.log.info("Ingesting raw images...")
518 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles,
519 exclude=self.config.dataBadFiles)
520 if dataFiles:
521 self._ingestRaws(dataFiles, processes=processes)
522 self.log.info("Images are now ingested in {0}".format(self.workspace.repo))
523 else:
524 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation)
526 def _ingestRaws(self, dataFiles, processes):
527 """Ingest raw images into a repository.
529 This task's repository is populated with *links* to ``dataFiles``.
531 Parameters
532 ----------
533 dataFiles : `list` of `str`
534 A list of filenames to ingest. May contain wildcards.
535 processes : `int`
536 The number processes to use to ingest.
538 Raises
539 ------
540 RuntimeError
541 Raised if ``dataFiles`` is empty or any file has already been ingested.
542 """
543 if not dataFiles:
544 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
546 try:
547 # run=None because expect ingester to name a new collection
548 self.ingester.run(dataFiles, run=None, processes=processes)
549 except lsst.daf.butler.registry.ConflictingDefinitionError as detail:
550 raise RuntimeError("Not all raw files are unique") from detail
552 def _defineVisits(self, processes):
553 """Map visits to the ingested exposures.
555 This step is necessary to be able to run most pipelines on raw datasets.
557 Parameters
558 ----------
559 processes : `int`
560 The number processes to use to define visits.
562 Raises
563 ------
564 RuntimeError
565 Raised if there are no exposures in the repository.
566 """
567 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"]))
568 if not exposures:
569 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.")
571 exposureKeys = list(exposures)[0].graph
572 exposuresWithVisits = {x.subset(exposureKeys) for x in
573 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])}
574 exposuresNoVisits = exposures - exposuresWithVisits
575 if exposuresNoVisits:
576 self.log.info("Defining visits...")
577 self.visitDefiner.run(exposuresNoVisits, processes=processes)
578 else:
579 self.log.info("Visits were previously defined, skipping...")
581 def _copyConfigs(self):
582 """Give a workspace a copy of all configs associated with the
583 ingested data.
585 After this method returns, the config directory in the workspace
586 contains all config files from the ap_verify dataset.
587 """
588 if os.listdir(self.workspace.configDir):
589 self.log.info("Configs already copied, skipping...")
590 else:
591 self.log.info("Storing data-specific configs...")
592 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']):
593 shutil.copy2(configFile, self.workspace.configDir)
594 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir))
597def ingestDataset(dataset, workspace):
598 """Ingest the contents of an ap_veify dataset into a Butler repository.
600 The original data directory shall not be modified.
602 Parameters
603 ----------
604 dataset : `lsst.ap.verify.dataset.Dataset`
605 The ap_verify dataset to be ingested.
606 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
607 The abstract location where ingestion repositories will be created.
608 If the repositories already exist, they must be compatible with
609 ``dataset`` (in particular, they must support the relevant
610 ``obs`` package).
611 """
612 # TODO: generalize to support arbitrary URIs (DM-11482)
613 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
615 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset))
616 ingester.run(dataset, workspace)
617 log.info("Data ingested")
620def ingestDatasetGen3(dataset, workspace, processes=1):
621 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository.
623 The original data directory is not modified.
625 Parameters
626 ----------
627 dataset : `lsst.ap.verify.dataset.Dataset`
628 The ap_verify dataset to be ingested.
629 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
630 The abstract location where the epository is be created, if it does
631 not already exist.
632 processes : `int`
633 The number processes to use to ingest.
634 """
635 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
637 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset))
638 ingester.run(processes=processes)
639 log.info("Data ingested")
642def _getConfig(task, dataset):
643 """Return the ingestion config associated with a specific dataset.
645 Parameters
646 ----------
647 task : `lsst.pipe.base.Task`-type
648 The task whose config is needed
649 dataset : `lsst.ap.verify.dataset.Dataset`
650 The dataset whose ingestion config is desired.
652 Returns
653 -------
654 config : ``task.ConfigClass``
655 The config for running ``task`` on ``dataset``.
656 """
657 # Can't use dataset.instrument.applyConfigOverrides for this, because the
658 # dataset might not have Gen 3 support.
659 overrideFile = task._DefaultName + ".py"
660 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
662 config = task.ConfigClass()
663 for path in [
664 os.path.join(packageDir, 'config'),
665 os.path.join(packageDir, 'config', dataset.camera),
666 dataset.configLocation,
667 ]:
668 overridePath = os.path.join(path, overrideFile)
669 if os.path.exists(overridePath):
670 config.load(overridePath)
671 return config
674def _runIngestTask(task, args):
675 """Run an ingestion task on a set of inputs.
677 Parameters
678 ----------
679 task : `lsst.pipe.tasks.IngestTask`
680 The task to run.
681 args : list of command-line arguments, split using Python conventions
682 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
683 """
684 argumentParser = task.ArgumentParser(name=task.getName())
685 try:
686 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
687 except SystemExit as e:
688 # SystemExit is not an appropriate response when the arguments aren't user-supplied
689 raise ValueError("Invalid ingestion arguments: %s" % args) from e
690 task.run(parsedCmd)
693def _findMatchingFiles(basePath, include, exclude=None):
694 """Recursively identify files matching one set of patterns and not matching another.
696 Parameters
697 ----------
698 basePath : `str`
699 The path on disk where the files in ``include`` are located.
700 include : iterable of `str`
701 A collection of files (with wildcards) to include. Must not
702 contain paths.
703 exclude : iterable of `str`, optional
704 A collection of filenames (with wildcards) to exclude. Must not
705 contain paths. If omitted, all files matching ``include`` are returned.
707 Returns
708 -------
709 files : `set` of `str`
710 The files in ``basePath`` or any subdirectory that match ``include``
711 but not ``exclude``.
712 """
713 _exclude = exclude if exclude is not None else []
715 allFiles = set()
716 for pattern in include:
717 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
719 for pattern in _exclude:
720 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
721 allFiles.difference_update(excludedFiles)
722 return allFiles