Coverage for python/lsst/ap/verify/ingestion.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of an ap_verify dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "Gen3DatasetIngestConfig", "ingestDataset", "ingestDatasetGen3"]
32import fnmatch
33import os
34import re
35import shutil
36import tarfile
37from glob import glob
38import sqlite3
40import lsst.utils
41import lsst.log
42import lsst.pex.config as pexConfig
43import lsst.pipe.base as pipeBase
45import lsst.daf.butler
46import lsst.obs.base
47from lsst.pipe.tasks.ingest import IngestTask
48from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
49from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
52class DatasetIngestConfig(pexConfig.Config):
53 """Settings and defaults for `DatasetIngestTask`.
55 The correct targets for this task's subtasks can be found in the
56 documentation of the appropriate ``obs`` package.
58 Because `DatasetIngestTask` is not designed to be run from the command line,
59 and its arguments are completely determined by the choice of dataset,
60 this config includes settings that would normally be passed as command-line
61 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
62 """
64 dataIngester = pexConfig.ConfigurableField(
65 target=IngestTask,
66 doc="Task used to perform raw data ingestion.",
67 )
68 # Normally file patterns should be user input, but put them in a config so
69 # the ap_verify dataset can configure them
70 dataFiles = pexConfig.ListField(
71 dtype=str,
72 default=["*.fits", "*.fz", "*.fits.gz"],
73 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
74 )
75 dataBadFiles = pexConfig.ListField(
76 dtype=str,
77 default=[],
78 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
79 "supersedes ``dataFiles``.",
80 )
82 calibIngester = pexConfig.ConfigurableField(
83 target=IngestCalibsTask,
84 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
85 )
86 calibFiles = pexConfig.ListField(
87 dtype=str,
88 default=["*.fits", "*.fz", "*.fits.gz"],
89 doc="Names of calib files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
90 )
91 calibBadFiles = pexConfig.ListField(
92 dtype=str,
93 default=[],
94 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
95 )
96 calibValidity = pexConfig.Field(
97 dtype=int,
98 default=9999,
99 doc="Calibration validity period (days). Assumed equal for all calib types.")
101 curatedCalibPaths = pexConfig.ListField(
102 dtype=str,
103 default=[],
104 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). "
105 "Each path should be a directory which contains one subdirectory per sensor."
106 )
107 curatedCalibIngester = pexConfig.ConfigurableField(
108 target=IngestCuratedCalibsTask,
109 doc="Task used to ingest curated calibs.",
110 )
112 refcats = pexConfig.DictField(
113 keytype=str,
114 itemtype=str,
115 default={},
116 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
117 )
120class DatasetIngestTask(pipeBase.Task):
121 """Task for automating ingestion of a ap_verify dataset.
123 Each dataset configures this task as appropriate for the files it provides
124 and the target instrument. Therefore, this task takes no input besides the
125 ap_verify dataset to load and the repositories to ingest to.
126 """
128 ConfigClass = DatasetIngestConfig
129 _DefaultName = "datasetIngest"
131 def __init__(self, *args, **kwargs):
132 pipeBase.Task.__init__(self, *args, **kwargs)
133 self.makeSubtask("dataIngester")
134 self.makeSubtask("calibIngester")
135 self.makeSubtask("curatedCalibIngester")
137 def run(self, dataset, workspace):
138 """Ingest the contents of a dataset into a Butler repository.
140 Parameters
141 ----------
142 dataset : `lsst.ap.verify.dataset.Dataset`
143 The dataset to be ingested.
144 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
145 The abstract location where ingestion repositories will be created.
146 If the repositories already exist, they must support the same
147 ``obs`` package as this task's subtasks.
148 """
149 # We're assuming ingest tasks always give absolute path to butler
150 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
151 self._ingestRaws(dataset, workspace)
152 self._ingestCalibs(dataset, workspace)
153 self._ingestCuratedCalibs(dataset, workspace)
154 self._ingestRefcats(dataset, workspace)
155 self._copyConfigs(dataset, workspace)
157 def _ingestRaws(self, dataset, workspace):
158 """Ingest the science data for use by LSST.
160 After this method returns, the data repository in ``workspace`` shall
161 contain all science data from ``dataset``. Butler operations on the
162 repository shall not be able to modify ``dataset``.
164 Parameters
165 ----------
166 dataset : `lsst.ap.verify.dataset.Dataset`
167 The dataset on which the pipeline will be run.
168 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
169 The location containing all ingestion repositories.
171 Raises
172 ------
173 RuntimeError
174 Raised if there are no files to ingest.
175 """
176 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
177 self.log.info("Raw images were previously ingested, skipping...")
178 else:
179 self.log.info("Ingesting raw images...")
180 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
181 if dataFiles:
182 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
183 dataFiles, self.config.dataBadFiles)
184 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
185 else:
186 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
188 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
189 """Ingest raw images into a repository.
191 ``repo`` shall be populated with *links* to ``dataFiles``.
193 Parameters
194 ----------
195 repo : `str`
196 The output repository location on disk for raw images. Must exist.
197 calibRepo : `str`
198 The output calibration repository location on disk.
199 dataFiles : `list` of `str`
200 A list of filenames to ingest. May contain wildcards.
201 badFiles : `list` of `str`
202 A list of filenames to exclude from ingestion. Must not contain paths.
203 May contain wildcards.
205 Raises
206 ------
207 RuntimeError
208 Raised if ``dataFiles`` is empty.
209 """
210 if not dataFiles:
211 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
213 args = [repo, "--calib", calibRepo, "--mode", "link"]
214 args.extend(dataFiles)
215 if badFiles:
216 args.append('--badFile')
217 args.extend(badFiles)
218 try:
219 _runIngestTask(self.dataIngester, args)
220 except sqlite3.IntegrityError as detail:
221 raise RuntimeError("Not all raw files are unique") from detail
223 def _ingestCalibs(self, dataset, workspace):
224 """Ingest the calibration files for use by LSST.
226 After this method returns, the calibration repository in ``workspace``
227 shall contain all calibration data from ``dataset``. Butler operations
228 on the repository shall not be able to modify ``dataset``.
230 Parameters
231 ----------
232 dataset : `lsst.ap.verify.dataset.Dataset`
233 The dataset on which the pipeline will be run.
234 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
235 The location containing all ingestion repositories.
237 Raises
238 ------
239 RuntimeError
240 Raised if there are no files to ingest.
241 """
242 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
243 self.log.info("Calibration files were previously ingested, skipping...")
244 else:
245 self.log.info("Ingesting calibration files...")
246 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
247 self.config.calibFiles, self.config.calibBadFiles)
248 if calibDataFiles:
249 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
250 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
251 workspace.dataRepo, workspace.calibRepo))
252 else:
253 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
255 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
256 """Ingest calibration images into a calibration repository.
258 Parameters
259 ----------
260 repo : `str`
261 The output repository location on disk for raw images. Must exist.
262 calibRepo : `str`
263 The output repository location on disk for calibration files. Must
264 exist.
265 calibDataFiles : `list` of `str`
266 A list of filenames to ingest. Supported files vary by instrument
267 but may include flats, biases, darks, fringes, or sky. May contain
268 wildcards.
270 Raises
271 ------
272 RuntimeError
273 Raised if ``calibDataFiles`` is empty.
274 """
275 if not calibDataFiles:
276 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
277 % calibDataFiles)
279 # TODO: --output is workaround for DM-11668
280 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
281 "--mode", "link", "--validity", str(self.config.calibValidity)]
282 args.extend(calibDataFiles)
283 try:
284 _runIngestTask(self.calibIngester, args)
285 except sqlite3.IntegrityError as detail:
286 raise RuntimeError("Not all calibration files are unique") from detail
288 def _ingestCuratedCalibs(self, dataset, workspace):
289 """Ingest the curated calib files for use by LSST.
291 After this method returns, the calibration repository in ``workspace``
292 shall contain all curated calibs mentioned in curatedCalibPaths. Butler
293 operations on the repository shall not be able to modify ``dataset``.
295 Parameters
296 ----------
297 dataset : `lsst.ap.verify.dataset.Dataset`
298 The dataset on which the pipeline will be run.
299 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
300 The location containing all ingestion repositories.
301 """
302 for curated in self.config.curatedCalibPaths:
303 self.log.info("Ingesting curated calibs...")
304 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated)
305 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo))
307 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath):
308 """Ingest curated calib data.
310 Parameters
311 ----------
312 repo : `str`
313 The output repository location on disk for raw images. Must exist.
314 calibRepo : `str`
315 The output repository location on disk for calibration files. Must
316 exist.
317 curatedPath : `str`
318 Path to the curated calibs in standard text form. This is probably
319 a path in ``obs_*_data``.
320 """
322 curatedargs = [repo, curatedPath, "--calib", calibRepo]
323 try:
324 _runIngestTask(self.curatedCalibIngester, curatedargs)
325 except sqlite3.IntegrityError as detail:
326 raise RuntimeError("Not all curated calib files are unique") from detail
328 def _ingestRefcats(self, dataset, workspace):
329 """Ingest the refcats for use by LSST.
331 After this method returns, the data repository in ``workspace`` shall
332 contain all reference catalogs from ``dataset``. Operations on the
333 repository shall not be able to modify ``dataset``.
335 Parameters
336 ----------
337 dataset : `lsst.ap.verify.dataset.Dataset`
338 The dataset on which the pipeline will be run.
339 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
340 The location containing all ingestion repositories.
342 Notes
343 -----
344 Refcats are not, at present, registered as part of the repository. They
345 are not guaranteed to be visible to anything other than a
346 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
347 for more details.
348 """
349 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
350 self.log.info("Refcats were previously ingested, skipping...")
351 else:
352 self.log.info("Ingesting reference catalogs...")
353 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
354 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
356 def _doIngestRefcats(self, repo, refcats):
357 """Place refcats inside a particular repository.
359 Parameters
360 ----------
361 repo : `str`
362 The output repository location on disk for raw images. Must exist.
363 refcats : `str`
364 A directory containing .tar.gz files with LSST-formatted astrometric
365 or photometric reference catalog information.
366 """
367 for refcatName, tarball in self.config.refcats.items():
368 tarball = os.path.join(refcats, tarball)
369 refcatDir = os.path.join(repo, "ref_cats", refcatName)
370 with tarfile.open(tarball, "r") as opened:
371 opened.extractall(refcatDir)
373 def _copyConfigs(self, dataset, workspace):
374 """Give a workspace a copy of all configs associated with the ingested data.
376 After this method returns, the config directory in ``workspace`` shall
377 contain all config files from ``dataset``.
379 Parameters
380 ----------
381 dataset : `lsst.ap.verify.dataset.Dataset`
382 The dataset on which the pipeline will be run.
383 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
384 The location containing the config directory.
385 """
386 if os.listdir(workspace.configDir):
387 self.log.info("Configs already copied, skipping...")
388 else:
389 self.log.info("Storing data-specific configs...")
390 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
391 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
393 def _doCopyConfigs(self, destination, source):
394 """Place configs inside a particular repository.
396 Parameters
397 ----------
398 destination : `str`
399 The directory to which the configs must be copied. Must exist.
400 source : `str`
401 A directory containing Task config files.
402 """
403 for configFile in _findMatchingFiles(source, ['*.py']):
404 shutil.copy2(configFile, destination)
407class Gen3DatasetIngestConfig(pexConfig.Config):
408 """Settings and defaults for `Gen3DatasetIngestTask`.
410 The correct target for `ingester` can be found in the documentation of
411 the appropriate ``obs`` package.
412 """
414 ingester = pexConfig.ConfigurableField(
415 target=lsst.obs.base.RawIngestTask,
416 doc="Task used to perform raw data ingestion.",
417 )
418 visitDefiner = pexConfig.ConfigurableField(
419 target=lsst.obs.base.DefineVisitsTask,
420 doc="Task used to organize raw exposures into visits.",
421 )
422 # Normally file patterns should be user input, but put them in a config so
423 # the ap_verify dataset can configure them
424 dataFiles = pexConfig.ListField(
425 dtype=str,
426 default=["*.fits", "*.fz", "*.fits.gz"],
427 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
428 )
429 dataBadFiles = pexConfig.ListField(
430 dtype=str,
431 default=[],
432 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
433 "supersedes ``dataFiles``.",
434 )
437class Gen3DatasetIngestTask(pipeBase.Task):
438 """Task for automating ingestion of a ap_verify dataset.
440 Each dataset configures this task as appropriate for the files it provides
441 and the target instrument. Therefore, this task takes no input besides the
442 ap_verify dataset to load and the repositories to ingest to.
444 Parameters
445 ----------
446 dataset : `lsst.ap.verify.dataset.Dataset`
447 The ``ap_verify`` dataset to be ingested.
448 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
449 The abstract location for all ``ap_verify`` outputs, including
450 a Gen 3 repository.
451 """
453 ConfigClass = Gen3DatasetIngestConfig
454 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides
455 _DefaultName = "datasetIngest-gen3"
457 def __init__(self, dataset, workspace, *args, **kwargs):
458 super().__init__(*args, **kwargs)
459 self.workspace = workspace
460 self.dataset = dataset
461 # workspace.workButler is undefined until the repository is created
462 self.dataset.makeCompatibleRepoGen3(self.workspace.repo)
463 self.makeSubtask("ingester", butler=self.workspace.workButler)
464 self.makeSubtask("visitDefiner", butler=self.workspace.workButler)
466 def run(self):
467 """Ingest the contents of a dataset into a Butler repository.
468 """
469 self._ensureRaws()
470 self._defineVisits()
471 self._copyConfigs()
473 def _ensureRaws(self):
474 """Ensure that the repository in ``workspace`` has raws ingested.
476 After this method returns, this task's repository contains all science
477 data from this task's ap_verify dataset. Butler operations on the
478 repository are not able to modify ``dataset`` in any way.
480 Raises
481 ------
482 RuntimeError
483 Raised if there are no files to ingest.
484 """
485 # TODO: regex is workaround for DM-25945
486 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
487 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
488 if rawCollections:
489 self.log.info("Raw images for %s were previously ingested, skipping...",
490 self.dataset.instrument.getName())
491 else:
492 self.log.info("Ingesting raw images...")
493 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles,
494 exclude=self.config.dataBadFiles)
495 if dataFiles:
496 self._ingestRaws(dataFiles)
497 self.log.info("Images are now ingested in {0}".format(self.workspace.repo))
498 else:
499 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation)
501 def _ingestRaws(self, dataFiles):
502 """Ingest raw images into a repository.
504 This task's repository is populated with *links* to ``dataFiles``.
506 Parameters
507 ----------
508 dataFiles : `list` of `str`
509 A list of filenames to ingest. May contain wildcards.
511 Raises
512 ------
513 RuntimeError
514 Raised if ``dataFiles`` is empty or any file has already been ingested.
515 """
516 if not dataFiles:
517 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
519 try:
520 self.ingester.run(dataFiles, run=None) # expect ingester to name a new collection
521 except lsst.daf.butler.registry.ConflictingDefinitionError as detail:
522 raise RuntimeError("Not all raw files are unique") from detail
524 def _defineVisits(self):
525 """Map visits to the ingested exposures.
527 This step is necessary to be able to run most pipelines on raw datasets.
529 Raises
530 ------
531 RuntimeError
532 Raised if there are no exposures in the repository.
533 """
534 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"]))
535 if not exposures:
536 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.")
538 exposureKeys = list(exposures)[0].graph
539 exposuresWithVisits = {x.subset(exposureKeys) for x in
540 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])}
541 exposuresNoVisits = exposures - exposuresWithVisits
542 if exposuresNoVisits:
543 self.log.info("Defining visits...")
544 self.visitDefiner.run(exposuresNoVisits)
545 else:
546 self.log.info("Visits were previously defined, skipping...")
548 def _copyConfigs(self):
549 """Give a workspace a copy of all configs associated with the
550 ingested data.
552 After this method returns, the config directory in the workspace
553 contains all config files from the ap_verify dataset.
554 """
555 if os.listdir(self.workspace.configDir):
556 self.log.info("Configs already copied, skipping...")
557 else:
558 self.log.info("Storing data-specific configs...")
559 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']):
560 shutil.copy2(configFile, self.workspace.configDir)
561 self.log.info("Configs are now stored in {0}".format(self.workspace.configDir))
564def ingestDataset(dataset, workspace):
565 """Ingest the contents of an ap_veify dataset into a Butler repository.
567 The original data directory shall not be modified.
569 Parameters
570 ----------
571 dataset : `lsst.ap.verify.dataset.Dataset`
572 The ap_verify dataset to be ingested.
573 workspace : `lsst.ap.verify.workspace.WorkspaceGen2`
574 The abstract location where ingestion repositories will be created.
575 If the repositories already exist, they must be compatible with
576 ``dataset`` (in particular, they must support the relevant
577 ``obs`` package).
578 """
579 # TODO: generalize to support arbitrary URIs (DM-11482)
580 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
582 ingester = DatasetIngestTask(config=_getConfig(DatasetIngestTask, dataset))
583 ingester.run(dataset, workspace)
584 log.info("Data ingested")
587def ingestDatasetGen3(dataset, workspace):
588 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository.
590 The original data directory is not modified.
592 Parameters
593 ----------
594 dataset : `lsst.ap.verify.dataset.Dataset`
595 The ap_verify dataset to be ingested.
596 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
597 The abstract location where the epository is be created, if it does
598 not already exist.
599 """
600 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
602 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset))
603 ingester.run()
604 log.info("Data ingested")
607def _getConfig(task, dataset):
608 """Return the ingestion config associated with a specific dataset.
610 Parameters
611 ----------
612 task : `lsst.pipe.base.Task`-type
613 The task whose config is needed
614 dataset : `lsst.ap.verify.dataset.Dataset`
615 The dataset whose ingestion config is desired.
617 Returns
618 -------
619 config : ``task.ConfigClass``
620 The config for running ``task`` on ``dataset``.
621 """
622 # Can't use dataset.instrument.applyConfigOverrides for this, because the
623 # dataset might not have Gen 3 support.
624 overrideFile = task._DefaultName + ".py"
625 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
627 config = task.ConfigClass()
628 for path in [
629 os.path.join(packageDir, 'config'),
630 os.path.join(packageDir, 'config', dataset.camera),
631 dataset.configLocation,
632 ]:
633 overridePath = os.path.join(path, overrideFile)
634 if os.path.exists(overridePath):
635 config.load(overridePath)
636 return config
639def _runIngestTask(task, args):
640 """Run an ingestion task on a set of inputs.
642 Parameters
643 ----------
644 task : `lsst.pipe.tasks.IngestTask`
645 The task to run.
646 args : list of command-line arguments, split using Python conventions
647 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
648 """
649 argumentParser = task.ArgumentParser(name=task.getName())
650 try:
651 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
652 except SystemExit as e:
653 # SystemExit is not an appropriate response when the arguments aren't user-supplied
654 raise ValueError("Invalid ingestion arguments: %s" % args) from e
655 task.run(parsedCmd)
658def _findMatchingFiles(basePath, include, exclude=None):
659 """Recursively identify files matching one set of patterns and not matching another.
661 Parameters
662 ----------
663 basePath : `str`
664 The path on disk where the files in ``include`` are located.
665 include : iterable of `str`
666 A collection of files (with wildcards) to include. Must not
667 contain paths.
668 exclude : iterable of `str`, optional
669 A collection of filenames (with wildcards) to exclude. Must not
670 contain paths. If omitted, all files matching ``include`` are returned.
672 Returns
673 -------
674 files : `set` of `str`
675 The files in ``basePath`` or any subdirectory that match ``include``
676 but not ``exclude``.
677 """
678 _exclude = exclude if exclude is not None else []
680 allFiles = set()
681 for pattern in include:
682 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
684 for pattern in _exclude:
685 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
686 allFiles.difference_update(excludedFiles)
687 return allFiles