Coverage for python/lsst/ap/verify/ingestion.py: 27%
93 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 04:34 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 04:34 -0700
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of an ap_verify dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["Gen3DatasetIngestConfig", "ingestDatasetGen3"]
32import fnmatch
33import os
34import re
35import shutil
36from glob import glob
37import logging
39import lsst.utils
40import lsst.pex.config as pexConfig
41import lsst.pipe.base as pipeBase
43import lsst.daf.butler
44import lsst.obs.base
46_LOG = logging.getLogger(__name__)
49class Gen3DatasetIngestConfig(pexConfig.Config):
50 """Settings and defaults for `Gen3DatasetIngestTask`.
52 The correct target for `ingester` can be found in the documentation of
53 the appropriate ``obs`` package.
54 """
56 ingester = pexConfig.ConfigurableField(
57 target=lsst.obs.base.RawIngestTask,
58 doc="Task used to perform raw data ingestion.",
59 )
60 visitDefiner = pexConfig.ConfigurableField(
61 target=lsst.obs.base.DefineVisitsTask,
62 doc="Task used to organize raw exposures into visits.",
63 )
64 # Normally file patterns should be user input, but put them in a config so
65 # the ap_verify dataset can configure them
66 dataFiles = pexConfig.ListField(
67 dtype=str,
68 default=["*.fits", "*.fz", "*.fits.gz"],
69 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.",
70 )
71 dataBadFiles = pexConfig.ListField(
72 dtype=str,
73 default=[],
74 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
75 "supersedes ``dataFiles``.",
76 )
79class Gen3DatasetIngestTask(pipeBase.Task):
80 """Task for automating ingestion of a ap_verify dataset.
82 Each dataset configures this task as appropriate for the files it provides
83 and the target instrument. Therefore, this task takes no input besides the
84 ap_verify dataset to load and the repositories to ingest to.
86 Parameters
87 ----------
88 dataset : `lsst.ap.verify.dataset.Dataset`
89 The ``ap_verify`` dataset to be ingested.
90 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
91 The abstract location for all ``ap_verify`` outputs, including
92 a Gen 3 repository.
93 """
95 ConfigClass = Gen3DatasetIngestConfig
96 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides
97 _DefaultName = "datasetIngest-gen3"
99 def __init__(self, dataset, workspace, *args, **kwargs):
100 super().__init__(*args, **kwargs)
101 self.workspace = workspace
102 self.dataset = dataset
103 # workspace.workButler is undefined until the repository is created
104 self.dataset.makeCompatibleRepoGen3(self.workspace.repo)
105 self.makeSubtask("ingester", butler=self.workspace.workButler)
106 self.makeSubtask("visitDefiner", butler=self.workspace.workButler)
108 def _reduce_kwargs(self):
109 # Add extra parameters to pickle
110 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace)
112 def run(self, processes=1):
113 """Ingest the contents of a dataset into a Butler repository.
115 Parameters
116 ----------
117 processes : `int`
118 The number processes to use to ingest.
119 """
120 self._ensureRaws(processes=processes)
121 self._defineVisits(processes=processes)
122 self._copyConfigs()
124 def _ensureRaws(self, processes):
125 """Ensure that the repository in ``workspace`` has raws ingested.
127 After this method returns, this task's repository contains all science
128 data from this task's ap_verify dataset. Butler operations on the
129 repository are not able to modify ``dataset`` in any way.
131 Parameters
132 ----------
133 processes : `int`
134 The number processes to use to ingest, if ingestion must be run.
136 Raises
137 ------
138 RuntimeError
139 Raised if there are no files to ingest.
140 """
141 # TODO: regex is workaround for DM-25945
142 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName())
143 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter))
144 rawData = list(self.workspace.workButler.registry.queryDatasets(
145 'raw',
146 collections=rawCollections,
147 dataId={"instrument": self.dataset.instrument.getName()})) \
148 if rawCollections else []
150 if rawData:
151 self.log.info("Raw images for %s were previously ingested, skipping...",
152 self.dataset.instrument.getName())
153 else:
154 self.log.info("Ingesting raw images...")
155 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles,
156 exclude=self.config.dataBadFiles)
157 if dataFiles:
158 self._ingestRaws(dataFiles, processes=processes)
159 self.log.info("Images are now ingested in {0}".format(self.workspace.repo))
160 else:
161 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation)
163 def _ingestRaws(self, dataFiles, processes):
164 """Ingest raw images into a repository.
166 This task's repository is populated with *links* to ``dataFiles``.
168 Parameters
169 ----------
170 dataFiles : `list` of `str`
171 A list of filenames to ingest. May contain wildcards.
172 processes : `int`
173 The number processes to use to ingest.
175 Raises
176 ------
177 RuntimeError
178 Raised if ``dataFiles`` is empty or any file has already been ingested.
179 """
180 if not dataFiles:
181 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
183 try:
184 # run=None because expect ingester to name a new collection.
185 # HACK: update_exposure_records=True to modernize exposure records
186 # from old ap_verify datasets. Since the exposure records are
187 # generated from the same files, the only changes should be
188 # schema-related.
189 self.ingester.run(dataFiles, run=None, processes=processes, update_exposure_records=True)
190 except lsst.daf.butler.registry.ConflictingDefinitionError as detail:
191 raise RuntimeError("Not all raw files are unique") from detail
193 def _defineVisits(self, processes):
194 """Map visits to the ingested exposures.
196 This step is necessary to be able to run most pipelines on raw datasets.
198 Parameters
199 ----------
200 processes : `int`
201 The number processes to use to define visits.
203 Raises
204 ------
205 RuntimeError
206 Raised if there are no exposures in the repository.
207 """
208 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"]))
209 if not exposures:
210 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.")
212 exposureKeys = list(exposures)[0].dimensions
213 exposuresWithVisits = {x.subset(exposureKeys) for x in
214 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])}
215 exposuresNoVisits = exposures - exposuresWithVisits
216 if exposuresNoVisits:
217 self.log.info("Defining visits...")
218 self.visitDefiner.run(exposuresNoVisits)
219 else:
220 self.log.info("Visits were previously defined, skipping...")
222 def _copyConfigs(self):
223 """Give a workspace a copy of all configs associated with the
224 ingested data.
226 After this method returns, the config directory in the workspace
227 contains all config files from the ap_verify dataset, and the
228 pipelines directory in the workspace contains all pipeline files
229 from the dataset.
230 """
231 if os.listdir(self.workspace.pipelineDir):
232 self.log.info("Configs already copied, skipping...")
233 else:
234 self.log.info("Storing data-specific configs...")
235 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']):
236 shutil.copy2(configFile, self.workspace.configDir)
237 self.log.info("Configs are now stored in %s.", self.workspace.configDir)
238 for pipelineFile in _findMatchingFiles(self.dataset.pipelineLocation, ['*.yaml']):
239 shutil.copy2(pipelineFile, self.workspace.pipelineDir)
240 self.log.info("Configs are now stored in %s.", self.workspace.pipelineDir)
243def ingestDatasetGen3(dataset, workspace, processes=1):
244 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository.
246 The original data directory is not modified.
248 Parameters
249 ----------
250 dataset : `lsst.ap.verify.dataset.Dataset`
251 The ap_verify dataset to be ingested.
252 workspace : `lsst.ap.verify.workspace.WorkspaceGen3`
253 The abstract location where the epository is be created, if it does
254 not already exist.
255 processes : `int`
256 The number processes to use to ingest.
257 """
258 log = _LOG.getChild("ingestDataset")
260 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset))
261 ingester.run(processes=processes)
262 log.info("Data ingested")
265def _getConfig(task, dataset):
266 """Return the ingestion config associated with a specific dataset.
268 Parameters
269 ----------
270 task : `lsst.pipe.base.Task`-type
271 The task whose config is needed
272 dataset : `lsst.ap.verify.dataset.Dataset`
273 The dataset whose ingestion config is desired.
275 Returns
276 -------
277 config : ``task.ConfigClass``
278 The config for running ``task`` on ``dataset``.
279 """
280 config = task.ConfigClass()
281 dataset.instrument.applyConfigOverrides(task._DefaultName, config)
282 return config
285def _findMatchingFiles(basePath, include, exclude=None):
286 """Recursively identify files matching one set of patterns and not matching another.
288 Parameters
289 ----------
290 basePath : `str`
291 The path on disk where the files in ``include`` are located.
292 include : iterable of `str`
293 A collection of files (with wildcards) to include. Must not
294 contain paths.
295 exclude : iterable of `str`, optional
296 A collection of filenames (with wildcards) to exclude. Must not
297 contain paths. If omitted, all files matching ``include`` are returned.
299 Returns
300 -------
301 files : `set` of `str`
302 The files in ``basePath`` or any subdirectory that match ``include``
303 but not ``exclude``.
304 """
305 _exclude = exclude if exclude is not None else []
307 allFiles = set()
308 for pattern in include:
309 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
311 for pattern in _exclude:
312 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
313 allFiles.difference_update(excludedFiles)
314 return allFiles