Coverage for python/lsst/ap/verify/ingestion.py : 26%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of a dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "ingestDataset"]
32import fnmatch
33import os
34import shutil
35import tarfile
36from glob import glob
37import sqlite3
39import lsst.utils
40import lsst.log
41import lsst.pex.config as pexConfig
42import lsst.pipe.base as pipeBase
44from lsst.pipe.tasks.ingest import IngestTask
45from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
46from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
49class DatasetIngestConfig(pexConfig.Config):
50 """Settings and defaults for `DatasetIngestTask`.
52 The correct targets for this task's subtasks can be found in the
53 documentation of the appropriate ``obs`` package.
55 Because `DatasetIngestTask` is not designed to be run from the command line,
56 and its arguments are completely determined by the choice of dataset,
57 this config includes settings that would normally be passed as command-line
58 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
59 """
61 dataIngester = pexConfig.ConfigurableField(
62 target=IngestTask,
63 doc="Task used to perform raw data ingestion.",
64 )
65 dataFiles = pexConfig.ListField(
66 dtype=str,
67 default=["*.fits", "*.fz", "*.fits.gz"],
68 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.",
69 )
70 dataBadFiles = pexConfig.ListField(
71 dtype=str,
72 default=[],
73 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
74 "supersedes ``dataFiles``.",
75 )
77 calibIngester = pexConfig.ConfigurableField(
78 target=IngestCalibsTask,
79 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
80 )
81 calibFiles = pexConfig.ListField(
82 dtype=str,
83 default=["*.fits", "*.fz", "*.fits.gz"],
84 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.",
85 )
86 calibBadFiles = pexConfig.ListField(
87 dtype=str,
88 default=[],
89 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
90 )
91 calibValidity = pexConfig.Field(
92 dtype=int,
93 default=9999,
94 doc="Calibration validity period (days). Assumed equal for all calib types.")
96 curatedCalibPaths = pexConfig.ListField(
97 dtype=str,
98 default=[],
99 doc="Paths to the top level of each curated calib's tree (e.g., defects, crosstalk). "
100 "Each path should be a directory which contains one subdirectory per sensor."
101 )
102 curatedCalibIngester = pexConfig.ConfigurableField(
103 target=IngestCuratedCalibsTask,
104 doc="Task used to ingest curated calibs.",
105 )
107 refcats = pexConfig.DictField(
108 keytype=str,
109 itemtype=str,
110 default={},
111 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
112 )
115class DatasetIngestTask(pipeBase.Task):
116 """Task for automating ingestion of a dataset.
118 Each dataset configures this task as appropriate for the files it provides
119 and the target instrument. Therefore, this task takes no input besides the
120 dataset to load and the repositories to ingest to.
121 """
123 ConfigClass = DatasetIngestConfig
124 _DefaultName = "datasetIngest"
126 def __init__(self, *args, **kwargs):
127 pipeBase.Task.__init__(self, *args, **kwargs)
128 self.makeSubtask("dataIngester")
129 self.makeSubtask("calibIngester")
130 self.makeSubtask("curatedCalibIngester")
132 def run(self, dataset, workspace):
133 """Ingest the contents of a dataset into a Butler repository.
135 Parameters
136 ----------
137 dataset : `lsst.ap.verify.dataset.Dataset`
138 The dataset to be ingested.
139 workspace : `lsst.ap.verify.workspace.Workspace`
140 The abstract location where ingestion repositories will be created.
141 If the repositories already exist, they must support the same
142 ``obs`` package as this task's subtasks.
143 """
144 # We're assuming ingest tasks always give absolute path to butler
145 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
146 self._ingestRaws(dataset, workspace)
147 self._ingestCalibs(dataset, workspace)
148 self._ingestCuratedCalibs(dataset, workspace)
149 self._ingestRefcats(dataset, workspace)
150 self._copyConfigs(dataset, workspace)
152 def _ingestRaws(self, dataset, workspace):
153 """Ingest the science data for use by LSST.
155 After this method returns, the data repository in ``workspace`` shall
156 contain all science data from ``dataset``. Butler operations on the
157 repository shall not be able to modify ``dataset``.
159 Parameters
160 ----------
161 dataset : `lsst.ap.verify.dataset.Dataset`
162 The dataset on which the pipeline will be run.
163 workspace : `lsst.ap.verify.workspace.Workspace`
164 The location containing all ingestion repositories.
166 Raises
167 ------
168 RuntimeError
169 Raised if there are no files to ingest.
170 """
171 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
172 self.log.info("Raw images were previously ingested, skipping...")
173 else:
174 self.log.info("Ingesting raw images...")
175 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
176 if dataFiles:
177 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
178 dataFiles, self.config.dataBadFiles)
179 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
180 else:
181 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
183 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
184 """Ingest raw images into a repository.
186 ``repo`` shall be populated with *links* to ``dataFiles``.
188 Parameters
189 ----------
190 repo : `str`
191 The output repository location on disk for raw images. Must exist.
192 calibRepo : `str`
193 The output calibration repository location on disk.
194 dataFiles : `list` of `str`
195 A list of filenames to ingest. May contain wildcards.
196 badFiles : `list` of `str`
197 A list of filenames to exclude from ingestion. Must not contain paths.
198 May contain wildcards.
200 Raises
201 ------
202 RuntimeError
203 Raised if ``dataFiles`` is empty.
204 """
205 if not dataFiles:
206 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
208 args = [repo, "--calib", calibRepo, "--mode", "link"]
209 args.extend(dataFiles)
210 if badFiles:
211 args.append('--badFile')
212 args.extend(badFiles)
213 try:
214 _runIngestTask(self.dataIngester, args)
215 except sqlite3.IntegrityError as detail:
216 raise RuntimeError("Not all raw files are unique") from detail
218 def _ingestCalibs(self, dataset, workspace):
219 """Ingest the calibration files for use by LSST.
221 After this method returns, the calibration repository in ``workspace``
222 shall contain all calibration data from ``dataset``. Butler operations
223 on the repository shall not be able to modify ``dataset``.
225 Parameters
226 ----------
227 dataset : `lsst.ap.verify.dataset.Dataset`
228 The dataset on which the pipeline will be run.
229 workspace : `lsst.ap.verify.workspace.Workspace`
230 The location containing all ingestion repositories.
232 Raises
233 ------
234 RuntimeError
235 Raised if there are no files to ingest.
236 """
237 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
238 self.log.info("Calibration files were previously ingested, skipping...")
239 else:
240 self.log.info("Ingesting calibration files...")
241 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
242 self.config.calibFiles, self.config.calibBadFiles)
243 if calibDataFiles:
244 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
245 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
246 workspace.dataRepo, workspace.calibRepo))
247 else:
248 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
250 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
251 """Ingest calibration images into a calibration repository.
253 Parameters
254 ----------
255 repo : `str`
256 The output repository location on disk for raw images. Must exist.
257 calibRepo : `str`
258 The output repository location on disk for calibration files. Must
259 exist.
260 calibDataFiles : `list` of `str`
261 A list of filenames to ingest. Supported files vary by instrument
262 but may include flats, biases, darks, fringes, or sky. May contain
263 wildcards.
265 Raises
266 ------
267 RuntimeError
268 Raised if ``calibDataFiles`` is empty.
269 """
270 if not calibDataFiles:
271 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
272 % calibDataFiles)
274 # TODO: --output is workaround for DM-11668
275 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
276 "--mode", "link", "--validity", str(self.config.calibValidity)]
277 args.extend(calibDataFiles)
278 try:
279 _runIngestTask(self.calibIngester, args)
280 except sqlite3.IntegrityError as detail:
281 raise RuntimeError("Not all calibration files are unique") from detail
283 def _ingestCuratedCalibs(self, dataset, workspace):
284 """Ingest the curated calib files for use by LSST.
286 After this method returns, the calibration repository in ``workspace``
287 shall contain all curated calibs mentioned in curatedCalibPaths. Butler
288 operations on the repository shall not be able to modify ``dataset``.
290 Parameters
291 ----------
292 dataset : `lsst.ap.verify.dataset.Dataset`
293 The dataset on which the pipeline will be run.
294 workspace : `lsst.ap.verify.workspace.Workspace`
295 The location containing all ingestion repositories.
296 """
297 for curated in self.config.curatedCalibPaths:
298 self.log.info("Ingesting curated calibs...")
299 self._doIngestCuratedCalibs(workspace.dataRepo, workspace.calibRepo, curated)
300 self.log.info("Curated calibs are now ingested in {0}".format(workspace.calibRepo))
302 def _doIngestCuratedCalibs(self, repo, calibRepo, curatedPath):
303 """Ingest curated calib data.
305 Parameters
306 ----------
307 repo : `str`
308 The output repository location on disk for raw images. Must exist.
309 calibRepo : `str`
310 The output repository location on disk for calibration files. Must
311 exist.
312 curatedPath : `str`
313 Path to the curated calibs in standard text form. This is probably
314 a path in ``obs_*_data``.
315 """
317 curatedargs = [repo, curatedPath, "--calib", calibRepo]
318 try:
319 _runIngestTask(self.curatedCalibIngester, curatedargs)
320 except sqlite3.IntegrityError as detail:
321 raise RuntimeError("Not all curated calib files are unique") from detail
323 def _ingestRefcats(self, dataset, workspace):
324 """Ingest the refcats for use by LSST.
326 After this method returns, the data repository in ``workspace`` shall
327 contain all reference catalogs from ``dataset``. Operations on the
328 repository shall not be able to modify ``dataset``.
330 Parameters
331 ----------
332 dataset : `lsst.ap.verify.dataset.Dataset`
333 The dataset on which the pipeline will be run.
334 workspace : `lsst.ap.verify.workspace.Workspace`
335 The location containing all ingestion repositories.
337 Notes
338 -----
339 Refcats are not, at present, registered as part of the repository. They
340 are not guaranteed to be visible to anything other than a
341 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
342 for more details.
343 """
344 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
345 self.log.info("Refcats were previously ingested, skipping...")
346 else:
347 self.log.info("Ingesting reference catalogs...")
348 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
349 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
351 def _doIngestRefcats(self, repo, refcats):
352 """Place refcats inside a particular repository.
354 Parameters
355 ----------
356 repo : `str`
357 The output repository location on disk for raw images. Must exist.
358 refcats : `str`
359 A directory containing .tar.gz files with LSST-formatted astrometric
360 or photometric reference catalog information.
361 """
362 for refcatName, tarball in self.config.refcats.items():
363 tarball = os.path.join(refcats, tarball)
364 refcatDir = os.path.join(repo, "ref_cats", refcatName)
365 with tarfile.open(tarball, "r") as opened:
366 opened.extractall(refcatDir)
368 def _copyConfigs(self, dataset, workspace):
369 """Give a workspace a copy of all configs associated with the ingested data.
371 After this method returns, the config directory in ``workspace`` shall
372 contain all config files from ``dataset``.
374 Parameters
375 ----------
376 dataset : `lsst.ap.verify.dataset.Dataset`
377 The dataset on which the pipeline will be run.
378 workspace : `lsst.ap.verify.workspace.Workspace`
379 The location containing the config directory.
380 """
381 if os.listdir(workspace.configDir):
382 self.log.info("Configs already copied, skipping...")
383 else:
384 self.log.info("Storing data-specific configs...")
385 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
386 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
388 def _doCopyConfigs(self, destination, source):
389 """Place configs inside a particular repository.
391 Parameters
392 ----------
393 destination : `str`
394 The directory to which the configs must be copied. Must exist.
395 source : `str`
396 A directory containing Task config files.
397 """
398 for configFile in _findMatchingFiles(source, ['*.py']):
399 shutil.copy2(configFile, destination)
402def ingestDataset(dataset, workspace):
403 """Ingest the contents of a dataset into a Butler repository.
405 The original data directory shall not be modified.
407 Parameters
408 ----------
409 dataset : `lsst.ap.verify.dataset.Dataset`
410 The dataset to be ingested.
411 workspace : `lsst.ap.verify.workspace.Workspace`
412 The abstract location where ingestion repositories will be created.
413 If the repositories already exist, they must be compatible with
414 ``dataset`` (in particular, they must support the relevant
415 ``obs`` package).
416 """
417 # TODO: generalize to support arbitrary URIs (DM-11482)
418 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
420 ingester = DatasetIngestTask(config=_getConfig(dataset))
421 ingester.run(dataset, workspace)
422 log.info("Data ingested")
425def _getConfig(dataset):
426 """Return the ingestion config associated with a specific dataset.
428 Parameters
429 ----------
430 dataset : `lsst.ap.verify.dataset.Dataset`
431 The dataset whose ingestion config is desired.
433 Returns
434 -------
435 config : `DatasetIngestConfig`
436 The config for running `DatasetIngestTask` on ``dataset``.
437 """
438 overrideFile = DatasetIngestTask._DefaultName + ".py"
439 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
441 config = DatasetIngestTask.ConfigClass()
442 for path in [
443 os.path.join(packageDir, 'config'),
444 os.path.join(packageDir, 'config', dataset.camera),
445 dataset.configLocation,
446 ]:
447 overridePath = os.path.join(path, overrideFile)
448 if os.path.exists(overridePath):
449 config.load(overridePath)
450 return config
453def _runIngestTask(task, args):
454 """Run an ingestion task on a set of inputs.
456 Parameters
457 ----------
458 task : `lsst.pipe.tasks.IngestTask`
459 The task to run.
460 args : list of command-line arguments, split using Python conventions
461 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
462 """
463 argumentParser = task.ArgumentParser(name=task.getName())
464 try:
465 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
466 except SystemExit as e:
467 # SystemExit is not an appropriate response when the arguments aren't user-supplied
468 raise ValueError("Invalid ingestion arguments: %s" % args) from e
469 task.run(parsedCmd)
472def _findMatchingFiles(basePath, include, exclude=None):
473 """Recursively identify files matching one set of patterns and not matching another.
475 Parameters
476 ----------
477 basePath : `str`
478 The path on disk where the files in ``include`` are located.
479 include : iterable of `str`
480 A collection of files (with wildcards) to include. Must not
481 contain paths.
482 exclude : iterable of `str`, optional
483 A collection of filenames (with wildcards) to exclude. Must not
484 contain paths. If omitted, all files matching ``include`` are returned.
486 Returns
487 -------
488 files : `set` of `str`
489 The files in ``basePath`` or any subdirectory that match ``include``
490 but not ``exclude``.
491 """
492 _exclude = exclude if exclude is not None else []
494 allFiles = set()
495 for pattern in include:
496 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
498 for pattern in _exclude:
499 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
500 allFiles.difference_update(excludedFiles)
501 return allFiles