Coverage for python/lsst/ap/verify/ingestion.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of a dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "ingestDataset"]
32import fnmatch
33import os
34import shutil
35import tarfile
36from glob import glob
37import sqlite3
39import lsst.utils
40import lsst.log
41import lsst.pex.config as pexConfig
42import lsst.pipe.base as pipeBase
44from lsst.pipe.tasks.ingest import IngestTask
45from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
46from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
49class DatasetIngestConfig(pexConfig.Config):
50 """Settings and defaults for `DatasetIngestTask`.
52 The correct targets for this task's subtasks can be found in the
53 documentation of the appropriate ``obs`` package.
55 Because `DatasetIngestTask` is not designed to be run from the command line,
56 and its arguments are completely determined by the choice of dataset,
57 this config includes settings that would normally be passed as command-line
58 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
59 """
61 dataIngester = pexConfig.ConfigurableField(
62 target=IngestTask,
63 doc="Task used to perform raw data ingestion.",
64 )
65 dataFiles = pexConfig.ListField(
66 dtype=str,
67 default=["*.fits", "*.fz", "*.fits.gz"],
68 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.",
69 )
70 dataBadFiles = pexConfig.ListField(
71 dtype=str,
72 default=[],
73 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
74 "supersedes ``dataFiles``.",
75 )
77 calibIngester = pexConfig.ConfigurableField(
78 target=IngestCalibsTask,
79 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
80 )
81 calibFiles = pexConfig.ListField(
82 dtype=str,
83 default=["*.fits", "*.fz", "*.fits.gz"],
84 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.",
85 )
86 calibBadFiles = pexConfig.ListField(
87 dtype=str,
88 default=[],
89 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
90 )
91 calibValidity = pexConfig.Field(
92 dtype=int,
93 default=9999,
94 doc="Calibration validity period (days). Assumed equal for all calib types.")
96 textDefectPath = pexConfig.Field(
97 dtype=str,
98 default='',
99 doc="Path to top level of the defect tree. This is a directory with a directory per sensor."
100 )
101 defectIngester = pexConfig.ConfigurableField(
102 target=IngestCuratedCalibsTask,
103 doc="Task used to ingest defects.",
104 )
106 refcats = pexConfig.DictField(
107 keytype=str,
108 itemtype=str,
109 default={},
110 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
111 )
114class DatasetIngestTask(pipeBase.Task):
115 """Task for automating ingestion of a dataset.
117 Each dataset configures this task as appropriate for the files it provides
118 and the target instrument. Therefore, this task takes no input besides the
119 dataset to load and the repositories to ingest to.
120 """
122 ConfigClass = DatasetIngestConfig
123 _DefaultName = "datasetIngest"
125 def __init__(self, *args, **kwargs):
126 pipeBase.Task.__init__(self, *args, **kwargs)
127 self.makeSubtask("dataIngester")
128 self.makeSubtask("calibIngester")
129 self.makeSubtask("defectIngester")
131 def run(self, dataset, workspace):
132 """Ingest the contents of a dataset into a Butler repository.
134 Parameters
135 ----------
136 dataset : `lsst.ap.verify.dataset.Dataset`
137 The dataset to be ingested.
138 workspace : `lsst.ap.verify.workspace.Workspace`
139 The abstract location where ingestion repositories will be created.
140 If the repositories already exist, they must support the same
141 ``obs`` package as this task's subtasks.
142 """
143 # We're assuming ingest tasks always give absolute path to butler
144 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
145 self._ingestRaws(dataset, workspace)
146 self._ingestCalibs(dataset, workspace)
147 self._ingestDefects(dataset, workspace)
148 self._ingestRefcats(dataset, workspace)
149 self._copyConfigs(dataset, workspace)
151 def _ingestRaws(self, dataset, workspace):
152 """Ingest the science data for use by LSST.
154 After this method returns, the data repository in ``workspace`` shall
155 contain all science data from ``dataset``. Butler operations on the
156 repository shall not be able to modify ``dataset``.
158 Parameters
159 ----------
160 dataset : `lsst.ap.verify.dataset.Dataset`
161 The dataset on which the pipeline will be run.
162 workspace : `lsst.ap.verify.workspace.Workspace`
163 The location containing all ingestion repositories.
165 Raises
166 ------
167 RuntimeError
168 Raised if there are no files to ingest.
169 """
170 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
171 self.log.info("Raw images were previously ingested, skipping...")
172 else:
173 self.log.info("Ingesting raw images...")
174 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
175 if dataFiles:
176 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
177 dataFiles, self.config.dataBadFiles)
178 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
179 else:
180 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
182 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
183 """Ingest raw images into a repository.
185 ``repo`` shall be populated with *links* to ``dataFiles``.
187 Parameters
188 ----------
189 repo : `str`
190 The output repository location on disk for raw images. Must exist.
191 calibRepo : `str`
192 The output calibration repository location on disk.
193 dataFiles : `list` of `str`
194 A list of filenames to ingest. May contain wildcards.
195 badFiles : `list` of `str`
196 A list of filenames to exclude from ingestion. Must not contain paths.
197 May contain wildcards.
199 Raises
200 ------
201 RuntimeError
202 Raised if ``dataFiles`` is empty.
203 """
204 if not dataFiles:
205 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
207 args = [repo, "--calib", calibRepo, "--mode", "link"]
208 args.extend(dataFiles)
209 if badFiles:
210 args.append('--badFile')
211 args.extend(badFiles)
212 try:
213 _runIngestTask(self.dataIngester, args)
214 except sqlite3.IntegrityError as detail:
215 raise RuntimeError("Not all raw files are unique") from detail
217 def _ingestCalibs(self, dataset, workspace):
218 """Ingest the calibration files for use by LSST.
220 After this method returns, the calibration repository in ``workspace``
221 shall contain all calibration data from ``dataset``. Butler operations
222 on the repository shall not be able to modify ``dataset``.
224 Parameters
225 ----------
226 dataset : `lsst.ap.verify.dataset.Dataset`
227 The dataset on which the pipeline will be run.
228 workspace : `lsst.ap.verify.workspace.Workspace`
229 The location containing all ingestion repositories.
231 Raises
232 ------
233 RuntimeError
234 Raised if there are no files to ingest.
235 """
236 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
237 self.log.info("Calibration files were previously ingested, skipping...")
238 else:
239 self.log.info("Ingesting calibration files...")
240 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
241 self.config.calibFiles, self.config.calibBadFiles)
242 if calibDataFiles:
243 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
244 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
245 workspace.dataRepo, workspace.calibRepo))
246 else:
247 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
249 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
250 """Ingest calibration images into a calibration repository.
252 Parameters
253 ----------
254 repo : `str`
255 The output repository location on disk for raw images. Must exist.
256 calibRepo : `str`
257 The output repository location on disk for calibration files. Must
258 exist.
259 calibDataFiles : `list` of `str`
260 A list of filenames to ingest. Supported files vary by instrument
261 but may include flats, biases, darks, fringes, or sky. May contain
262 wildcards.
264 Raises
265 ------
266 RuntimeError
267 Raised if ``calibDataFiles`` is empty.
268 """
269 if not calibDataFiles:
270 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
271 % calibDataFiles)
273 # TODO: --output is workaround for DM-11668
274 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
275 "--mode", "link", "--validity", str(self.config.calibValidity)]
276 args.extend(calibDataFiles)
277 try:
278 _runIngestTask(self.calibIngester, args)
279 except sqlite3.IntegrityError as detail:
280 raise RuntimeError("Not all calibration files are unique") from detail
282 def _ingestDefects(self, dataset, workspace):
283 """Ingest the defect files for use by LSST.
285 After this method returns, the calibration repository in ``workspace``
286 shall contain all defects from ``dataset``. Butler operations on the
287 repository shall not be able to modify ``dataset``.
289 Parameters
290 ----------
291 dataset : `lsst.ap.verify.dataset.Dataset`
292 The dataset on which the pipeline will be run.
293 workspace : `lsst.ap.verify.workspace.Workspace`
294 The location containing all ingestion repositories.
296 Raises
297 ------
298 RuntimeError
299 Raised if defect ingestion requested but no defects found.
300 """
301 if os.path.exists(os.path.join(workspace.calibRepo, "defects")):
302 self.log.info("Defects were previously ingested, skipping...")
303 else:
304 self.log.info("Ingesting defects...")
305 self._doIngestDefects(workspace.dataRepo, workspace.calibRepo, self.config.textDefectPath)
306 self.log.info("Defects are now ingested in {0}".format(workspace.calibRepo))
308 def _doIngestDefects(self, repo, calibRepo, defectPath):
309 """Ingest defect images.
311 Parameters
312 ----------
313 repo : `str`
314 The output repository location on disk for raw images. Must exist.
315 calibRepo : `str`
316 The output repository location on disk for calibration files. Must
317 exist.
318 defectPath : `str`
319 Path to the defects in standard text form. This is probably a path in ``obs_decam_data``.
321 Raises
322 ------
323 RuntimeError
324 Raised if ``defectTarball`` exists but is empty.
325 """
327 defectargs = [repo, defectPath, "--calib", calibRepo]
328 try:
329 _runIngestTask(self.defectIngester, defectargs)
330 except sqlite3.IntegrityError as detail:
331 raise RuntimeError("Not all defect files are unique") from detail
333 def _ingestRefcats(self, dataset, workspace):
334 """Ingest the refcats for use by LSST.
336 After this method returns, the data repository in ``workspace`` shall
337 contain all reference catalogs from ``dataset``. Operations on the
338 repository shall not be able to modify ``dataset``.
340 Parameters
341 ----------
342 dataset : `lsst.ap.verify.dataset.Dataset`
343 The dataset on which the pipeline will be run.
344 workspace : `lsst.ap.verify.workspace.Workspace`
345 The location containing all ingestion repositories.
347 Notes
348 -----
349 Refcats are not, at present, registered as part of the repository. They
350 are not guaranteed to be visible to anything other than a
351 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
352 for more details.
353 """
354 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
355 self.log.info("Refcats were previously ingested, skipping...")
356 else:
357 self.log.info("Ingesting reference catalogs...")
358 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
359 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
361 def _doIngestRefcats(self, repo, refcats):
362 """Place refcats inside a particular repository.
364 Parameters
365 ----------
366 repo : `str`
367 The output repository location on disk for raw images. Must exist.
368 refcats : `str`
369 A directory containing .tar.gz files with LSST-formatted astrometric
370 or photometric reference catalog information.
371 """
372 for refcatName, tarball in self.config.refcats.items():
373 tarball = os.path.join(refcats, tarball)
374 refcatDir = os.path.join(repo, "ref_cats", refcatName)
375 with tarfile.open(tarball, "r") as opened:
376 opened.extractall(refcatDir)
378 def _copyConfigs(self, dataset, workspace):
379 """Give a workspace a copy of all configs associated with the ingested data.
381 After this method returns, the config directory in ``workspace`` shall
382 contain all config files from ``dataset``.
384 Parameters
385 ----------
386 dataset : `lsst.ap.verify.dataset.Dataset`
387 The dataset on which the pipeline will be run.
388 workspace : `lsst.ap.verify.workspace.Workspace`
389 The location containing the config directory.
390 """
391 if os.listdir(workspace.configDir):
392 self.log.info("Configs already copied, skipping...")
393 else:
394 self.log.info("Storing data-specific configs...")
395 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
396 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
398 def _doCopyConfigs(self, destination, source):
399 """Place configs inside a particular repository.
401 Parameters
402 ----------
403 destination : `str`
404 The directory to which the configs must be copied. Must exist.
405 source : `str`
406 A directory containing Task config files.
407 """
408 for configFile in _findMatchingFiles(source, ['*.py']):
409 shutil.copy2(configFile, destination)
412def ingestDataset(dataset, workspace):
413 """Ingest the contents of a dataset into a Butler repository.
415 The original data directory shall not be modified.
417 Parameters
418 ----------
419 dataset : `lsst.ap.verify.dataset.Dataset`
420 The dataset to be ingested.
421 workspace : `lsst.ap.verify.workspace.Workspace`
422 The abstract location where ingestion repositories will be created.
423 If the repositories already exist, they must be compatible with
424 ``dataset`` (in particular, they must support the relevant
425 ``obs`` package).
426 """
427 # TODO: generalize to support arbitrary URIs (DM-11482)
428 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
430 ingester = DatasetIngestTask(config=_getConfig(dataset))
431 ingester.run(dataset, workspace)
432 log.info("Data ingested")
435def _getConfig(dataset):
436 """Return the ingestion config associated with a specific dataset.
438 Parameters
439 ----------
440 dataset : `lsst.ap.verify.dataset.Dataset`
441 The dataset whose ingestion config is desired.
443 Returns
444 -------
445 config : `DatasetIngestConfig`
446 The config for running `DatasetIngestTask` on ``dataset``.
447 """
448 overrideFile = DatasetIngestTask._DefaultName + ".py"
449 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
451 config = DatasetIngestTask.ConfigClass()
452 for path in [
453 os.path.join(packageDir, 'config'),
454 os.path.join(packageDir, 'config', dataset.camera),
455 dataset.configLocation,
456 ]:
457 overridePath = os.path.join(path, overrideFile)
458 if os.path.exists(overridePath):
459 config.load(overridePath)
460 return config
463def _runIngestTask(task, args):
464 """Run an ingestion task on a set of inputs.
466 Parameters
467 ----------
468 task : `lsst.pipe.tasks.IngestTask`
469 The task to run.
470 args : list of command-line arguments, split using Python conventions
471 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
472 """
473 argumentParser = task.ArgumentParser(name=task.getName())
474 try:
475 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
476 except SystemExit as e:
477 # SystemExit is not an appropriate response when the arguments aren't user-supplied
478 raise ValueError("Invalid ingestion arguments: %s" % args) from e
479 task.run(parsedCmd)
482def _findMatchingFiles(basePath, include, exclude=None):
483 """Recursively identify files matching one set of patterns and not matching another.
485 Parameters
486 ----------
487 basePath : `str`
488 The path on disk where the files in ``include`` are located.
489 include : iterable of `str`
490 A collection of files (with wildcards) to include. Must not
491 contain paths.
492 exclude : iterable of `str`, optional
493 A collection of filenames (with wildcards) to exclude. Must not
494 contain paths. If omitted, all files matching ``include`` are returned.
496 Returns
497 -------
498 files : `set` of `str`
499 The files in ``basePath`` or any subdirectory that match ``include``
500 but not ``exclude``.
501 """
502 _exclude = exclude if exclude is not None else []
504 allFiles = set()
505 for pattern in include:
506 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
508 for pattern in _exclude:
509 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
510 allFiles.difference_update(excludedFiles)
511 return allFiles