Coverage for python/lsst/ap/verify/ingestion.py : 26%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of a dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "ingestDataset"]
32import fnmatch
33import os
34import shutil
35import tarfile
36from contextlib import contextmanager
37from glob import glob
38import sqlite3
40import lsst.utils
41import lsst.log
42import lsst.pex.config as pexConfig
43import lsst.pipe.base as pipeBase
45from lsst.pipe.tasks.ingest import IngestTask
46from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
47from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
50class DatasetIngestConfig(pexConfig.Config):
51 """Settings and defaults for `DatasetIngestTask`.
53 The correct targets for this task's subtasks can be found in the
54 documentation of the appropriate ``obs`` package.
56 Because `DatasetIngestTask` is not designed to be run from the command line,
57 and its arguments are completely determined by the choice of dataset,
58 this config includes settings that would normally be passed as command-line
59 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
60 """
62 dataIngester = pexConfig.ConfigurableField(
63 target=IngestTask,
64 doc="Task used to perform raw data ingestion.",
65 )
66 dataFiles = pexConfig.ListField(
67 dtype=str,
68 default=["*.fits", "*.fz", "*.fits.gz"],
69 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.",
70 )
71 dataBadFiles = pexConfig.ListField(
72 dtype=str,
73 default=[],
74 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
75 "supersedes ``dataFiles``.",
76 )
78 calibIngester = pexConfig.ConfigurableField(
79 target=IngestCalibsTask,
80 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
81 )
82 calibFiles = pexConfig.ListField(
83 dtype=str,
84 default=["*.fits", "*.fz", "*.fits.gz"],
85 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.",
86 )
87 calibBadFiles = pexConfig.ListField(
88 dtype=str,
89 default=[],
90 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
91 )
92 calibValidity = pexConfig.Field(
93 dtype=int,
94 default=9999,
95 doc="Calibration validity period (days). Assumed equal for all calib types.")
97 textDefectPath = pexConfig.Field(
98 dtype=str,
99 default='',
100 doc="Path to top level of the defect tree. This is a directory with a directory per sensor."
101 )
102 defectIngester = pexConfig.ConfigurableField(
103 target=IngestCuratedCalibsTask,
104 doc="Task used to ingest defects.",
105 )
107 refcats = pexConfig.DictField(
108 keytype=str,
109 itemtype=str,
110 default={},
111 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
112 )
115class DatasetIngestTask(pipeBase.Task):
116 """Task for automating ingestion of a dataset.
118 Each dataset configures this task as appropriate for the files it provides
119 and the target instrument. Therefore, this task takes no input besides the
120 dataset to load and the repositories to ingest to.
121 """
123 ConfigClass = DatasetIngestConfig
124 _DefaultName = "datasetIngest"
126 def __init__(self, *args, **kwargs):
127 pipeBase.Task.__init__(self, *args, **kwargs)
128 self.makeSubtask("dataIngester")
129 self.makeSubtask("calibIngester")
130 self.makeSubtask("defectIngester")
132 def run(self, dataset, workspace):
133 """Ingest the contents of a dataset into a Butler repository.
135 Parameters
136 ----------
137 dataset : `lsst.ap.verify.dataset.Dataset`
138 The dataset to be ingested.
139 workspace : `lsst.ap.verify.workspace.Workspace`
140 The abstract location where ingestion repositories will be created.
141 If the repositories already exist, they must support the same
142 ``obs`` package as this task's subtasks.
143 """
144 # We're assuming ingest tasks always give absolute path to butler
145 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
146 self._ingestRaws(dataset, workspace)
147 self._ingestCalibs(dataset, workspace)
148 self._ingestDefects(dataset, workspace)
149 self._ingestRefcats(dataset, workspace)
150 self._copyConfigs(dataset, workspace)
152 def _ingestRaws(self, dataset, workspace):
153 """Ingest the science data for use by LSST.
155 After this method returns, the data repository in ``workspace`` shall
156 contain all science data from ``dataset``. Butler operations on the
157 repository shall not be able to modify ``dataset``.
159 Parameters
160 ----------
161 dataset : `lsst.ap.verify.dataset.Dataset`
162 The dataset on which the pipeline will be run.
163 workspace : `lsst.ap.verify.workspace.Workspace`
164 The location containing all ingestion repositories.
166 Raises
167 ------
168 RuntimeError
169 Raised if there are no files to ingest.
170 """
171 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
172 self.log.info("Raw images were previously ingested, skipping...")
173 else:
174 self.log.info("Ingesting raw images...")
175 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
176 if dataFiles:
177 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
178 dataFiles, self.config.dataBadFiles)
179 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
180 else:
181 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
183 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
184 """Ingest raw images into a repository.
186 ``repo`` shall be populated with *links* to ``dataFiles``.
188 Parameters
189 ----------
190 repo : `str`
191 The output repository location on disk for raw images. Must exist.
192 calibRepo : `str`
193 The output calibration repository location on disk.
194 dataFiles : `list` of `str`
195 A list of filenames to ingest. May contain wildcards.
196 badFiles : `list` of `str`
197 A list of filenames to exclude from ingestion. Must not contain paths.
198 May contain wildcards.
200 Raises
201 ------
202 RuntimeError
203 Raised if ``dataFiles`` is empty.
204 """
205 if not dataFiles:
206 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
208 args = [repo, "--calib", calibRepo, "--mode", "link"]
209 args.extend(dataFiles)
210 if badFiles:
211 args.append('--badFile')
212 args.extend(badFiles)
213 try:
214 _runIngestTask(self.dataIngester, args)
215 except sqlite3.IntegrityError as detail:
216 raise RuntimeError("Not all raw files are unique") from detail
218 def _ingestCalibs(self, dataset, workspace):
219 """Ingest the calibration files for use by LSST.
221 After this method returns, the calibration repository in ``workspace``
222 shall contain all calibration data from ``dataset``. Butler operations
223 on the repository shall not be able to modify ``dataset``.
225 Parameters
226 ----------
227 dataset : `lsst.ap.verify.dataset.Dataset`
228 The dataset on which the pipeline will be run.
229 workspace : `lsst.ap.verify.workspace.Workspace`
230 The location containing all ingestion repositories.
232 Raises
233 ------
234 RuntimeError
235 Raised if there are no files to ingest.
236 """
237 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
238 self.log.info("Calibration files were previously ingested, skipping...")
239 else:
240 self.log.info("Ingesting calibration files...")
241 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
242 self.config.calibFiles, self.config.calibBadFiles)
243 if calibDataFiles:
244 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
245 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
246 workspace.dataRepo, workspace.calibRepo))
247 else:
248 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
250 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
251 """Ingest calibration images into a calibration repository.
253 Parameters
254 ----------
255 repo : `str`
256 The output repository location on disk for raw images. Must exist.
257 calibRepo : `str`
258 The output repository location on disk for calibration files. Must
259 exist.
260 calibDataFiles : `list` of `str`
261 A list of filenames to ingest. Supported files vary by instrument
262 but may include flats, biases, darks, fringes, or sky. May contain
263 wildcards.
265 Raises
266 ------
267 RuntimeError
268 Raised if ``calibDataFiles`` is empty.
269 """
270 if not calibDataFiles:
271 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
272 % calibDataFiles)
274 # TODO: --output is workaround for DM-11668
275 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
276 "--mode", "link", "--validity", str(self.config.calibValidity)]
277 args.extend(calibDataFiles)
278 try:
279 _runIngestTask(self.calibIngester, args)
280 except sqlite3.IntegrityError as detail:
281 raise RuntimeError("Not all calibration files are unique") from detail
283 def _ingestDefects(self, dataset, workspace):
284 """Ingest the defect files for use by LSST.
286 After this method returns, the calibration repository in ``workspace``
287 shall contain all defects from ``dataset``. Butler operations on the
288 repository shall not be able to modify ``dataset``.
290 Parameters
291 ----------
292 dataset : `lsst.ap.verify.dataset.Dataset`
293 The dataset on which the pipeline will be run.
294 workspace : `lsst.ap.verify.workspace.Workspace`
295 The location containing all ingestion repositories.
297 Raises
298 ------
299 RuntimeError
300 Raised if defect ingestion requested but no defects found.
301 """
302 if os.path.exists(os.path.join(workspace.calibRepo, "defects")):
303 self.log.info("Defects were previously ingested, skipping...")
304 else:
305 self.log.info("Ingesting defects...")
306 self._doIngestDefects(workspace.dataRepo, workspace.calibRepo, self.config.textDefectPath)
307 self.log.info("Defects are now ingested in {0}".format(workspace.calibRepo))
309 def _doIngestDefects(self, repo, calibRepo, defectPath):
310 """Ingest defect images.
312 Parameters
313 ----------
314 repo : `str`
315 The output repository location on disk for raw images. Must exist.
316 calibRepo : `str`
317 The output repository location on disk for calibration files. Must
318 exist.
319 defectPath : `str`
320 Path to the defects in standard text form. This is probably a path in ``obs_decam_data``.
322 Raises
323 ------
324 RuntimeError
325 Raised if ``defectTarball`` exists but is empty.
326 """
328 defectargs = [repo, defectPath, "--calib", calibRepo]
329 try:
330 _runIngestTask(self.defectIngester, defectargs)
331 except sqlite3.IntegrityError as detail:
332 raise RuntimeError("Not all defect files are unique") from detail
334 def _ingestRefcats(self, dataset, workspace):
335 """Ingest the refcats for use by LSST.
337 After this method returns, the data repository in ``workspace`` shall
338 contain all reference catalogs from ``dataset``. Operations on the
339 repository shall not be able to modify ``dataset``.
341 Parameters
342 ----------
343 dataset : `lsst.ap.verify.dataset.Dataset`
344 The dataset on which the pipeline will be run.
345 workspace : `lsst.ap.verify.workspace.Workspace`
346 The location containing all ingestion repositories.
348 Notes
349 -----
350 Refcats are not, at present, registered as part of the repository. They
351 are not guaranteed to be visible to anything other than a
352 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
353 for more details.
354 """
355 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
356 self.log.info("Refcats were previously ingested, skipping...")
357 else:
358 self.log.info("Ingesting reference catalogs...")
359 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
360 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
362 def _doIngestRefcats(self, repo, refcats):
363 """Place refcats inside a particular repository.
365 Parameters
366 ----------
367 repo : `str`
368 The output repository location on disk for raw images. Must exist.
369 refcats : `str`
370 A directory containing .tar.gz files with LSST-formatted astrometric
371 or photometric reference catalog information.
372 """
373 for refcatName, tarball in self.config.refcats.items():
374 tarball = os.path.join(refcats, tarball)
375 refcatDir = os.path.join(repo, "ref_cats", refcatName)
376 with tarfile.open(tarball, "r") as opened:
377 opened.extractall(refcatDir)
379 def _copyConfigs(self, dataset, workspace):
380 """Give a workspace a copy of all configs associated with the ingested data.
382 After this method returns, the config directory in ``workspace`` shall
383 contain all config files from ``dataset``.
385 Parameters
386 ----------
387 dataset : `lsst.ap.verify.dataset.Dataset`
388 The dataset on which the pipeline will be run.
389 workspace : `lsst.ap.verify.workspace.Workspace`
390 The location containing the config directory.
391 """
392 if os.listdir(workspace.configDir):
393 self.log.info("Configs already copied, skipping...")
394 else:
395 self.log.info("Storing data-specific configs...")
396 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
397 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
399 def _doCopyConfigs(self, destination, source):
400 """Place configs inside a particular repository.
402 Parameters
403 ----------
404 destination : `str`
405 The directory to which the configs must be copied. Must exist.
406 source : `str`
407 A directory containing Task config files.
408 """
409 for configFile in _findMatchingFiles(source, ['*.py']):
410 shutil.copy2(configFile, destination)
413def ingestDataset(dataset, workspace):
414 """Ingest the contents of a dataset into a Butler repository.
416 The original data directory shall not be modified.
418 Parameters
419 ----------
420 dataset : `lsst.ap.verify.dataset.Dataset`
421 The dataset to be ingested.
422 workspace : `lsst.ap.verify.workspace.Workspace`
423 The abstract location where ingestion repositories will be created.
424 If the repositories already exist, they must be compatible with
425 ``dataset`` (in particular, they must support the relevant
426 ``obs`` package).
427 """
428 # TODO: generalize to support arbitrary URIs (DM-11482)
429 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
431 ingester = DatasetIngestTask(config=_getConfig(dataset))
432 ingester.run(dataset, workspace)
433 log.info("Data ingested")
436def _getConfig(dataset):
437 """Return the ingestion config associated with a specific dataset.
439 Parameters
440 ----------
441 dataset : `lsst.ap.verify.dataset.Dataset`
442 The dataset whose ingestion config is desired.
444 Returns
445 -------
446 config : `DatasetIngestConfig`
447 The config for running `DatasetIngestTask` on ``dataset``.
448 """
449 overrideFile = DatasetIngestTask._DefaultName + ".py"
450 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
452 config = DatasetIngestTask.ConfigClass()
453 for path in [
454 os.path.join(packageDir, 'config'),
455 os.path.join(packageDir, 'config', dataset.camera),
456 dataset.configLocation,
457 ]:
458 overridePath = os.path.join(path, overrideFile)
459 if os.path.exists(overridePath):
460 config.load(overridePath)
461 return config
464def _runIngestTask(task, args):
465 """Run an ingestion task on a set of inputs.
467 Parameters
468 ----------
469 task : `lsst.pipe.tasks.IngestTask`
470 The task to run.
471 args : list of command-line arguments, split using Python conventions
472 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
473 """
474 argumentParser = task.ArgumentParser(name=task.getName())
475 try:
476 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
477 except SystemExit as e:
478 # SystemExit is not an appropriate response when the arguments aren't user-supplied
479 raise ValueError("Invalid ingestion arguments: %s" % args) from e
480 task.run(parsedCmd)
483def _findMatchingFiles(basePath, include, exclude=None):
484 """Recursively identify files matching one set of patterns and not matching another.
486 Parameters
487 ----------
488 basePath : `str`
489 The path on disk where the files in ``include`` are located.
490 include : iterable of `str`
491 A collection of files (with wildcards) to include. Must not
492 contain paths.
493 exclude : iterable of `str`, optional
494 A collection of filenames (with wildcards) to exclude. Must not
495 contain paths. If omitted, all files matching ``include`` are returned.
497 Returns
498 -------
499 files : `set` of `str`
500 The files in ``basePath`` or any subdirectory that match ``include``
501 but not ``exclude``.
502 """
503 _exclude = exclude if exclude is not None else []
505 allFiles = set()
506 for pattern in include:
507 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
509 for pattern in _exclude:
510 allFiles.difference_update(fnmatch.filter(allFiles, pattern))
511 return allFiles
514@contextmanager
515def _tempChDir(newDir):
516 """Change to a new directory, while avoiding side effects in external code.
518 Note that no side effects are guaranteed in the case of normal operation or
519 for exceptions raised by the body of a ``with`` statement, but not for
520 exceptions raised by ``_tempChDir`` itself (see below).
522 This context manager cannot be used with "with ... as" statements.
524 Parameters
525 ----------
526 newDir : `str`
527 The directory to change to for the duration of a ``with`` statement.
529 Raises
530 ------
531 OSError
532 Raised if either the program cannot change to ``newDir``, or if it
533 cannot undo the change. Failing to change to ``newDir`` is
534 exception-safe (no side effects), but failing to undo is
535 not recoverable.
536 """
537 startDir = os.path.abspath(os.getcwd())
538 os.chdir(newDir)
539 try:
540 yield
541 finally:
542 os.chdir(startDir)