Coverage for python/lsst/ap/verify/ingestion.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1#
2# This file is part of ap_verify.
3#
4# Developed for the LSST Data Management System.
5# This product includes software developed by the LSST Project
6# (http://www.lsst.org).
7# See the COPYRIGHT file at the top-level directory of this distribution
8# for details of code ownership.
9#
10# This program is free software: you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program. If not, see <http://www.gnu.org/licenses/>.
22#
24"""Data ingestion for ap_verify.
26This module handles ingestion of a dataset into an appropriate repository, so
27that pipeline code need not be aware of the dataset framework.
28"""
30__all__ = ["DatasetIngestConfig", "ingestDataset"]
32import fnmatch
33import os
34import shutil
35import tarfile
36from glob import glob
37import sqlite3
39import lsst.utils
40import lsst.log
41import lsst.pex.config as pexConfig
42import lsst.pipe.base as pipeBase
44from lsst.pipe.tasks.ingest import IngestTask
45from lsst.pipe.tasks.ingestCalibs import IngestCalibsTask
46from lsst.pipe.tasks.ingestCuratedCalibs import IngestCuratedCalibsTask
49class DatasetIngestConfig(pexConfig.Config):
50 """Settings and defaults for `DatasetIngestTask`.
52 The correct targets for this task's subtasks can be found in the
53 documentation of the appropriate ``obs`` package.
55 Because `DatasetIngestTask` is not designed to be run from the command line,
56 and its arguments are completely determined by the choice of dataset,
57 this config includes settings that would normally be passed as command-line
58 arguments to `~lsst.pipe.tasks.ingest.IngestTask`.
59 """
61 dataIngester = pexConfig.ConfigurableField(
62 target=IngestTask,
63 doc="Task used to perform raw data ingestion.",
64 )
65 dataFiles = pexConfig.ListField(
66 dtype=str,
67 default=["*.fits", "*.fz", "*.fits.gz"],
68 doc="Names of raw science files (no path; wildcards allowed) to ingest from the dataset.",
69 )
70 dataBadFiles = pexConfig.ListField(
71 dtype=str,
72 default=[],
73 doc="Names of raw science files (no path; wildcards allowed) to not ingest, "
74 "supersedes ``dataFiles``.",
75 )
77 calibIngester = pexConfig.ConfigurableField(
78 target=IngestCalibsTask,
79 doc="Task used to ingest flats, biases, darks, fringes, or sky.",
80 )
81 calibFiles = pexConfig.ListField(
82 dtype=str,
83 default=["*.fits", "*.fz", "*.fits.gz"],
84 doc="Names of calib files (no path; wildcards allowed) to ingest from the dataset.",
85 )
86 calibBadFiles = pexConfig.ListField(
87 dtype=str,
88 default=[],
89 doc="Names of calib files (no path; wildcards allowed) to not ingest, supersedes ``calibFiles``.",
90 )
91 calibValidity = pexConfig.Field(
92 dtype=int,
93 default=9999,
94 doc="Calibration validity period (days). Assumed equal for all calib types.")
96 textDefectPath = pexConfig.Field(
97 dtype=str,
98 default=None,
99 optional=True,
100 doc="Path to top level of the defect tree. This is a directory with a directory per sensor. "
101 "Set to None to disable defect ingestion."
102 )
103 defectIngester = pexConfig.ConfigurableField(
104 target=IngestCuratedCalibsTask,
105 doc="Task used to ingest defects.",
106 )
108 refcats = pexConfig.DictField(
109 keytype=str,
110 itemtype=str,
111 default={},
112 doc="Map from a refcat name to a tar.gz file containing the sharded catalog. May be empty.",
113 )
116class DatasetIngestTask(pipeBase.Task):
117 """Task for automating ingestion of a dataset.
119 Each dataset configures this task as appropriate for the files it provides
120 and the target instrument. Therefore, this task takes no input besides the
121 dataset to load and the repositories to ingest to.
122 """
124 ConfigClass = DatasetIngestConfig
125 _DefaultName = "datasetIngest"
127 def __init__(self, *args, **kwargs):
128 pipeBase.Task.__init__(self, *args, **kwargs)
129 self.makeSubtask("dataIngester")
130 self.makeSubtask("calibIngester")
131 self.makeSubtask("defectIngester")
133 def run(self, dataset, workspace):
134 """Ingest the contents of a dataset into a Butler repository.
136 Parameters
137 ----------
138 dataset : `lsst.ap.verify.dataset.Dataset`
139 The dataset to be ingested.
140 workspace : `lsst.ap.verify.workspace.Workspace`
141 The abstract location where ingestion repositories will be created.
142 If the repositories already exist, they must support the same
143 ``obs`` package as this task's subtasks.
144 """
145 # We're assuming ingest tasks always give absolute path to butler
146 dataset.makeCompatibleRepo(workspace.dataRepo, os.path.abspath(workspace.calibRepo))
147 self._ingestRaws(dataset, workspace)
148 self._ingestCalibs(dataset, workspace)
149 self._ingestDefects(dataset, workspace)
150 self._ingestRefcats(dataset, workspace)
151 self._copyConfigs(dataset, workspace)
153 def _ingestRaws(self, dataset, workspace):
154 """Ingest the science data for use by LSST.
156 After this method returns, the data repository in ``workspace`` shall
157 contain all science data from ``dataset``. Butler operations on the
158 repository shall not be able to modify ``dataset``.
160 Parameters
161 ----------
162 dataset : `lsst.ap.verify.dataset.Dataset`
163 The dataset on which the pipeline will be run.
164 workspace : `lsst.ap.verify.workspace.Workspace`
165 The location containing all ingestion repositories.
167 Raises
168 ------
169 RuntimeError
170 Raised if there are no files to ingest.
171 """
172 if os.path.exists(os.path.join(workspace.dataRepo, "registry.sqlite3")):
173 self.log.info("Raw images were previously ingested, skipping...")
174 else:
175 self.log.info("Ingesting raw images...")
176 dataFiles = _findMatchingFiles(dataset.rawLocation, self.config.dataFiles)
177 if dataFiles:
178 self._doIngestRaws(workspace.dataRepo, workspace.calibRepo,
179 dataFiles, self.config.dataBadFiles)
180 self.log.info("Images are now ingested in {0}".format(workspace.dataRepo))
181 else:
182 raise RuntimeError("No raw files found at %s." % dataset.rawLocation)
184 def _doIngestRaws(self, repo, calibRepo, dataFiles, badFiles):
185 """Ingest raw images into a repository.
187 ``repo`` shall be populated with *links* to ``dataFiles``.
189 Parameters
190 ----------
191 repo : `str`
192 The output repository location on disk for raw images. Must exist.
193 calibRepo : `str`
194 The output calibration repository location on disk.
195 dataFiles : `list` of `str`
196 A list of filenames to ingest. May contain wildcards.
197 badFiles : `list` of `str`
198 A list of filenames to exclude from ingestion. Must not contain paths.
199 May contain wildcards.
201 Raises
202 ------
203 RuntimeError
204 Raised if ``dataFiles`` is empty.
205 """
206 if not dataFiles:
207 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles)
209 args = [repo, "--calib", calibRepo, "--mode", "link"]
210 args.extend(dataFiles)
211 if badFiles:
212 args.append('--badFile')
213 args.extend(badFiles)
214 try:
215 _runIngestTask(self.dataIngester, args)
216 except sqlite3.IntegrityError as detail:
217 raise RuntimeError("Not all raw files are unique") from detail
219 def _ingestCalibs(self, dataset, workspace):
220 """Ingest the calibration files for use by LSST.
222 After this method returns, the calibration repository in ``workspace``
223 shall contain all calibration data from ``dataset``. Butler operations
224 on the repository shall not be able to modify ``dataset``.
226 Parameters
227 ----------
228 dataset : `lsst.ap.verify.dataset.Dataset`
229 The dataset on which the pipeline will be run.
230 workspace : `lsst.ap.verify.workspace.Workspace`
231 The location containing all ingestion repositories.
233 Raises
234 ------
235 RuntimeError
236 Raised if there are no files to ingest.
237 """
238 if os.path.exists(os.path.join(workspace.calibRepo, "calibRegistry.sqlite3")):
239 self.log.info("Calibration files were previously ingested, skipping...")
240 else:
241 self.log.info("Ingesting calibration files...")
242 calibDataFiles = _findMatchingFiles(dataset.calibLocation,
243 self.config.calibFiles, self.config.calibBadFiles)
244 if calibDataFiles:
245 self._doIngestCalibs(workspace.dataRepo, workspace.calibRepo, calibDataFiles)
246 self.log.info("Calibrations corresponding to {0} are now ingested in {1}".format(
247 workspace.dataRepo, workspace.calibRepo))
248 else:
249 raise RuntimeError("No calib files found at %s." % dataset.calibLocation)
251 def _doIngestCalibs(self, repo, calibRepo, calibDataFiles):
252 """Ingest calibration images into a calibration repository.
254 Parameters
255 ----------
256 repo : `str`
257 The output repository location on disk for raw images. Must exist.
258 calibRepo : `str`
259 The output repository location on disk for calibration files. Must
260 exist.
261 calibDataFiles : `list` of `str`
262 A list of filenames to ingest. Supported files vary by instrument
263 but may include flats, biases, darks, fringes, or sky. May contain
264 wildcards.
266 Raises
267 ------
268 RuntimeError
269 Raised if ``calibDataFiles`` is empty.
270 """
271 if not calibDataFiles:
272 raise RuntimeError("No calib files to ingest (expected list of filenames, got %r)."
273 % calibDataFiles)
275 # TODO: --output is workaround for DM-11668
276 args = [repo, "--calib", calibRepo, "--output", os.path.join(calibRepo, "dummy"),
277 "--mode", "link", "--validity", str(self.config.calibValidity)]
278 args.extend(calibDataFiles)
279 try:
280 _runIngestTask(self.calibIngester, args)
281 except sqlite3.IntegrityError as detail:
282 raise RuntimeError("Not all calibration files are unique") from detail
284 def _ingestDefects(self, dataset, workspace):
285 """Ingest the defect files for use by LSST.
287 After this method returns, the calibration repository in ``workspace``
288 shall contain all defects from ``dataset``. Butler operations on the
289 repository shall not be able to modify ``dataset``.
291 Parameters
292 ----------
293 dataset : `lsst.ap.verify.dataset.Dataset`
294 The dataset on which the pipeline will be run.
295 workspace : `lsst.ap.verify.workspace.Workspace`
296 The location containing all ingestion repositories.
298 Raises
299 ------
300 RuntimeError
301 Raised if defect ingestion requested but no defects found.
302 """
303 if os.path.exists(os.path.join(workspace.calibRepo, "defects")):
304 self.log.info("Defects were previously ingested, skipping...")
305 elif self.config.textDefectPath:
306 self.log.info("Ingesting defects...")
307 self._doIngestDefects(workspace.dataRepo, workspace.calibRepo, self.config.textDefectPath)
308 self.log.info("Defects are now ingested in {0}".format(workspace.calibRepo))
310 def _doIngestDefects(self, repo, calibRepo, defectPath):
311 """Ingest defect images.
313 Parameters
314 ----------
315 repo : `str`
316 The output repository location on disk for raw images. Must exist.
317 calibRepo : `str`
318 The output repository location on disk for calibration files. Must
319 exist.
320 defectPath : `str`
321 Path to the defects in standard text form. This is probably a path in ``obs_*_data``.
323 Raises
324 ------
325 RuntimeError
326 Raised if ``defectTarball`` exists but is empty.
327 """
329 defectargs = [repo, defectPath, "--calib", calibRepo]
330 try:
331 _runIngestTask(self.defectIngester, defectargs)
332 except sqlite3.IntegrityError as detail:
333 raise RuntimeError("Not all defect files are unique") from detail
335 def _ingestRefcats(self, dataset, workspace):
336 """Ingest the refcats for use by LSST.
338 After this method returns, the data repository in ``workspace`` shall
339 contain all reference catalogs from ``dataset``. Operations on the
340 repository shall not be able to modify ``dataset``.
342 Parameters
343 ----------
344 dataset : `lsst.ap.verify.dataset.Dataset`
345 The dataset on which the pipeline will be run.
346 workspace : `lsst.ap.verify.workspace.Workspace`
347 The location containing all ingestion repositories.
349 Notes
350 -----
351 Refcats are not, at present, registered as part of the repository. They
352 are not guaranteed to be visible to anything other than a
353 ``refObjLoader``. See the [refcat Community thread](https://community.lsst.org/t/1523)
354 for more details.
355 """
356 if os.path.exists(os.path.join(workspace.dataRepo, "ref_cats")):
357 self.log.info("Refcats were previously ingested, skipping...")
358 else:
359 self.log.info("Ingesting reference catalogs...")
360 self._doIngestRefcats(workspace.dataRepo, dataset.refcatsLocation)
361 self.log.info("Reference catalogs are now ingested in {0}".format(workspace.dataRepo))
363 def _doIngestRefcats(self, repo, refcats):
364 """Place refcats inside a particular repository.
366 Parameters
367 ----------
368 repo : `str`
369 The output repository location on disk for raw images. Must exist.
370 refcats : `str`
371 A directory containing .tar.gz files with LSST-formatted astrometric
372 or photometric reference catalog information.
373 """
374 for refcatName, tarball in self.config.refcats.items():
375 tarball = os.path.join(refcats, tarball)
376 refcatDir = os.path.join(repo, "ref_cats", refcatName)
377 with tarfile.open(tarball, "r") as opened:
378 opened.extractall(refcatDir)
380 def _copyConfigs(self, dataset, workspace):
381 """Give a workspace a copy of all configs associated with the ingested data.
383 After this method returns, the config directory in ``workspace`` shall
384 contain all config files from ``dataset``.
386 Parameters
387 ----------
388 dataset : `lsst.ap.verify.dataset.Dataset`
389 The dataset on which the pipeline will be run.
390 workspace : `lsst.ap.verify.workspace.Workspace`
391 The location containing the config directory.
392 """
393 if os.listdir(workspace.configDir):
394 self.log.info("Configs already copied, skipping...")
395 else:
396 self.log.info("Storing data-specific configs...")
397 self._doCopyConfigs(workspace.configDir, dataset.configLocation)
398 self.log.info("Configs are now stored in {0}".format(workspace.configDir))
400 def _doCopyConfigs(self, destination, source):
401 """Place configs inside a particular repository.
403 Parameters
404 ----------
405 destination : `str`
406 The directory to which the configs must be copied. Must exist.
407 source : `str`
408 A directory containing Task config files.
409 """
410 for configFile in _findMatchingFiles(source, ['*.py']):
411 shutil.copy2(configFile, destination)
414def ingestDataset(dataset, workspace):
415 """Ingest the contents of a dataset into a Butler repository.
417 The original data directory shall not be modified.
419 Parameters
420 ----------
421 dataset : `lsst.ap.verify.dataset.Dataset`
422 The dataset to be ingested.
423 workspace : `lsst.ap.verify.workspace.Workspace`
424 The abstract location where ingestion repositories will be created.
425 If the repositories already exist, they must be compatible with
426 ``dataset`` (in particular, they must support the relevant
427 ``obs`` package).
428 """
429 # TODO: generalize to support arbitrary URIs (DM-11482)
430 log = lsst.log.Log.getLogger("ap.verify.ingestion.ingestDataset")
432 ingester = DatasetIngestTask(config=_getConfig(dataset))
433 ingester.run(dataset, workspace)
434 log.info("Data ingested")
437def _getConfig(dataset):
438 """Return the ingestion config associated with a specific dataset.
440 Parameters
441 ----------
442 dataset : `lsst.ap.verify.dataset.Dataset`
443 The dataset whose ingestion config is desired.
445 Returns
446 -------
447 config : `DatasetIngestConfig`
448 The config for running `DatasetIngestTask` on ``dataset``.
449 """
450 overrideFile = DatasetIngestTask._DefaultName + ".py"
451 packageDir = lsst.utils.getPackageDir(dataset.obsPackage)
453 config = DatasetIngestTask.ConfigClass()
454 for path in [
455 os.path.join(packageDir, 'config'),
456 os.path.join(packageDir, 'config', dataset.camera),
457 dataset.configLocation,
458 ]:
459 overridePath = os.path.join(path, overrideFile)
460 if os.path.exists(overridePath):
461 config.load(overridePath)
462 return config
465def _runIngestTask(task, args):
466 """Run an ingestion task on a set of inputs.
468 Parameters
469 ----------
470 task : `lsst.pipe.tasks.IngestTask`
471 The task to run.
472 args : list of command-line arguments, split using Python conventions
473 The command-line arguments for ``task``. Must be compatible with ``task.ArgumentParser``.
474 """
475 argumentParser = task.ArgumentParser(name=task.getName())
476 try:
477 parsedCmd = argumentParser.parse_args(config=task.config, args=args)
478 except SystemExit as e:
479 # SystemExit is not an appropriate response when the arguments aren't user-supplied
480 raise ValueError("Invalid ingestion arguments: %s" % args) from e
481 task.run(parsedCmd)
484def _findMatchingFiles(basePath, include, exclude=None):
485 """Recursively identify files matching one set of patterns and not matching another.
487 Parameters
488 ----------
489 basePath : `str`
490 The path on disk where the files in ``include`` are located.
491 include : iterable of `str`
492 A collection of files (with wildcards) to include. Must not
493 contain paths.
494 exclude : iterable of `str`, optional
495 A collection of filenames (with wildcards) to exclude. Must not
496 contain paths. If omitted, all files matching ``include`` are returned.
498 Returns
499 -------
500 files : `set` of `str`
501 The files in ``basePath`` or any subdirectory that match ``include``
502 but not ``exclude``.
503 """
504 _exclude = exclude if exclude is not None else []
506 allFiles = set()
507 for pattern in include:
508 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True))
510 for pattern in _exclude:
511 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)]
512 allFiles.difference_update(excludedFiles)
513 return allFiles