lsst.obs.base  19.0.0-24-g940be9f+1
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associate names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  rootSkyMapName = Field(
108  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
109  "datasets in the root repository when no SkyMap is found there. ",
110  dtype=str,
111  optional=True,
112  default=None,
113  )
114  collections = DictField(
115  "Special collections (values) for certain dataset types (keys). "
116  "These are used in addition to rerun collections for datasets in "
117  "reruns. The 'raw' dataset must have an entry here if it is to be "
118  "converted.",
119  keytype=str,
120  itemtype=str,
121  default={
122  "deepCoadd_skyMap": "skymaps",
123  "brightObjectMask": "masks",
124  }
125  )
126  storageClasses = DictField(
127  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
128  "or 'persistable') to the Gen3 StorageClass name.",
129  keytype=str,
130  itemtype=str,
131  default={
132  "bias": "ExposureF",
133  "dark": "ExposureF",
134  "flat": "ExposureF",
135  "defects": "Defects",
136  "BaseSkyMap": "SkyMap",
137  "BaseCatalog": "Catalog",
138  "BackgroundList": "Background",
139  "raw": "Exposure",
140  "MultilevelParquetTable": "DataFrame",
141  "ParquetTable": "DataFrame",
142  "SkyWcs": "Wcs",
143  }
144  )
145  doRegisterInstrument = Field(
146  "If True (default), add dimension records for the Instrument and its "
147  "filters and detectors to the registry instead of assuming they are "
148  "already present.",
149  dtype=bool,
150  default=True,
151  )
152  doWriteCuratedCalibrations = Field(
153  "If True (default), ingest human-curated calibrations directly via "
154  "the Instrument interface. Note that these calibrations are never "
155  "converted from Gen2 repositories.",
156  dtype=bool,
157  default=True,
158  )
159  refCats = ListField(
160  "The names of reference catalogs (subdirectories under ref_cats) to "
161  "be converted",
162  dtype=str,
163  default=[]
164  )
165  fileIgnorePatterns = ListField(
166  "Filename globs that should be ignored instead of being treated as "
167  "datasets.",
168  dtype=str,
169  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
170  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
171  "_parent", "repositoryCfg.yaml"]
172  )
173  datasetIncludePatterns = ListField(
174  "Glob-style patterns for dataset type names that should be converted.",
175  dtype=str,
176  default=["*"]
177  )
178  datasetIgnorePatterns = ListField(
179  "Glob-style patterns for dataset type names that should not be "
180  "converted despite matching a pattern in datasetIncludePatterns.",
181  dtype=str,
182  default=[]
183  )
184  ccdKey = Field(
185  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
186  dtype=str,
187  default="ccd",
188  )
189  relatedOnly = Field(
190  "If True (default), only convert datasets that are related to the "
191  "ingested visits. Ignored unless a list of visits is passed to "
192  "run().",
193  dtype=bool,
194  default=False,
195  )
196  curatedCalibrations = ListField(
197  "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` "
198  "and thus should not be converted using the standard calibration "
199  "conversion system.",
200  dtype=str,
201  default=["camera",
202  "transmission_sensor",
203  "transmission_filter",
204  "transmission_optics",
205  "transmission_atmosphere",
206  "bfKernel"]
207  )
208 
209  @property
210  def transfer(self):
211  return self.raws.transfer
212 
213  @transfer.setter
214  def transfer(self, value):
215  self.raws.transfer = value
216 
217  @property
218  def instrument(self):
219  return self.raws.instrument
220 
221  @instrument.setter
222  def instrument(self, value):
223  self.raws.instrument = value
224 
225  def setDefaults(self):
226  self.transfer = None
227 
228  # TODO: check that there are no collection overrides for curated
229  # calibrations, since we don't have a good way to utilize them.
230 
231 
232 class ConvertRepoTask(Task):
233  """A task that converts one or more related Gen2 data repositories to a
234  single Gen3 data repository (with multiple collections).
235 
236  Parameters
237  ----------
238  config: `ConvertRepoConfig`
239  Configuration for this task.
240  butler3: `lsst.daf.butler.Butler`
241  Gen3 Butler instance that represents the data repository datasets will
242  be ingested into. The collection and/or run associated with this
243  Butler will be ignored in favor of collections/runs passed via config
244  or to `run`.
245  kwds
246  Other keyword arguments are forwarded to the `Task` constructor.
247 
248  Notes
249  -----
250  Most of the work of converting repositories is delegated to instances of
251  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
252  only state that is relevant for all Gen2 repositories being ingested, while
253  each `RepoConverter` instance holds only state relevant for the conversion
254  of a single Gen2 repository. Both the task and the `RepoConverter`
255  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
256  methods may only be called once on a particular instance.
257  """
258 
259  ConfigClass = ConvertRepoConfig
260 
261  _DefaultName = "convertRepo"
262 
263  def __init__(self, config=None, *, butler3: Butler3, **kwds):
264  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
265  super().__init__(config, **kwds)
266  self.butler3 = butler3
267  self.registry = self.butler3.registry
268  self.universe = self.registry.dimensions
269  if self.isDatasetTypeIncluded("raw"):
270  self.makeSubtask("raws", butler=butler3)
271  self.instrument = self.raws.instrument
272  else:
273  self.raws = None
274  self.instrument = doImport(self.config.instrument)()
275  self._configuredSkyMapsBySha1 = {}
276  self._configuredSkyMapsByName = {}
277  for name, config in self.config.skyMaps.items():
278  instance = config.skyMap.apply()
279  self._populateSkyMapDicts(name, instance)
280  self._usedSkyPix = set()
281 
282  def _populateSkyMapDicts(self, name, instance):
283  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
284  self._configuredSkyMapsBySha1[struct.sha1] = struct
285  self._configuredSkyMapsByName[struct.name] = struct
286 
287  def isDatasetTypeIncluded(self, datasetTypeName: str):
288  """Return `True` if configuration indicates that the given dataset type
289  should be converted.
290 
291  This method is intended to be called primarily by the
292  `RepoConverter` instances used interally by the task.
293 
294  Parameters
295  ----------
296  datasetTypeName: str
297  Name of the dataset type.
298 
299  Returns
300  -------
301  included : `bool`
302  Whether the dataset should be included in the conversion.
303  """
304  return (
305  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
306  for pattern in self.config.datasetIncludePatterns)
307  and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
308  for pattern in self.config.datasetIgnorePatterns)
309  )
310 
311  def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
312  """Indicate that a repository uses the given SkyMap.
313 
314  This method is intended to be called primarily by the
315  `RepoConverter` instances used interally by the task.
316 
317  Parameters
318  ----------
319  skyMap : `lsst.skymap.BaseSkyMap`
320  SkyMap instance being used, typically retrieved from a Gen2
321  data repository.
322  skyMapName : `str`
323  The name of the gen2 skymap, for error reporting.
324 
325  Returns
326  -------
327  name : `str`
328  The name of the skymap in Gen3 data IDs.
329 
330  Raises
331  ------
332  LookupError
333  Raised if the specified skymap cannot be found.
334  """
335  sha1 = skyMap.getSha1()
336  if sha1 not in self._configuredSkyMapsBySha1:
337  self._populateSkyMapDicts(skyMapName, skyMap)
338  try:
339  struct = self._configuredSkyMapsBySha1[sha1]
340  except KeyError as err:
341  msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
342  raise LookupError(msg) from err
343  struct.used = True
344  return struct.name
345 
346  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
347  """Register all skymaps that have been marked as used.
348 
349  This method is intended to be called primarily by the
350  `RepoConverter` instances used interally by the task.
351 
352  Parameters
353  ----------
354  subset : `ConversionSubset`, optional
355  Object that will be used to filter converted datasets by data ID.
356  If given, it will be updated with the tracts of this skymap that
357  overlap the visits in the subset.
358  """
359  for struct in self._configuredSkyMapsBySha1.values():
360  if struct.used:
361  struct.instance.register(struct.name, self.registry)
362  if subset is not None and self.config.relatedOnly:
363  subset.addSkyMap(self.registry, struct.name)
364 
365  def useSkyPix(self, dimension: SkyPixDimension):
366  """Indicate that a repository uses the given SkyPix dimension.
367 
368  This method is intended to be called primarily by the
369  `RepoConverter` instances used interally by the task.
370 
371  Parameters
372  ----------
373  dimension : `lsst.daf.butler.SkyPixDimension`
374  Dimension represening a pixelization of the sky.
375  """
376  self._usedSkyPix.add(dimension)
377 
378  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
379  """Register all skymaps that have been marked as used.
380 
381  This method is intended to be called primarily by the
382  `RepoConverter` instances used interally by the task.
383 
384  Parameters
385  ----------
386  subset : `ConversionSubset`, optional
387  Object that will be used to filter converted datasets by data ID.
388  If given, it will be updated with the pixelization IDs that
389  overlap the visits in the subset.
390  """
391  if subset is not None and self.config.relatedOnly:
392  for dimension in self._usedSkyPix:
393  subset.addSkyPix(self.registry, dimension)
394 
395  def run(self, root: str, collections: List[str], *,
396  calibs: Dict[str, List[str]] = None,
397  reruns: Dict[str, List[str]] = None,
398  visits: Optional[Iterable[int]] = None):
399  """Convert a group of related data repositories.
400 
401  Parameters
402  ----------
403  root : `str`
404  Complete path to the root Gen2 data repository. This should be
405  a data repository that includes a Gen2 registry and any raw files
406  and/or reference catalogs.
407  collections : `list` of `str`
408  Gen3 collections that datasets from the root repository should be
409  associated with. This should include any rerun collection that
410  these datasets should also be considered to be part of; because of
411  structural difference between Gen2 parent/child relationships and
412  Gen3 collections, these cannot be reliably inferred.
413  calibs : `dict`
414  Dictionary mapping calibration repository path to the collections
415  that the repository's datasets should be associated with. The path
416  may be relative to ``root`` or absolute. Collections should
417  include child repository collections as appropriate (see
418  documentation for ``collections``).
419  reruns : `dict`
420  Dictionary mapping rerun repository path to the collections that
421  the repository's datasets should be associated with. The path may
422  be relative to ``root`` or absolute. Collections should include
423  child repository collections as appropriate (see documentation for
424  ``collections``).
425  visits : iterable of `int`, optional
426  The integer IDs of visits to convert. If not provided, all visits
427  in the Gen2 root repository will be converted.
428  """
429 
430  if calibs is None:
431  calibs = {}
432  if reruns is None:
433  reruns = {}
434  if visits is not None:
435  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
436  else:
437  if self.config.relatedOnly:
438  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
439  "no filtering will be done.")
440  subset = None
441 
442  # We can't wrap database writes sanely in transactions (yet) because we
443  # keep initializing new Butler instances just so we can write into new
444  # runs/collections, and transactions are managed at the Butler level.
445  # DM-21246 should let us fix this, assuming we actually want to keep
446  # the transaction open that long.
447  if self.config.doRegisterInstrument:
448  self.instrument.register(self.registry)
449 
450  # Make and prep converters for all Gen2 repos. This should not modify
451  # the Registry database or filesystem at all, though it may query it.
452  # The prep() calls here will be some of the slowest ones, because
453  # that's when we walk the filesystem.
454  converters = []
455  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
456  rootConverter.prep()
457  converters.append(rootConverter)
458 
459  for root, collections in calibs.items():
460  if not os.path.isabs(root):
461  root = os.path.join(rootConverter.root, root)
462  converter = CalibRepoConverter(task=self, root=root, collections=collections,
463  mapper=rootConverter.mapper,
464  subset=rootConverter.subset)
465  converter.prep()
466  converters.append(converter)
467 
468  for root, collections in reruns.items():
469  if not os.path.isabs(root):
470  root = os.path.join(rootConverter.root, root)
471  converter = StandardRepoConverter(task=self, root=root, collections=collections,
472  subset=rootConverter.subset)
473  converter.prep()
474  converters.append(converter)
475 
476  # Actual database writes start here. We can't wrap these sanely in
477  # transactions (yet) because we keep initializing new Butler instances
478  # just so we can write into new runs/collections, and transactions
479  # are managed at the Butler level (DM-21246 should let us fix this).
480 
481  # Insert dimensions needed by any converters. These are only the
482  # dimensions that a converter expects to be uniquely derived from the
483  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
484  # calibration_labels.
485  #
486  # Note that we do not try to filter dimensions down to just those
487  # related to the given visits, even if config.relatedOnly is True; we
488  # need them in the Gen3 repo in order to be able to know which datasets
489  # to convert, because Gen2 alone doesn't know enough about the
490  # relationships between data IDs.
491  for converter in converters:
492  converter.insertDimensionData()
493 
494  # Insert dimensions that are potentially shared by all Gen2
495  # repositories (and are hence managed directly by the Task, rather
496  # than a converter instance).
497  # This also finishes setting up the (shared) converter.subsets object
498  # that is used to filter data IDs for config.relatedOnly.
499  self.registerUsedSkyMaps(rootConverter.subset)
500  self.registerUsedSkyPix(rootConverter.subset)
501 
502  # Look for datasets, generally by scanning the filesystem.
503  # This requires dimensions to have already been inserted so we can use
504  # dimension information to identify related datasets.
505  for converter in converters:
506  converter.findDatasets()
507 
508  # Expand data IDs.
509  for converter in converters:
510  converter.expandDataIds()
511 
512  # Actually ingest datasets.
513  for converter in converters:
514  converter.ingest()