lsst.obs.base  19.0.0-28-g99824a6
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associate names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  rootSkyMapName = Field(
108  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
109  "datasets in the root repository when no SkyMap is found there. ",
110  dtype=str,
111  optional=True,
112  default=None,
113  )
114  collections = DictField(
115  "Special collections (values) for certain dataset types (keys). "
116  "These are used in addition to rerun collections for datasets in "
117  "reruns. The 'raw' dataset must have an entry here if it is to be "
118  "converted.",
119  keytype=str,
120  itemtype=str,
121  default={
122  "deepCoadd_skyMap": "skymaps",
123  "brightObjectMask": "masks",
124  }
125  )
126  storageClasses = DictField(
127  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
128  "or 'persistable') to the Gen3 StorageClass name.",
129  keytype=str,
130  itemtype=str,
131  default={
132  "bias": "ExposureF",
133  "dark": "ExposureF",
134  "flat": "ExposureF",
135  "defects": "Defects",
136  "BaseSkyMap": "SkyMap",
137  "BaseCatalog": "Catalog",
138  "BackgroundList": "Background",
139  "raw": "Exposure",
140  "MultilevelParquetTable": "DataFrame",
141  "ParquetTable": "DataFrame",
142  "SkyWcs": "Wcs",
143  }
144  )
145  formatterClasses = DictField(
146  "Mapping from dataset type name to formatter class. "
147  "By default these are derived from the formatters listed in the"
148  " Gen3 datastore configuration.",
149  keytype=str,
150  itemtype=str,
151  default={}
152  )
153  targetHandlerClasses = DictField(
154  "Mapping from dataset type name to target handler class.",
155  keytype=str,
156  itemtype=str,
157  default={}
158  )
159  doRegisterInstrument = Field(
160  "If True (default), add dimension records for the Instrument and its "
161  "filters and detectors to the registry instead of assuming they are "
162  "already present.",
163  dtype=bool,
164  default=True,
165  )
166  doWriteCuratedCalibrations = Field(
167  "If True (default), ingest human-curated calibrations directly via "
168  "the Instrument interface. Note that these calibrations are never "
169  "converted from Gen2 repositories.",
170  dtype=bool,
171  default=True,
172  )
173  refCats = ListField(
174  "The names of reference catalogs (subdirectories under ref_cats) to "
175  "be converted",
176  dtype=str,
177  default=[]
178  )
179  fileIgnorePatterns = ListField(
180  "Filename globs that should be ignored instead of being treated as "
181  "datasets.",
182  dtype=str,
183  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
184  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
185  "_parent", "repositoryCfg.yaml"]
186  )
187  rawDatasetType = Field(
188  "Gen2 dataset type to use for raw data.",
189  dtype=str,
190  default="raw",
191  )
192  datasetIncludePatterns = ListField(
193  "Glob-style patterns for dataset type names that should be converted.",
194  dtype=str,
195  default=["*"]
196  )
197  datasetIgnorePatterns = ListField(
198  "Glob-style patterns for dataset type names that should not be "
199  "converted despite matching a pattern in datasetIncludePatterns.",
200  dtype=str,
201  default=[]
202  )
203  ccdKey = Field(
204  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
205  dtype=str,
206  default="ccd",
207  )
208  relatedOnly = Field(
209  "If True (default), only convert datasets that are related to the "
210  "ingested visits. Ignored unless a list of visits is passed to "
211  "run().",
212  dtype=bool,
213  default=False,
214  )
215  curatedCalibrations = ListField(
216  "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` "
217  "and thus should not be converted using the standard calibration "
218  "conversion system.",
219  dtype=str,
220  default=["camera",
221  "transmission_sensor",
222  "transmission_filter",
223  "transmission_optics",
224  "transmission_atmosphere",
225  "bfKernel"]
226  )
227 
228  @property
229  def transfer(self):
230  return self.raws.transfer
231 
232  @transfer.setter
233  def transfer(self, value):
234  self.raws.transfer = value
235 
236  @property
237  def instrument(self):
238  return self.raws.instrument
239 
240  @instrument.setter
241  def instrument(self, value):
242  self.raws.instrument = value
243 
244  def setDefaults(self):
245  self.transfer = None
246 
247  # TODO: check that there are no collection overrides for curated
248  # calibrations, since we don't have a good way to utilize them.
249 
250 
251 class ConvertRepoTask(Task):
252  """A task that converts one or more related Gen2 data repositories to a
253  single Gen3 data repository (with multiple collections).
254 
255  Parameters
256  ----------
257  config: `ConvertRepoConfig`
258  Configuration for this task.
259  butler3: `lsst.daf.butler.Butler`
260  Gen3 Butler instance that represents the data repository datasets will
261  be ingested into. The collection and/or run associated with this
262  Butler will be ignored in favor of collections/runs passed via config
263  or to `run`.
264  kwds
265  Other keyword arguments are forwarded to the `Task` constructor.
266 
267  Notes
268  -----
269  Most of the work of converting repositories is delegated to instances of
270  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
271  only state that is relevant for all Gen2 repositories being ingested, while
272  each `RepoConverter` instance holds only state relevant for the conversion
273  of a single Gen2 repository. Both the task and the `RepoConverter`
274  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
275  methods may only be called once on a particular instance.
276  """
277 
278  ConfigClass = ConvertRepoConfig
279 
280  _DefaultName = "convertRepo"
281 
282  def __init__(self, config=None, *, butler3: Butler3, **kwds):
283  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
284  super().__init__(config, **kwds)
285  self.butler3 = butler3
286  self.registry = self.butler3.registry
287  self.universe = self.registry.dimensions
288  if self.isDatasetTypeIncluded("raw"):
289  self.makeSubtask("raws", butler=butler3)
290  self.instrument = self.raws.instrument
291  else:
292  self.raws = None
293  self.instrument = doImport(self.config.instrument)()
294  self._configuredSkyMapsBySha1 = {}
295  self._configuredSkyMapsByName = {}
296  for name, config in self.config.skyMaps.items():
297  instance = config.skyMap.apply()
298  self._populateSkyMapDicts(name, instance)
299  self._usedSkyPix = set()
300 
301  def _populateSkyMapDicts(self, name, instance):
302  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
303  self._configuredSkyMapsBySha1[struct.sha1] = struct
304  self._configuredSkyMapsByName[struct.name] = struct
305 
306  def isDatasetTypeIncluded(self, datasetTypeName: str):
307  """Return `True` if configuration indicates that the given dataset type
308  should be converted.
309 
310  This method is intended to be called primarily by the
311  `RepoConverter` instances used interally by the task.
312 
313  Parameters
314  ----------
315  datasetTypeName: str
316  Name of the dataset type.
317 
318  Returns
319  -------
320  included : `bool`
321  Whether the dataset should be included in the conversion.
322  """
323  return (
324  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
325  for pattern in self.config.datasetIncludePatterns)
326  and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
327  for pattern in self.config.datasetIgnorePatterns)
328  )
329 
330  def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
331  """Indicate that a repository uses the given SkyMap.
332 
333  This method is intended to be called primarily by the
334  `RepoConverter` instances used interally by the task.
335 
336  Parameters
337  ----------
338  skyMap : `lsst.skymap.BaseSkyMap`
339  SkyMap instance being used, typically retrieved from a Gen2
340  data repository.
341  skyMapName : `str`
342  The name of the gen2 skymap, for error reporting.
343 
344  Returns
345  -------
346  name : `str`
347  The name of the skymap in Gen3 data IDs.
348 
349  Raises
350  ------
351  LookupError
352  Raised if the specified skymap cannot be found.
353  """
354  sha1 = skyMap.getSha1()
355  if sha1 not in self._configuredSkyMapsBySha1:
356  self._populateSkyMapDicts(skyMapName, skyMap)
357  try:
358  struct = self._configuredSkyMapsBySha1[sha1]
359  except KeyError as err:
360  msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
361  raise LookupError(msg) from err
362  struct.used = True
363  return struct.name
364 
365  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
366  """Register all skymaps that have been marked as used.
367 
368  This method is intended to be called primarily by the
369  `RepoConverter` instances used interally by the task.
370 
371  Parameters
372  ----------
373  subset : `ConversionSubset`, optional
374  Object that will be used to filter converted datasets by data ID.
375  If given, it will be updated with the tracts of this skymap that
376  overlap the visits in the subset.
377  """
378  for struct in self._configuredSkyMapsBySha1.values():
379  if struct.used:
380  struct.instance.register(struct.name, self.registry)
381  if subset is not None and self.config.relatedOnly:
382  subset.addSkyMap(self.registry, struct.name)
383 
384  def useSkyPix(self, dimension: SkyPixDimension):
385  """Indicate that a repository uses the given SkyPix dimension.
386 
387  This method is intended to be called primarily by the
388  `RepoConverter` instances used interally by the task.
389 
390  Parameters
391  ----------
392  dimension : `lsst.daf.butler.SkyPixDimension`
393  Dimension represening a pixelization of the sky.
394  """
395  self._usedSkyPix.add(dimension)
396 
397  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
398  """Register all skymaps that have been marked as used.
399 
400  This method is intended to be called primarily by the
401  `RepoConverter` instances used interally by the task.
402 
403  Parameters
404  ----------
405  subset : `ConversionSubset`, optional
406  Object that will be used to filter converted datasets by data ID.
407  If given, it will be updated with the pixelization IDs that
408  overlap the visits in the subset.
409  """
410  if subset is not None and self.config.relatedOnly:
411  for dimension in self._usedSkyPix:
412  subset.addSkyPix(self.registry, dimension)
413 
414  def run(self, root: str, collections: List[str], *,
415  calibs: Dict[str, List[str]] = None,
416  reruns: Dict[str, List[str]] = None,
417  visits: Optional[Iterable[int]] = None):
418  """Convert a group of related data repositories.
419 
420  Parameters
421  ----------
422  root : `str`
423  Complete path to the root Gen2 data repository. This should be
424  a data repository that includes a Gen2 registry and any raw files
425  and/or reference catalogs.
426  collections : `list` of `str`
427  Gen3 collections that datasets from the root repository should be
428  associated with. This should include any rerun collection that
429  these datasets should also be considered to be part of; because of
430  structural difference between Gen2 parent/child relationships and
431  Gen3 collections, these cannot be reliably inferred.
432  calibs : `dict`
433  Dictionary mapping calibration repository path to the collections
434  that the repository's datasets should be associated with. The path
435  may be relative to ``root`` or absolute. Collections should
436  include child repository collections as appropriate (see
437  documentation for ``collections``).
438  reruns : `dict`
439  Dictionary mapping rerun repository path to the collections that
440  the repository's datasets should be associated with. The path may
441  be relative to ``root`` or absolute. Collections should include
442  child repository collections as appropriate (see documentation for
443  ``collections``).
444  visits : iterable of `int`, optional
445  The integer IDs of visits to convert. If not provided, all visits
446  in the Gen2 root repository will be converted.
447  """
448 
449  if calibs is None:
450  calibs = {}
451  if reruns is None:
452  reruns = {}
453  if visits is not None:
454  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
455  else:
456  if self.config.relatedOnly:
457  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
458  "no filtering will be done.")
459  subset = None
460 
461  # We can't wrap database writes sanely in transactions (yet) because we
462  # keep initializing new Butler instances just so we can write into new
463  # runs/collections, and transactions are managed at the Butler level.
464  # DM-21246 should let us fix this, assuming we actually want to keep
465  # the transaction open that long.
466  if self.config.doRegisterInstrument:
467  # Allow registration to fail on the assumption that this means
468  # we are reusing a butler
469  try:
470  self.instrument.register(self.registry)
471  except Exception:
472  pass
473 
474  # Make and prep converters for all Gen2 repos. This should not modify
475  # the Registry database or filesystem at all, though it may query it.
476  # The prep() calls here will be some of the slowest ones, because
477  # that's when we walk the filesystem.
478  converters = []
479  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
480  rootConverter.prep()
481  converters.append(rootConverter)
482 
483  for root, collections in calibs.items():
484  if not os.path.isabs(root):
485  root = os.path.join(rootConverter.root, root)
486  converter = CalibRepoConverter(task=self, root=root, collections=collections,
487  mapper=rootConverter.mapper,
488  subset=rootConverter.subset)
489  converter.prep()
490  converters.append(converter)
491 
492  for root, collections in reruns.items():
493  if not os.path.isabs(root):
494  root = os.path.join(rootConverter.root, root)
495  converter = StandardRepoConverter(task=self, root=root, collections=collections,
496  subset=rootConverter.subset)
497  converter.prep()
498  converters.append(converter)
499 
500  # Actual database writes start here. We can't wrap these sanely in
501  # transactions (yet) because we keep initializing new Butler instances
502  # just so we can write into new runs/collections, and transactions
503  # are managed at the Butler level (DM-21246 should let us fix this).
504 
505  # Insert dimensions needed by any converters. These are only the
506  # dimensions that a converter expects to be uniquely derived from the
507  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
508  # calibration_labels.
509  #
510  # Note that we do not try to filter dimensions down to just those
511  # related to the given visits, even if config.relatedOnly is True; we
512  # need them in the Gen3 repo in order to be able to know which datasets
513  # to convert, because Gen2 alone doesn't know enough about the
514  # relationships between data IDs.
515  for converter in converters:
516  try:
517  converter.insertDimensionData()
518  except Exception:
519  pass
520 
521  # Insert dimensions that are potentially shared by all Gen2
522  # repositories (and are hence managed directly by the Task, rather
523  # than a converter instance).
524  # This also finishes setting up the (shared) converter.subsets object
525  # that is used to filter data IDs for config.relatedOnly.
526  self.registerUsedSkyMaps(rootConverter.subset)
527  self.registerUsedSkyPix(rootConverter.subset)
528 
529  # Look for datasets, generally by scanning the filesystem.
530  # This requires dimensions to have already been inserted so we can use
531  # dimension information to identify related datasets.
532  for converter in converters:
533  converter.findDatasets()
534 
535  # Expand data IDs.
536  for converter in converters:
537  converter.expandDataIds()
538 
539  # Actually ingest datasets.
540  for converter in converters:
541  converter.ingest()