lsst.obs.base  19.0.0-22-g282de62
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associate names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  rootSkyMapName = Field(
108  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
109  "datasets in the root repository when no SkyMap is found there. ",
110  dtype=str,
111  optional=True,
112  default=None,
113  )
114  collections = DictField(
115  "Special collections (values) for certain dataset types (keys). "
116  "These are used in addition to rerun collections for datasets in "
117  "reruns. The 'raw' dataset must have an entry here if it is to be "
118  "converted.",
119  keytype=str,
120  itemtype=str,
121  default={
122  "deepCoadd_skyMap": "skymaps",
123  "brightObjectMask": "masks",
124  }
125  )
126  storageClasses = DictField(
127  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
128  "or 'persistable') to the Gen3 StorageClass name.",
129  keytype=str,
130  itemtype=str,
131  default={
132  "BaseSkyMap": "SkyMap",
133  "BaseCatalog": "Catalog",
134  "BackgroundList": "Background",
135  "raw": "Exposure",
136  "MultilevelParquetTable": "DataFrame",
137  "ParquetTable": "DataFrame",
138  "SkyWcs": "Wcs",
139  }
140  )
141  doRegisterInstrument = Field(
142  "If True (default), add dimension records for the Instrument and its "
143  "filters and detectors to the registry instead of assuming they are "
144  "already present.",
145  dtype=bool,
146  default=True,
147  )
148  doWriteCuratedCalibrations = Field(
149  "If True (default), ingest human-curated calibrations directly via "
150  "the Instrument interface. Note that these calibrations are never "
151  "converted from Gen2 repositories.",
152  dtype=bool,
153  default=True,
154  )
155  refCats = ListField(
156  "The names of reference catalogs (subdirectories under ref_cats) to "
157  "be converted",
158  dtype=str,
159  default=[]
160  )
161  fileIgnorePatterns = ListField(
162  "Filename globs that should be ignored instead of being treated as "
163  "datasets.",
164  dtype=str,
165  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
166  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
167  "_parent", "repositoryCfg.yaml"]
168  )
169  datasetIncludePatterns = ListField(
170  "Glob-style patterns for dataset type names that should be converted.",
171  dtype=str,
172  default=["*"]
173  )
174  datasetIgnorePatterns = ListField(
175  "Glob-style patterns for dataset type names that should not be "
176  "converted despite matching a pattern in datasetIncludePatterns.",
177  dtype=str,
178  default=[]
179  )
180  ccdKey = Field(
181  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
182  dtype=str,
183  default="ccd",
184  )
185  relatedOnly = Field(
186  "If True (default), only convert datasets that are related to the "
187  "ingested visits. Ignored unless a list of visits is passed to "
188  "run().",
189  dtype=bool,
190  default=False,
191  )
192 
193  @property
194  def transfer(self):
195  return self.raws.transfer
196 
197  @transfer.setter
198  def transfer(self, value):
199  self.raws.transfer = value
200 
201  @property
202  def instrument(self):
203  return self.raws.instrument
204 
205  @instrument.setter
206  def instrument(self, value):
207  self.raws.instrument = value
208 
209  def setDefaults(self):
210  self.transfer = None
211 
212  # TODO: check that there are no collection overrides for curated
213  # calibrations, since we don't have a good way to utilize them.
214 
215 
216 class ConvertRepoTask(Task):
217  """A task that converts one or more related Gen2 data repositories to a
218  single Gen3 data repository (with multiple collections).
219 
220  Parameters
221  ----------
222  config: `ConvertRepoConfig`
223  Configuration for this task.
224  butler3: `lsst.daf.butler.Butler`
225  Gen3 Butler instance that represents the data repository datasets will
226  be ingested into. The collection and/or run associated with this
227  Butler will be ignored in favor of collections/runs passed via config
228  or to `run`.
229  kwds
230  Other keyword arguments are forwarded to the `Task` constructor.
231 
232  Notes
233  -----
234  Most of the work of converting repositories is delegated to instances of
235  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
236  only state that is relevant for all Gen2 repositories being ingested, while
237  each `RepoConverter` instance holds only state relevant for the conversion
238  of a single Gen2 repository. Both the task and the `RepoConverter`
239  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
240  methods may only be called once on a particular instance.
241  """
242 
243  ConfigClass = ConvertRepoConfig
244 
245  _DefaultName = "convertRepo"
246 
247  def __init__(self, config=None, *, butler3: Butler3, **kwds):
248  super().__init__(config, **kwds)
249  self.butler3 = butler3
250  self.registry = self.butler3.registry
251  self.universe = self.registry.dimensions
252  if self.isDatasetTypeIncluded("raw"):
253  self.makeSubtask("raws", butler=butler3)
254  self.instrument = self.raws.instrument
255  else:
256  self.raws = None
257  self.instrument = doImport(self.config.instrument)()
258  self._configuredSkyMapsBySha1 = {}
259  self._configuredSkyMapsByName = {}
260  for name, config in self.config.skyMaps.items():
261  instance = config.skyMap.apply()
262  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
263  self._configuredSkyMapsBySha1[struct.sha1] = struct
264  self._configuredSkyMapsByName[struct.name] = struct
265  self._usedSkyPix = set()
266 
267  def isDatasetTypeIncluded(self, datasetTypeName: str):
268  """Return `True` if configuration indicates that the given dataset type
269  should be converted.
270 
271  This method is intended to be called primarily by the
272  `RepoConverter` instances used interally by the task.
273 
274  Parameters
275  ----------
276  datasetTypeName: str
277  Name of the dataset type.
278 
279  Returns
280  -------
281  included : `bool`
282  Whether the dataset should be included in the conversion.
283  """
284  return (
285  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
286  for pattern in self.config.datasetIncludePatterns)
287  and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
288  for pattern in self.config.datasetIgnorePatterns)
289  )
290 
291  def useSkyMap(self, skyMap: BaseSkyMap) -> str:
292  """Indicate that a repository uses the given SkyMap.
293 
294  This method is intended to be called primarily by the
295  `RepoConverter` instances used interally by the task.
296 
297  Parameters
298  ----------
299  skyMap : `lsst.skymap.BaseSkyMap`
300  SkyMap instance being used, typically retrieved from a Gen2
301  data repository.
302 
303  Returns
304  -------
305  name : `str`
306  The name of the skymap in Gen3 data IDs.
307  """
308  sha1 = skyMap.getSha1()
309  try:
310  struct = self._configuredSkyMapsBySha1[sha1]
311  except KeyError as err:
312  raise LookupError(f"SkyMap with sha1={sha1} not included in configuration.") from err
313  struct.used = True
314  return struct.name
315 
316  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
317  """Register all skymaps that have been marked as used.
318 
319  This method is intended to be called primarily by the
320  `RepoConverter` instances used interally by the task.
321 
322  Parameters
323  ----------
324  subset : `ConversionSubset`, optional
325  Object that will be used to filter converted datasets by data ID.
326  If given, it will be updated with the tracts of this skymap that
327  overlap the visits in the subset.
328  """
329  for struct in self._configuredSkyMapsBySha1.values():
330  if struct.used:
331  struct.instance.register(struct.name, self.registry)
332  if subset is not None and self.config.relatedOnly:
333  subset.addSkyMap(self.registry, struct.name)
334 
335  def useSkyPix(self, dimension: SkyPixDimension):
336  """Indicate that a repository uses the given SkyPix dimension.
337 
338  This method is intended to be called primarily by the
339  `RepoConverter` instances used interally by the task.
340 
341  Parameters
342  ----------
343  dimension : `lsst.daf.butler.SkyPixDimension`
344  Dimension represening a pixelization of the sky.
345  """
346  self._usedSkyPix.add(dimension)
347 
348  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
349  """Register all skymaps that have been marked as used.
350 
351  This method is intended to be called primarily by the
352  `RepoConverter` instances used interally by the task.
353 
354  Parameters
355  ----------
356  subset : `ConversionSubset`, optional
357  Object that will be used to filter converted datasets by data ID.
358  If given, it will be updated with the pixelization IDs that
359  overlap the visits in the subset.
360  """
361  if subset is not None and self.config.relatedOnly:
362  for dimension in self._usedSkyPix:
363  subset.addSkyPix(self.registry, dimension)
364 
365  def run(self, root: str, collections: List[str], *,
366  calibs: Dict[str, List[str]] = None,
367  reruns: Dict[str, List[str]] = None,
368  visits: Optional[Iterable[int]] = None):
369  """Convert a group of related data repositories.
370 
371  Parameters
372  ----------
373  root : `str`
374  Complete path to the root Gen2 data repository. This should be
375  a data repository that includes a Gen2 registry and any raw files
376  and/or reference catalogs.
377  collections : `list` of `str`
378  Gen3 collections that datasets from the root repository should be
379  associated with. This should include any rerun collection that
380  these datasets should also be considered to be part of; because of
381  structural difference between Gen2 parent/child relationships and
382  Gen3 collections, these cannot be reliably inferred.
383  calibs : `dict`
384  Dictionary mapping calibration repository path to the collections
385  that the repository's datasets should be associated with. The path
386  may be relative to ``root`` or absolute. Collections should
387  include child repository collections as appropriate (see
388  documentation for ``collections``).
389  reruns : `dict`
390  Dictionary mapping rerun repository path to the collections that
391  the repository's datasets should be associated with. The path may
392  be relative to ``root`` or absolute. Collections should include
393  child repository collections as appropriate (see documentation for
394  ``collections``).
395  visits : iterable of `int`, optional
396  The integer IDs of visits to convert. If not provided, all visits
397  in the Gen2 root repository will be converted.
398  """
399 
400  if calibs is None:
401  calibs = {}
402  if reruns is None:
403  reruns = {}
404  if visits is not None:
405  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
406  else:
407  if self.config.relatedOnly:
408  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
409  "no filtering will be done.")
410  subset = None
411 
412  # We can't wrap database writes sanely in transactions (yet) because we
413  # keep initializing new Butler instances just so we can write into new
414  # runs/collections, and transactions are managed at the Butler level.
415  # DM-21246 should let us fix this, assuming we actually want to keep
416  # the transaction open that long.
417  if self.config.doRegisterInstrument:
418  self.instrument.register(self.registry)
419 
420  # Make and prep converters for all Gen2 repos. This should not modify
421  # the Registry database or filesystem at all, though it may query it.
422  # The prep() calls here will be some of the slowest ones, because
423  # that's when we walk the filesystem.
424  converters = []
425  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
426  rootConverter.prep()
427  converters.append(rootConverter)
428 
429  for root, collections in calibs.items():
430  if not os.path.isabs(root):
431  root = os.path.join(rootConverter.root, root)
432  converter = CalibRepoConverter(task=self, root=root, collections=collections,
433  mapper=rootConverter.mapper,
434  subset=rootConverter.subset)
435  converter.prep()
436  converters.append(converter)
437 
438  for root, collections in reruns.items():
439  if not os.path.isabs(root):
440  root = os.path.join(rootConverter.root, root)
441  converter = StandardRepoConverter(task=self, root=root, collections=collections,
442  subset=rootConverter.subset)
443  converter.prep()
444  converters.append(converter)
445 
446  # Actual database writes start here. We can't wrap these sanely in
447  # transactions (yet) because we keep initializing new Butler instances
448  # just so we can write into new runs/collections, and transactions
449  # are managed at the Butler level (DM-21246 should let us fix this).
450 
451  # Insert dimensions needed by any converters. These are only the
452  # dimensions that a converter expects to be uniquely derived from the
453  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
454  # calibration_labels.
455  #
456  # Note that we do not try to filter dimensions down to just those
457  # related to the given visits, even if config.relatedOnly is True; we
458  # need them in the Gen3 repo in order to be able to know which datasets
459  # to convert, because Gen2 alone doesn't know enough about the
460  # relationships between data IDs.
461  for converter in converters:
462  converter.insertDimensionData()
463 
464  # Insert dimensions that are potentially shared by all Gen2
465  # repositories (and are hence managed directly by the Task, rather
466  # than a converter instance).
467  # This also finishes setting up the (shared) converter.subsets object
468  # that is used to filter data IDs for config.relatedOnly.
469  self.registerUsedSkyMaps(rootConverter.subset)
470  self.registerUsedSkyPix(rootConverter.subset)
471 
472  # Look for datasets, generally by scanning the filesystem.
473  # This requires dimensions to have already been inserted so we can use
474  # dimension information to identify related datasets.
475  for converter in converters:
476  converter.findDatasets()
477 
478  # Expand data IDs.
479  for converter in converters:
480  converter.expandDataIds()
481 
482  # Actually ingest datasets.
483  for converter in converters:
484  converter.ingest()