lsst.obs.base  19.0.0-20-g6de566f+6
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associate names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  rootSkyMapName = Field(
108  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
109  "datasets in the root repository when no SkyMap is found there. ",
110  dtype=str,
111  optional=True,
112  default=None,
113  )
114  collections = DictField(
115  "Special collections (values) for certain dataset types (keys). "
116  "These are used in addition to rerun collections for datasets in "
117  "reruns. The 'raw' dataset must have an entry here if it is to be "
118  "converted.",
119  keytype=str,
120  itemtype=str,
121  default={
122  "deepCoadd_skyMap": "skymaps",
123  "brightObjectMask": "masks",
124  }
125  )
126  storageClasses = DictField(
127  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
128  "or 'persistable') to the Gen3 StorageClass name.",
129  keytype=str,
130  itemtype=str,
131  default={
132  "BaseSkyMap": "SkyMap",
133  "BaseCatalog": "Catalog",
134  "BackgroundList": "Background",
135  "raw": "Exposure",
136  "MultilevelParquetTable": "DataFrame",
137  "ParquetTable": "DataFrame",
138  }
139  )
140  doRegisterInstrument = Field(
141  "If True (default), add dimension records for the Instrument and its "
142  "filters and detectors to the registry instead of assuming they are "
143  "already present.",
144  dtype=bool,
145  default=True,
146  )
147  doWriteCuratedCalibrations = Field(
148  "If True (default), ingest human-curated calibrations directly via "
149  "the Instrument interface. Note that these calibrations are never "
150  "converted from Gen2 repositories.",
151  dtype=bool,
152  default=True,
153  )
154  refCats = ListField(
155  "The names of reference catalogs (subdirectories under ref_cats) to "
156  "be converted",
157  dtype=str,
158  default=[]
159  )
160  fileIgnorePatterns = ListField(
161  "Filename globs that should be ignored instead of being treated as "
162  "datasets.",
163  dtype=str,
164  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
165  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
166  "_parent", "repositoryCfg.yaml"]
167  )
168  datasetIncludePatterns = ListField(
169  "Glob-style patterns for dataset type names that should be converted.",
170  dtype=str,
171  default=["*"]
172  )
173  datasetIgnorePatterns = ListField(
174  "Glob-style patterns for dataset type names that should not be "
175  "converted despite matching a pattern in datasetIncludePatterns.",
176  dtype=str,
177  default=[]
178  )
179  ccdKey = Field(
180  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
181  dtype=str,
182  default="ccd",
183  )
184  relatedOnly = Field(
185  "If True (default), only convert datasets that are related to the "
186  "ingested visits. Ignored unless a list of visits is passed to "
187  "run().",
188  dtype=bool,
189  default=False,
190  )
191 
192  @property
193  def transfer(self):
194  return self.raws.transfer
195 
196  @transfer.setter
197  def transfer(self, value):
198  self.raws.transfer = value
199 
200  @property
201  def instrument(self):
202  return self.raws.instrument
203 
204  @instrument.setter
205  def instrument(self, value):
206  self.raws.instrument = value
207 
208  def setDefaults(self):
209  self.transfer = None
210 
211  # TODO: check that there are no collection overrides for curated
212  # calibrations, since we don't have a good way to utilize them.
213 
214 
215 class ConvertRepoTask(Task):
216  """A task that converts one or more related Gen2 data repositories to a
217  single Gen3 data repository (with multiple collections).
218 
219  Parameters
220  ----------
221  config: `ConvertRepoConfig`
222  Configuration for this task.
223  butler3: `lsst.daf.butler.Butler`
224  Gen3 Butler instance that represents the data repository datasets will
225  be ingested into. The collection and/or run associated with this
226  Butler will be ignored in favor of collections/runs passed via config
227  or to `run`.
228  kwds
229  Other keyword arguments are forwarded to the `Task` constructor.
230 
231  Notes
232  -----
233  Most of the work of converting repositories is delegated to instances of
234  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
235  only state that is relevant for all Gen2 repositories being ingested, while
236  each `RepoConverter` instance holds only state relevant for the conversion
237  of a single Gen2 repository. Both the task and the `RepoConverter`
238  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
239  methods may only be called once on a particular instance.
240  """
241 
242  ConfigClass = ConvertRepoConfig
243 
244  _DefaultName = "convertRepo"
245 
246  def __init__(self, config=None, *, butler3: Butler3, **kwds):
247  super().__init__(config, **kwds)
248  self.butler3 = butler3
249  self.registry = self.butler3.registry
250  self.universe = self.registry.dimensions
251  if self.isDatasetTypeIncluded("raw"):
252  self.makeSubtask("raws", butler=butler3)
253  self.instrument = self.raws.instrument
254  else:
255  self.raws = None
256  self.instrument = doImport(self.config.instrument)()
257  self._configuredSkyMapsBySha1 = {}
258  self._configuredSkyMapsByName = {}
259  for name, config in self.config.skyMaps.items():
260  instance = config.skyMap.apply()
261  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
262  self._configuredSkyMapsBySha1[struct.sha1] = struct
263  self._configuredSkyMapsByName[struct.name] = struct
264  self._usedSkyPix = set()
265 
266  def isDatasetTypeIncluded(self, datasetTypeName: str):
267  """Return `True` if configuration indicates that the given dataset type
268  should be converted.
269 
270  This method is intended to be called primarily by the
271  `RepoConverter` instances used interally by the task.
272 
273  Parameters
274  ----------
275  datasetTypeName: str
276  Name of the dataset type.
277 
278  Returns
279  -------
280  included : `bool`
281  Whether the dataset should be included in the conversion.
282  """
283  return (
284  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
285  for pattern in self.config.datasetIncludePatterns)
286  and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
287  for pattern in self.config.datasetIgnorePatterns)
288  )
289 
290  def useSkyMap(self, skyMap: BaseSkyMap) -> str:
291  """Indicate that a repository uses the given SkyMap.
292 
293  This method is intended to be called primarily by the
294  `RepoConverter` instances used interally by the task.
295 
296  Parameters
297  ----------
298  skyMap : `lsst.skymap.BaseSkyMap`
299  SkyMap instance being used, typically retrieved from a Gen2
300  data repository.
301 
302  Returns
303  -------
304  name : `str`
305  The name of the skymap in Gen3 data IDs.
306  """
307  sha1 = skyMap.getSha1()
308  try:
309  struct = self._configuredSkyMapsBySha1[sha1]
310  except KeyError as err:
311  raise LookupError(f"SkyMap with sha1={sha1} not included in configuration.") from err
312  struct.used = True
313  return struct.name
314 
315  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
316  """Register all skymaps that have been marked as used.
317 
318  This method is intended to be called primarily by the
319  `RepoConverter` instances used interally by the task.
320 
321  Parameters
322  ----------
323  subset : `ConversionSubset`, optional
324  Object that will be used to filter converted datasets by data ID.
325  If given, it will be updated with the tracts of this skymap that
326  overlap the visits in the subset.
327  """
328  for struct in self._configuredSkyMapsBySha1.values():
329  if struct.used:
330  struct.instance.register(struct.name, self.registry)
331  if subset is not None and self.config.relatedOnly:
332  subset.addSkyMap(self.registry, struct.name)
333 
334  def useSkyPix(self, dimension: SkyPixDimension):
335  """Indicate that a repository uses the given SkyPix dimension.
336 
337  This method is intended to be called primarily by the
338  `RepoConverter` instances used interally by the task.
339 
340  Parameters
341  ----------
342  dimension : `lsst.daf.butler.SkyPixDimension`
343  Dimension represening a pixelization of the sky.
344  """
345  self._usedSkyPix.add(dimension)
346 
347  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
348  """Register all skymaps that have been marked as used.
349 
350  This method is intended to be called primarily by the
351  `RepoConverter` instances used interally by the task.
352 
353  Parameters
354  ----------
355  subset : `ConversionSubset`, optional
356  Object that will be used to filter converted datasets by data ID.
357  If given, it will be updated with the pixelization IDs that
358  overlap the visits in the subset.
359  """
360  if subset is not None and self.config.relatedOnly:
361  for dimension in self._usedSkyPix:
362  subset.addSkyPix(self.registry, dimension)
363 
364  def run(self, root: str, collections: List[str], *,
365  calibs: Dict[str, List[str]] = None,
366  reruns: Dict[str, List[str]] = None,
367  visits: Optional[Iterable[int]] = None):
368  """Convert a group of related data repositories.
369 
370  Parameters
371  ----------
372  root : `str`
373  Complete path to the root Gen2 data repository. This should be
374  a data repository that includes a Gen2 registry and any raw files
375  and/or reference catalogs.
376  collections : `list` of `str`
377  Gen3 collections that datasets from the root repository should be
378  associated with. This should include any rerun collection that
379  these datasets should also be considered to be part of; because of
380  structural difference between Gen2 parent/child relationships and
381  Gen3 collections, these cannot be reliably inferred.
382  calibs : `dict`
383  Dictionary mapping calibration repository path to the collections
384  that the repository's datasets should be associated with. The path
385  may be relative to ``root`` or absolute. Collections should
386  include child repository collections as appropriate (see
387  documentation for ``collections``).
388  reruns : `dict`
389  Dictionary mapping rerun repository path to the collections that
390  the repository's datasets should be associated with. The path may
391  be relative to ``root`` or absolute. Collections should include
392  child repository collections as appropriate (see documentation for
393  ``collections``).
394  visits : iterable of `int`, optional
395  The integer IDs of visits to convert. If not provided, all visits
396  in the Gen2 root repository will be converted.
397  """
398 
399  if calibs is None:
400  calibs = {}
401  if reruns is None:
402  reruns = {}
403  if visits is not None:
404  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
405  else:
406  if self.config.relatedOnly:
407  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
408  "no filtering will be done.")
409  subset = None
410 
411  # We can't wrap database writes sanely in transactions (yet) because we
412  # keep initializing new Butler instances just so we can write into new
413  # runs/collections, and transactions are managed at the Butler level.
414  # DM-21246 should let us fix this, assuming we actually want to keep
415  # the transaction open that long.
416  if self.config.doRegisterInstrument:
417  self.instrument.register(self.registry)
418 
419  # Make and prep converters for all Gen2 repos. This should not modify
420  # the Registry database or filesystem at all, though it may query it.
421  # The prep() calls here will be some of the slowest ones, because
422  # that's when we walk the filesystem.
423  converters = []
424  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
425  rootConverter.prep()
426  converters.append(rootConverter)
427 
428  for root, collections in calibs.items():
429  if not os.path.isabs(root):
430  root = os.path.join(rootConverter.root, root)
431  converter = CalibRepoConverter(task=self, root=root, collections=collections,
432  mapper=rootConverter.mapper,
433  subset=rootConverter.subset)
434  converter.prep()
435  converters.append(converter)
436 
437  for root, collections in reruns.items():
438  if not os.path.isabs(root):
439  root = os.path.join(rootConverter.root, root)
440  converter = StandardRepoConverter(task=self, root=root, collections=collections,
441  subset=rootConverter.subset)
442  converter.prep()
443  converters.append(converter)
444 
445  # Actual database writes start here. We can't wrap these sanely in
446  # transactions (yet) because we keep initializing new Butler instances
447  # just so we can write into new runs/collections, and transactions
448  # are managed at the Butler level (DM-21246 should let us fix this).
449 
450  # Insert dimensions needed by any converters. These are only the
451  # dimensions that a converter expects to be uniquely derived from the
452  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
453  # calibration_labels.
454  #
455  # Note that we do not try to filter dimensions down to just those
456  # related to the given visits, even if config.relatedOnly is True; we
457  # need them in the Gen3 repo in order to be able to know which datasets
458  # to convert, because Gen2 alone doesn't know enough about the
459  # relationships between data IDs.
460  for converter in converters:
461  converter.insertDimensionData()
462 
463  # Insert dimensions that are potentially shared by all Gen2
464  # repositories (and are hence managed directly by the Task, rather
465  # than a converter instance).
466  # This also finishes setting up the (shared) converter.subsets object
467  # that is used to filter data IDs for config.relatedOnly.
468  self.registerUsedSkyMaps(rootConverter.subset)
469  self.registerUsedSkyPix(rootConverter.subset)
470 
471  # Look for datasets, generally by scanning the filesystem.
472  # This requires dimensions to have already been inserted so we can use
473  # dimension information to identify related datasets.
474  for converter in converters:
475  converter.findDatasets()
476 
477  # Expand data IDs.
478  for converter in converters:
479  converter.expandDataIds()
480 
481  # Actually ingest datasets.
482  for converter in converters:
483  converter.ingest()