lsst.obs.base  19.0.0-26-g830ab5e+1
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associate names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  rootSkyMapName = Field(
108  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
109  "datasets in the root repository when no SkyMap is found there. ",
110  dtype=str,
111  optional=True,
112  default=None,
113  )
114  collections = DictField(
115  "Special collections (values) for certain dataset types (keys). "
116  "These are used in addition to rerun collections for datasets in "
117  "reruns. The 'raw' dataset must have an entry here if it is to be "
118  "converted.",
119  keytype=str,
120  itemtype=str,
121  default={
122  "deepCoadd_skyMap": "skymaps",
123  "brightObjectMask": "masks",
124  }
125  )
126  storageClasses = DictField(
127  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
128  "or 'persistable') to the Gen3 StorageClass name.",
129  keytype=str,
130  itemtype=str,
131  default={
132  "bias": "ExposureF",
133  "dark": "ExposureF",
134  "flat": "ExposureF",
135  "defects": "Defects",
136  "BaseSkyMap": "SkyMap",
137  "BaseCatalog": "Catalog",
138  "BackgroundList": "Background",
139  "raw": "Exposure",
140  "MultilevelParquetTable": "DataFrame",
141  "ParquetTable": "DataFrame",
142  "SkyWcs": "Wcs",
143  }
144  )
145  formatterClasses = DictField(
146  "Mapping from dataset type name to formatter class. "
147  "By default these are derived from the formatters listed in the"
148  " Gen3 datastore configuration.",
149  keytype=str,
150  itemtype=str,
151  default={}
152  )
153  targetHandlerClasses = DictField(
154  "Mapping from dataset type name to target handler class.",
155  keytype=str,
156  itemtype=str,
157  default={}
158  )
159  doRegisterInstrument = Field(
160  "If True (default), add dimension records for the Instrument and its "
161  "filters and detectors to the registry instead of assuming they are "
162  "already present.",
163  dtype=bool,
164  default=True,
165  )
166  doWriteCuratedCalibrations = Field(
167  "If True (default), ingest human-curated calibrations directly via "
168  "the Instrument interface. Note that these calibrations are never "
169  "converted from Gen2 repositories.",
170  dtype=bool,
171  default=True,
172  )
173  refCats = ListField(
174  "The names of reference catalogs (subdirectories under ref_cats) to "
175  "be converted",
176  dtype=str,
177  default=[]
178  )
179  fileIgnorePatterns = ListField(
180  "Filename globs that should be ignored instead of being treated as "
181  "datasets.",
182  dtype=str,
183  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
184  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
185  "_parent", "repositoryCfg.yaml"]
186  )
187  datasetIncludePatterns = ListField(
188  "Glob-style patterns for dataset type names that should be converted.",
189  dtype=str,
190  default=["*"]
191  )
192  datasetIgnorePatterns = ListField(
193  "Glob-style patterns for dataset type names that should not be "
194  "converted despite matching a pattern in datasetIncludePatterns.",
195  dtype=str,
196  default=[]
197  )
198  ccdKey = Field(
199  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
200  dtype=str,
201  default="ccd",
202  )
203  relatedOnly = Field(
204  "If True (default), only convert datasets that are related to the "
205  "ingested visits. Ignored unless a list of visits is passed to "
206  "run().",
207  dtype=bool,
208  default=False,
209  )
210  curatedCalibrations = ListField(
211  "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` "
212  "and thus should not be converted using the standard calibration "
213  "conversion system.",
214  dtype=str,
215  default=["camera",
216  "transmission_sensor",
217  "transmission_filter",
218  "transmission_optics",
219  "transmission_atmosphere",
220  "bfKernel"]
221  )
222 
223  @property
224  def transfer(self):
225  return self.raws.transfer
226 
227  @transfer.setter
228  def transfer(self, value):
229  self.raws.transfer = value
230 
231  @property
232  def instrument(self):
233  return self.raws.instrument
234 
235  @instrument.setter
236  def instrument(self, value):
237  self.raws.instrument = value
238 
239  def setDefaults(self):
240  self.transfer = None
241 
242  # TODO: check that there are no collection overrides for curated
243  # calibrations, since we don't have a good way to utilize them.
244 
245 
246 class ConvertRepoTask(Task):
247  """A task that converts one or more related Gen2 data repositories to a
248  single Gen3 data repository (with multiple collections).
249 
250  Parameters
251  ----------
252  config: `ConvertRepoConfig`
253  Configuration for this task.
254  butler3: `lsst.daf.butler.Butler`
255  Gen3 Butler instance that represents the data repository datasets will
256  be ingested into. The collection and/or run associated with this
257  Butler will be ignored in favor of collections/runs passed via config
258  or to `run`.
259  kwds
260  Other keyword arguments are forwarded to the `Task` constructor.
261 
262  Notes
263  -----
264  Most of the work of converting repositories is delegated to instances of
265  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
266  only state that is relevant for all Gen2 repositories being ingested, while
267  each `RepoConverter` instance holds only state relevant for the conversion
268  of a single Gen2 repository. Both the task and the `RepoConverter`
269  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
270  methods may only be called once on a particular instance.
271  """
272 
273  ConfigClass = ConvertRepoConfig
274 
275  _DefaultName = "convertRepo"
276 
277  def __init__(self, config=None, *, butler3: Butler3, **kwds):
278  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
279  super().__init__(config, **kwds)
280  self.butler3 = butler3
281  self.registry = self.butler3.registry
282  self.universe = self.registry.dimensions
283  if self.isDatasetTypeIncluded("raw"):
284  self.makeSubtask("raws", butler=butler3)
285  self.instrument = self.raws.instrument
286  else:
287  self.raws = None
288  self.instrument = doImport(self.config.instrument)()
289  self._configuredSkyMapsBySha1 = {}
290  self._configuredSkyMapsByName = {}
291  for name, config in self.config.skyMaps.items():
292  instance = config.skyMap.apply()
293  self._populateSkyMapDicts(name, instance)
294  self._usedSkyPix = set()
295 
296  def _populateSkyMapDicts(self, name, instance):
297  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
298  self._configuredSkyMapsBySha1[struct.sha1] = struct
299  self._configuredSkyMapsByName[struct.name] = struct
300 
301  def isDatasetTypeIncluded(self, datasetTypeName: str):
302  """Return `True` if configuration indicates that the given dataset type
303  should be converted.
304 
305  This method is intended to be called primarily by the
306  `RepoConverter` instances used interally by the task.
307 
308  Parameters
309  ----------
310  datasetTypeName: str
311  Name of the dataset type.
312 
313  Returns
314  -------
315  included : `bool`
316  Whether the dataset should be included in the conversion.
317  """
318  return (
319  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
320  for pattern in self.config.datasetIncludePatterns)
321  and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
322  for pattern in self.config.datasetIgnorePatterns)
323  )
324 
325  def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
326  """Indicate that a repository uses the given SkyMap.
327 
328  This method is intended to be called primarily by the
329  `RepoConverter` instances used interally by the task.
330 
331  Parameters
332  ----------
333  skyMap : `lsst.skymap.BaseSkyMap`
334  SkyMap instance being used, typically retrieved from a Gen2
335  data repository.
336  skyMapName : `str`
337  The name of the gen2 skymap, for error reporting.
338 
339  Returns
340  -------
341  name : `str`
342  The name of the skymap in Gen3 data IDs.
343 
344  Raises
345  ------
346  LookupError
347  Raised if the specified skymap cannot be found.
348  """
349  sha1 = skyMap.getSha1()
350  if sha1 not in self._configuredSkyMapsBySha1:
351  self._populateSkyMapDicts(skyMapName, skyMap)
352  try:
353  struct = self._configuredSkyMapsBySha1[sha1]
354  except KeyError as err:
355  msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
356  raise LookupError(msg) from err
357  struct.used = True
358  return struct.name
359 
360  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
361  """Register all skymaps that have been marked as used.
362 
363  This method is intended to be called primarily by the
364  `RepoConverter` instances used interally by the task.
365 
366  Parameters
367  ----------
368  subset : `ConversionSubset`, optional
369  Object that will be used to filter converted datasets by data ID.
370  If given, it will be updated with the tracts of this skymap that
371  overlap the visits in the subset.
372  """
373  for struct in self._configuredSkyMapsBySha1.values():
374  if struct.used:
375  struct.instance.register(struct.name, self.registry)
376  if subset is not None and self.config.relatedOnly:
377  subset.addSkyMap(self.registry, struct.name)
378 
379  def useSkyPix(self, dimension: SkyPixDimension):
380  """Indicate that a repository uses the given SkyPix dimension.
381 
382  This method is intended to be called primarily by the
383  `RepoConverter` instances used interally by the task.
384 
385  Parameters
386  ----------
387  dimension : `lsst.daf.butler.SkyPixDimension`
388  Dimension represening a pixelization of the sky.
389  """
390  self._usedSkyPix.add(dimension)
391 
392  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
393  """Register all skymaps that have been marked as used.
394 
395  This method is intended to be called primarily by the
396  `RepoConverter` instances used interally by the task.
397 
398  Parameters
399  ----------
400  subset : `ConversionSubset`, optional
401  Object that will be used to filter converted datasets by data ID.
402  If given, it will be updated with the pixelization IDs that
403  overlap the visits in the subset.
404  """
405  if subset is not None and self.config.relatedOnly:
406  for dimension in self._usedSkyPix:
407  subset.addSkyPix(self.registry, dimension)
408 
409  def run(self, root: str, collections: List[str], *,
410  calibs: Dict[str, List[str]] = None,
411  reruns: Dict[str, List[str]] = None,
412  visits: Optional[Iterable[int]] = None):
413  """Convert a group of related data repositories.
414 
415  Parameters
416  ----------
417  root : `str`
418  Complete path to the root Gen2 data repository. This should be
419  a data repository that includes a Gen2 registry and any raw files
420  and/or reference catalogs.
421  collections : `list` of `str`
422  Gen3 collections that datasets from the root repository should be
423  associated with. This should include any rerun collection that
424  these datasets should also be considered to be part of; because of
425  structural difference between Gen2 parent/child relationships and
426  Gen3 collections, these cannot be reliably inferred.
427  calibs : `dict`
428  Dictionary mapping calibration repository path to the collections
429  that the repository's datasets should be associated with. The path
430  may be relative to ``root`` or absolute. Collections should
431  include child repository collections as appropriate (see
432  documentation for ``collections``).
433  reruns : `dict`
434  Dictionary mapping rerun repository path to the collections that
435  the repository's datasets should be associated with. The path may
436  be relative to ``root`` or absolute. Collections should include
437  child repository collections as appropriate (see documentation for
438  ``collections``).
439  visits : iterable of `int`, optional
440  The integer IDs of visits to convert. If not provided, all visits
441  in the Gen2 root repository will be converted.
442  """
443 
444  if calibs is None:
445  calibs = {}
446  if reruns is None:
447  reruns = {}
448  if visits is not None:
449  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
450  else:
451  if self.config.relatedOnly:
452  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
453  "no filtering will be done.")
454  subset = None
455 
456  # We can't wrap database writes sanely in transactions (yet) because we
457  # keep initializing new Butler instances just so we can write into new
458  # runs/collections, and transactions are managed at the Butler level.
459  # DM-21246 should let us fix this, assuming we actually want to keep
460  # the transaction open that long.
461  if self.config.doRegisterInstrument:
462  self.instrument.register(self.registry)
463 
464  # Make and prep converters for all Gen2 repos. This should not modify
465  # the Registry database or filesystem at all, though it may query it.
466  # The prep() calls here will be some of the slowest ones, because
467  # that's when we walk the filesystem.
468  converters = []
469  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
470  rootConverter.prep()
471  converters.append(rootConverter)
472 
473  for root, collections in calibs.items():
474  if not os.path.isabs(root):
475  root = os.path.join(rootConverter.root, root)
476  converter = CalibRepoConverter(task=self, root=root, collections=collections,
477  mapper=rootConverter.mapper,
478  subset=rootConverter.subset)
479  converter.prep()
480  converters.append(converter)
481 
482  for root, collections in reruns.items():
483  if not os.path.isabs(root):
484  root = os.path.join(rootConverter.root, root)
485  converter = StandardRepoConverter(task=self, root=root, collections=collections,
486  subset=rootConverter.subset)
487  converter.prep()
488  converters.append(converter)
489 
490  # Actual database writes start here. We can't wrap these sanely in
491  # transactions (yet) because we keep initializing new Butler instances
492  # just so we can write into new runs/collections, and transactions
493  # are managed at the Butler level (DM-21246 should let us fix this).
494 
495  # Insert dimensions needed by any converters. These are only the
496  # dimensions that a converter expects to be uniquely derived from the
497  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
498  # calibration_labels.
499  #
500  # Note that we do not try to filter dimensions down to just those
501  # related to the given visits, even if config.relatedOnly is True; we
502  # need them in the Gen3 repo in order to be able to know which datasets
503  # to convert, because Gen2 alone doesn't know enough about the
504  # relationships between data IDs.
505  for converter in converters:
506  converter.insertDimensionData()
507 
508  # Insert dimensions that are potentially shared by all Gen2
509  # repositories (and are hence managed directly by the Task, rather
510  # than a converter instance).
511  # This also finishes setting up the (shared) converter.subsets object
512  # that is used to filter data IDs for config.relatedOnly.
513  self.registerUsedSkyMaps(rootConverter.subset)
514  self.registerUsedSkyPix(rootConverter.subset)
515 
516  # Look for datasets, generally by scanning the filesystem.
517  # This requires dimensions to have already been inserted so we can use
518  # dimension information to identify related datasets.
519  for converter in converters:
520  converter.findDatasets()
521 
522  # Expand data IDs.
523  for converter in converters:
524  converter.expandDataIds()
525 
526  # Actually ingest datasets.
527  for converter in converters:
528  converter.ingest()