lsst.obs.base  19.0.0-33-g58bbfa5
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  CollectionType,
34  SkyPixDimension
35 )
36 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
37 from lsst.pipe.base import Task
38 from lsst.skymap import skyMapRegistry, BaseSkyMap
39 
40 from ..ingest import RawIngestTask
41 from .repoConverter import ConversionSubset
42 from .rootRepoConverter import RootRepoConverter
43 from .calibRepoConverter import CalibRepoConverter
44 from .standardRepoConverter import StandardRepoConverter
45 
46 
47 @dataclass
49  """Struct containing information about a skymap that may appear in a Gen2
50  repository.
51  """
52 
53  name: str
54  """Name of the skymap used in Gen3 data IDs.
55  """
56 
57  sha1: bytes
58  """Hash computed by `BaseSkyMap.getSha1`.
59  """
60 
61  instance: BaseSkyMap
62  """Name of the skymap used in Gen3 data IDs.
63  """
64 
65  used: bool = False
66  """Whether this skymap has been found in at least one repository being
67  converted.
68  """
69 
70 
71 @dataclass
72 class Rerun:
73  """Specification for a Gen2 processing-output repository to convert.
74  """
75 
76  path: str
77  """Absolute or relative (to the root repository) path to the Gen2
78  repository (`str`).
79  """
80 
81  runName: str
82  """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets
83  will be inserted into (`str`).
84  """
85 
86  chainName: Optional[str]
87  """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will
88  combine this repository's datasets with those of its parent repositories
89  (`str`, optional).
90  """
91 
92  parents: List[str]
93  """Collection names associated with parent repositories, used to define the
94  chained collection (`list` [ `str` ]).
95 
96  Ignored if `chainName` is `None`. Runs used in the root repo are
97  automatically included.
98  """
99 
100 
102  """Sub-config used to hold the parameters of a SkyMap.
103 
104  Notes
105  -----
106  This config only needs to exist because we can't put a
107  `~lsst.pex.config.RegistryField` directly inside a
108  `~lsst.pex.config.ConfigDictField`.
109 
110  It needs to have its only field named "skyMap" for compatibility with the
111  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
112  use one config file in an obs package to configure both.
113 
114  This name leads to unfortunate repetition with the field named
115  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
116  unavoidable.
117  """
118  skyMap = skyMapRegistry.makeField(
119  doc="Type and parameters for the SkyMap itself.",
120  default="dodeca",
121  )
122 
123 
124 class ConvertRepoConfig(Config):
125  raws = ConfigurableField(
126  "Configuration for subtask responsible for ingesting raws and adding "
127  "visit and exposure dimension entries.",
128  target=RawIngestTask,
129  )
130  skyMaps = ConfigDictField(
131  "Mapping from Gen3 skymap name to the parameters used to construct a "
132  "BaseSkyMap instance. This will be used to associate names with "
133  "existing skymaps found in the Gen2 repo.",
134  keytype=str,
135  itemtype=ConvertRepoSkyMapConfig,
136  default={}
137  )
138  rootSkyMapName = Field(
139  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
140  "datasets in the root repository when no SkyMap is found there. ",
141  dtype=str,
142  optional=True,
143  default=None,
144  )
145  runs = DictField(
146  "A mapping from dataset type name to the RUN collection they should "
147  "be inserted into. This must include all datasets that can be found "
148  "in the root repository; other repositories will use per-repository "
149  "runs.",
150  keytype=str,
151  itemtype=str,
152  default={
153  "deepCoadd_skyMap": "skymaps",
154  "brightObjectMask": "masks",
155  }
156  )
157  storageClasses = DictField(
158  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
159  "or 'persistable') to the Gen3 StorageClass name.",
160  keytype=str,
161  itemtype=str,
162  default={
163  "bias": "ExposureF",
164  "dark": "ExposureF",
165  "flat": "ExposureF",
166  "defects": "Defects",
167  "BaseSkyMap": "SkyMap",
168  "BaseCatalog": "Catalog",
169  "BackgroundList": "Background",
170  "raw": "Exposure",
171  "MultilevelParquetTable": "DataFrame",
172  "ParquetTable": "DataFrame",
173  "SkyWcs": "Wcs",
174  }
175  )
176  formatterClasses = DictField(
177  "Mapping from dataset type name to formatter class. "
178  "By default these are derived from the formatters listed in the"
179  " Gen3 datastore configuration.",
180  keytype=str,
181  itemtype=str,
182  default={}
183  )
184  targetHandlerClasses = DictField(
185  "Mapping from dataset type name to target handler class.",
186  keytype=str,
187  itemtype=str,
188  default={}
189  )
190  doRegisterInstrument = Field(
191  "If True (default), add dimension records for the Instrument and its "
192  "filters and detectors to the registry instead of assuming they are "
193  "already present.",
194  dtype=bool,
195  default=True,
196  )
197  doWriteCuratedCalibrations = Field(
198  "If True (default), ingest human-curated calibrations directly via "
199  "the Instrument interface. Note that these calibrations are never "
200  "converted from Gen2 repositories.",
201  dtype=bool,
202  default=True,
203  )
204  refCats = ListField(
205  "The names of reference catalogs (subdirectories under ref_cats) to "
206  "be converted",
207  dtype=str,
208  default=[]
209  )
210  fileIgnorePatterns = ListField(
211  "Filename globs that should be ignored instead of being treated as "
212  "datasets.",
213  dtype=str,
214  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
215  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
216  "_parent", "repositoryCfg.yaml"]
217  )
218  rawDatasetType = Field(
219  "Gen2 dataset type to use for raw data.",
220  dtype=str,
221  default="raw",
222  )
223  datasetIncludePatterns = ListField(
224  "Glob-style patterns for dataset type names that should be converted.",
225  dtype=str,
226  default=["*"]
227  )
228  datasetIgnorePatterns = ListField(
229  "Glob-style patterns for dataset type names that should not be "
230  "converted despite matching a pattern in datasetIncludePatterns.",
231  dtype=str,
232  default=[]
233  )
234  ccdKey = Field(
235  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
236  dtype=str,
237  default="ccd",
238  )
239  relatedOnly = Field(
240  "If True (default), only convert datasets that are related to the "
241  "ingested visits. Ignored unless a list of visits is passed to "
242  "run().",
243  dtype=bool,
244  default=False,
245  )
246  curatedCalibrations = ListField(
247  "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` "
248  "and thus should not be converted using the standard calibration "
249  "conversion system.",
250  dtype=str,
251  default=["camera",
252  "transmission_sensor",
253  "transmission_filter",
254  "transmission_optics",
255  "transmission_atmosphere",
256  "bfKernel"]
257  )
258 
259  @property
260  def transfer(self):
261  return self.raws.transfer
262 
263  @transfer.setter
264  def transfer(self, value):
265  self.raws.transfer = value
266 
267  @property
268  def instrument(self):
269  return self.raws.instrument
270 
271  @instrument.setter
272  def instrument(self, value):
273  self.raws.instrument = value
274 
275  def setDefaults(self):
276  self.transfer = None
277 
278  # TODO: check that there are no collection overrides for curated
279  # calibrations, since we don't have a good way to utilize them.
280 
281 
282 class ConvertRepoTask(Task):
283  """A task that converts one or more related Gen2 data repositories to a
284  single Gen3 data repository (with multiple collections).
285 
286  Parameters
287  ----------
288  config: `ConvertRepoConfig`
289  Configuration for this task.
290  butler3: `lsst.daf.butler.Butler`
291  Gen3 Butler instance that represents the data repository datasets will
292  be ingested into. The collection and/or run associated with this
293  Butler will be ignored in favor of collections/runs passed via config
294  or to `run`.
295  kwds
296  Other keyword arguments are forwarded to the `Task` constructor.
297 
298  Notes
299  -----
300  Most of the work of converting repositories is delegated to instances of
301  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
302  only state that is relevant for all Gen2 repositories being ingested, while
303  each `RepoConverter` instance holds only state relevant for the conversion
304  of a single Gen2 repository. Both the task and the `RepoConverter`
305  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
306  methods may only be called once on a particular instance.
307  """
308 
309  ConfigClass = ConvertRepoConfig
310 
311  _DefaultName = "convertRepo"
312 
313  def __init__(self, config=None, *, butler3: Butler3, **kwds):
314  config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
315  super().__init__(config, **kwds)
316  self.butler3 = butler3
317  self.registry = self.butler3.registry
318  self.universe = self.registry.dimensions
319  if self.isDatasetTypeIncluded("raw"):
320  self.makeSubtask("raws", butler=butler3)
321  self.instrument = self.raws.instrument
322  else:
323  self.raws = None
324  self.instrument = doImport(self.config.instrument)()
325  self._configuredSkyMapsBySha1 = {}
326  self._configuredSkyMapsByName = {}
327  for name, config in self.config.skyMaps.items():
328  instance = config.skyMap.apply()
329  self._populateSkyMapDicts(name, instance)
330  self._usedSkyPix = set()
331 
332  def _populateSkyMapDicts(self, name, instance):
333  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
334  self._configuredSkyMapsBySha1[struct.sha1] = struct
335  self._configuredSkyMapsByName[struct.name] = struct
336 
337  def isDatasetTypeIncluded(self, datasetTypeName: str):
338  """Return `True` if configuration indicates that the given dataset type
339  should be converted.
340 
341  This method is intended to be called primarily by the
342  `RepoConverter` instances used interally by the task.
343 
344  Parameters
345  ----------
346  datasetTypeName: str
347  Name of the dataset type.
348 
349  Returns
350  -------
351  included : `bool`
352  Whether the dataset should be included in the conversion.
353  """
354  return (
355  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
356  for pattern in self.config.datasetIncludePatterns)
357  and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
358  for pattern in self.config.datasetIgnorePatterns)
359  )
360 
361  def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
362  """Indicate that a repository uses the given SkyMap.
363 
364  This method is intended to be called primarily by the
365  `RepoConverter` instances used interally by the task.
366 
367  Parameters
368  ----------
369  skyMap : `lsst.skymap.BaseSkyMap`
370  SkyMap instance being used, typically retrieved from a Gen2
371  data repository.
372  skyMapName : `str`
373  The name of the gen2 skymap, for error reporting.
374 
375  Returns
376  -------
377  name : `str`
378  The name of the skymap in Gen3 data IDs.
379 
380  Raises
381  ------
382  LookupError
383  Raised if the specified skymap cannot be found.
384  """
385  sha1 = skyMap.getSha1()
386  if sha1 not in self._configuredSkyMapsBySha1:
387  self._populateSkyMapDicts(skyMapName, skyMap)
388  try:
389  struct = self._configuredSkyMapsBySha1[sha1]
390  except KeyError as err:
391  msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
392  raise LookupError(msg) from err
393  struct.used = True
394  return struct.name
395 
396  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
397  """Register all skymaps that have been marked as used.
398 
399  This method is intended to be called primarily by the
400  `RepoConverter` instances used interally by the task.
401 
402  Parameters
403  ----------
404  subset : `ConversionSubset`, optional
405  Object that will be used to filter converted datasets by data ID.
406  If given, it will be updated with the tracts of this skymap that
407  overlap the visits in the subset.
408  """
409  for struct in self._configuredSkyMapsBySha1.values():
410  if struct.used:
411  struct.instance.register(struct.name, self.registry)
412  if subset is not None and self.config.relatedOnly:
413  subset.addSkyMap(self.registry, struct.name)
414 
415  def useSkyPix(self, dimension: SkyPixDimension):
416  """Indicate that a repository uses the given SkyPix dimension.
417 
418  This method is intended to be called primarily by the
419  `RepoConverter` instances used interally by the task.
420 
421  Parameters
422  ----------
423  dimension : `lsst.daf.butler.SkyPixDimension`
424  Dimension represening a pixelization of the sky.
425  """
426  self._usedSkyPix.add(dimension)
427 
428  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
429  """Register all skymaps that have been marked as used.
430 
431  This method is intended to be called primarily by the
432  `RepoConverter` instances used interally by the task.
433 
434  Parameters
435  ----------
436  subset : `ConversionSubset`, optional
437  Object that will be used to filter converted datasets by data ID.
438  If given, it will be updated with the pixelization IDs that
439  overlap the visits in the subset.
440  """
441  if subset is not None and self.config.relatedOnly:
442  for dimension in self._usedSkyPix:
443  subset.addSkyPix(self.registry, dimension)
444 
445  def run(self, root: str, *,
446  calibs: Dict[str, str] = None,
447  reruns: List[Rerun],
448  visits: Optional[Iterable[int]] = None):
449  """Convert a group of related data repositories.
450 
451  Parameters
452  ----------
453  root : `str`
454  Complete path to the root Gen2 data repository. This should be
455  a data repository that includes a Gen2 registry and any raw files
456  and/or reference catalogs.
457  calibs : `dict`
458  Dictionary mapping calibration repository path to the
459  `~lsst.daf.butler.CollectionType.RUN` collection that converted
460  datasets within it should be inserted into.
461  reruns : `list` of `Rerun`
462  Specifications for rerun (processing output) collections to
463  convert.
464  visits : iterable of `int`, optional
465  The integer IDs of visits to convert. If not provided, all visits
466  in the Gen2 root repository will be converted.
467  """
468  if calibs is None:
469  calibs = {}
470  if visits is not None:
471  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
472  else:
473  if self.config.relatedOnly:
474  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
475  "no filtering will be done.")
476  subset = None
477 
478  # We can't wrap database writes sanely in transactions (yet) because we
479  # keep initializing new Butler instances just so we can write into new
480  # runs/collections, and transactions are managed at the Butler level.
481  # DM-21246 should let us fix this, assuming we actually want to keep
482  # the transaction open that long.
483  if self.config.doRegisterInstrument:
484  # Allow registration to fail on the assumption that this means
485  # we are reusing a butler
486  try:
487  self.instrument.register(self.registry)
488  except Exception:
489  pass
490 
491  # Make and prep converters for all Gen2 repos. This should not modify
492  # the Registry database or filesystem at all, though it may query it.
493  # The prep() calls here will be some of the slowest ones, because
494  # that's when we walk the filesystem.
495  converters = []
496  rootConverter = RootRepoConverter(task=self, root=root, subset=subset)
497  rootConverter.prep()
498  converters.append(rootConverter)
499 
500  for calibRoot, run in calibs.items():
501  if not os.path.isabs(calibRoot):
502  calibRoot = os.path.join(rootConverter.root, calibRoot)
503  converter = CalibRepoConverter(task=self, root=calibRoot, run=run,
504  mapper=rootConverter.mapper,
505  subset=rootConverter.subset)
506  converter.prep()
507  converters.append(converter)
508 
509  for spec in reruns:
510  runRoot = spec.path
511  if not os.path.isabs(runRoot):
512  runRoot = os.path.join(rootConverter.root, runRoot)
513  converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName,
514  subset=rootConverter.subset)
515  converter.prep()
516  converters.append(converter)
517 
518  # Actual database writes start here. We can't wrap these sanely in
519  # transactions (yet) because we keep initializing new Butler instances
520  # just so we can write into new runs/collections, and transactions
521  # are managed at the Butler level (DM-21246 should let us fix this).
522 
523  # Insert dimensions needed by any converters. These are only the
524  # dimensions that a converter expects to be uniquely derived from the
525  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
526  # calibration_labels.
527  #
528  # Note that we do not try to filter dimensions down to just those
529  # related to the given visits, even if config.relatedOnly is True; we
530  # need them in the Gen3 repo in order to be able to know which datasets
531  # to convert, because Gen2 alone doesn't know enough about the
532  # relationships between data IDs.
533  for converter in converters:
534  try:
535  converter.insertDimensionData()
536  except Exception:
537  pass
538 
539  # Insert dimensions that are potentially shared by all Gen2
540  # repositories (and are hence managed directly by the Task, rather
541  # than a converter instance).
542  # This also finishes setting up the (shared) converter.subsets object
543  # that is used to filter data IDs for config.relatedOnly.
544  self.registerUsedSkyMaps(rootConverter.subset)
545  self.registerUsedSkyPix(rootConverter.subset)
546 
547  # Look for datasets, generally by scanning the filesystem.
548  # This requires dimensions to have already been inserted so we can use
549  # dimension information to identify related datasets.
550  for converter in converters:
551  converter.findDatasets()
552 
553  # Expand data IDs.
554  for converter in converters:
555  converter.expandDataIds()
556 
557  # Actually ingest datasets.
558  for converter in converters:
559  converter.ingest()
560 
561  # Add chained collections for reruns.
562  for spec in reruns:
563  if spec.chainName is not None:
564  self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED)
565  chain = [spec.runName]
566  chain.extend(spec.parents)
567  chain.extend(rootConverter.getCollectionChain())
568  self.log.info("Defining %s from chain %s.", spec.chainName, chain)
569  self.butler3.registry.setCollectionChain(spec.chainName, chain)