lsst.obs.base  19.0.0-16-g8258e2a+1
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associate names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  rootSkyMapName = Field(
108  "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
109  "datasets in the root repository when no SkyMap is found there. ",
110  dtype=str,
111  optional=True,
112  default=None,
113  )
114  collections = DictField(
115  "Special collections (values) for certain dataset types (keys). "
116  "These are used in addition to rerun collections for datasets in "
117  "reruns. The 'raw' dataset must have an entry here if it is to be "
118  "converted.",
119  keytype=str,
120  itemtype=str,
121  default={
122  "deepCoadd_skyMap": "skymaps",
123  "brightObjectMask": "masks",
124  }
125  )
126  storageClasses = DictField(
127  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
128  "or 'persistable') to the Gen3 StorageClass name.",
129  keytype=str,
130  itemtype=str,
131  default={
132  "BaseSkyMap": "SkyMap",
133  "BaseCatalog": "Catalog",
134  "BackgroundList": "Background",
135  "raw": "Exposure",
136  "MultilevelParquetTable": "DataFrame",
137  }
138  )
139  doRegisterInstrument = Field(
140  "If True (default), add dimension records for the Instrument and its "
141  "filters and detectors to the registry instead of assuming they are "
142  "already present.",
143  dtype=bool,
144  default=True,
145  )
146  doWriteCuratedCalibrations = Field(
147  "If True (default), ingest human-curated calibrations directly via "
148  "the Instrument interface. Note that these calibrations are never "
149  "converted from Gen2 repositories.",
150  dtype=bool,
151  default=True,
152  )
153  refCats = ListField(
154  "The names of reference catalogs (subdirectories under ref_cats) to "
155  "be converted",
156  dtype=str,
157  default=[]
158  )
159  fileIgnorePatterns = ListField(
160  "Filename globs that should be ignored instead of being treated as "
161  "datasets.",
162  dtype=str,
163  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
164  "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
165  "_parent", "repositoryCfg.yaml"]
166  )
167  datasetIncludePatterns = ListField(
168  "Glob-style patterns for dataset type names that should be converted.",
169  dtype=str,
170  default=["*"]
171  )
172  datasetIgnorePatterns = ListField(
173  "Glob-style patterns for dataset type names that should not be "
174  "converted despite matching a pattern in datasetIncludePatterns.",
175  dtype=str,
176  default=[]
177  )
178  ccdKey = Field(
179  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
180  dtype=str,
181  default="ccd",
182  )
183  relatedOnly = Field(
184  "If True (default), only convert datasets that are related to the "
185  "ingested visits. Ignored unless a list of visits is passed to "
186  "run().",
187  dtype=bool,
188  default=False,
189  )
190 
191  @property
192  def transfer(self):
193  return self.raws.transfer
194 
195  @transfer.setter
196  def transfer(self, value):
197  self.raws.transfer = value
198 
199  @property
200  def instrument(self):
201  return self.raws.instrument
202 
203  @instrument.setter
204  def instrument(self, value):
205  self.raws.instrument = value
206 
207  def setDefaults(self):
208  self.transfer = None
209 
210  # TODO: check that there are no collection overrides for curated
211  # calibrations, since we don't have a good way to utilize them.
212 
213 
214 class ConvertRepoTask(Task):
215  """A task that converts one or more related Gen2 data repositories to a
216  single Gen3 data repository (with multiple collections).
217 
218  Parameters
219  ----------
220  config: `ConvertRepoConfig`
221  Configuration for this task.
222  butler3: `lsst.daf.butler.Butler`
223  Gen3 Butler instance that represents the data repository datasets will
224  be ingested into. The collection and/or run associated with this
225  Butler will be ignored in favor of collections/runs passed via config
226  or to `run`.
227  kwds
228  Other keyword arguments are forwarded to the `Task` constructor.
229 
230  Notes
231  -----
232  Most of the work of converting repositories is delegated to instances of
233  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
234  only state that is relevant for all Gen2 repositories being ingested, while
235  each `RepoConverter` instance holds only state relevant for the conversion
236  of a single Gen2 repository. Both the task and the `RepoConverter`
237  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
238  methods may only be called once on a particular instance.
239  """
240 
241  ConfigClass = ConvertRepoConfig
242 
243  _DefaultName = "convertRepo"
244 
245  def __init__(self, config=None, *, butler3: Butler3, **kwds):
246  super().__init__(config, **kwds)
247  self.butler3 = butler3
248  self.registry = self.butler3.registry
249  self.universe = self.registry.dimensions
250  if self.isDatasetTypeIncluded("raw"):
251  self.makeSubtask("raws", butler=butler3)
252  self.instrument = self.raws.instrument
253  else:
254  self.raws = None
255  self.instrument = doImport(self.config.instrument)()
256  self._configuredSkyMapsBySha1 = {}
257  self._configuredSkyMapsByName = {}
258  for name, config in self.config.skyMaps.items():
259  instance = config.skyMap.apply()
260  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
261  self._configuredSkyMapsBySha1[struct.sha1] = struct
262  self._configuredSkyMapsByName[struct.name] = struct
263  self._usedSkyPix = set()
264 
265  def isDatasetTypeIncluded(self, datasetTypeName: str):
266  """Return `True` if configuration indicates that the given dataset type
267  should be converted.
268 
269  This method is intended to be called primarily by the
270  `RepoConverter` instances used interally by the task.
271 
272  Parameters
273  ----------
274  datasetTypeName: str
275  Name of the dataset type.
276 
277  Returns
278  -------
279  included : `bool`
280  Whether the dataset should be included in the conversion.
281  """
282  return (
283  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
284  for pattern in self.config.datasetIncludePatterns) and
285  not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
286  for pattern in self.config.datasetIgnorePatterns)
287  )
288 
289  def useSkyMap(self, skyMap: BaseSkyMap) -> str:
290  """Indicate that a repository uses the given SkyMap.
291 
292  This method is intended to be called primarily by the
293  `RepoConverter` instances used interally by the task.
294 
295  Parameters
296  ----------
297  skyMap : `lsst.skymap.BaseSkyMap`
298  SkyMap instance being used, typically retrieved from a Gen2
299  data repository.
300 
301  Returns
302  -------
303  name : `str`
304  The name of the skymap in Gen3 data IDs.
305  """
306  sha1 = skyMap.getSha1()
307  try:
308  struct = self._configuredSkyMapsBySha1[sha1]
309  except KeyError as err:
310  raise LookupError(f"SkyMap with sha1={sha1} not included in configuration.") from err
311  struct.used = True
312  return struct.name
313 
314  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
315  """Register all skymaps that have been marked as used.
316 
317  This method is intended to be called primarily by the
318  `RepoConverter` instances used interally by the task.
319 
320  Parameters
321  ----------
322  subset : `ConversionSubset`, optional
323  Object that will be used to filter converted datasets by data ID.
324  If given, it will be updated with the tracts of this skymap that
325  overlap the visits in the subset.
326  """
327  for struct in self._configuredSkyMapsBySha1.values():
328  if struct.used:
329  struct.instance.register(struct.name, self.registry)
330  if subset is not None and self.config.relatedOnly:
331  subset.addSkyMap(self.registry, struct.name)
332 
333  def useSkyPix(self, dimension: SkyPixDimension):
334  """Indicate that a repository uses the given SkyPix dimension.
335 
336  This method is intended to be called primarily by the
337  `RepoConverter` instances used interally by the task.
338 
339  Parameters
340  ----------
341  dimension : `lsst.daf.butler.SkyPixDimension`
342  Dimension represening a pixelization of the sky.
343  """
344  self._usedSkyPix.add(dimension)
345 
346  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
347  """Register all skymaps that have been marked as used.
348 
349  This method is intended to be called primarily by the
350  `RepoConverter` instances used interally by the task.
351 
352  Parameters
353  ----------
354  subset : `ConversionSubset`, optional
355  Object that will be used to filter converted datasets by data ID.
356  If given, it will be updated with the pixelization IDs that
357  overlap the visits in the subset.
358  """
359  if subset is not None and self.config.relatedOnly:
360  for dimension in self._usedSkyPix:
361  subset.addSkyPix(self.registry, dimension)
362 
363  def run(self, root: str, collections: List[str], *,
364  calibs: Dict[str, List[str]] = None,
365  reruns: Dict[str, List[str]] = None,
366  visits: Optional[Iterable[int]] = None):
367  """Convert a group of related data repositories.
368 
369  Parameters
370  ----------
371  root : `str`
372  Complete path to the root Gen2 data repository. This should be
373  a data repository that includes a Gen2 registry and any raw files
374  and/or reference catalogs.
375  collections : `list` of `str`
376  Gen3 collections that datasets from the root repository should be
377  associated with. This should include any rerun collection that
378  these datasets should also be considered to be part of; because of
379  structural difference between Gen2 parent/child relationships and
380  Gen3 collections, these cannot be reliably inferred.
381  calibs : `dict`
382  Dictionary mapping calibration repository path to the collections
383  that the repository's datasets should be associated with. The path
384  may be relative to ``root`` or absolute. Collections should
385  include child repository collections as appropriate (see
386  documentation for ``collections``).
387  reruns : `dict`
388  Dictionary mapping rerun repository path to the collections that
389  the repository's datasets should be associated with. The path may
390  be relative to ``root`` or absolute. Collections should include
391  child repository collections as appropriate (see documentation for
392  ``collections``).
393  visits : iterable of `int`, optional
394  The integer IDs of visits to convert. If not provided, all visits
395  in the Gen2 root repository will be converted.
396  """
397 
398  if calibs is None:
399  calibs = {}
400  if reruns is None:
401  reruns = {}
402  if visits is not None:
403  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
404  else:
405  if self.config.relatedOnly:
406  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
407  "no filtering will be done.")
408  subset = None
409 
410  # We can't wrap database writes sanely in transactions (yet) because we
411  # keep initializing new Butler instances just so we can write into new
412  # runs/collections, and transactions are managed at the Butler level.
413  # DM-21246 should let us fix this, assuming we actually want to keep
414  # the transaction open that long.
415  if self.config.doRegisterInstrument:
416  self.instrument.register(self.registry)
417 
418  # Make and prep converters for all Gen2 repos. This should not modify
419  # the Registry database or filesystem at all, though it may query it.
420  # The prep() calls here will be some of the slowest ones, because
421  # that's when we walk the filesystem.
422  converters = []
423  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
424  rootConverter.prep()
425  converters.append(rootConverter)
426 
427  for root, collections in calibs.items():
428  if not os.path.isabs(root):
429  root = os.path.join(rootConverter.root, root)
430  converter = CalibRepoConverter(task=self, root=root, collections=collections,
431  mapper=rootConverter.mapper,
432  subset=rootConverter.subset)
433  converter.prep()
434  converters.append(converter)
435 
436  for root, collections in reruns.items():
437  if not os.path.isabs(root):
438  root = os.path.join(rootConverter.root, root)
439  converter = StandardRepoConverter(task=self, root=root, collections=collections,
440  subset=rootConverter.subset)
441  converter.prep()
442  converters.append(converter)
443 
444  # Actual database writes start here. We can't wrap these sanely in
445  # transactions (yet) because we keep initializing new Butler instances
446  # just so we can write into new runs/collections, and transactions
447  # are managed at the Butler level (DM-21246 should let us fix this).
448 
449  # Insert dimensions needed by any converters. These are only the
450  # dimensions that a converter expects to be uniquely derived from the
451  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
452  # calibration_labels.
453  #
454  # Note that we do not try to filter dimensions down to just those
455  # related to the given visits, even if config.relatedOnly is True; we
456  # need them in the Gen3 repo in order to be able to know which datasets
457  # to convert, because Gen2 alone doesn't know enough about the
458  # relationships between data IDs.
459  for converter in converters:
460  converter.insertDimensionData()
461 
462  # Insert dimensions that are potentially shared by all Gen2
463  # repositories (and are hence managed directly by the Task, rather
464  # than a converter instance).
465  # This also finishes setting up the (shared) converter.subsets object
466  # that is used to filter data IDs for config.relatedOnly.
467  self.registerUsedSkyMaps(rootConverter.subset)
468  self.registerUsedSkyPix(rootConverter.subset)
469 
470  # Look for datasets, generally by scanning the filesystem.
471  # This requires dimensions to have already been inserted so we can use
472  # dimension information to identify related datasets.
473  for converter in converters:
474  converter.findDatasets()
475 
476  # Expand data IDs.
477  for converter in converters:
478  converter.expandDataIds()
479 
480  # Actually ingest datasets.
481  for converter in converters:
482  converter.ingest()