lsst.obs.base  19.0.0-21-gaaa92db
repoConverter.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["RepoConverter"]
24 
25 from dataclasses import dataclass
26 from collections import defaultdict
27 from abc import ABC, abstractmethod
28 import fnmatch
29 import re
30 from typing import (
31  Dict,
32  Iterator,
33  List,
34  MutableMapping,
35  Optional,
36  Set,
37  Tuple,
38  Union,
39  TYPE_CHECKING,
40 )
41 
42 from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
43 from lsst.sphgeom import RangeSet, Region
44 from .repoWalker import RepoWalker
45 
46 if TYPE_CHECKING:
47  from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
48  from .convertRepo import ConvertRepoTask
49  from lsst.daf.butler import StorageClass, Registry, SkyPixDimension
50 
51 
52 @dataclass
54  """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
55  lists of related data ID values that should be included in the conversion.
56 
57  Parameters
58  ----------
59  instrument : `str`
60  Instrument name used in Gen3 data IDs.
61  visits : `set` of `int`
62  Visit IDs that define the filter.
63  """
64 
65  def __init__(self, instrument: str, visits: Set[int]):
66  self.instrument = instrument
67  self.visits = visits
68  self.regions = None
69  self.tracts = {}
70  self.skypix = {}
71 
72  def addSkyMap(self, registry: Registry, name: str):
73  """Populate the included tract IDs for the given skymap from those that
74  overlap the visits the `ConversionSubset` was initialized with.
75 
76  Parameters
77  ----------
78  registry : `lsst.daf.butler.Registry`
79  Registry that can be queried for visit/tract overlaps.
80  name : `str`
81  SkyMap name used in Gen3 data IDs.
82  """
83  tracts = set()
84  self.tracts[name] = tracts
85  for visit in self.visits:
86  for dataId in registry.queryDimensions(["tract"], expand=False,
87  dataId={"skymap": name,
88  "instrument": self.instrument,
89  "visit": visit}):
90  tracts.add(dataId["tract"])
91 
92  def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93  """Populate the included skypix IDs for the given dimension from those
94  that overlap the visits the `ConversionSubset` was initialized with.
95 
96  Parameters
97  ----------
98  registry : `lsst.daf.butler.Registry`
99  Registry that can be queried for visit regions.
100  name : `str`
101  SkyMap name used in Gen3 data IDs.
102  """
103  if self.regions is None:
104  self.regions = []
105  for visit in self.visits:
106  dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
107  self.regions.append(dataId.region)
108  ranges = RangeSet()
109  for region in self.regions:
110  ranges = ranges.union(dimension.pixelization.envelope(region))
111  self.skypix[dimension] = ranges
112 
113  def isRelated(self, dataId: DataCoordinate) -> bool:
114  """Test whether the given data ID is related to this subset and hence
115  should be included in a repository conversion.
116 
117  Parameters
118  ----------
119  dataId : `lsst.daf.butler.DataCoordinate`
120  Data ID to test.
121 
122  Returns
123  -------
124  related : `bool`
125  `True` if this data ID should be included in a repository
126  conversion.
127 
128  Notes
129  -----
130  More formally, this tests that the given data ID is not unrelated;
131  if a data ID does not involve tracts, visits, or skypix dimensions,
132  we always include it.
133  """
134  if self.visits is None:
135  # We're not filtering at all.
136  return True
137  if "visit" in dataId.graph and dataId["visit"] not in self.visits:
138  return False
139  if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
140  return False
141  for dimension, ranges in self.skypix.items():
142  if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
143  return False
144  return True
145 
146  # Class attributes that will be shadowed by public instance attributes;
147  # defined here only for documentation purposes.
148 
149  instrument: str
150  """The name of the instrument, as used in Gen3 data IDs (`str`).
151  """
152 
153  visits: Set[int]
154  """The set of visit IDs that should be included in the conversion (`set`
155  of `int`).
156  """
157 
158  regions: Optional[List[Region]]
159  """Regions for all visits (`list` of `lsst.sphgeom.Region`).
160 
161  Set to `None` before it has been initialized. Any code that attempts to
162  use it when it is `None` has a logic bug.
163  """
164 
165  tracts: Dict[str, Set[int]]
166  """Tracts that should be included in the conversion, grouped by skymap
167  name (`dict` mapping `str` to `set` of `int`).
168  """
169 
170  skypix: Dict[SkyPixDimension, RangeSet]
171  """SkyPix ranges that should be included in the conversion, grouped by
172  dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
173  """
174 
175 
176 class RepoConverter(ABC):
177  """An abstract base class for objects that help `ConvertRepoTask` convert
178  datasets from a single Gen2 repository.
179 
180  Parameters
181  ----------
182  task : `ConvertRepoTask`
183  Task instance that is using this helper object.
184  root : `str`
185  Root of the Gen2 repo being converted.
186  collections : `list` of `str`
187  Gen3 collections with which all converted datasets should be
188  associated.
189  subset : `ConversionSubset, optional
190  Helper object that implements a filter that restricts the data IDs that
191  are converted.
192 
193  Notes
194  -----
195  `RepoConverter` defines the only public API users of its subclasses should
196  use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
197  several abstract methods that subclasses must implement. In some cases,
198  subclasses may reimplement the public methods as well, but are expected to
199  delegate to ``super()`` either at the beginning or end of their own
200  implementation.
201  """
202 
203  def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204  subset: Optional[ConversionSubset] = None):
205  self.task = task
206  self.root = root
207  self.subset = subset
208  self._collections = list(collections)
209  self._repoWalker = None # Created in prep
210  self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
211 
212  @abstractmethod
213  def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
214  """Test whether the given dataset is handled specially by this
215  converter and hence should be ignored by generic base-class logic that
216  searches for dataset types to convert.
217 
218  Parameters
219  ----------
220  datasetTypeName : `str`
221  Name of the dataset type to test.
222 
223  Returns
224  -------
225  special : `bool`
226  `True` if the dataset type is special.
227  """
228  raise NotImplementedError()
229 
230  @abstractmethod
231  def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
232  """Iterate over all `CameraMapper` `Mapping` objects that should be
233  considered for conversion by this repository.
234 
235  This this should include any datasets that may appear in the
236  repository, including those that are special (see
237  `isDatasetTypeSpecial`) and those that are being ignored (see
238  `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
239  to identify and hence skip these datasets quietly instead of warning
240  about them as unrecognized.
241 
242  Yields
243  ------
244  datasetTypeName: `str`
245  Name of the dataset type.
246  mapping : `lsst.obs.base.mapping.Mapping`
247  Mapping object used by the Gen2 `CameraMapper` to describe the
248  dataset type.
249  """
250  raise NotImplementedError()
251 
252  @abstractmethod
253  def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
254  storageClass: StorageClass) -> RepoWalker.Target:
255  """Make a struct that identifies a dataset type to be extracted by
256  walking the repo directory structure.
257 
258  Parameters
259  ----------
260  datasetTypeName : `str`
261  Name of the dataset type (the same in both Gen2 and Gen3).
262  template : `str`
263  The full Gen2 filename template.
264  keys : `dict` [`str`, `type`]
265  A dictionary mapping Gen2 data ID key to the type of its value.
266  storageClass : `lsst.daf.butler.StorageClass`
267  Gen3 storage class for this dataset type.
268 
269  Returns
270  -------
271  target : `RepoWalker.Target`
272  A struct containing information about the target dataset (much of
273  it simplify forwarded from the arguments).
274  """
275  raise NotImplementedError()
276 
277  def getSpecialDirectories(self) -> List[str]:
278  """Return a list of directory paths that should not be searched for
279  files.
280 
281  These may be directories that simply do not contain datasets (or
282  contain datasets in another repository), or directories whose datasets
283  are handled specially by a subclass.
284 
285  Returns
286  -------
287  directories : `list` [`str`]
288  The full paths of directories to skip, relative to the repository
289  root.
290  """
291  return []
292 
293  def prep(self):
294  """Perform preparatory work associated with the dataset types to be
295  converted from this repository (but not the datasets themselves).
296 
297  Notes
298  -----
299  This should be a relatively fast operation that should not depend on
300  the size of the repository.
301 
302  Subclasses may override this method, but must delegate to the base
303  class implementation at some point in their own logic.
304  More often, subclasses will specialize the behavior of `prep` by
305  overriding other methods to which the base class implementation
306  delegates. These include:
307  - `iterMappings`
308  - `isDatasetTypeSpecial`
309  - `getSpecialDirectories`
310  - `makeRepoWalkerTarget`
311 
312  This should not perform any write operations to the Gen3 repository.
313  It is guaranteed to be called before `insertDimensionData`.
314  """
315  self.task.log.info(f"Preparing other dataset types from root {self.root}.")
316  walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
317  for datasetTypeName, mapping in self.iterMappings():
318  try:
319  template = mapping.template
320  except RuntimeError:
321  # No template for this dataset in this mapper, so there's no
322  # way there should be instances of this dataset in this repo.
323  continue
324  skip = False
325  message = None
326  storageClass = None
327  if (not self.task.isDatasetTypeIncluded(datasetTypeName)
328  or self.isDatasetTypeSpecial(datasetTypeName)):
329  # User indicated not to include this data, but we still want
330  # to recognize files of that type to avoid warning about them.
331  skip = True
332  else:
333  storageClass = self._guessStorageClass(datasetTypeName, mapping)
334  if storageClass is None:
335  # This may be a problem, but only if we actually encounter any
336  # files corresponding to this dataset. Of course, we need
337  # to be able to parse those files in order to recognize that
338  # situation.
339  message = f"no storage class found for {datasetTypeName}"
340  skip = True
341  if skip:
342  walkerInput = RepoWalker.Skip(
343  template=template,
344  keys=mapping.keys(),
345  message=message,
346  )
347  else:
348  assert message is None
349  walkerInput = self.makeRepoWalkerTarget(
350  datasetTypeName=datasetTypeName,
351  template=template,
352  keys=mapping.keys(),
353  storageClass=storageClass,
354  )
355  walkerInputs.append(walkerInput)
356  for dirPath in self.getSpecialDirectories():
357  walkerInputs.append(
358  RepoWalker.Skip(
359  template=dirPath, # not really a template, but that's fine; it's relative to root.
360  keys={},
361  message=None,
362  isForFiles=True,
363  )
364  )
365  fileIgnoreRegExTerms = []
366  for pattern in self.task.config.fileIgnorePatterns:
367  fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
368  if fileIgnoreRegExTerms:
369  fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
370  else:
371  fileIgnoreRegEx = None
372  self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx)
373 
374  def iterDatasets(self) -> Iterator[FileDataset]:
375  """Iterate over datasets in the repository that should be ingested into
376  the Gen3 repository.
377 
378  The base class implementation yields nothing; the datasets handled by
379  the `RepoConverter` base class itself are read directly in
380  `findDatasets`.
381 
382  Subclasses should override this method if they support additional
383  datasets that are handled some other way.
384 
385  Yields
386  ------
387  dataset : `FileDataset`
388  Structures representing datasets to be ingested. Paths should be
389  absolute.
390  """
391  yield from ()
392 
393  def findDatasets(self):
394  assert self._repoWalker, "prep() must be called before findDatasets."
395  self.task.log.info("Adding special datasets in repo %s.", self.root)
396  for dataset in self.iterDatasets():
397  assert len(dataset.refs) == 1
398  self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
399  self.task.log.info("Finding datasets from files in repo %s.", self.root)
400  self._fileDatasets.update(
401  self._repoWalker.walk(
402  self.root,
403  log=self.task.log,
404  predicate=(self.subset.isRelated if self.subset is not None else None)
405  )
406  )
407 
409  """Insert any dimension records uniquely derived from this repository
410  into the registry.
411 
412  Subclasses may override this method, but may not need to; the default
413  implementation does nothing.
414 
415  SkyMap and SkyPix dimensions should instead be handled by calling
416  `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
417  these dimensions are in general shared by multiple Gen2 repositories.
418 
419  This method is guaranteed to be called between `prep` and
420  `expandDataIds`.
421  """
422  pass
423 
424  def handleDataIdExpansionFailure(self, dataset: FileDataset, err: LookupError):
425  self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
426  return False
427 
428  def expandDataIds(self):
429  """Expand the data IDs for all datasets to be inserted.
430 
431  Subclasses may override this method, but must delegate to the base
432  class implementation if they do. If they wish to handle expected
433  failures in data ID expansion, they should override
434  `handleDataIdExpansionFailure` instead.
435 
436  This involves queries to the registry, but not writes. It is
437  guaranteed to be called between `insertDimensionData` and `ingest`.
438  """
439  for datasetType, datasetsForType in self._fileDatasets.items():
440  self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType),
441  datasetType.name)
442  expanded = []
443  for dataset in datasetsForType:
444  for i, ref in enumerate(dataset.refs):
445  try:
446  dataId = self.task.registry.expandDataId(ref.dataId)
447  dataset.refs[i] = ref.expanded(dataId)
448  expanded.append(dataset)
449  except LookupError as err:
450  if self.handleDataIdExpansionFailure(dataset, err):
451  expanded.append(dataset)
452  datasetsForType[:] = expanded
453 
454  def ingest(self):
455  """Insert converted datasets into the Gen3 repository.
456 
457  Subclasses may override this method, but must delegate to the base
458  class implementation at some point in their own logic.
459 
460  This method is guaranteed to be called after `expandDataIds`.
461  """
462  for datasetType, datasetsForType in self._fileDatasets.items():
463  self.task.registry.registerDatasetType(datasetType)
464  self.task.log.info("Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
465  try:
466  collections = self.getCollections(datasetType.name)
467  except LookupError as err:
468  self.task.log.warn(str(err))
469  continue
470  try:
471  self.task.registry.registerRun(collections[0])
472  self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer,
473  run=collections[0])
474  except LookupError as err:
475  raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
476  for collection in collections[1:]:
477  self.task.registry.associate(collection,
478  [ref for dataset in datasetsForType for ref in dataset.refs])
479 
480  def getCollections(self, datasetTypeName: str) -> List[str]:
481  """Return the set of collections a particular dataset type should be
482  associated with.
483 
484  Parameters
485  ----------
486  datasetTypeName : `str`
487  Name of the dataset type.
488 
489  Returns
490  -------
491  collections : `list` of `str`
492  Collections the dataset should be associated with. The first
493  item in the list is the run the dataset should be added to
494  initially.
495  """
496  if datasetTypeName in self.task.config.collections:
497  return [self.task.config.collections[datasetTypeName]] + self._collections
498  elif self._collections:
499  return self._collections
500  else:
501  raise LookupError("No collection configured for dataset type {datasetTypeName}.")
502 
503  def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
504  ) -> Optional[StorageClass]:
505  """Infer the Gen3 `StorageClass` from a dataset from a combination of
506  configuration and Gen2 dataset type information.
507 
508  datasetTypeName: `str`
509  Name of the dataset type.
510  mapping : `lsst.obs.base.mapping.Mapping`
511  Mapping object used by the Gen2 `CameraMapper` to describe the
512  dataset type.
513  """
514  storageClassName = self.task.config.storageClasses.get(datasetTypeName)
515  if storageClassName is None and mapping.python is not None:
516  storageClassName = self.task.config.storageClasses.get(mapping.python, None)
517  if storageClassName is None and mapping.persistable is not None:
518  storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
519  if storageClassName is None and mapping.python is not None:
520  unqualified = mapping.python.split(".")[-1]
521  storageClassName = self.task.config.storageClasses.get(unqualified, None)
522  if storageClassName is not None:
523  storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
524  else:
525  try:
526  storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
527  except KeyError:
528  storageClass = None
529  if storageClass is None and mapping.python is not None:
530  try:
531  storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
532  except KeyError:
533  pass
534  if storageClass is None:
535  self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
536  else:
537  self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
538  return storageClass
539 
540  # Class attributes that will be shadowed by public instance attributes;
541  # defined here only for documentation purposes.
542 
543  task: ConvertRepoTask
544  """The parent task that constructed and uses this converter
545  (`ConvertRepoTask`).
546  """
547 
548  root: str
549  """Root path to the Gen2 repository this converter manages (`str`).
550 
551  This is a complete path, not relative to some other repository root.
552  """
553 
554  subset: Optional[ConversionSubset]
555  """An object that represents a filter to be applied to the datasets that
556  are converted (`ConversionSubset` or `None`).
557  """