lsst.obs.base  19.0.0-25-g78ff95b
repoConverter.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["RepoConverter"]
24 
25 from dataclasses import dataclass
26 from collections import defaultdict
27 from abc import ABC, abstractmethod
28 import fnmatch
29 import re
30 from typing import (
31  Dict,
32  Iterator,
33  List,
34  MutableMapping,
35  Optional,
36  Set,
37  Tuple,
38  Union,
39  TYPE_CHECKING,
40 )
41 
42 from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
43 from lsst.sphgeom import RangeSet, Region
44 from .repoWalker import RepoWalker
45 
46 if TYPE_CHECKING:
47  from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
48  from .convertRepo import ConvertRepoTask
49  from lsst.daf.butler import StorageClass, Registry, SkyPixDimension
50 
51 
52 @dataclass
54  """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
55  lists of related data ID values that should be included in the conversion.
56 
57  Parameters
58  ----------
59  instrument : `str`
60  Instrument name used in Gen3 data IDs.
61  visits : `set` of `int`
62  Visit IDs that define the filter.
63  """
64 
65  def __init__(self, instrument: str, visits: Set[int]):
66  self.instrument = instrument
67  self.visits = visits
68  self.regions = None
69  self.tracts = {}
70  self.skypix = {}
71 
72  def addSkyMap(self, registry: Registry, name: str):
73  """Populate the included tract IDs for the given skymap from those that
74  overlap the visits the `ConversionSubset` was initialized with.
75 
76  Parameters
77  ----------
78  registry : `lsst.daf.butler.Registry`
79  Registry that can be queried for visit/tract overlaps.
80  name : `str`
81  SkyMap name used in Gen3 data IDs.
82  """
83  tracts = set()
84  self.tracts[name] = tracts
85  for visit in self.visits:
86  for dataId in registry.queryDimensions(["tract"], expand=False,
87  dataId={"skymap": name,
88  "instrument": self.instrument,
89  "visit": visit}):
90  tracts.add(dataId["tract"])
91 
92  def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93  """Populate the included skypix IDs for the given dimension from those
94  that overlap the visits the `ConversionSubset` was initialized with.
95 
96  Parameters
97  ----------
98  registry : `lsst.daf.butler.Registry`
99  Registry that can be queried for visit regions.
100  name : `str`
101  SkyMap name used in Gen3 data IDs.
102  """
103  if self.regions is None:
104  self.regions = []
105  for visit in self.visits:
106  dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
107  self.regions.append(dataId.region)
108  ranges = RangeSet()
109  for region in self.regions:
110  ranges = ranges.union(dimension.pixelization.envelope(region))
111  self.skypix[dimension] = ranges
112 
113  def isRelated(self, dataId: DataCoordinate) -> bool:
114  """Test whether the given data ID is related to this subset and hence
115  should be included in a repository conversion.
116 
117  Parameters
118  ----------
119  dataId : `lsst.daf.butler.DataCoordinate`
120  Data ID to test.
121 
122  Returns
123  -------
124  related : `bool`
125  `True` if this data ID should be included in a repository
126  conversion.
127 
128  Notes
129  -----
130  More formally, this tests that the given data ID is not unrelated;
131  if a data ID does not involve tracts, visits, or skypix dimensions,
132  we always include it.
133  """
134  if self.visits is None:
135  # We're not filtering at all.
136  return True
137  if "visit" in dataId.graph and dataId["visit"] not in self.visits:
138  return False
139  if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
140  return False
141  for dimension, ranges in self.skypix.items():
142  if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
143  return False
144  return True
145 
146  # Class attributes that will be shadowed by public instance attributes;
147  # defined here only for documentation purposes.
148 
149  instrument: str
150  """The name of the instrument, as used in Gen3 data IDs (`str`).
151  """
152 
153  visits: Set[int]
154  """The set of visit IDs that should be included in the conversion (`set`
155  of `int`).
156  """
157 
158  regions: Optional[List[Region]]
159  """Regions for all visits (`list` of `lsst.sphgeom.Region`).
160 
161  Set to `None` before it has been initialized. Any code that attempts to
162  use it when it is `None` has a logic bug.
163  """
164 
165  tracts: Dict[str, Set[int]]
166  """Tracts that should be included in the conversion, grouped by skymap
167  name (`dict` mapping `str` to `set` of `int`).
168  """
169 
170  skypix: Dict[SkyPixDimension, RangeSet]
171  """SkyPix ranges that should be included in the conversion, grouped by
172  dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
173  """
174 
175 
176 class RepoConverter(ABC):
177  """An abstract base class for objects that help `ConvertRepoTask` convert
178  datasets from a single Gen2 repository.
179 
180  Parameters
181  ----------
182  task : `ConvertRepoTask`
183  Task instance that is using this helper object.
184  root : `str`
185  Root of the Gen2 repo being converted.
186  collections : `list` of `str`
187  Gen3 collections with which all converted datasets should be
188  associated.
189  subset : `ConversionSubset, optional
190  Helper object that implements a filter that restricts the data IDs that
191  are converted.
192 
193  Notes
194  -----
195  `RepoConverter` defines the only public API users of its subclasses should
196  use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
197  several abstract methods that subclasses must implement. In some cases,
198  subclasses may reimplement the public methods as well, but are expected to
199  delegate to ``super()`` either at the beginning or end of their own
200  implementation.
201  """
202 
203  def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204  subset: Optional[ConversionSubset] = None):
205  self.task = task
206  self.root = root
207  self.subset = subset
208  self._collections = list(collections)
209  self._repoWalker = None # Created in prep
210  self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
211 
212  @abstractmethod
213  def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
214  """Test whether the given dataset is handled specially by this
215  converter and hence should be ignored by generic base-class logic that
216  searches for dataset types to convert.
217 
218  Parameters
219  ----------
220  datasetTypeName : `str`
221  Name of the dataset type to test.
222 
223  Returns
224  -------
225  special : `bool`
226  `True` if the dataset type is special.
227  """
228  raise NotImplementedError()
229 
230  @abstractmethod
231  def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
232  """Iterate over all `CameraMapper` `Mapping` objects that should be
233  considered for conversion by this repository.
234 
235  This this should include any datasets that may appear in the
236  repository, including those that are special (see
237  `isDatasetTypeSpecial`) and those that are being ignored (see
238  `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
239  to identify and hence skip these datasets quietly instead of warning
240  about them as unrecognized.
241 
242  Yields
243  ------
244  datasetTypeName: `str`
245  Name of the dataset type.
246  mapping : `lsst.obs.base.mapping.Mapping`
247  Mapping object used by the Gen2 `CameraMapper` to describe the
248  dataset type.
249  """
250  raise NotImplementedError()
251 
252  @abstractmethod
253  def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
254  storageClass: StorageClass) -> RepoWalker.Target:
255  """Make a struct that identifies a dataset type to be extracted by
256  walking the repo directory structure.
257 
258  Parameters
259  ----------
260  datasetTypeName : `str`
261  Name of the dataset type (the same in both Gen2 and Gen3).
262  template : `str`
263  The full Gen2 filename template.
264  keys : `dict` [`str`, `type`]
265  A dictionary mapping Gen2 data ID key to the type of its value.
266  storageClass : `lsst.daf.butler.StorageClass`
267  Gen3 storage class for this dataset type.
268 
269  Returns
270  -------
271  target : `RepoWalker.Target`
272  A struct containing information about the target dataset (much of
273  it simplify forwarded from the arguments).
274  """
275  raise NotImplementedError()
276 
277  def getSpecialDirectories(self) -> List[str]:
278  """Return a list of directory paths that should not be searched for
279  files.
280 
281  These may be directories that simply do not contain datasets (or
282  contain datasets in another repository), or directories whose datasets
283  are handled specially by a subclass.
284 
285  Returns
286  -------
287  directories : `list` [`str`]
288  The full paths of directories to skip, relative to the repository
289  root.
290  """
291  return []
292 
293  def prep(self):
294  """Perform preparatory work associated with the dataset types to be
295  converted from this repository (but not the datasets themselves).
296 
297  Notes
298  -----
299  This should be a relatively fast operation that should not depend on
300  the size of the repository.
301 
302  Subclasses may override this method, but must delegate to the base
303  class implementation at some point in their own logic.
304  More often, subclasses will specialize the behavior of `prep` by
305  overriding other methods to which the base class implementation
306  delegates. These include:
307  - `iterMappings`
308  - `isDatasetTypeSpecial`
309  - `getSpecialDirectories`
310  - `makeRepoWalkerTarget`
311 
312  This should not perform any write operations to the Gen3 repository.
313  It is guaranteed to be called before `insertDimensionData`.
314  """
315  self.task.log.info(f"Preparing other dataset types from root {self.root}.")
316  walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
317  for datasetTypeName, mapping in self.iterMappings():
318  try:
319  template = mapping.template
320  except RuntimeError:
321  # No template for this dataset in this mapper, so there's no
322  # way there should be instances of this dataset in this repo.
323  continue
324  extensions = [""]
325  skip = False
326  message = None
327  storageClass = None
328  if (not self.task.isDatasetTypeIncluded(datasetTypeName)
329  or self.isDatasetTypeSpecial(datasetTypeName)):
330  # User indicated not to include this data, but we still want
331  # to recognize files of that type to avoid warning about them.
332  skip = True
333  else:
334  storageClass = self._guessStorageClass(datasetTypeName, mapping)
335  if storageClass is None:
336  # This may be a problem, but only if we actually encounter any
337  # files corresponding to this dataset. Of course, we need
338  # to be able to parse those files in order to recognize that
339  # situation.
340  message = f"no storage class found for {datasetTypeName}"
341  skip = True
342  # Handle files that are compressed on disk, but the gen2 template is just `.fits`
343  if template.endswith(".fits"):
344  extensions.extend((".gz", ".fz"))
345  for extension in extensions:
346  if skip:
347  walkerInput = RepoWalker.Skip(
348  template=template+extension,
349  keys=mapping.keys(),
350  message=message,
351  )
352  self.task.log.debug("Skipping template in walker: %s", template)
353  else:
354  assert message is None
355  walkerInput = self.makeRepoWalkerTarget(
356  datasetTypeName=datasetTypeName,
357  template=template+extension,
358  keys=mapping.keys(),
359  storageClass=storageClass,
360  )
361  self.task.log.debug("Adding template to walker: %s", template)
362  walkerInputs.append(walkerInput)
363 
364  for dirPath in self.getSpecialDirectories():
365  walkerInputs.append(
366  RepoWalker.Skip(
367  template=dirPath, # not really a template, but that's fine; it's relative to root.
368  keys={},
369  message=None,
370  isForFiles=True,
371  )
372  )
373  fileIgnoreRegExTerms = []
374  for pattern in self.task.config.fileIgnorePatterns:
375  fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
376  if fileIgnoreRegExTerms:
377  fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
378  else:
379  fileIgnoreRegEx = None
380  self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx)
381 
382  def iterDatasets(self) -> Iterator[FileDataset]:
383  """Iterate over datasets in the repository that should be ingested into
384  the Gen3 repository.
385 
386  The base class implementation yields nothing; the datasets handled by
387  the `RepoConverter` base class itself are read directly in
388  `findDatasets`.
389 
390  Subclasses should override this method if they support additional
391  datasets that are handled some other way.
392 
393  Yields
394  ------
395  dataset : `FileDataset`
396  Structures representing datasets to be ingested. Paths should be
397  absolute.
398  """
399  yield from ()
400 
401  def findDatasets(self):
402  assert self._repoWalker, "prep() must be called before findDatasets."
403  self.task.log.info("Adding special datasets in repo %s.", self.root)
404  for dataset in self.iterDatasets():
405  assert len(dataset.refs) == 1
406  self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
407  self.task.log.info("Finding datasets from files in repo %s.", self.root)
408  self._fileDatasets.update(
409  self._repoWalker.walk(
410  self.root,
411  log=self.task.log,
412  predicate=(self.subset.isRelated if self.subset is not None else None)
413  )
414  )
415 
417  """Insert any dimension records uniquely derived from this repository
418  into the registry.
419 
420  Subclasses may override this method, but may not need to; the default
421  implementation does nothing.
422 
423  SkyMap and SkyPix dimensions should instead be handled by calling
424  `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
425  these dimensions are in general shared by multiple Gen2 repositories.
426 
427  This method is guaranteed to be called between `prep` and
428  `expandDataIds`.
429  """
430  pass
431 
432  def expandDataIds(self):
433  """Expand the data IDs for all datasets to be inserted.
434 
435  Subclasses may override this method, but must delegate to the base
436  class implementation if they do.
437 
438  This involves queries to the registry, but not writes. It is
439  guaranteed to be called between `insertDimensionData` and `ingest`.
440  """
441  import itertools
442  for datasetType, datasetsForType in self._fileDatasets.items():
443  self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType),
444  datasetType.name)
445  expanded = []
446  for dataset in datasetsForType:
447  for i, ref in enumerate(dataset.refs):
448  try:
449  dataId = self.task.registry.expandDataId(ref.dataId)
450  dataset.refs[i] = ref.expanded(dataId)
451  except LookupError as err:
452  self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
453  # Remove skipped datasets from multi-extension FileDatasets
454  dataset.refs[i] = None # We will strip off the `None`s after the loop.
455  dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs)
456  if dataset.refs:
457  expanded.append(dataset)
458 
459  datasetsForType[:] = expanded
460 
461  def ingest(self):
462  """Insert converted datasets into the Gen3 repository.
463 
464  Subclasses may override this method, but must delegate to the base
465  class implementation at some point in their own logic.
466 
467  This method is guaranteed to be called after `expandDataIds`.
468  """
469  for datasetType, datasetsForType in self._fileDatasets.items():
470  self.task.registry.registerDatasetType(datasetType)
471  self.task.log.info("Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
472  try:
473  collections = self.getCollections(datasetType.name)
474  except LookupError as err:
475  self.task.log.warn(str(err))
476  continue
477  try:
478  self.task.registry.registerRun(collections[0])
479  self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer,
480  run=collections[0])
481  except LookupError as err:
482  raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
483  for collection in collections[1:]:
484  self.task.registry.associate(collection,
485  [ref for dataset in datasetsForType for ref in dataset.refs])
486 
487  def getCollections(self, datasetTypeName: str) -> List[str]:
488  """Return the set of collections a particular dataset type should be
489  associated with.
490 
491  Parameters
492  ----------
493  datasetTypeName : `str`
494  Name of the dataset type.
495 
496  Returns
497  -------
498  collections : `list` of `str`
499  Collections the dataset should be associated with. The first
500  item in the list is the run the dataset should be added to
501  initially.
502  """
503  if datasetTypeName in self.task.config.collections:
504  return [self.task.config.collections[datasetTypeName]] + self._collections
505  elif self._collections:
506  return self._collections
507  else:
508  raise LookupError("No collection configured for dataset type {datasetTypeName}.")
509 
510  def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
511  ) -> Optional[StorageClass]:
512  """Infer the Gen3 `StorageClass` from a dataset from a combination of
513  configuration and Gen2 dataset type information.
514 
515  datasetTypeName: `str`
516  Name of the dataset type.
517  mapping : `lsst.obs.base.mapping.Mapping`
518  Mapping object used by the Gen2 `CameraMapper` to describe the
519  dataset type.
520  """
521  storageClassName = self.task.config.storageClasses.get(datasetTypeName)
522  if storageClassName is None and mapping.python is not None:
523  storageClassName = self.task.config.storageClasses.get(mapping.python, None)
524  if storageClassName is None and mapping.persistable is not None:
525  storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
526  if storageClassName is None and mapping.python is not None:
527  unqualified = mapping.python.split(".")[-1]
528  storageClassName = self.task.config.storageClasses.get(unqualified, None)
529  if storageClassName is not None:
530  storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
531  else:
532  try:
533  storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
534  except KeyError:
535  storageClass = None
536  if storageClass is None and mapping.python is not None:
537  try:
538  storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
539  except KeyError:
540  pass
541  if storageClass is None:
542  self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
543  else:
544  self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
545  return storageClass
546 
547  # Class attributes that will be shadowed by public instance attributes;
548  # defined here only for documentation purposes.
549 
550  task: ConvertRepoTask
551  """The parent task that constructed and uses this converter
552  (`ConvertRepoTask`).
553  """
554 
555  root: str
556  """Root path to the Gen2 repository this converter manages (`str`).
557 
558  This is a complete path, not relative to some other repository root.
559  """
560 
561  subset: Optional[ConversionSubset]
562  """An object that represents a filter to be applied to the datasets that
563  are converted (`ConversionSubset` or `None`).
564  """