lsst.obs.base  18.1.0-21-gde80ed3+5
repoConverter.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["RepoConverter"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from collections import defaultdict
29 from abc import ABC, abstractmethod
30 from typing import TYPE_CHECKING, Generic, TypeVar, List, Tuple, Optional, Iterator, Set, Any, Callable, Dict
31 
32 from lsst.daf.butler import DatasetRef, Butler as Butler3, DataCoordinate
33 from lsst.sphgeom import RangeSet, Region
34 
35 from .filePathParser import FilePathParser
36 
37 if TYPE_CHECKING:
38  from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
39  from .dataIdExtractor import DataIdExtractor
40  from .convertRepo import ConvertRepoTask
41  from lsst.daf.butler import StorageClass, Registry, SkyPixDimension
42 
43 
44 REPO_ROOT_FILES = ("registry.sqlite3", "_mapper", "repositoryCfg.yaml", "calibRegistry.sqlite3", "_parent")
45 
46 
47 T = TypeVar("T")
48 
49 
50 class MostRecentlyUsedStack(Generic[T]):
51  """A simple container that maintains a most-recently-used ordering.
52  """
53 
54  def __init__(self):
55  self._elements = []
56 
57  def __iter__(self):
58  # Iterate in reverse order so we can keep the most recent element used
59  # at the end of the list. We want to use the end rather than the
60  # beginning because appending to lists is much more efficient than
61  # inserting at the beginning.
62  yield from reversed(self._elements)
63 
64  def apply(self, func: Callable[[T], Any]) -> Any:
65  """Apply a function to elements until it returns a value that coerces
66  to `True`, and move the corresponding element to the front of the
67  stack.
68 
69  Parameters
70  ----------
71  func : callable
72  Callable object.
73 
74  Returns
75  -------
76  value : `object`
77  The first value returned by ``func`` that coerces to `True`.
78  """
79  for n, element in enumerate(self):
80  result = func(element)
81  if result:
82  break
83  else:
84  return None
85  # Move the extractor that matched to the back of the list (note that
86  # n indexes from the back of the internal list).
87  if n != 0:
88  # i indexes from the front of the internal list.
89  i = len(self._elements) - 1 - n
90  assert self._elements[i] is element
91  del self._elements[i]
92  self._elements.append(element)
93  return result
94 
95  def push(self, element):
96  """Add a new element to the front of the stack.
97  """
98  self._elements.append(element)
99 
100 
101 @dataclass
103  """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
104  lists of related data ID values that should be included in the conversion.
105 
106  Parameters
107  ----------
108  instrument : `str`
109  Instrument name used in Gen3 data IDs.
110  visits : `set` of `int`
111  Visit IDs that define the filter.
112  """
113 
114  def __init__(self, instrument: str, visits: Set[int]):
115  self.instrument = instrument
116  self.visits = visits
117  self.regions = None
118  self.tracts = {}
119  self.skypix = {}
120 
121  def addSkyMap(self, registry: Registry, name: str):
122  """Populate the included tract IDs for the given skymap from those that
123  overlap the visits the `ConversionSubset` was initialized with.
124 
125  Parameters
126  ----------
127  registry : `lsst.daf.butler.Registry`
128  Registry that can be queried for visit/tract overlaps.
129  name : `str`
130  SkyMap name used in Gen3 data IDs.
131  """
132  tracts = set()
133  self.tracts[name] = tracts
134  for visit in self.visits:
135  for dataId in self.registry.queryDimensions(["tract"], expand=False,
136  dataId={"skymap": name, "visit": visit}):
137  tracts.add(dataId["tract"])
138  self.task.log.info("Limiting datasets defined on skymap %s to %s tracts.", name, len(tracts))
139 
140  def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
141  """Populate the included skypix IDs for the given dimension from those
142  that overlap the visits the `ConversionSubset` was initialized with.
143 
144  Parameters
145  ----------
146  registry : `lsst.daf.butler.Registry`
147  Registry that can be queried for visit regions.
148  name : `str`
149  SkyMap name used in Gen3 data IDs.
150  """
151  if self.regions is None:
152  self.regions = []
153  for visit in self.visits:
154  dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
155  self.regions.append(dataId.region)
156  ranges = RangeSet()
157  for region in self.regions:
158  ranges = ranges.join(dimension.pixelization.envelope(region))
159  self.skypix[dimension] = ranges
160 
161  def isRelated(self, dataId: DataCoordinate) -> bool:
162  """Test whether the given data ID is related to this subset and hence
163  should be included in a repository conversion.
164 
165  Parameters
166  ----------
167  dataId : `lsst.daf.butler.DataCoordinate`
168  Data ID to test.
169 
170  Returns
171  -------
172  related : `bool`
173  `True` if this data ID should be included in a repository
174  conversion.
175 
176  Notes
177  -----
178  More formally, this tests that the given data ID is not unrelated;
179  if a data ID does not involve tracts, visits, or skypix dimensions,
180  we always include it.
181  """
182  if self.visits is None:
183  # We're not filtering at all.
184  return True
185  if "visit" in dataId.graph and dataId["visit"] not in self.visits:
186  return False
187  if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
188  return False
189  for dimension, ranges in self.skypix.items():
190  if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
191  return False
192  return True
193 
194  # Class attributes that will be shadowed by public instance attributes;
195  # defined here only for documentation purposes.
196 
197  instrument: str
198  """The name of the instrument, as used in Gen3 data IDs (`str`).
199  """
200 
201  visits: Set[int]
202  """The set of visit IDs that should be included in the conversion (`set`
203  of `int`).
204  """
205 
206  regions: Optional[List[Region]]
207  """Regions for all visits (`list` of `lsst.sphgeom.Region`).
208 
209  Set to `None` before it has been initialized. Any code that attempts to
210  use it when it is `None` has a logic bug.
211  """
212 
213  tracts: Dict[str, Set[int]]
214  """Tracts that should be included in the conversion, grouped by skymap
215  name (`dict` mapping `str` to `set` of `int`).
216  """
217 
218  skypix: Dict[SkyPixDimension, RangeSet]
219  """SkyPix ranges that should be included in the conversion, grouped by
220  dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
221  """
222 
223 
224 class RepoConverter(ABC):
225  """An abstract base class for objects that help `ConvertRepoTask` convert
226  datasets from a single Gen2 repository.
227 
228  Parameters
229  ----------
230  task : `ConvertRepoTask`
231  Task instance that is using this helper object.
232  v
233 
234  Notes
235  -----
236  `RepoConverter` defines the only public API users of its subclasses should
237  use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
238  several abstract methods that subclasses must implement. In some cases,
239  subclasses may reimplement the public methods as well, but are expected to
240  delegate to ``super()`` either at the beginning or end of their own
241  implementation.
242  """
243 
244  def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
245  subset: Optional[ConversionSubset] = None):
246  self.task = task
247  self.root = root
248  self.subset = subset
249  self._collections = list(collections)
250  self._extractors: MostRecentlyUsedStack[DataIdExtractor] = MostRecentlyUsedStack()
251  self._skipParsers: MostRecentlyUsedStack[Tuple[FilePathParser, str, str]] = MostRecentlyUsedStack()
252 
253  @abstractmethod
254  def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
255  """Test whether the given dataset is handled specially by this
256  converter and hence should be ignored by generic base-class logic that
257  searches for dataset types to convert.
258 
259  Parameters
260  ----------
261  datasetTypeName : `str`
262  Name of the dataset type to test.
263 
264  Returns
265  -------
266  special : `bool`
267  `True` if the dataset type is special.
268  """
269  raise NotImplementedError()
270 
271  @abstractmethod
272  def isDirectorySpecial(self, subdirectory: str) -> bool:
273  """Test whether the given directory is handled specially by this
274  converter and hence should be ignored by generic base-class logic that
275  searches for datasets to convert.
276 
277  Parameters
278  ----------
279  subdirectory : `str`
280  Subdirectory. This is only ever a single subdirectory, and it
281  could appear anywhere within a repo root. (A full path relative
282  to the repo root might be more useful, but it is harder to
283  implement, and we don't currently need it to identify any special
284  directories).
285 
286  Returns
287  -------
288  special : `bool`
289  `True` if the direct is special.
290  """
291  raise NotImplementedError()
292 
293  @abstractmethod
294  def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
295  """Iterate over all `CameraMapper` `Mapping` objects that should be
296  considered for conversion by this repository.
297 
298  This this should include any datasets that may appear in the
299  repository, including those that are special (see
300  `isDatasetTypeSpecial`) and those that are being ignored (see
301  `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
302  to identify and hence skip these datasets quietly instead of warning
303  about them as unrecognized.
304 
305  Yields
306  ------
307  datasetTypeName: `str`
308  Name of the dataset type.
309  mapping : `lsst.obs.base.mapping.Mapping`
310  Mapping object used by the Gen2 `CameraMapper` to describe the
311  dataset type.
312  """
313  raise NotImplementedError()
314 
315  @abstractmethod
316  def makeDataIdExtractor(self, datasetTypeName: str, parser: FilePathParser,
317  storageClass: StorageClass) -> DataIdExtractor:
318  """Construct a `DataIdExtractor` instance appropriate for a particular
319  dataset type.
320 
321  Parameters
322  ----------
323  datasetTypeName : `str`
324  Name of the dataset type; typically forwarded directly to
325  the `DataIdExtractor` constructor.
326  parser : `FilePathParser`
327  Object that parses filenames into Gen2 data IDs; typically
328  forwarded directly to the `DataIdExtractor` constructor.
329  storageClass : `lsst.daf.butler.StorageClass`
330  Storage class for this dataset type in the Gen3 butler; typically
331  forwarded directly to the `DataIdExtractor` constructor.
332 
333  Returns
334  -------
335  extractor : `DataIdExtractor`
336  A new `DataIdExtractor` instance.
337  """
338  raise NotImplementedError()
339 
340  def iterDatasets(self) -> Iterator[Tuple[str, DatasetRef]]:
341  """Iterate over all datasets in the repository that should be
342  ingested into the Gen3 repository.
343 
344  Subclasses may override this method, but must delegate to the base
345  class implementation at some point in their own logic.
346 
347  Yields
348  ------
349  fileNameInRoot : `str`
350  Name of the file to be ingested, relative to the repository root.
351  ref : `lsst.daf.butler.DatasetRef`
352  Reference for the Gen3 datasets, including a complete `DatasetType`
353  and data ID.
354  """
355  for dirPath, subdirNamesInDir, fileNamesInDir in os.walk(self.root, followlinks=True):
356  # Remove subdirectories that appear to be repositories themselves
357  # from the walking
358  def isRepoRoot(dirName):
359  return any(os.path.exists(os.path.join(dirPath, dirName, f))
360  for f in REPO_ROOT_FILES)
361  subdirNamesInDir[:] = [d for d in subdirNamesInDir
362  if not isRepoRoot(d) and not self.isDirectorySpecial(d)]
363  # Loop over files in this directory, and ask per-DatasetType
364  # extractors if they recognize them and can extract a data ID;
365  # if so, ingest.
366  dirPathInRoot = dirPath[len(self.root) + len(os.path.sep):]
367  for fileNameInDir in fileNamesInDir:
368  if any(fnmatch.fnmatchcase(fileNameInDir, pattern)
369  for pattern in self.task.config.fileIgnorePatterns):
370  continue
371  fileNameInRoot = os.path.join(dirPathInRoot, fileNameInDir)
372  if fileNameInRoot in REPO_ROOT_FILES:
373  continue
374  ref = self._extractDatasetRef(fileNameInRoot)
375  if ref is not None:
376  if self.subset is None or self.subset.isRelated(ref.dataId):
377  yield fileNameInRoot, ref
378  else:
379  self._handleUnrecognizedFile(fileNameInRoot)
380 
381  def prep(self):
382  """Prepare the repository by identifying the dataset types to be
383  converted and building `DataIdExtractor` instance for them.
384 
385  Subclasses may override this method, but must delegate to the base
386  class implementation at some point in their own logic. More often,
387  subclasses will specialize the behavior of `prep` simply by overriding
388  `iterMappings`, `isDatasetTypeSpecial`, and `makeDataIdExtractor`, to
389  which the base implementation delegates.
390 
391  This should not perform any write operations to the Gen3 repository.
392  It is guaranteed to be called before `insertDimensionData` and
393  `ingest`.
394  """
395  self.task.log.info(f"Preparing other datasets from root {self.root}.")
396  for datasetTypeName, mapping in self.iterMappings():
397  try:
398  parser = FilePathParser.fromMapping(mapping)
399  except RuntimeError:
400  # No template, so there should be no way we'd get one of these
401  # in the Gen2 repo anyway (and if we do, we'll still produce a
402  # warning - just a less informative one than we might be able
403  # to produce if we had a template).
404  continue
405  if (not self.task.isDatasetTypeIncluded(datasetTypeName) or
406  self.isDatasetTypeSpecial(datasetTypeName)):
407  # User indicated not to include this data, but we still want
408  # to recognize files of that type to avoid warning about them.
409  self._skipParsers.push((parser, datasetTypeName, None))
410  continue
411  storageClass = self._guessStorageClass(datasetTypeName, mapping)
412  if storageClass is None:
413  # This may be a problem, but only if we actually encounter any
414  # files corresponding to this dataset. Of course, we need
415  # to be able to parse those files in order to recognize that
416  # situation.
417  self._skipParsers.push((parser, datasetTypeName, "no storage class found."))
418  continue
419  self._extractors.push(self.makeDataIdExtractor(datasetTypeName, parser, storageClass))
420 
422  """Insert any dimension records uniquely derived from this repository
423  into the registry.
424 
425  Subclasses may override this method, but may not need to; the default
426  implementation does nothing.
427 
428  SkyMap and SkyPix dimensions should instead be handled by calling
429  `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
430  these dimensions are in general shared by multiple Gen2 repositories.
431 
432  This method is guaranteed to be called between `prep` and `ingest`.
433  """
434  pass
435 
436  def ingest(self):
437  """Insert converted datasets into the Gen3 repository.
438 
439  Subclasses may override this method, but must delegate to the base
440  class implementation at some point in their own logic. More often,
441  subclasses will specialize the behavior of `ingest` simply by
442  overriding `iterDatasets` and `isDirectorySpecial`, to which the base
443  implementation delegates.
444 
445  This method is guaranteed to be called after both `prep` and
446  `insertDimensionData`.
447  """
448  self.task.log.info("Finding datasets in repo %s.", self.root)
449  datasets = defaultdict(list)
450  for fileNameInRoot, ref in self.iterDatasets():
451  datasets[ref.datasetType].append((fileNameInRoot, ref))
452  for datasetType, toIngest in datasets.items():
453  self.task.registry.registerDatasetType(datasetType)
454  self.task.log.info("Ingesting %s %s datasets.", len(toIngest), datasetType.name)
455  try:
456  butler3, collections = self.getButler(datasetType.name)
457  except LookupError as err:
458  self.task.log.warn(str(err))
459  continue
460  try:
461  refs = [butler3.ingest(os.path.join(self.root, fileNameInRoot), ref,
462  transfer=self.task.config.transfer)
463  for fileNameInRoot, ref in toIngest]
464  except LookupError as err:
465  raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
466  for collection in collections:
467  self.task.registry.associate(collection, refs)
468 
469  def getButler(self, datasetTypeName: str) -> Tuple[Butler3, List[str]]:
470  """Create a new Gen3 Butler appropriate for a particular dataset type.
471 
472  This should be used exclusively by subclasses when obtaining a butler
473  to use for dataset ingest (`ConvertRepoTask.butler3` should never be
474  used directly).
475 
476  Parameters
477  ----------
478  datasetTypeName : `str`
479  Name of the dataset type.
480 
481  Returns
482  -------
483  butler : `lsst.daf.butler.Butler`
484  Gen3 Butler instance appropriate for ingesting the given dataset
485  type.
486  collections : `list` of `str`
487  Collections the dataset should be associated with, in addition to
488  the one used to define the `lsst.daf.butler.Run` used in
489  ``butler``.
490  """
491  if datasetTypeName in self.task.config.collections:
492  return (
493  Butler3(butler=self.task.butler3, run=self.task.config.collections[datasetTypeName]),
494  self._collections,
495  )
496  elif self._collections:
497  return (
498  Butler3(butler=self.task.butler3, run=self._collections[0]),
499  self._collections[1:],
500  )
501  else:
502  raise LookupError("No collection configured for dataset type {datasetTypeName}.")
503 
504  def _extractDatasetRef(self, fileNameInRoot: str) -> Optional[DatasetRef]:
505  """Extract a `DatasetRef` from a file name.
506 
507  This method is for internal use by `RepoConverter` itself (not its
508  subclasses).
509 
510  Parameters
511  ----------
512  fileNameInRoot : `str`
513  Name of the file to be ingested, relative to the repository root.
514 
515  Returns
516  -------
517  ref : `lsst.daf.butler.DatasetRef` or `None`
518  Reference for the Gen3 datasets, including a complete `DatasetType`
519  and data ID. `None` if the converter does not recognize the
520  file as one to be converted.
521  """
522  def closure(extractor):
523  try:
524  dataId = extractor.apply(fileNameInRoot)
525  except LookupError as err:
526  raise RuntimeError(f"Error extracting data ID for {extractor.datasetType.name} "
527  f"on file {fileNameInRoot}.") from err
528  if dataId is None:
529  return None
530  else:
531  return DatasetRef(extractor.datasetType, dataId=dataId)
532  return self._extractors.apply(closure)
533 
534  def _handleUnrecognizedFile(self, fileNameInRoot: str):
535  """Generate appropriate warnings (or not) for files not matched by
536  `_extractDatasetRef`.
537 
538  This method is for internal use by `RepoConverter` itself (not its
539  subclasses).
540 
541  Parameters
542  ----------
543  fileNameInRoot : `str`
544  Name of the file, relative to the repository root.
545  """
546  def closure(skipTuple):
547  parser, datasetTypeName, message = skipTuple
548  if parser(fileNameInRoot) is not None:
549  if message is not None:
550  self.task.log.warn("Skipping dataset %s file %s: %s", datasetTypeName,
551  fileNameInRoot, message)
552  return True
553  return False
554  if not self._skipParsers.apply(closure):
555  self.task.log.warn("Skipping unrecognized file %s.", fileNameInRoot)
556 
557  def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
558  ) -> Optional[StorageClass]:
559  """Infer the Gen3 `StorageClass` from a dataset from a combination of
560  configuration and Gen2 dataset type information.
561 
562  datasetTypeName: `str`
563  Name of the dataset type.
564  mapping : `lsst.obs.base.mapping.Mapping`
565  Mapping object used by the Gen2 `CameraMapper` to describe the
566  dataset type.
567  """
568  storageClassName = self.task.config.storageClasses.get(datasetTypeName)
569  if storageClassName is None and mapping.python is not None:
570  storageClassName = self.task.config.storageClasses.get(mapping.python, None)
571  if storageClassName is None and mapping.persistable is not None:
572  storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
573  if storageClassName is None and mapping.python is not None:
574  unqualified = mapping.python.split(".")[-1]
575  storageClassName = self.task.config.storageClasses.get(unqualified, None)
576  if storageClassName is not None:
577  storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
578  else:
579  try:
580  storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
581  except KeyError:
582  storageClass = None
583  if storageClass is None and mapping.python is not None:
584  try:
585  storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
586  except KeyError:
587  pass
588  if storageClass is None:
589  self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
590  else:
591  self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
592  return storageClass
593 
594  # Class attributes that will be shadowed by public instance attributes;
595  # defined here only for documentation purposes.
596 
597  task: ConvertRepoTask
598  """The parent task that constructed and uses this converter
599  (`ConvertRepoTask`).
600  """
601 
602  root: str
603  """Root path to the Gen2 repository this converter manages (`str`).
604 
605  This is a complete path, not relative to some other repository root.
606  """
607 
608  subset: Optional[ConversionSubset]
609  """An object that represents a filter to be applied to the datasets that
610  are converted (`ConversionSubset` or `None`).
611  """