21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
25 from dataclasses
import dataclass
26 from collections
import defaultdict
27 from abc
import ABC, abstractmethod
44 from lsst.daf.butler
import DataCoordinate, FileDataset, DatasetType
45 from lsst.sphgeom
import RangeSet, Region
46 from .repoWalker
import RepoWalker
49 from ..mapping
import Mapping
as CameraMapperMapping
50 from .convertRepo
import ConvertRepoTask
51 from .scanner
import PathElementHandler
52 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension, FormatterParameter
57 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
58 lists of related data ID values that should be included in the conversion.
63 Instrument name used in Gen3 data IDs.
64 visits : `set` of `int`
65 Visit IDs that define the filter.
68 def __init__(self, instrument: str, visits: Set[int]):
76 """Populate the included tract IDs for the given skymap from those that
77 overlap the visits the `ConversionSubset` was initialized with.
81 registry : `lsst.daf.butler.Registry`
82 Registry that can be queried for visit/tract overlaps.
84 SkyMap name used in Gen3 data IDs.
89 for dataId
in registry.queryDataIds([
"tract"],
90 dataId={
"skymap": name,
93 tracts.add(dataId[
"tract"])
95 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
96 """Populate the included skypix IDs for the given dimension from those
97 that overlap the visits the `ConversionSubset` was initialized with.
101 registry : `lsst.daf.butler.Registry`
102 Registry that can be queried for visit regions.
104 SkyMap name used in Gen3 data IDs.
109 dataId = registry.expandDataId(instrument=self.
instrument, visit=visit)
110 self.
regions.append(dataId.region)
113 ranges = ranges.union(dimension.pixelization.envelope(region))
114 self.
skypix[dimension] = ranges
117 """Test whether the given data ID is related to this subset and hence
118 should be included in a repository conversion.
122 dataId : `lsst.daf.butler.DataCoordinate`
128 `True` if this data ID should be included in a repository
133 More formally, this tests that the given data ID is not unrelated;
134 if a data ID does not involve tracts, visits, or skypix dimensions,
135 we always include it.
140 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visits:
142 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tracts[dataId[
"skymap"]]:
144 for dimension, ranges
in self.
skypix.items():
145 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
153 """The name of the instrument, as used in Gen3 data IDs (`str`).
157 """The set of visit IDs that should be included in the conversion (`set`
161 regions: Optional[List[Region]]
162 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
164 Set to `None` before it has been initialized. Any code that attempts to
165 use it when it is `None` has a logic bug.
168 tracts: Dict[str, Set[int]]
169 """Tracts that should be included in the conversion, grouped by skymap
170 name (`dict` mapping `str` to `set` of `int`).
173 skypix: Dict[SkyPixDimension, RangeSet]
174 """SkyPix ranges that should be included in the conversion, grouped by
175 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
180 """An abstract base class for objects that help `ConvertRepoTask` convert
181 datasets from a single Gen2 repository.
185 task : `ConvertRepoTask`
186 Task instance that is using this helper object.
188 Root of the Gen2 repo being converted. Will be converted to an
189 absolute path, resolving symbolic links and ``~``, if necessary.
190 collections : `list` of `str`
191 Gen3 collections with which all converted datasets should be
193 subset : `ConversionSubset, optional
194 Helper object that implements a filter that restricts the data IDs that
199 `RepoConverter` defines the only public API users of its subclasses should
200 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
201 several abstract methods that subclasses must implement. In some cases,
202 subclasses may reimplement the public methods as well, but are expected to
203 delegate to ``super()`` either at the beginning or end of their own
207 def __init__(self, *, task: ConvertRepoTask, root: str, run: Optional[str],
208 subset: Optional[ConversionSubset] =
None):
210 self.
root = os.path.realpath(os.path.expanduser(root))
214 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
218 """Test whether the given dataset is handled specially by this
219 converter and hence should be ignored by generic base-class logic that
220 searches for dataset types to convert.
224 datasetTypeName : `str`
225 Name of the dataset type to test.
230 `True` if the dataset type is special.
232 raise NotImplementedError()
236 """Iterate over all `CameraMapper` `Mapping` objects that should be
237 considered for conversion by this repository.
239 This this should include any datasets that may appear in the
240 repository, including those that are special (see
241 `isDatasetTypeSpecial`) and those that are being ignored (see
242 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
243 to identify and hence skip these datasets quietly instead of warning
244 about them as unrecognized.
248 datasetTypeName: `str`
249 Name of the dataset type.
250 mapping : `lsst.obs.base.mapping.Mapping`
251 Mapping object used by the Gen2 `CameraMapper` to describe the
254 raise NotImplementedError()
258 storageClass: StorageClass,
259 formatter: FormatterParameter =
None,
260 targetHandler: Optional[PathElementHandler] =
None,
261 ) -> RepoWalker.Target:
262 """Make a struct that identifies a dataset type to be extracted by
263 walking the repo directory structure.
267 datasetTypeName : `str`
268 Name of the dataset type (the same in both Gen2 and Gen3).
270 The full Gen2 filename template.
271 keys : `dict` [`str`, `type`]
272 A dictionary mapping Gen2 data ID key to the type of its value.
273 storageClass : `lsst.daf.butler.StorageClass`
274 Gen3 storage class for this dataset type.
275 formatter : `lsst.daf.butler.Formatter` or `str`, optional
276 A Gen 3 formatter class or fully-qualified name.
277 targetHandler : `PathElementHandler`, optional
278 Specialist target handler to use for this dataset type.
282 target : `RepoWalker.Target`
283 A struct containing information about the target dataset (much of
284 it simplify forwarded from the arguments).
286 raise NotImplementedError()
289 """Return a list of directory paths that should not be searched for
292 These may be directories that simply do not contain datasets (or
293 contain datasets in another repository), or directories whose datasets
294 are handled specially by a subclass.
298 directories : `list` [`str`]
299 The full paths of directories to skip, relative to the repository
305 """Perform preparatory work associated with the dataset types to be
306 converted from this repository (but not the datasets themselves).
310 This should be a relatively fast operation that should not depend on
311 the size of the repository.
313 Subclasses may override this method, but must delegate to the base
314 class implementation at some point in their own logic.
315 More often, subclasses will specialize the behavior of `prep` by
316 overriding other methods to which the base class implementation
317 delegates. These include:
319 - `isDatasetTypeSpecial`
320 - `getSpecialDirectories`
321 - `makeRepoWalkerTarget`
323 This should not perform any write operations to the Gen3 repository.
324 It is guaranteed to be called before `insertDimensionData`.
326 self.
task.log.info(f
"Preparing other dataset types from root {self.root}.")
327 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
330 template = mapping.template
339 if (
not self.
task.isDatasetTypeIncluded(datasetTypeName)
346 if storageClass
is None:
351 message = f
"no storage class found for {datasetTypeName}"
354 if template.endswith(
".fits"):
355 extensions.extend((
".gz",
".fz"))
356 for extension
in extensions:
358 walkerInput = RepoWalker.Skip(
359 template=template+extension,
363 self.
task.log.debug(
"Skipping template in walker: %s", template)
365 assert message
is None
366 targetHandler = self.
task.config.targetHandlerClasses.get(datasetTypeName)
367 if targetHandler
is not None:
368 targetHandler = doImport(targetHandler)
370 datasetTypeName=datasetTypeName,
371 template=template+extension,
373 storageClass=storageClass,
374 formatter=self.
task.config.formatterClasses.get(datasetTypeName),
375 targetHandler=targetHandler,
377 self.
task.log.debug(
"Adding template to walker: %s + %s, for %s", template, extension,
378 walkerInput.datasetType)
379 walkerInputs.append(walkerInput)
390 fileIgnoreRegExTerms = []
391 for pattern
in self.
task.config.fileIgnorePatterns:
392 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
393 if fileIgnoreRegExTerms:
394 fileIgnoreRegEx = re.compile(
"|".join(fileIgnoreRegExTerms))
396 fileIgnoreRegEx =
None
398 log=self.
task.log.getChild(
"repoWalker"))
401 """Iterate over datasets in the repository that should be ingested into
404 The base class implementation yields nothing; the datasets handled by
405 the `RepoConverter` base class itself are read directly in
408 Subclasses should override this method if they support additional
409 datasets that are handled some other way.
413 dataset : `FileDataset`
414 Structures representing datasets to be ingested. Paths should be
420 assert self.
_repoWalker,
"prep() must be called before findDatasets."
421 self.
task.log.info(
"Adding special datasets in repo %s.", self.
root)
423 assert len(dataset.refs) == 1
424 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
425 self.
task.log.info(
"Finding datasets from files in repo %s.", self.
root)
426 self._fileDatasets.update(
429 predicate=(self.
subset.isRelated
if self.
subset is not None else None)
434 """Insert any dimension records uniquely derived from this repository
437 Subclasses may override this method, but may not need to; the default
438 implementation does nothing.
440 SkyMap and SkyPix dimensions should instead be handled by calling
441 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
442 these dimensions are in general shared by multiple Gen2 repositories.
444 This method is guaranteed to be called between `prep` and
450 """Expand the data IDs for all datasets to be inserted.
452 Subclasses may override this method, but must delegate to the base
453 class implementation if they do.
455 This involves queries to the registry, but not writes. It is
456 guaranteed to be called between `insertDimensionData` and `ingest`.
459 for datasetType, datasetsForType
in self._fileDatasets.items():
460 self.task.log.info(
"Expanding data IDs for %s %s datasets.", len(datasetsForType),
463 for dataset
in datasetsForType:
464 for i, ref
in enumerate(dataset.refs):
466 dataId = self.task.registry.expandDataId(ref.dataId)
467 dataset.refs[i] = ref.expanded(dataId)
468 except LookupError
as err:
469 self.task.log.warn(
"Skipping ingestion for '%s': %s", dataset.path, err)
471 dataset.refs[i] =
None
472 dataset.refs[:] = itertools.filterfalse(
lambda x: x
is None, dataset.refs)
474 expanded.append(dataset)
476 datasetsForType[:] = expanded
479 """Insert converted datasets into the Gen3 repository.
481 Subclasses may override this method, but must delegate to the base
482 class implementation at some point in their own logic.
484 This method is guaranteed to be called after `expandDataIds`.
486 for datasetType, datasetsForType
in self._fileDatasets.items():
487 self.
task.registry.registerDatasetType(datasetType)
489 run = self.
getRun(datasetType.name)
491 self.
task.log.warn(f
"No run configured for dataset type {datasetType.name}.")
493 self.
task.log.info(
"Ingesting %s %s datasets into run %s.", len(datasetsForType),
494 datasetType.name, run)
496 self.
task.registry.registerRun(run)
497 self.
task.butler3.ingest(*datasetsForType, transfer=self.
task.config.transfer, run=run)
498 except LookupError
as err:
499 raise LookupError(f
"Error expanding data ID for dataset type {datasetType.name}.")
from err
501 def getRun(self, datasetTypeName: str) -> str:
502 """Return the name of the run to insert instances of the given dataset
503 type into in this collection.
507 datasetTypeName : `str`
508 Name of the dataset type.
513 Name of the `~lsst.daf.butler.CollectionType.RUN` collection.
515 assert self.
_run is not None,
"Method must be overridden if self._run is allowed to be None"
518 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
519 ) -> Optional[StorageClass]:
520 """Infer the Gen3 `StorageClass` from a dataset from a combination of
521 configuration and Gen2 dataset type information.
523 datasetTypeName: `str`
524 Name of the dataset type.
525 mapping : `lsst.obs.base.mapping.Mapping`
526 Mapping object used by the Gen2 `CameraMapper` to describe the
529 storageClassName = self.
task.config.storageClasses.get(datasetTypeName)
530 if storageClassName
is None and mapping.python
is not None:
531 storageClassName = self.
task.config.storageClasses.get(mapping.python,
None)
532 if storageClassName
is None and mapping.persistable
is not None:
533 storageClassName = self.
task.config.storageClasses.get(mapping.persistable,
None)
534 if storageClassName
is None and mapping.python
is not None:
535 unqualified = mapping.python.split(
".")[-1]
536 storageClassName = self.
task.config.storageClasses.get(unqualified,
None)
537 if storageClassName
is not None:
538 storageClass = self.
task.butler3.storageClasses.getStorageClass(storageClassName)
541 storageClass = self.
task.butler3.storageClasses.getStorageClass(mapping.persistable)
544 if storageClass
is None and mapping.python
is not None:
546 storageClass = self.
task.butler3.storageClasses.getStorageClass(unqualified)
549 if storageClass
is None:
550 self.
task.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
552 self.
task.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
558 task: ConvertRepoTask
559 """The parent task that constructed and uses this converter
564 """Root path to the Gen2 repository this converter manages (`str`).
566 This is a complete path, not relative to some other repository root.
569 subset: Optional[ConversionSubset]
570 """An object that represents a filter to be applied to the datasets that
571 are converted (`ConversionSubset` or `None`).