21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
25 from dataclasses
import dataclass
26 from collections
import defaultdict
27 from abc
import ABC, abstractmethod
42 from lsst.daf.butler
import DataCoordinate, FileDataset, DatasetType
43 from lsst.sphgeom
import RangeSet, Region
44 from .repoWalker
import RepoWalker
47 from ..mapping
import Mapping
as CameraMapperMapping
48 from .convertRepo
import ConvertRepoTask
49 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension
54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 55 lists of related data ID values that should be included in the conversion. 60 Instrument name used in Gen3 data IDs. 61 visits : `set` of `int` 62 Visit IDs that define the filter. 65 def __init__(self, instrument: str, visits: Set[int]):
73 """Populate the included tract IDs for the given skymap from those that 74 overlap the visits the `ConversionSubset` was initialized with. 78 registry : `lsst.daf.butler.Registry` 79 Registry that can be queried for visit/tract overlaps. 81 SkyMap name used in Gen3 data IDs. 86 for dataId
in registry.queryDimensions([
"tract"], expand=
False,
87 dataId={
"skymap": name,
90 tracts.add(dataId[
"tract"])
92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93 """Populate the included skypix IDs for the given dimension from those 94 that overlap the visits the `ConversionSubset` was initialized with. 98 registry : `lsst.daf.butler.Registry` 99 Registry that can be queried for visit regions. 101 SkyMap name used in Gen3 data IDs. 106 dataId = registry.expandDataId(instrument=self.
instrument, visit=visit)
107 self.
regions.append(dataId.region)
110 ranges = ranges.union(dimension.pixelization.envelope(region))
111 self.
skypix[dimension] = ranges
114 """Test whether the given data ID is related to this subset and hence 115 should be included in a repository conversion. 119 dataId : `lsst.daf.butler.DataCoordinate` 125 `True` if this data ID should be included in a repository 130 More formally, this tests that the given data ID is not unrelated; 131 if a data ID does not involve tracts, visits, or skypix dimensions, 132 we always include it. 137 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visits:
139 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tracts[dataId[
"skymap"]]:
141 for dimension, ranges
in self.
skypix.items():
142 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
150 """The name of the instrument, as used in Gen3 data IDs (`str`). 154 """The set of visit IDs that should be included in the conversion (`set` 158 regions: Optional[List[Region]]
159 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 161 Set to `None` before it has been initialized. Any code that attempts to 162 use it when it is `None` has a logic bug. 165 tracts: Dict[str, Set[int]]
166 """Tracts that should be included in the conversion, grouped by skymap 167 name (`dict` mapping `str` to `set` of `int`). 170 skypix: Dict[SkyPixDimension, RangeSet]
171 """SkyPix ranges that should be included in the conversion, grouped by 172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 177 """An abstract base class for objects that help `ConvertRepoTask` convert 178 datasets from a single Gen2 repository. 182 task : `ConvertRepoTask` 183 Task instance that is using this helper object. 185 Root of the Gen2 repo being converted. 186 collections : `list` of `str` 187 Gen3 collections with which all converted datasets should be 189 subset : `ConversionSubset, optional 190 Helper object that implements a filter that restricts the data IDs that 195 `RepoConverter` defines the only public API users of its subclasses should 196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 197 several abstract methods that subclasses must implement. In some cases, 198 subclasses may reimplement the public methods as well, but are expected to 199 delegate to ``super()`` either at the beginning or end of their own 203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204 subset: Optional[ConversionSubset] =
None):
210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
214 """Test whether the given dataset is handled specially by this 215 converter and hence should be ignored by generic base-class logic that 216 searches for dataset types to convert. 220 datasetTypeName : `str` 221 Name of the dataset type to test. 226 `True` if the dataset type is special. 228 raise NotImplementedError()
232 """Iterate over all `CameraMapper` `Mapping` objects that should be 233 considered for conversion by this repository. 235 This this should include any datasets that may appear in the 236 repository, including those that are special (see 237 `isDatasetTypeSpecial`) and those that are being ignored (see 238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 239 to identify and hence skip these datasets quietly instead of warning 240 about them as unrecognized. 244 datasetTypeName: `str` 245 Name of the dataset type. 246 mapping : `lsst.obs.base.mapping.Mapping` 247 Mapping object used by the Gen2 `CameraMapper` to describe the 250 raise NotImplementedError()
254 storageClass: StorageClass) -> RepoWalker.Target:
255 """Make a struct that identifies a dataset type to be extracted by 256 walking the repo directory structure. 260 datasetTypeName : `str` 261 Name of the dataset type (the same in both Gen2 and Gen3). 263 The full Gen2 filename template. 264 keys : `dict` [`str`, `type`] 265 A dictionary mapping Gen2 data ID key to the type of its value. 266 storageClass : `lsst.daf.butler.StorageClass` 267 Gen3 storage class for this dataset type. 271 target : `RepoWalker.Target` 272 A struct containing information about the target dataset (much of 273 it simplify forwarded from the arguments). 275 raise NotImplementedError()
278 """Return a list of directory paths that should not be searched for 281 These may be directories that simply do not contain datasets (or 282 contain datasets in another repository), or directories whose datasets 283 are handled specially by a subclass. 287 directories : `list` [`str`] 288 The full paths of directories to skip, relative to the repository 294 """Perform preparatory work associated with the dataset types to be 295 converted from this repository (but not the datasets themselves). 299 This should be a relatively fast operation that should not depend on 300 the size of the repository. 302 Subclasses may override this method, but must delegate to the base 303 class implementation at some point in their own logic. 304 More often, subclasses will specialize the behavior of `prep` by 305 overriding other methods to which the base class implementation 306 delegates. These include: 308 - `isDatasetTypeSpecial` 309 - `getSpecialDirectories` 310 - `makeRepoWalkerTarget` 312 This should not perform any write operations to the Gen3 repository. 313 It is guaranteed to be called before `insertDimensionData`. 315 self.
task.log.info(f
"Preparing other dataset types from root {self.root}.")
316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
319 template = mapping.template
327 if (
not self.
task.isDatasetTypeIncluded(datasetTypeName)
334 if storageClass
is None:
339 message = f
"no storage class found for {datasetTypeName}" 342 walkerInput = RepoWalker.Skip(
348 assert message
is None 350 datasetTypeName=datasetTypeName,
353 storageClass=storageClass,
355 walkerInputs.append(walkerInput)
365 fileIgnoreRegExTerms = []
366 for pattern
in self.
task.config.fileIgnorePatterns:
367 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
368 if fileIgnoreRegExTerms:
369 fileIgnoreRegEx = re.compile(
"|".join(fileIgnoreRegExTerms))
371 fileIgnoreRegEx =
None 375 """Iterate over datasets in the repository that should be ingested into 378 The base class implementation yields nothing; the datasets handled by 379 the `RepoConverter` base class itself are read directly in 382 Subclasses should override this method if they support additional 383 datasets that are handled some other way. 387 dataset : `FileDataset` 388 Structures representing datasets to be ingested. Paths should be 394 assert self.
_repoWalker,
"prep() must be called before findDatasets." 395 self.
task.log.info(
"Adding special datasets in repo %s.", self.
root)
397 assert len(dataset.refs) == 1
398 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
399 self.
task.log.info(
"Finding datasets from files in repo %s.", self.
root)
400 self._fileDatasets.update(
404 predicate=(self.
subset.isRelated
if self.
subset is not None else None)
409 """Insert any dimension records uniquely derived from this repository 412 Subclasses may override this method, but may not need to; the default 413 implementation does nothing. 415 SkyMap and SkyPix dimensions should instead be handled by calling 416 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 417 these dimensions are in general shared by multiple Gen2 repositories. 419 This method is guaranteed to be called between `prep` and 425 self.task.log.warn(
"Skipping ingestion for '%s': %s", dataset.path, err)
429 """Expand the data IDs for all datasets to be inserted. 431 Subclasses may override this method, but must delegate to the base 432 class implementation if they do. If they wish to handle expected 433 failures in data ID expansion, they should override 434 `handleDataIdExpansionFailure` instead. 436 This involves queries to the registry, but not writes. It is 437 guaranteed to be called between `insertDimensionData` and `ingest`. 439 for datasetType, datasetsForType
in self._fileDatasets.items():
440 self.
task.log.info(
"Expanding data IDs for %s %s datasets.", len(datasetsForType),
443 for dataset
in datasetsForType:
444 for i, ref
in enumerate(dataset.refs):
446 dataId = self.
task.registry.expandDataId(ref.dataId)
447 dataset.refs[i] = ref.expanded(dataId)
448 expanded.append(dataset)
449 except LookupError
as err:
451 expanded.append(dataset)
452 datasetsForType[:] = expanded
455 """Insert converted datasets into the Gen3 repository. 457 Subclasses may override this method, but must delegate to the base 458 class implementation at some point in their own logic. 460 This method is guaranteed to be called after `expandDataIds`. 462 for datasetType, datasetsForType
in self._fileDatasets.items():
463 self.
task.registry.registerDatasetType(datasetType)
464 self.
task.log.info(
"Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
467 except LookupError
as err:
468 self.
task.log.warn(str(err))
471 self.
task.registry.registerRun(collections[0])
472 self.
task.butler3.ingest(*datasetsForType, transfer=self.
task.config.transfer,
474 except LookupError
as err:
475 raise LookupError(f
"Error expanding data ID for dataset type {datasetType.name}.")
from err
476 for collection
in collections[1:]:
477 self.
task.registry.associate(collection,
478 [ref
for dataset
in datasetsForType
for ref
in dataset.refs])
481 """Return the set of collections a particular dataset type should be 486 datasetTypeName : `str` 487 Name of the dataset type. 491 collections : `list` of `str` 492 Collections the dataset should be associated with. The first 493 item in the list is the run the dataset should be added to 496 if datasetTypeName
in self.
task.config.collections:
501 raise LookupError(
"No collection configured for dataset type {datasetTypeName}.")
503 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
504 ) -> Optional[StorageClass]:
505 """Infer the Gen3 `StorageClass` from a dataset from a combination of 506 configuration and Gen2 dataset type information. 508 datasetTypeName: `str` 509 Name of the dataset type. 510 mapping : `lsst.obs.base.mapping.Mapping` 511 Mapping object used by the Gen2 `CameraMapper` to describe the 514 storageClassName = self.
task.config.storageClasses.get(datasetTypeName)
515 if storageClassName
is None and mapping.python
is not None:
516 storageClassName = self.
task.config.storageClasses.get(mapping.python,
None)
517 if storageClassName
is None and mapping.persistable
is not None:
518 storageClassName = self.
task.config.storageClasses.get(mapping.persistable,
None)
519 if storageClassName
is None and mapping.python
is not None:
520 unqualified = mapping.python.split(
".")[-1]
521 storageClassName = self.
task.config.storageClasses.get(unqualified,
None)
522 if storageClassName
is not None:
523 storageClass = self.
task.butler3.storageClasses.getStorageClass(storageClassName)
526 storageClass = self.
task.butler3.storageClasses.getStorageClass(mapping.persistable)
529 if storageClass
is None and mapping.python
is not None:
531 storageClass = self.
task.butler3.storageClasses.getStorageClass(unqualified)
534 if storageClass
is None:
535 self.
task.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
537 self.
task.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
543 task: ConvertRepoTask
544 """The parent task that constructed and uses this converter 549 """Root path to the Gen2 repository this converter manages (`str`). 551 This is a complete path, not relative to some other repository root. 554 subset: Optional[ConversionSubset]
555 """An object that represents a filter to be applied to the datasets that 556 are converted (`ConversionSubset` or `None`).
def handleDataIdExpansionFailure
def getSpecialDirectories(self)
def insertDimensionData(self)