21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
25 from dataclasses
import dataclass
26 from collections
import defaultdict
27 from abc
import ABC, abstractmethod
42 from lsst.daf.butler
import Butler
as Butler3, DataCoordinate, FileDataset, DatasetType
43 from lsst.sphgeom
import RangeSet, Region
44 from .repoWalker
import RepoWalker
47 from ..mapping
import Mapping
as CameraMapperMapping
48 from .convertRepo
import ConvertRepoTask
49 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension
54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 55 lists of related data ID values that should be included in the conversion. 60 Instrument name used in Gen3 data IDs. 61 visits : `set` of `int` 62 Visit IDs that define the filter. 65 def __init__(self, instrument: str, visits: Set[int]):
73 """Populate the included tract IDs for the given skymap from those that 74 overlap the visits the `ConversionSubset` was initialized with. 78 registry : `lsst.daf.butler.Registry` 79 Registry that can be queried for visit/tract overlaps. 81 SkyMap name used in Gen3 data IDs. 86 for dataId
in registry.queryDimensions([
"tract"], expand=
False,
87 dataId={
"skymap": name,
90 tracts.add(dataId[
"tract"])
92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93 """Populate the included skypix IDs for the given dimension from those 94 that overlap the visits the `ConversionSubset` was initialized with. 98 registry : `lsst.daf.butler.Registry` 99 Registry that can be queried for visit regions. 101 SkyMap name used in Gen3 data IDs. 106 dataId = registry.expandDataId(instrument=self.
instrument, visit=visit)
107 self.
regions.append(dataId.region)
110 ranges = ranges.union(dimension.pixelization.envelope(region))
111 self.
skypix[dimension] = ranges
114 """Test whether the given data ID is related to this subset and hence 115 should be included in a repository conversion. 119 dataId : `lsst.daf.butler.DataCoordinate` 125 `True` if this data ID should be included in a repository 130 More formally, this tests that the given data ID is not unrelated; 131 if a data ID does not involve tracts, visits, or skypix dimensions, 132 we always include it. 137 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visits:
139 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tracts[dataId[
"skymap"]]:
141 for dimension, ranges
in self.
skypix.items():
142 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
150 """The name of the instrument, as used in Gen3 data IDs (`str`). 154 """The set of visit IDs that should be included in the conversion (`set` 158 regions: Optional[List[Region]]
159 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 161 Set to `None` before it has been initialized. Any code that attempts to 162 use it when it is `None` has a logic bug. 165 tracts: Dict[str, Set[int]]
166 """Tracts that should be included in the conversion, grouped by skymap 167 name (`dict` mapping `str` to `set` of `int`). 170 skypix: Dict[SkyPixDimension, RangeSet]
171 """SkyPix ranges that should be included in the conversion, grouped by 172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 177 """An abstract base class for objects that help `ConvertRepoTask` convert 178 datasets from a single Gen2 repository. 182 task : `ConvertRepoTask` 183 Task instance that is using this helper object. 185 Root of the Gen2 repo being converted. 186 collections : `list` of `str` 187 Gen3 collections with which all converted datasets should be 189 subset : `ConversionSubset, optional 190 Helper object that implements a filter that restricts the data IDs that 195 `RepoConverter` defines the only public API users of its subclasses should 196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 197 several abstract methods that subclasses must implement. In some cases, 198 subclasses may reimplement the public methods as well, but are expected to 199 delegate to ``super()`` either at the beginning or end of their own 203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204 subset: Optional[ConversionSubset] =
None):
210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
214 """Test whether the given dataset is handled specially by this 215 converter and hence should be ignored by generic base-class logic that 216 searches for dataset types to convert. 220 datasetTypeName : `str` 221 Name of the dataset type to test. 226 `True` if the dataset type is special. 228 raise NotImplementedError()
232 """Iterate over all `CameraMapper` `Mapping` objects that should be 233 considered for conversion by this repository. 235 This this should include any datasets that may appear in the 236 repository, including those that are special (see 237 `isDatasetTypeSpecial`) and those that are being ignored (see 238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 239 to identify and hence skip these datasets quietly instead of warning 240 about them as unrecognized. 244 datasetTypeName: `str` 245 Name of the dataset type. 246 mapping : `lsst.obs.base.mapping.Mapping` 247 Mapping object used by the Gen2 `CameraMapper` to describe the 250 raise NotImplementedError()
254 storageClass: StorageClass) -> RepoWalker.Target:
255 """Make a struct that identifies a dataset type to be extracted by 256 walking the repo directory structure. 260 datasetTypeName : `str` 261 Name of the dataset type (the same in both Gen2 and Gen3). 263 The full Gen2 filename template. 264 keys : `dict` [`str`, `type`] 265 A dictionary mapping Gen2 data ID key to the type of its value. 266 storageClass : `lsst.daf.butler.StorageClass` 267 Gen3 storage class for this dataset type. 271 target : `RepoWalker.Target` 272 A struct containing information about the target dataset (much of 273 it simplify forwarded from the arguments). 275 raise NotImplementedError()
278 """Return a list of directory paths that should not be searched for 281 These may be directories that simply do not contain datasets (or 282 contain datasets in another repository), or directories whose datasets 283 are handled specially by a subclass. 287 directories : `list` [`str`] 288 The full paths of directories to skip, relative to the repository 294 """Perform preparatory work associated with the dataset types to be 295 converted from this repository (but not the datasets themselves). 299 This should be a relatively fast operation that should not depend on 300 the size of the repository. 302 Subclasses may override this method, but must delegate to the base 303 class implementation at some point in their own logic. 304 More often, subclasses will specialize the behavior of `prep` by 305 overriding other methods to which the base class implementation 306 delegates. These include: 308 - `isDatasetTypeSpecial` 309 - `getSpecialDirectories` 310 - `makeRepoWalkerTarget` 312 This should not perform any write operations to the Gen3 repository. 313 It is guaranteed to be called before `insertDimensionData`. 315 self.
task.log.info(f
"Preparing other dataset types from root {self.root}.")
316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
319 template = mapping.template
327 if (
not self.
task.isDatasetTypeIncluded(datasetTypeName)
334 if storageClass
is None:
339 message = f
"no storage class found for {datasetTypeName}" 342 walkerInput = RepoWalker.Skip(
348 assert message
is None 350 datasetTypeName=datasetTypeName,
353 storageClass=storageClass,
355 walkerInputs.append(walkerInput)
365 fileIgnoreRegExTerms = []
366 for pattern
in self.
task.config.fileIgnorePatterns:
367 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
368 if fileIgnoreRegExTerms:
369 fileIgnoreRegEx = re.compile(
"|".join(fileIgnoreRegExTerms))
371 fileIgnoreRegEx =
None 375 """Iterate over datasets in the repository that should be ingested into 378 The base class implementation yields nothing; the datasets handled by 379 the `RepoConverter` base class itself are read directly in 382 Subclasses should override this method if they support additional 383 datasets that are handled some other way. 387 dataset : `FileDataset` 388 Structures representing datasets to be ingested. Paths should be 394 assert self.
_repoWalker,
"prep() must be called before findDatasets." 395 self.
task.log.info(
"Adding special datasets in repo %s.", self.
root)
397 assert len(dataset.refs) == 1
398 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
399 self.
task.log.info(
"Finding datasets from files in repo %s.", self.
root)
400 self._fileDatasets.update(
404 predicate=(self.
subset.isRelated
if self.
subset is not None else None)
409 """Insert any dimension records uniquely derived from this repository 412 Subclasses may override this method, but may not need to; the default 413 implementation does nothing. 415 SkyMap and SkyPix dimensions should instead be handled by calling 416 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 417 these dimensions are in general shared by multiple Gen2 repositories. 419 This method is guaranteed to be called between `prep` and 425 self.task.log.warn(
"Skipping ingestion for '%s': %s", dataset.path, err)
429 """Expand the data IDs for all datasets to be inserted. 431 Subclasses may override this method, but must delegate to the base 432 class implementation if they do. If they wish to handle expected 433 failures in data ID expansion, they should override 434 `handleDataIdExpansionFailure` instead. 436 This involves queries to the registry, but not writes. It is 437 guaranteed to be called between `insertDimensionData` and `ingest`. 439 for datasetType, datasetsForType
in self._fileDatasets.items():
440 self.
task.log.info(
"Expanding data IDs for %s %s datasets.", len(datasetsForType),
443 for dataset
in datasetsForType:
444 for i, ref
in enumerate(dataset.refs):
446 dataId = self.
task.registry.expandDataId(ref.dataId)
447 dataset.refs[i] = ref.expanded(dataId)
448 expanded.append(dataset)
449 except LookupError
as err:
451 expanded.append(dataset)
452 datasetsForType[:] = expanded
455 """Insert converted datasets into the Gen3 repository. 457 Subclasses may override this method, but must delegate to the base 458 class implementation at some point in their own logic. 460 This method is guaranteed to be called after `expandDataIds`. 462 for datasetType, datasetsForType
in self._fileDatasets.items():
463 self.
task.registry.registerDatasetType(datasetType)
464 self.
task.log.info(
"Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
466 butler3, collections = self.
getButler(datasetType.name)
467 except LookupError
as err:
468 self.
task.log.warn(str(err))
471 butler3.ingest(*datasetsForType, transfer=self.
task.config.transfer)
472 except LookupError
as err:
473 raise LookupError(f
"Error expanding data ID for dataset type {datasetType.name}.")
from err
474 for collection
in collections:
475 self.
task.registry.associate(collection,
476 [ref
for dataset
in datasetsForType
for ref
in dataset.refs])
478 def getButler(self, datasetTypeName: str) -> Tuple[Butler3, List[str]]:
479 """Create a new Gen3 Butler appropriate for a particular dataset type. 481 This should be used exclusively by subclasses when obtaining a butler 482 to use for dataset ingest (`ConvertRepoTask.butler3` should never be 487 datasetTypeName : `str` 488 Name of the dataset type. 492 butler : `lsst.daf.butler.Butler` 493 Gen3 Butler instance appropriate for ingesting the given dataset 495 collections : `list` of `str` 496 Collections the dataset should be associated with, in addition to 497 the one used to define the `lsst.daf.butler.Run` used in 500 if datasetTypeName
in self.
task.config.collections:
502 Butler3(butler=self.
task.butler3, run=self.
task.config.collections[datasetTypeName]),
511 raise LookupError(
"No collection configured for dataset type {datasetTypeName}.")
513 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
514 ) -> Optional[StorageClass]:
515 """Infer the Gen3 `StorageClass` from a dataset from a combination of 516 configuration and Gen2 dataset type information. 518 datasetTypeName: `str` 519 Name of the dataset type. 520 mapping : `lsst.obs.base.mapping.Mapping` 521 Mapping object used by the Gen2 `CameraMapper` to describe the 524 storageClassName = self.
task.config.storageClasses.get(datasetTypeName)
525 if storageClassName
is None and mapping.python
is not None:
526 storageClassName = self.
task.config.storageClasses.get(mapping.python,
None)
527 if storageClassName
is None and mapping.persistable
is not None:
528 storageClassName = self.
task.config.storageClasses.get(mapping.persistable,
None)
529 if storageClassName
is None and mapping.python
is not None:
530 unqualified = mapping.python.split(
".")[-1]
531 storageClassName = self.
task.config.storageClasses.get(unqualified,
None)
532 if storageClassName
is not None:
533 storageClass = self.
task.butler3.storageClasses.getStorageClass(storageClassName)
536 storageClass = self.
task.butler3.storageClasses.getStorageClass(mapping.persistable)
539 if storageClass
is None and mapping.python
is not None:
541 storageClass = self.
task.butler3.storageClasses.getStorageClass(unqualified)
544 if storageClass
is None:
545 self.
task.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
547 self.
task.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
553 task: ConvertRepoTask
554 """The parent task that constructed and uses this converter 559 """Root path to the Gen2 repository this converter manages (`str`). 561 This is a complete path, not relative to some other repository root. 564 subset: Optional[ConversionSubset]
565 """An object that represents a filter to be applied to the datasets that 566 are converted (`ConversionSubset` or `None`).
def handleDataIdExpansionFailure
def getSpecialDirectories(self)
def insertDimensionData(self)