21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
25 from dataclasses
import dataclass
26 from collections
import defaultdict
27 from abc
import ABC, abstractmethod
42 from lsst.daf.butler
import DataCoordinate, FileDataset, DatasetType
43 from lsst.sphgeom
import RangeSet, Region
44 from .repoWalker
import RepoWalker
47 from ..mapping
import Mapping
as CameraMapperMapping
48 from .convertRepo
import ConvertRepoTask
49 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension
54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 55 lists of related data ID values that should be included in the conversion. 60 Instrument name used in Gen3 data IDs. 61 visits : `set` of `int` 62 Visit IDs that define the filter. 65 def __init__(self, instrument: str, visits: Set[int]):
73 """Populate the included tract IDs for the given skymap from those that 74 overlap the visits the `ConversionSubset` was initialized with. 78 registry : `lsst.daf.butler.Registry` 79 Registry that can be queried for visit/tract overlaps. 81 SkyMap name used in Gen3 data IDs. 86 for dataId
in registry.queryDimensions([
"tract"], expand=
False,
87 dataId={
"skymap": name,
90 tracts.add(dataId[
"tract"])
92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93 """Populate the included skypix IDs for the given dimension from those 94 that overlap the visits the `ConversionSubset` was initialized with. 98 registry : `lsst.daf.butler.Registry` 99 Registry that can be queried for visit regions. 101 SkyMap name used in Gen3 data IDs. 106 dataId = registry.expandDataId(instrument=self.
instrument, visit=visit)
107 self.
regions.append(dataId.region)
110 ranges = ranges.union(dimension.pixelization.envelope(region))
111 self.
skypix[dimension] = ranges
114 """Test whether the given data ID is related to this subset and hence 115 should be included in a repository conversion. 119 dataId : `lsst.daf.butler.DataCoordinate` 125 `True` if this data ID should be included in a repository 130 More formally, this tests that the given data ID is not unrelated; 131 if a data ID does not involve tracts, visits, or skypix dimensions, 132 we always include it. 137 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visits:
139 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tracts[dataId[
"skymap"]]:
141 for dimension, ranges
in self.
skypix.items():
142 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
150 """The name of the instrument, as used in Gen3 data IDs (`str`). 154 """The set of visit IDs that should be included in the conversion (`set` 158 regions: Optional[List[Region]]
159 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 161 Set to `None` before it has been initialized. Any code that attempts to 162 use it when it is `None` has a logic bug. 165 tracts: Dict[str, Set[int]]
166 """Tracts that should be included in the conversion, grouped by skymap 167 name (`dict` mapping `str` to `set` of `int`). 170 skypix: Dict[SkyPixDimension, RangeSet]
171 """SkyPix ranges that should be included in the conversion, grouped by 172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 177 """An abstract base class for objects that help `ConvertRepoTask` convert 178 datasets from a single Gen2 repository. 182 task : `ConvertRepoTask` 183 Task instance that is using this helper object. 185 Root of the Gen2 repo being converted. 186 collections : `list` of `str` 187 Gen3 collections with which all converted datasets should be 189 subset : `ConversionSubset, optional 190 Helper object that implements a filter that restricts the data IDs that 195 `RepoConverter` defines the only public API users of its subclasses should 196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 197 several abstract methods that subclasses must implement. In some cases, 198 subclasses may reimplement the public methods as well, but are expected to 199 delegate to ``super()`` either at the beginning or end of their own 203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204 subset: Optional[ConversionSubset] =
None):
210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
214 """Test whether the given dataset is handled specially by this 215 converter and hence should be ignored by generic base-class logic that 216 searches for dataset types to convert. 220 datasetTypeName : `str` 221 Name of the dataset type to test. 226 `True` if the dataset type is special. 228 raise NotImplementedError()
232 """Iterate over all `CameraMapper` `Mapping` objects that should be 233 considered for conversion by this repository. 235 This this should include any datasets that may appear in the 236 repository, including those that are special (see 237 `isDatasetTypeSpecial`) and those that are being ignored (see 238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 239 to identify and hence skip these datasets quietly instead of warning 240 about them as unrecognized. 244 datasetTypeName: `str` 245 Name of the dataset type. 246 mapping : `lsst.obs.base.mapping.Mapping` 247 Mapping object used by the Gen2 `CameraMapper` to describe the 250 raise NotImplementedError()
254 storageClass: StorageClass) -> RepoWalker.Target:
255 """Make a struct that identifies a dataset type to be extracted by 256 walking the repo directory structure. 260 datasetTypeName : `str` 261 Name of the dataset type (the same in both Gen2 and Gen3). 263 The full Gen2 filename template. 264 keys : `dict` [`str`, `type`] 265 A dictionary mapping Gen2 data ID key to the type of its value. 266 storageClass : `lsst.daf.butler.StorageClass` 267 Gen3 storage class for this dataset type. 271 target : `RepoWalker.Target` 272 A struct containing information about the target dataset (much of 273 it simplify forwarded from the arguments). 275 raise NotImplementedError()
278 """Return a list of directory paths that should not be searched for 281 These may be directories that simply do not contain datasets (or 282 contain datasets in another repository), or directories whose datasets 283 are handled specially by a subclass. 287 directories : `list` [`str`] 288 The full paths of directories to skip, relative to the repository 294 """Perform preparatory work associated with the dataset types to be 295 converted from this repository (but not the datasets themselves). 299 This should be a relatively fast operation that should not depend on 300 the size of the repository. 302 Subclasses may override this method, but must delegate to the base 303 class implementation at some point in their own logic. 304 More often, subclasses will specialize the behavior of `prep` by 305 overriding other methods to which the base class implementation 306 delegates. These include: 308 - `isDatasetTypeSpecial` 309 - `getSpecialDirectories` 310 - `makeRepoWalkerTarget` 312 This should not perform any write operations to the Gen3 repository. 313 It is guaranteed to be called before `insertDimensionData`. 315 self.
task.log.info(f
"Preparing other dataset types from root {self.root}.")
316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
319 template = mapping.template
328 if (
not self.
task.isDatasetTypeIncluded(datasetTypeName)
335 if storageClass
is None:
340 message = f
"no storage class found for {datasetTypeName}" 343 if template.endswith(
".fits"):
344 extensions.extend((
".gz",
".fz"))
345 for extension
in extensions:
347 walkerInput = RepoWalker.Skip(
348 template=template+extension,
352 self.
task.log.debug(
"Skipping template in walker: %s", template)
354 assert message
is None 356 datasetTypeName=datasetTypeName,
357 template=template+extension,
359 storageClass=storageClass,
361 self.
task.log.debug(
"Adding template to walker: %s", template)
362 walkerInputs.append(walkerInput)
373 fileIgnoreRegExTerms = []
374 for pattern
in self.
task.config.fileIgnorePatterns:
375 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
376 if fileIgnoreRegExTerms:
377 fileIgnoreRegEx = re.compile(
"|".join(fileIgnoreRegExTerms))
379 fileIgnoreRegEx =
None 383 """Iterate over datasets in the repository that should be ingested into 386 The base class implementation yields nothing; the datasets handled by 387 the `RepoConverter` base class itself are read directly in 390 Subclasses should override this method if they support additional 391 datasets that are handled some other way. 395 dataset : `FileDataset` 396 Structures representing datasets to be ingested. Paths should be 402 assert self.
_repoWalker,
"prep() must be called before findDatasets." 403 self.
task.log.info(
"Adding special datasets in repo %s.", self.
root)
405 assert len(dataset.refs) == 1
406 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
407 self.
task.log.info(
"Finding datasets from files in repo %s.", self.
root)
408 self._fileDatasets.update(
412 predicate=(self.
subset.isRelated
if self.
subset is not None else None)
417 """Insert any dimension records uniquely derived from this repository 420 Subclasses may override this method, but may not need to; the default 421 implementation does nothing. 423 SkyMap and SkyPix dimensions should instead be handled by calling 424 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 425 these dimensions are in general shared by multiple Gen2 repositories. 427 This method is guaranteed to be called between `prep` and 433 """Expand the data IDs for all datasets to be inserted. 435 Subclasses may override this method, but must delegate to the base 436 class implementation if they do. 438 This involves queries to the registry, but not writes. It is 439 guaranteed to be called between `insertDimensionData` and `ingest`. 442 for datasetType, datasetsForType
in self._fileDatasets.items():
443 self.task.log.info(
"Expanding data IDs for %s %s datasets.", len(datasetsForType),
446 for dataset
in datasetsForType:
447 for i, ref
in enumerate(dataset.refs):
449 dataId = self.task.registry.expandDataId(ref.dataId)
450 dataset.refs[i] = ref.expanded(dataId)
451 except LookupError
as err:
452 self.task.log.warn(
"Skipping ingestion for '%s': %s", dataset.path, err)
454 dataset.refs[i] =
None 455 dataset.refs[:] = itertools.filterfalse(
lambda x: x
is None, dataset.refs)
457 expanded.append(dataset)
459 datasetsForType[:] = expanded
462 """Insert converted datasets into the Gen3 repository. 464 Subclasses may override this method, but must delegate to the base 465 class implementation at some point in their own logic. 467 This method is guaranteed to be called after `expandDataIds`. 469 for datasetType, datasetsForType
in self._fileDatasets.items():
470 self.
task.registry.registerDatasetType(datasetType)
471 self.
task.log.info(
"Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
474 except LookupError
as err:
475 self.
task.log.warn(str(err))
478 self.
task.registry.registerRun(collections[0])
479 self.
task.butler3.ingest(*datasetsForType, transfer=self.
task.config.transfer,
481 except LookupError
as err:
482 raise LookupError(f
"Error expanding data ID for dataset type {datasetType.name}.")
from err
483 for collection
in collections[1:]:
484 self.
task.registry.associate(collection,
485 [ref
for dataset
in datasetsForType
for ref
in dataset.refs])
488 """Return the set of collections a particular dataset type should be 493 datasetTypeName : `str` 494 Name of the dataset type. 498 collections : `list` of `str` 499 Collections the dataset should be associated with. The first 500 item in the list is the run the dataset should be added to 503 if datasetTypeName
in self.
task.config.collections:
508 raise LookupError(
"No collection configured for dataset type {datasetTypeName}.")
510 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
511 ) -> Optional[StorageClass]:
512 """Infer the Gen3 `StorageClass` from a dataset from a combination of 513 configuration and Gen2 dataset type information. 515 datasetTypeName: `str` 516 Name of the dataset type. 517 mapping : `lsst.obs.base.mapping.Mapping` 518 Mapping object used by the Gen2 `CameraMapper` to describe the 521 storageClassName = self.
task.config.storageClasses.get(datasetTypeName)
522 if storageClassName
is None and mapping.python
is not None:
523 storageClassName = self.
task.config.storageClasses.get(mapping.python,
None)
524 if storageClassName
is None and mapping.persistable
is not None:
525 storageClassName = self.
task.config.storageClasses.get(mapping.persistable,
None)
526 if storageClassName
is None and mapping.python
is not None:
527 unqualified = mapping.python.split(
".")[-1]
528 storageClassName = self.
task.config.storageClasses.get(unqualified,
None)
529 if storageClassName
is not None:
530 storageClass = self.
task.butler3.storageClasses.getStorageClass(storageClassName)
533 storageClass = self.
task.butler3.storageClasses.getStorageClass(mapping.persistable)
536 if storageClass
is None and mapping.python
is not None:
538 storageClass = self.
task.butler3.storageClasses.getStorageClass(unqualified)
541 if storageClass
is None:
542 self.
task.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
544 self.
task.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
550 task: ConvertRepoTask
551 """The parent task that constructed and uses this converter 556 """Root path to the Gen2 repository this converter manages (`str`). 558 This is a complete path, not relative to some other repository root. 561 subset: Optional[ConversionSubset]
562 """An object that represents a filter to be applied to the datasets that 563 are converted (`ConversionSubset` or `None`).
def getSpecialDirectories(self)
def insertDimensionData(self)