21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
25 from dataclasses
import dataclass
26 from collections
import defaultdict
27 from abc
import ABC, abstractmethod
43 from lsst.daf.butler
import DataCoordinate, FileDataset, DatasetType
44 from lsst.sphgeom
import RangeSet, Region
45 from .repoWalker
import RepoWalker
48 from ..mapping
import Mapping
as CameraMapperMapping
49 from .convertRepo
import ConvertRepoTask
50 from .scanner
import PathElementHandler
51 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension, FormatterParameter
56 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 57 lists of related data ID values that should be included in the conversion. 62 Instrument name used in Gen3 data IDs. 63 visits : `set` of `int` 64 Visit IDs that define the filter. 67 def __init__(self, instrument: str, visits: Set[int]):
75 """Populate the included tract IDs for the given skymap from those that 76 overlap the visits the `ConversionSubset` was initialized with. 80 registry : `lsst.daf.butler.Registry` 81 Registry that can be queried for visit/tract overlaps. 83 SkyMap name used in Gen3 data IDs. 88 for dataId
in registry.queryDimensions([
"tract"], expand=
False,
89 dataId={
"skymap": name,
92 tracts.add(dataId[
"tract"])
94 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
95 """Populate the included skypix IDs for the given dimension from those 96 that overlap the visits the `ConversionSubset` was initialized with. 100 registry : `lsst.daf.butler.Registry` 101 Registry that can be queried for visit regions. 103 SkyMap name used in Gen3 data IDs. 108 dataId = registry.expandDataId(instrument=self.
instrument, visit=visit)
109 self.
regions.append(dataId.region)
112 ranges = ranges.union(dimension.pixelization.envelope(region))
113 self.
skypix[dimension] = ranges
116 """Test whether the given data ID is related to this subset and hence 117 should be included in a repository conversion. 121 dataId : `lsst.daf.butler.DataCoordinate` 127 `True` if this data ID should be included in a repository 132 More formally, this tests that the given data ID is not unrelated; 133 if a data ID does not involve tracts, visits, or skypix dimensions, 134 we always include it. 139 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visits:
141 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tracts[dataId[
"skymap"]]:
143 for dimension, ranges
in self.
skypix.items():
144 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
152 """The name of the instrument, as used in Gen3 data IDs (`str`). 156 """The set of visit IDs that should be included in the conversion (`set` 160 regions: Optional[List[Region]]
161 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 163 Set to `None` before it has been initialized. Any code that attempts to 164 use it when it is `None` has a logic bug. 167 tracts: Dict[str, Set[int]]
168 """Tracts that should be included in the conversion, grouped by skymap 169 name (`dict` mapping `str` to `set` of `int`). 172 skypix: Dict[SkyPixDimension, RangeSet]
173 """SkyPix ranges that should be included in the conversion, grouped by 174 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 179 """An abstract base class for objects that help `ConvertRepoTask` convert 180 datasets from a single Gen2 repository. 184 task : `ConvertRepoTask` 185 Task instance that is using this helper object. 187 Root of the Gen2 repo being converted. 188 collections : `list` of `str` 189 Gen3 collections with which all converted datasets should be 191 subset : `ConversionSubset, optional 192 Helper object that implements a filter that restricts the data IDs that 197 `RepoConverter` defines the only public API users of its subclasses should 198 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 199 several abstract methods that subclasses must implement. In some cases, 200 subclasses may reimplement the public methods as well, but are expected to 201 delegate to ``super()`` either at the beginning or end of their own 205 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
206 subset: Optional[ConversionSubset] =
None):
212 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
216 """Test whether the given dataset is handled specially by this 217 converter and hence should be ignored by generic base-class logic that 218 searches for dataset types to convert. 222 datasetTypeName : `str` 223 Name of the dataset type to test. 228 `True` if the dataset type is special. 230 raise NotImplementedError()
234 """Iterate over all `CameraMapper` `Mapping` objects that should be 235 considered for conversion by this repository. 237 This this should include any datasets that may appear in the 238 repository, including those that are special (see 239 `isDatasetTypeSpecial`) and those that are being ignored (see 240 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 241 to identify and hence skip these datasets quietly instead of warning 242 about them as unrecognized. 246 datasetTypeName: `str` 247 Name of the dataset type. 248 mapping : `lsst.obs.base.mapping.Mapping` 249 Mapping object used by the Gen2 `CameraMapper` to describe the 252 raise NotImplementedError()
256 storageClass: StorageClass,
257 formatter: FormatterParameter =
None,
258 targetHandler: Optional[PathElementHandler] =
None,
259 ) -> RepoWalker.Target:
260 """Make a struct that identifies a dataset type to be extracted by 261 walking the repo directory structure. 265 datasetTypeName : `str` 266 Name of the dataset type (the same in both Gen2 and Gen3). 268 The full Gen2 filename template. 269 keys : `dict` [`str`, `type`] 270 A dictionary mapping Gen2 data ID key to the type of its value. 271 storageClass : `lsst.daf.butler.StorageClass` 272 Gen3 storage class for this dataset type. 273 formatter : `lsst.daf.butler.Formatter` or `str`, optional 274 A Gen 3 formatter class or fully-qualified name. 275 targetHandler : `PathElementHandler`, optional 276 Specialist target handler to use for this dataset type. 280 target : `RepoWalker.Target` 281 A struct containing information about the target dataset (much of 282 it simplify forwarded from the arguments). 284 raise NotImplementedError()
287 """Return a list of directory paths that should not be searched for 290 These may be directories that simply do not contain datasets (or 291 contain datasets in another repository), or directories whose datasets 292 are handled specially by a subclass. 296 directories : `list` [`str`] 297 The full paths of directories to skip, relative to the repository 303 """Perform preparatory work associated with the dataset types to be 304 converted from this repository (but not the datasets themselves). 308 This should be a relatively fast operation that should not depend on 309 the size of the repository. 311 Subclasses may override this method, but must delegate to the base 312 class implementation at some point in their own logic. 313 More often, subclasses will specialize the behavior of `prep` by 314 overriding other methods to which the base class implementation 315 delegates. These include: 317 - `isDatasetTypeSpecial` 318 - `getSpecialDirectories` 319 - `makeRepoWalkerTarget` 321 This should not perform any write operations to the Gen3 repository. 322 It is guaranteed to be called before `insertDimensionData`. 324 self.
task.log.info(f
"Preparing other dataset types from root {self.root}.")
325 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
328 template = mapping.template
337 if (
not self.
task.isDatasetTypeIncluded(datasetTypeName)
344 if storageClass
is None:
349 message = f
"no storage class found for {datasetTypeName}" 352 if template.endswith(
".fits"):
353 extensions.extend((
".gz",
".fz"))
354 for extension
in extensions:
356 walkerInput = RepoWalker.Skip(
357 template=template+extension,
361 self.
task.log.debug(
"Skipping template in walker: %s", template)
363 assert message
is None 364 targetHandler = self.
task.config.targetHandlerClasses.get(datasetTypeName)
365 if targetHandler
is not None:
366 targetHandler = doImport(targetHandler)
368 datasetTypeName=datasetTypeName,
369 template=template+extension,
371 storageClass=storageClass,
372 formatter=self.
task.config.formatterClasses.get(datasetTypeName),
373 targetHandler=targetHandler,
375 self.
task.log.debug(
"Adding template to walker: %s", template)
376 walkerInputs.append(walkerInput)
387 fileIgnoreRegExTerms = []
388 for pattern
in self.
task.config.fileIgnorePatterns:
389 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
390 if fileIgnoreRegExTerms:
391 fileIgnoreRegEx = re.compile(
"|".join(fileIgnoreRegExTerms))
393 fileIgnoreRegEx =
None 397 """Iterate over datasets in the repository that should be ingested into 400 The base class implementation yields nothing; the datasets handled by 401 the `RepoConverter` base class itself are read directly in 404 Subclasses should override this method if they support additional 405 datasets that are handled some other way. 409 dataset : `FileDataset` 410 Structures representing datasets to be ingested. Paths should be 416 assert self.
_repoWalker,
"prep() must be called before findDatasets." 417 self.
task.log.info(
"Adding special datasets in repo %s.", self.
root)
419 assert len(dataset.refs) == 1
420 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
421 self.
task.log.info(
"Finding datasets from files in repo %s.", self.
root)
422 self._fileDatasets.update(
426 predicate=(self.
subset.isRelated
if self.
subset is not None else None)
431 """Insert any dimension records uniquely derived from this repository 434 Subclasses may override this method, but may not need to; the default 435 implementation does nothing. 437 SkyMap and SkyPix dimensions should instead be handled by calling 438 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 439 these dimensions are in general shared by multiple Gen2 repositories. 441 This method is guaranteed to be called between `prep` and 447 """Expand the data IDs for all datasets to be inserted. 449 Subclasses may override this method, but must delegate to the base 450 class implementation if they do. 452 This involves queries to the registry, but not writes. It is 453 guaranteed to be called between `insertDimensionData` and `ingest`. 456 for datasetType, datasetsForType
in self._fileDatasets.items():
457 self.task.log.info(
"Expanding data IDs for %s %s datasets.", len(datasetsForType),
460 for dataset
in datasetsForType:
461 for i, ref
in enumerate(dataset.refs):
463 dataId = self.task.registry.expandDataId(ref.dataId)
464 dataset.refs[i] = ref.expanded(dataId)
465 except LookupError
as err:
466 self.task.log.warn(
"Skipping ingestion for '%s': %s", dataset.path, err)
468 dataset.refs[i] =
None 469 dataset.refs[:] = itertools.filterfalse(
lambda x: x
is None, dataset.refs)
471 expanded.append(dataset)
473 datasetsForType[:] = expanded
476 """Insert converted datasets into the Gen3 repository. 478 Subclasses may override this method, but must delegate to the base 479 class implementation at some point in their own logic. 481 This method is guaranteed to be called after `expandDataIds`. 483 for datasetType, datasetsForType
in self._fileDatasets.items():
484 self.
task.registry.registerDatasetType(datasetType)
485 self.
task.log.info(
"Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
488 except LookupError
as err:
489 self.
task.log.warn(str(err))
492 self.
task.registry.registerRun(collections[0])
493 self.
task.butler3.ingest(*datasetsForType, transfer=self.
task.config.transfer,
495 except LookupError
as err:
496 raise LookupError(f
"Error expanding data ID for dataset type {datasetType.name}.")
from err
497 for collection
in collections[1:]:
498 self.
task.registry.associate(collection,
499 [ref
for dataset
in datasetsForType
for ref
in dataset.refs])
502 """Return the set of collections a particular dataset type should be 507 datasetTypeName : `str` 508 Name of the dataset type. 512 collections : `list` of `str` 513 Collections the dataset should be associated with. The first 514 item in the list is the run the dataset should be added to 517 if datasetTypeName
in self.
task.config.collections:
522 raise LookupError(
"No collection configured for dataset type {datasetTypeName}.")
524 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
525 ) -> Optional[StorageClass]:
526 """Infer the Gen3 `StorageClass` from a dataset from a combination of 527 configuration and Gen2 dataset type information. 529 datasetTypeName: `str` 530 Name of the dataset type. 531 mapping : `lsst.obs.base.mapping.Mapping` 532 Mapping object used by the Gen2 `CameraMapper` to describe the 535 storageClassName = self.
task.config.storageClasses.get(datasetTypeName)
536 if storageClassName
is None and mapping.python
is not None:
537 storageClassName = self.
task.config.storageClasses.get(mapping.python,
None)
538 if storageClassName
is None and mapping.persistable
is not None:
539 storageClassName = self.
task.config.storageClasses.get(mapping.persistable,
None)
540 if storageClassName
is None and mapping.python
is not None:
541 unqualified = mapping.python.split(
".")[-1]
542 storageClassName = self.
task.config.storageClasses.get(unqualified,
None)
543 if storageClassName
is not None:
544 storageClass = self.
task.butler3.storageClasses.getStorageClass(storageClassName)
547 storageClass = self.
task.butler3.storageClasses.getStorageClass(mapping.persistable)
550 if storageClass
is None and mapping.python
is not None:
552 storageClass = self.
task.butler3.storageClasses.getStorageClass(unqualified)
555 if storageClass
is None:
556 self.
task.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
558 self.
task.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
564 task: ConvertRepoTask
565 """The parent task that constructed and uses this converter 570 """Root path to the Gen2 repository this converter manages (`str`). 572 This is a complete path, not relative to some other repository root. 575 subset: Optional[ConversionSubset]
576 """An object that represents a filter to be applied to the datasets that 577 are converted (`ConversionSubset` or `None`).
def getSpecialDirectories(self)
def insertDimensionData(self)