21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
27 from dataclasses
import dataclass
28 from collections
import defaultdict
29 from abc
import ABC, abstractmethod
30 from typing
import TYPE_CHECKING, Generic, TypeVar, List, Tuple, Optional, Iterator, Set, Any, Callable, Dict
32 from lsst.daf.butler
import DatasetRef, Butler
as Butler3, DataCoordinate
33 from lsst.sphgeom
import RangeSet, Region
35 from .filePathParser
import FilePathParser
38 from ..mapping
import Mapping
as CameraMapperMapping
39 from .dataIdExtractor
import DataIdExtractor
40 from .convertRepo
import ConvertRepoTask
41 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension
44 REPO_ROOT_FILES = (
"registry.sqlite3",
"_mapper",
"repositoryCfg.yaml",
"calibRegistry.sqlite3",
"_parent")
51 """A simple container that maintains a most-recently-used ordering. 64 def apply(self, func: Callable[[T], Any]) -> Any:
65 """Apply a function to elements until it returns a value that coerces 66 to `True`, and move the corresponding element to the front of the 77 The first value returned by ``func`` that coerces to `True`. 79 for n, element
in enumerate(self):
80 result = func(element)
96 """Add a new element to the front of the stack. 103 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains 104 lists of related data ID values that should be included in the conversion. 109 Instrument name used in Gen3 data IDs. 110 visits : `set` of `int` 111 Visit IDs that define the filter. 114 def __init__(self, instrument: str, visits: Set[int]):
122 """Populate the included tract IDs for the given skymap from those that 123 overlap the visits the `ConversionSubset` was initialized with. 127 registry : `lsst.daf.butler.Registry` 128 Registry that can be queried for visit/tract overlaps. 130 SkyMap name used in Gen3 data IDs. 133 self.
tracts[name] = tracts
135 for dataId
in self.registry.queryDimensions([
"tract"], expand=
False,
136 dataId={
"skymap": name,
"visit": visit}):
137 tracts.add(dataId[
"tract"])
138 self.task.log.info(
"Limiting datasets defined on skymap %s to %s tracts.", name, len(tracts))
140 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
141 """Populate the included skypix IDs for the given dimension from those 142 that overlap the visits the `ConversionSubset` was initialized with. 146 registry : `lsst.daf.butler.Registry` 147 Registry that can be queried for visit regions. 149 SkyMap name used in Gen3 data IDs. 154 dataId = registry.expandDataId(instrument=self.
instrument, visit=visit)
155 self.
regions.append(dataId.region)
158 ranges = ranges.join(dimension.pixelization.envelope(region))
159 self.
skypix[dimension] = ranges
162 """Test whether the given data ID is related to this subset and hence 163 should be included in a repository conversion. 167 dataId : `lsst.daf.butler.DataCoordinate` 173 `True` if this data ID should be included in a repository 178 More formally, this tests that the given data ID is not unrelated; 179 if a data ID does not involve tracts, visits, or skypix dimensions, 180 we always include it. 185 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visits:
187 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tracts[dataId[
"skymap"]]:
189 for dimension, ranges
in self.
skypix.items():
190 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
198 """The name of the instrument, as used in Gen3 data IDs (`str`). 202 """The set of visit IDs that should be included in the conversion (`set` 206 regions: Optional[List[Region]]
207 """Regions for all visits (`list` of `lsst.sphgeom.Region`). 209 Set to `None` before it has been initialized. Any code that attempts to 210 use it when it is `None` has a logic bug. 213 tracts: Dict[str, Set[int]]
214 """Tracts that should be included in the conversion, grouped by skymap 215 name (`dict` mapping `str` to `set` of `int`). 218 skypix: Dict[SkyPixDimension, RangeSet]
219 """SkyPix ranges that should be included in the conversion, grouped by 220 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`). 225 """An abstract base class for objects that help `ConvertRepoTask` convert 226 datasets from a single Gen2 repository. 230 task : `ConvertRepoTask` 231 Task instance that is using this helper object. 236 `RepoConverter` defines the only public API users of its subclasses should 237 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to 238 several abstract methods that subclasses must implement. In some cases, 239 subclasses may reimplement the public methods as well, but are expected to 240 delegate to ``super()`` either at the beginning or end of their own 244 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
245 subset: Optional[ConversionSubset] =
None):
255 """Test whether the given dataset is handled specially by this 256 converter and hence should be ignored by generic base-class logic that 257 searches for dataset types to convert. 261 datasetTypeName : `str` 262 Name of the dataset type to test. 267 `True` if the dataset type is special. 269 raise NotImplementedError()
273 """Test whether the given directory is handled specially by this 274 converter and hence should be ignored by generic base-class logic that 275 searches for datasets to convert. 280 Subdirectory. This is only ever a single subdirectory, and it 281 could appear anywhere within a repo root. (A full path relative 282 to the repo root might be more useful, but it is harder to 283 implement, and we don't currently need it to identify any special 289 `True` if the direct is special. 291 raise NotImplementedError()
295 """Iterate over all `CameraMapper` `Mapping` objects that should be 296 considered for conversion by this repository. 298 This this should include any datasets that may appear in the 299 repository, including those that are special (see 300 `isDatasetTypeSpecial`) and those that are being ignored (see 301 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter 302 to identify and hence skip these datasets quietly instead of warning 303 about them as unrecognized. 307 datasetTypeName: `str` 308 Name of the dataset type. 309 mapping : `lsst.obs.base.mapping.Mapping` 310 Mapping object used by the Gen2 `CameraMapper` to describe the 313 raise NotImplementedError()
317 storageClass: StorageClass) -> DataIdExtractor:
318 """Construct a `DataIdExtractor` instance appropriate for a particular 323 datasetTypeName : `str` 324 Name of the dataset type; typically forwarded directly to 325 the `DataIdExtractor` constructor. 326 parser : `FilePathParser` 327 Object that parses filenames into Gen2 data IDs; typically 328 forwarded directly to the `DataIdExtractor` constructor. 329 storageClass : `lsst.daf.butler.StorageClass` 330 Storage class for this dataset type in the Gen3 butler; typically 331 forwarded directly to the `DataIdExtractor` constructor. 335 extractor : `DataIdExtractor` 336 A new `DataIdExtractor` instance. 338 raise NotImplementedError()
341 """Iterate over all datasets in the repository that should be 342 ingested into the Gen3 repository. 344 Subclasses may override this method, but must delegate to the base 345 class implementation at some point in their own logic. 349 fileNameInRoot : `str` 350 Name of the file to be ingested, relative to the repository root. 351 ref : `lsst.daf.butler.DatasetRef` 352 Reference for the Gen3 datasets, including a complete `DatasetType` 355 for dirPath, subdirNamesInDir, fileNamesInDir
in os.walk(self.
root, followlinks=
True):
358 def isRepoRoot(dirName):
359 return any(os.path.exists(os.path.join(dirPath, dirName, f))
360 for f
in REPO_ROOT_FILES)
361 subdirNamesInDir[:] = [d
for d
in subdirNamesInDir
366 dirPathInRoot = dirPath[len(self.
root) + len(os.path.sep):]
367 for fileNameInDir
in fileNamesInDir:
368 if any(fnmatch.fnmatchcase(fileNameInDir, pattern)
369 for pattern
in self.
task.config.fileIgnorePatterns):
371 fileNameInRoot = os.path.join(dirPathInRoot, fileNameInDir)
372 if fileNameInRoot
in REPO_ROOT_FILES:
376 if self.
subset is None or self.
subset.isRelated(ref.dataId):
377 yield fileNameInRoot, ref
382 """Prepare the repository by identifying the dataset types to be 383 converted and building `DataIdExtractor` instance for them. 385 Subclasses may override this method, but must delegate to the base 386 class implementation at some point in their own logic. More often, 387 subclasses will specialize the behavior of `prep` simply by overriding 388 `iterMappings`, `isDatasetTypeSpecial`, and `makeDataIdExtractor`, to 389 which the base implementation delegates. 391 This should not perform any write operations to the Gen3 repository. 392 It is guaranteed to be called before `insertDimensionData` and 395 self.
task.log.info(f
"Preparing other datasets from root {self.root}.")
398 parser = FilePathParser.fromMapping(mapping)
405 if (
not self.
task.isDatasetTypeIncluded(datasetTypeName)
or 409 self._skipParsers.push((parser, datasetTypeName,
None))
412 if storageClass
is None:
417 self._skipParsers.push((parser, datasetTypeName,
"no storage class found."))
422 """Insert any dimension records uniquely derived from this repository 425 Subclasses may override this method, but may not need to; the default 426 implementation does nothing. 428 SkyMap and SkyPix dimensions should instead be handled by calling 429 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because 430 these dimensions are in general shared by multiple Gen2 repositories. 432 This method is guaranteed to be called between `prep` and `ingest`. 437 """Insert converted datasets into the Gen3 repository. 439 Subclasses may override this method, but must delegate to the base 440 class implementation at some point in their own logic. More often, 441 subclasses will specialize the behavior of `ingest` simply by 442 overriding `iterDatasets` and `isDirectorySpecial`, to which the base 443 implementation delegates. 445 This method is guaranteed to be called after both `prep` and 446 `insertDimensionData`. 448 self.task.log.info(
"Finding datasets in repo %s.", self.root)
449 datasets = defaultdict(list)
450 for fileNameInRoot, ref
in self.iterDatasets():
451 datasets[ref.datasetType].append((fileNameInRoot, ref))
452 for datasetType, toIngest
in datasets.items():
453 self.task.registry.registerDatasetType(datasetType)
454 self.task.log.info(
"Ingesting %s %s datasets.", len(toIngest), datasetType.name)
456 butler3, collections = self.getButler(datasetType.name)
457 except LookupError
as err:
458 self.task.log.warn(str(err))
461 refs = [butler3.ingest(os.path.join(self.root, fileNameInRoot), ref,
462 transfer=self.task.config.transfer)
463 for fileNameInRoot, ref
in toIngest]
464 except LookupError
as err:
465 raise LookupError(f
"Error expanding data ID for dataset type {datasetType.name}.")
from err
466 for collection
in collections:
467 self.task.registry.associate(collection, refs)
469 def getButler(self, datasetTypeName: str) -> Tuple[Butler3, List[str]]:
470 """Create a new Gen3 Butler appropriate for a particular dataset type. 472 This should be used exclusively by subclasses when obtaining a butler 473 to use for dataset ingest (`ConvertRepoTask.butler3` should never be 478 datasetTypeName : `str` 479 Name of the dataset type. 483 butler : `lsst.daf.butler.Butler` 484 Gen3 Butler instance appropriate for ingesting the given dataset 486 collections : `list` of `str` 487 Collections the dataset should be associated with, in addition to 488 the one used to define the `lsst.daf.butler.Run` used in 491 if datasetTypeName
in self.
task.config.collections:
493 Butler3(butler=self.
task.butler3, run=self.
task.config.collections[datasetTypeName]),
502 raise LookupError(
"No collection configured for dataset type {datasetTypeName}.")
504 def _extractDatasetRef(self, fileNameInRoot: str) -> Optional[DatasetRef]:
505 """Extract a `DatasetRef` from a file name. 507 This method is for internal use by `RepoConverter` itself (not its 512 fileNameInRoot : `str` 513 Name of the file to be ingested, relative to the repository root. 517 ref : `lsst.daf.butler.DatasetRef` or `None` 518 Reference for the Gen3 datasets, including a complete `DatasetType` 519 and data ID. `None` if the converter does not recognize the 520 file as one to be converted. 522 def closure(extractor):
524 dataId = extractor.apply(fileNameInRoot)
525 except LookupError
as err:
526 raise RuntimeError(f
"Error extracting data ID for {extractor.datasetType.name} " 527 f
"on file {fileNameInRoot}.")
from err
531 return DatasetRef(extractor.datasetType, dataId=dataId)
532 return self._extractors.apply(closure)
534 def _handleUnrecognizedFile(self, fileNameInRoot: str):
535 """Generate appropriate warnings (or not) for files not matched by 536 `_extractDatasetRef`. 538 This method is for internal use by `RepoConverter` itself (not its 543 fileNameInRoot : `str` 544 Name of the file, relative to the repository root. 546 def closure(skipTuple):
547 parser, datasetTypeName, message = skipTuple
548 if parser(fileNameInRoot)
is not None:
549 if message
is not None:
550 self.
task.log.warn(
"Skipping dataset %s file %s: %s", datasetTypeName,
551 fileNameInRoot, message)
554 if not self._skipParsers.apply(closure):
555 self.
task.log.warn(
"Skipping unrecognized file %s.", fileNameInRoot)
557 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
558 ) -> Optional[StorageClass]:
559 """Infer the Gen3 `StorageClass` from a dataset from a combination of 560 configuration and Gen2 dataset type information. 562 datasetTypeName: `str` 563 Name of the dataset type. 564 mapping : `lsst.obs.base.mapping.Mapping` 565 Mapping object used by the Gen2 `CameraMapper` to describe the 568 storageClassName = self.
task.config.storageClasses.get(datasetTypeName)
569 if storageClassName
is None and mapping.python
is not None:
570 storageClassName = self.
task.config.storageClasses.get(mapping.python,
None)
571 if storageClassName
is None and mapping.persistable
is not None:
572 storageClassName = self.
task.config.storageClasses.get(mapping.persistable,
None)
573 if storageClassName
is None and mapping.python
is not None:
574 unqualified = mapping.python.split(
".")[-1]
575 storageClassName = self.
task.config.storageClasses.get(unqualified,
None)
576 if storageClassName
is not None:
577 storageClass = self.
task.butler3.storageClasses.getStorageClass(storageClassName)
580 storageClass = self.
task.butler3.storageClasses.getStorageClass(mapping.persistable)
583 if storageClass
is None and mapping.python
is not None:
585 storageClass = self.
task.butler3.storageClasses.getStorageClass(unqualified)
588 if storageClass
is None:
589 self.
task.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
591 self.
task.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
597 task: ConvertRepoTask
598 """The parent task that constructed and uses this converter 603 """Root path to the Gen2 repository this converter manages (`str`). 605 This is a complete path, not relative to some other repository root. 608 subset: Optional[ConversionSubset]
609 """An object that represents a filter to be applied to the datasets that 610 are converted (`ConversionSubset` or `None`).
def _handleUnrecognizedFile
def insertDimensionData(self)