Coverage for python/lsst/obs/base/gen2to3/repoConverter.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RepoConverter"]
25from dataclasses import dataclass
26from collections import defaultdict
27from abc import ABC, abstractmethod
28import fnmatch
29import re
30from typing import (
31 Dict,
32 Iterator,
33 List,
34 MutableMapping,
35 Optional,
36 Set,
37 Tuple,
38 Union,
39 TYPE_CHECKING,
40)
42from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
43from lsst.sphgeom import RangeSet, Region
44from .repoWalker import RepoWalker
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
48 from .convertRepo import ConvertRepoTask
49 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension
52@dataclass
53class ConversionSubset:
54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
55 lists of related data ID values that should be included in the conversion.
57 Parameters
58 ----------
59 instrument : `str`
60 Instrument name used in Gen3 data IDs.
61 visits : `set` of `int`
62 Visit IDs that define the filter.
63 """
65 def __init__(self, instrument: str, visits: Set[int]):
66 self.instrument = instrument
67 self.visits = visits
68 self.regions = None
69 self.tracts = {}
70 self.skypix = {}
72 def addSkyMap(self, registry: Registry, name: str):
73 """Populate the included tract IDs for the given skymap from those that
74 overlap the visits the `ConversionSubset` was initialized with.
76 Parameters
77 ----------
78 registry : `lsst.daf.butler.Registry`
79 Registry that can be queried for visit/tract overlaps.
80 name : `str`
81 SkyMap name used in Gen3 data IDs.
82 """
83 tracts = set()
84 self.tracts[name] = tracts
85 for visit in self.visits:
86 for dataId in registry.queryDimensions(["tract"], expand=False,
87 dataId={"skymap": name,
88 "instrument": self.instrument,
89 "visit": visit}):
90 tracts.add(dataId["tract"])
92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93 """Populate the included skypix IDs for the given dimension from those
94 that overlap the visits the `ConversionSubset` was initialized with.
96 Parameters
97 ----------
98 registry : `lsst.daf.butler.Registry`
99 Registry that can be queried for visit regions.
100 name : `str`
101 SkyMap name used in Gen3 data IDs.
102 """
103 if self.regions is None:
104 self.regions = []
105 for visit in self.visits:
106 dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
107 self.regions.append(dataId.region)
108 ranges = RangeSet()
109 for region in self.regions:
110 ranges = ranges.union(dimension.pixelization.envelope(region))
111 self.skypix[dimension] = ranges
113 def isRelated(self, dataId: DataCoordinate) -> bool:
114 """Test whether the given data ID is related to this subset and hence
115 should be included in a repository conversion.
117 Parameters
118 ----------
119 dataId : `lsst.daf.butler.DataCoordinate`
120 Data ID to test.
122 Returns
123 -------
124 related : `bool`
125 `True` if this data ID should be included in a repository
126 conversion.
128 Notes
129 -----
130 More formally, this tests that the given data ID is not unrelated;
131 if a data ID does not involve tracts, visits, or skypix dimensions,
132 we always include it.
133 """
134 if self.visits is None:
135 # We're not filtering at all.
136 return True
137 if "visit" in dataId.graph and dataId["visit"] not in self.visits:
138 return False
139 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
140 return False
141 for dimension, ranges in self.skypix.items():
142 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
143 return False
144 return True
146 # Class attributes that will be shadowed by public instance attributes;
147 # defined here only for documentation purposes.
149 instrument: str
150 """The name of the instrument, as used in Gen3 data IDs (`str`).
151 """
153 visits: Set[int]
154 """The set of visit IDs that should be included in the conversion (`set`
155 of `int`).
156 """
158 regions: Optional[List[Region]]
159 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
161 Set to `None` before it has been initialized. Any code that attempts to
162 use it when it is `None` has a logic bug.
163 """
165 tracts: Dict[str, Set[int]]
166 """Tracts that should be included in the conversion, grouped by skymap
167 name (`dict` mapping `str` to `set` of `int`).
168 """
170 skypix: Dict[SkyPixDimension, RangeSet]
171 """SkyPix ranges that should be included in the conversion, grouped by
172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
173 """
176class RepoConverter(ABC):
177 """An abstract base class for objects that help `ConvertRepoTask` convert
178 datasets from a single Gen2 repository.
180 Parameters
181 ----------
182 task : `ConvertRepoTask`
183 Task instance that is using this helper object.
184 root : `str`
185 Root of the Gen2 repo being converted.
186 collections : `list` of `str`
187 Gen3 collections with which all converted datasets should be
188 associated.
189 subset : `ConversionSubset, optional
190 Helper object that implements a filter that restricts the data IDs that
191 are converted.
193 Notes
194 -----
195 `RepoConverter` defines the only public API users of its subclasses should
196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
197 several abstract methods that subclasses must implement. In some cases,
198 subclasses may reimplement the public methods as well, but are expected to
199 delegate to ``super()`` either at the beginning or end of their own
200 implementation.
201 """
203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204 subset: Optional[ConversionSubset] = None):
205 self.task = task
206 self.root = root
207 self.subset = subset
208 self._collections = list(collections)
209 self._repoWalker = None # Created in prep
210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
212 @abstractmethod
213 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
214 """Test whether the given dataset is handled specially by this
215 converter and hence should be ignored by generic base-class logic that
216 searches for dataset types to convert.
218 Parameters
219 ----------
220 datasetTypeName : `str`
221 Name of the dataset type to test.
223 Returns
224 -------
225 special : `bool`
226 `True` if the dataset type is special.
227 """
228 raise NotImplementedError()
230 @abstractmethod
231 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
232 """Iterate over all `CameraMapper` `Mapping` objects that should be
233 considered for conversion by this repository.
235 This this should include any datasets that may appear in the
236 repository, including those that are special (see
237 `isDatasetTypeSpecial`) and those that are being ignored (see
238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
239 to identify and hence skip these datasets quietly instead of warning
240 about them as unrecognized.
242 Yields
243 ------
244 datasetTypeName: `str`
245 Name of the dataset type.
246 mapping : `lsst.obs.base.mapping.Mapping`
247 Mapping object used by the Gen2 `CameraMapper` to describe the
248 dataset type.
249 """
250 raise NotImplementedError()
252 @abstractmethod
253 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
254 storageClass: StorageClass) -> RepoWalker.Target:
255 """Make a struct that identifies a dataset type to be extracted by
256 walking the repo directory structure.
258 Parameters
259 ----------
260 datasetTypeName : `str`
261 Name of the dataset type (the same in both Gen2 and Gen3).
262 template : `str`
263 The full Gen2 filename template.
264 keys : `dict` [`str`, `type`]
265 A dictionary mapping Gen2 data ID key to the type of its value.
266 storageClass : `lsst.daf.butler.StorageClass`
267 Gen3 storage class for this dataset type.
269 Returns
270 -------
271 target : `RepoWalker.Target`
272 A struct containing information about the target dataset (much of
273 it simplify forwarded from the arguments).
274 """
275 raise NotImplementedError()
277 def getSpecialDirectories(self) -> List[str]:
278 """Return a list of directory paths that should not be searched for
279 files.
281 These may be directories that simply do not contain datasets (or
282 contain datasets in another repository), or directories whose datasets
283 are handled specially by a subclass.
285 Returns
286 -------
287 directories : `list` [`str`]
288 The full paths of directories to skip, relative to the repository
289 root.
290 """
291 return []
293 def prep(self):
294 """Perform preparatory work associated with the dataset types to be
295 converted from this repository (but not the datasets themselves).
297 Notes
298 -----
299 This should be a relatively fast operation that should not depend on
300 the size of the repository.
302 Subclasses may override this method, but must delegate to the base
303 class implementation at some point in their own logic.
304 More often, subclasses will specialize the behavior of `prep` by
305 overriding other methods to which the base class implementation
306 delegates. These include:
307 - `iterMappings`
308 - `isDatasetTypeSpecial`
309 - `getSpecialDirectories`
310 - `makeRepoWalkerTarget`
312 This should not perform any write operations to the Gen3 repository.
313 It is guaranteed to be called before `insertDimensionData`.
314 """
315 self.task.log.info(f"Preparing other dataset types from root {self.root}.")
316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
317 for datasetTypeName, mapping in self.iterMappings():
318 try:
319 template = mapping.template
320 except RuntimeError:
321 # No template for this dataset in this mapper, so there's no
322 # way there should be instances of this dataset in this repo.
323 continue
324 skip = False
325 message = None
326 storageClass = None
327 if (not self.task.isDatasetTypeIncluded(datasetTypeName)
328 or self.isDatasetTypeSpecial(datasetTypeName)):
329 # User indicated not to include this data, but we still want
330 # to recognize files of that type to avoid warning about them.
331 skip = True
332 else:
333 storageClass = self._guessStorageClass(datasetTypeName, mapping)
334 if storageClass is None:
335 # This may be a problem, but only if we actually encounter any
336 # files corresponding to this dataset. Of course, we need
337 # to be able to parse those files in order to recognize that
338 # situation.
339 message = f"no storage class found for {datasetTypeName}"
340 skip = True
341 if skip:
342 walkerInput = RepoWalker.Skip(
343 template=template,
344 keys=mapping.keys(),
345 message=message,
346 )
347 else:
348 assert message is None
349 walkerInput = self.makeRepoWalkerTarget(
350 datasetTypeName=datasetTypeName,
351 template=template,
352 keys=mapping.keys(),
353 storageClass=storageClass,
354 )
355 walkerInputs.append(walkerInput)
356 for dirPath in self.getSpecialDirectories():
357 walkerInputs.append(
358 RepoWalker.Skip(
359 template=dirPath, # not really a template, but that's fine; it's relative to root.
360 keys={},
361 message=None,
362 isForFiles=True,
363 )
364 )
365 fileIgnoreRegExTerms = []
366 for pattern in self.task.config.fileIgnorePatterns:
367 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
368 if fileIgnoreRegExTerms:
369 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
370 else:
371 fileIgnoreRegEx = None
372 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx)
374 def iterDatasets(self) -> Iterator[FileDataset]:
375 """Iterate over datasets in the repository that should be ingested into
376 the Gen3 repository.
378 The base class implementation yields nothing; the datasets handled by
379 the `RepoConverter` base class itself are read directly in
380 `findDatasets`.
382 Subclasses should override this method if they support additional
383 datasets that are handled some other way.
385 Yields
386 ------
387 dataset : `FileDataset`
388 Structures representing datasets to be ingested. Paths should be
389 absolute.
390 """
391 yield from ()
393 def findDatasets(self):
394 assert self._repoWalker, "prep() must be called before findDatasets."
395 self.task.log.info("Adding special datasets in repo %s.", self.root)
396 for dataset in self.iterDatasets():
397 assert len(dataset.refs) == 1
398 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
399 self.task.log.info("Finding datasets from files in repo %s.", self.root)
400 self._fileDatasets.update(
401 self._repoWalker.walk(
402 self.root,
403 log=self.task.log,
404 predicate=(self.subset.isRelated if self.subset is not None else None)
405 )
406 )
408 def insertDimensionData(self):
409 """Insert any dimension records uniquely derived from this repository
410 into the registry.
412 Subclasses may override this method, but may not need to; the default
413 implementation does nothing.
415 SkyMap and SkyPix dimensions should instead be handled by calling
416 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
417 these dimensions are in general shared by multiple Gen2 repositories.
419 This method is guaranteed to be called between `prep` and
420 `expandDataIds`.
421 """
422 pass
424 def handleDataIdExpansionFailure(self, dataset: FileDataset, err: LookupError):
425 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
426 return False
428 def expandDataIds(self):
429 """Expand the data IDs for all datasets to be inserted.
431 Subclasses may override this method, but must delegate to the base
432 class implementation if they do. If they wish to handle expected
433 failures in data ID expansion, they should override
434 `handleDataIdExpansionFailure` instead.
436 This involves queries to the registry, but not writes. It is
437 guaranteed to be called between `insertDimensionData` and `ingest`.
438 """
439 for datasetType, datasetsForType in self._fileDatasets.items():
440 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType),
441 datasetType.name)
442 expanded = []
443 for dataset in datasetsForType:
444 for i, ref in enumerate(dataset.refs):
445 try:
446 dataId = self.task.registry.expandDataId(ref.dataId)
447 dataset.refs[i] = ref.expanded(dataId)
448 expanded.append(dataset)
449 except LookupError as err:
450 if self.handleDataIdExpansionFailure(dataset, err):
451 expanded.append(dataset)
452 datasetsForType[:] = expanded
454 def ingest(self):
455 """Insert converted datasets into the Gen3 repository.
457 Subclasses may override this method, but must delegate to the base
458 class implementation at some point in their own logic.
460 This method is guaranteed to be called after `expandDataIds`.
461 """
462 for datasetType, datasetsForType in self._fileDatasets.items():
463 self.task.registry.registerDatasetType(datasetType)
464 self.task.log.info("Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
465 try:
466 collections = self.getCollections(datasetType.name)
467 except LookupError as err:
468 self.task.log.warn(str(err))
469 continue
470 try:
471 self.task.registry.registerRun(collections[0])
472 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer,
473 run=collections[0])
474 except LookupError as err:
475 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
476 for collection in collections[1:]:
477 self.task.registry.associate(collection,
478 [ref for dataset in datasetsForType for ref in dataset.refs])
480 def getCollections(self, datasetTypeName: str) -> List[str]:
481 """Return the set of collections a particular dataset type should be
482 associated with.
484 Parameters
485 ----------
486 datasetTypeName : `str`
487 Name of the dataset type.
489 Returns
490 -------
491 collections : `list` of `str`
492 Collections the dataset should be associated with. The first
493 item in the list is the run the dataset should be added to
494 initially.
495 """
496 if datasetTypeName in self.task.config.collections:
497 return [self.task.config.collections[datasetTypeName]] + self._collections
498 elif self._collections:
499 return self._collections
500 else:
501 raise LookupError("No collection configured for dataset type {datasetTypeName}.")
503 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
504 ) -> Optional[StorageClass]:
505 """Infer the Gen3 `StorageClass` from a dataset from a combination of
506 configuration and Gen2 dataset type information.
508 datasetTypeName: `str`
509 Name of the dataset type.
510 mapping : `lsst.obs.base.mapping.Mapping`
511 Mapping object used by the Gen2 `CameraMapper` to describe the
512 dataset type.
513 """
514 storageClassName = self.task.config.storageClasses.get(datasetTypeName)
515 if storageClassName is None and mapping.python is not None:
516 storageClassName = self.task.config.storageClasses.get(mapping.python, None)
517 if storageClassName is None and mapping.persistable is not None:
518 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
519 if storageClassName is None and mapping.python is not None:
520 unqualified = mapping.python.split(".")[-1]
521 storageClassName = self.task.config.storageClasses.get(unqualified, None)
522 if storageClassName is not None:
523 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
524 else:
525 try:
526 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
527 except KeyError:
528 storageClass = None
529 if storageClass is None and mapping.python is not None:
530 try:
531 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
532 except KeyError:
533 pass
534 if storageClass is None:
535 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
536 else:
537 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
538 return storageClass
540 # Class attributes that will be shadowed by public instance attributes;
541 # defined here only for documentation purposes.
543 task: ConvertRepoTask
544 """The parent task that constructed and uses this converter
545 (`ConvertRepoTask`).
546 """
548 root: str
549 """Root path to the Gen2 repository this converter manages (`str`).
551 This is a complete path, not relative to some other repository root.
552 """
554 subset: Optional[ConversionSubset]
555 """An object that represents a filter to be applied to the datasets that
556 are converted (`ConversionSubset` or `None`).
557 """