Coverage for python/lsst/obs/base/gen2to3/repoConverter.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RepoConverter"]
25from dataclasses import dataclass
26from collections import defaultdict
27from abc import ABC, abstractmethod
28import fnmatch
29import re
30from typing import (
31 Dict,
32 Iterator,
33 List,
34 MutableMapping,
35 Optional,
36 Set,
37 Tuple,
38 Union,
39 TYPE_CHECKING,
40)
42from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
43from lsst.sphgeom import RangeSet, Region
44from .repoWalker import RepoWalker
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
48 from .convertRepo import ConvertRepoTask
49 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension
52@dataclass
53class ConversionSubset:
54 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
55 lists of related data ID values that should be included in the conversion.
57 Parameters
58 ----------
59 instrument : `str`
60 Instrument name used in Gen3 data IDs.
61 visits : `set` of `int`
62 Visit IDs that define the filter.
63 """
65 def __init__(self, instrument: str, visits: Set[int]):
66 self.instrument = instrument
67 self.visits = visits
68 self.regions = None
69 self.tracts = {}
70 self.skypix = {}
72 def addSkyMap(self, registry: Registry, name: str):
73 """Populate the included tract IDs for the given skymap from those that
74 overlap the visits the `ConversionSubset` was initialized with.
76 Parameters
77 ----------
78 registry : `lsst.daf.butler.Registry`
79 Registry that can be queried for visit/tract overlaps.
80 name : `str`
81 SkyMap name used in Gen3 data IDs.
82 """
83 tracts = set()
84 self.tracts[name] = tracts
85 for visit in self.visits:
86 for dataId in registry.queryDimensions(["tract"], expand=False,
87 dataId={"skymap": name,
88 "instrument": self.instrument,
89 "visit": visit}):
90 tracts.add(dataId["tract"])
92 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
93 """Populate the included skypix IDs for the given dimension from those
94 that overlap the visits the `ConversionSubset` was initialized with.
96 Parameters
97 ----------
98 registry : `lsst.daf.butler.Registry`
99 Registry that can be queried for visit regions.
100 name : `str`
101 SkyMap name used in Gen3 data IDs.
102 """
103 if self.regions is None:
104 self.regions = []
105 for visit in self.visits:
106 dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
107 self.regions.append(dataId.region)
108 ranges = RangeSet()
109 for region in self.regions:
110 ranges = ranges.union(dimension.pixelization.envelope(region))
111 self.skypix[dimension] = ranges
113 def isRelated(self, dataId: DataCoordinate) -> bool:
114 """Test whether the given data ID is related to this subset and hence
115 should be included in a repository conversion.
117 Parameters
118 ----------
119 dataId : `lsst.daf.butler.DataCoordinate`
120 Data ID to test.
122 Returns
123 -------
124 related : `bool`
125 `True` if this data ID should be included in a repository
126 conversion.
128 Notes
129 -----
130 More formally, this tests that the given data ID is not unrelated;
131 if a data ID does not involve tracts, visits, or skypix dimensions,
132 we always include it.
133 """
134 if self.visits is None:
135 # We're not filtering at all.
136 return True
137 if "visit" in dataId.graph and dataId["visit"] not in self.visits:
138 return False
139 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
140 return False
141 for dimension, ranges in self.skypix.items():
142 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
143 return False
144 return True
146 # Class attributes that will be shadowed by public instance attributes;
147 # defined here only for documentation purposes.
149 instrument: str
150 """The name of the instrument, as used in Gen3 data IDs (`str`).
151 """
153 visits: Set[int]
154 """The set of visit IDs that should be included in the conversion (`set`
155 of `int`).
156 """
158 regions: Optional[List[Region]]
159 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
161 Set to `None` before it has been initialized. Any code that attempts to
162 use it when it is `None` has a logic bug.
163 """
165 tracts: Dict[str, Set[int]]
166 """Tracts that should be included in the conversion, grouped by skymap
167 name (`dict` mapping `str` to `set` of `int`).
168 """
170 skypix: Dict[SkyPixDimension, RangeSet]
171 """SkyPix ranges that should be included in the conversion, grouped by
172 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
173 """
176class RepoConverter(ABC):
177 """An abstract base class for objects that help `ConvertRepoTask` convert
178 datasets from a single Gen2 repository.
180 Parameters
181 ----------
182 task : `ConvertRepoTask`
183 Task instance that is using this helper object.
184 root : `str`
185 Root of the Gen2 repo being converted.
186 collections : `list` of `str`
187 Gen3 collections with which all converted datasets should be
188 associated.
189 subset : `ConversionSubset, optional
190 Helper object that implements a filter that restricts the data IDs that
191 are converted.
193 Notes
194 -----
195 `RepoConverter` defines the only public API users of its subclasses should
196 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
197 several abstract methods that subclasses must implement. In some cases,
198 subclasses may reimplement the public methods as well, but are expected to
199 delegate to ``super()`` either at the beginning or end of their own
200 implementation.
201 """
203 def __init__(self, *, task: ConvertRepoTask, root: str, collections: List[str],
204 subset: Optional[ConversionSubset] = None):
205 self.task = task
206 self.root = root
207 self.subset = subset
208 self._collections = list(collections)
209 self._repoWalker = None # Created in prep
210 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
212 @abstractmethod
213 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
214 """Test whether the given dataset is handled specially by this
215 converter and hence should be ignored by generic base-class logic that
216 searches for dataset types to convert.
218 Parameters
219 ----------
220 datasetTypeName : `str`
221 Name of the dataset type to test.
223 Returns
224 -------
225 special : `bool`
226 `True` if the dataset type is special.
227 """
228 raise NotImplementedError()
230 @abstractmethod
231 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
232 """Iterate over all `CameraMapper` `Mapping` objects that should be
233 considered for conversion by this repository.
235 This this should include any datasets that may appear in the
236 repository, including those that are special (see
237 `isDatasetTypeSpecial`) and those that are being ignored (see
238 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
239 to identify and hence skip these datasets quietly instead of warning
240 about them as unrecognized.
242 Yields
243 ------
244 datasetTypeName: `str`
245 Name of the dataset type.
246 mapping : `lsst.obs.base.mapping.Mapping`
247 Mapping object used by the Gen2 `CameraMapper` to describe the
248 dataset type.
249 """
250 raise NotImplementedError()
252 @abstractmethod
253 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
254 storageClass: StorageClass) -> RepoWalker.Target:
255 """Make a struct that identifies a dataset type to be extracted by
256 walking the repo directory structure.
258 Parameters
259 ----------
260 datasetTypeName : `str`
261 Name of the dataset type (the same in both Gen2 and Gen3).
262 template : `str`
263 The full Gen2 filename template.
264 keys : `dict` [`str`, `type`]
265 A dictionary mapping Gen2 data ID key to the type of its value.
266 storageClass : `lsst.daf.butler.StorageClass`
267 Gen3 storage class for this dataset type.
269 Returns
270 -------
271 target : `RepoWalker.Target`
272 A struct containing information about the target dataset (much of
273 it simplify forwarded from the arguments).
274 """
275 raise NotImplementedError()
277 def getSpecialDirectories(self) -> List[str]:
278 """Return a list of directory paths that should not be searched for
279 files.
281 These may be directories that simply do not contain datasets (or
282 contain datasets in another repository), or directories whose datasets
283 are handled specially by a subclass.
285 Returns
286 -------
287 directories : `list` [`str`]
288 The full paths of directories to skip, relative to the repository
289 root.
290 """
291 return []
293 def prep(self):
294 """Perform preparatory work associated with the dataset types to be
295 converted from this repository (but not the datasets themselves).
297 Notes
298 -----
299 This should be a relatively fast operation that should not depend on
300 the size of the repository.
302 Subclasses may override this method, but must delegate to the base
303 class implementation at some point in their own logic.
304 More often, subclasses will specialize the behavior of `prep` by
305 overriding other methods to which the base class implementation
306 delegates. These include:
307 - `iterMappings`
308 - `isDatasetTypeSpecial`
309 - `getSpecialDirectories`
310 - `makeRepoWalkerTarget`
312 This should not perform any write operations to the Gen3 repository.
313 It is guaranteed to be called before `insertDimensionData`.
314 """
315 self.task.log.info(f"Preparing other dataset types from root {self.root}.")
316 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
317 for datasetTypeName, mapping in self.iterMappings():
318 try:
319 template = mapping.template
320 except RuntimeError:
321 # No template for this dataset in this mapper, so there's no
322 # way there should be instances of this dataset in this repo.
323 continue
324 extensions = [""]
325 skip = False
326 message = None
327 storageClass = None
328 if (not self.task.isDatasetTypeIncluded(datasetTypeName)
329 or self.isDatasetTypeSpecial(datasetTypeName)):
330 # User indicated not to include this data, but we still want
331 # to recognize files of that type to avoid warning about them.
332 skip = True
333 else:
334 storageClass = self._guessStorageClass(datasetTypeName, mapping)
335 if storageClass is None:
336 # This may be a problem, but only if we actually encounter any
337 # files corresponding to this dataset. Of course, we need
338 # to be able to parse those files in order to recognize that
339 # situation.
340 message = f"no storage class found for {datasetTypeName}"
341 skip = True
342 # Handle files that are compressed on disk, but the gen2 template is just `.fits`
343 if template.endswith(".fits"):
344 extensions.extend((".gz", ".fz"))
345 for extension in extensions:
346 if skip:
347 walkerInput = RepoWalker.Skip(
348 template=template+extension,
349 keys=mapping.keys(),
350 message=message,
351 )
352 self.task.log.debug("Skipping template in walker: %s", template)
353 else:
354 assert message is None
355 walkerInput = self.makeRepoWalkerTarget(
356 datasetTypeName=datasetTypeName,
357 template=template+extension,
358 keys=mapping.keys(),
359 storageClass=storageClass,
360 )
361 self.task.log.debug("Adding template to walker: %s", template)
362 walkerInputs.append(walkerInput)
364 for dirPath in self.getSpecialDirectories():
365 walkerInputs.append(
366 RepoWalker.Skip(
367 template=dirPath, # not really a template, but that's fine; it's relative to root.
368 keys={},
369 message=None,
370 isForFiles=True,
371 )
372 )
373 fileIgnoreRegExTerms = []
374 for pattern in self.task.config.fileIgnorePatterns:
375 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
376 if fileIgnoreRegExTerms:
377 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
378 else:
379 fileIgnoreRegEx = None
380 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx)
382 def iterDatasets(self) -> Iterator[FileDataset]:
383 """Iterate over datasets in the repository that should be ingested into
384 the Gen3 repository.
386 The base class implementation yields nothing; the datasets handled by
387 the `RepoConverter` base class itself are read directly in
388 `findDatasets`.
390 Subclasses should override this method if they support additional
391 datasets that are handled some other way.
393 Yields
394 ------
395 dataset : `FileDataset`
396 Structures representing datasets to be ingested. Paths should be
397 absolute.
398 """
399 yield from ()
401 def findDatasets(self):
402 assert self._repoWalker, "prep() must be called before findDatasets."
403 self.task.log.info("Adding special datasets in repo %s.", self.root)
404 for dataset in self.iterDatasets():
405 assert len(dataset.refs) == 1
406 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
407 self.task.log.info("Finding datasets from files in repo %s.", self.root)
408 self._fileDatasets.update(
409 self._repoWalker.walk(
410 self.root,
411 log=self.task.log,
412 predicate=(self.subset.isRelated if self.subset is not None else None)
413 )
414 )
416 def insertDimensionData(self):
417 """Insert any dimension records uniquely derived from this repository
418 into the registry.
420 Subclasses may override this method, but may not need to; the default
421 implementation does nothing.
423 SkyMap and SkyPix dimensions should instead be handled by calling
424 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
425 these dimensions are in general shared by multiple Gen2 repositories.
427 This method is guaranteed to be called between `prep` and
428 `expandDataIds`.
429 """
430 pass
432 def expandDataIds(self):
433 """Expand the data IDs for all datasets to be inserted.
435 Subclasses may override this method, but must delegate to the base
436 class implementation if they do.
438 This involves queries to the registry, but not writes. It is
439 guaranteed to be called between `insertDimensionData` and `ingest`.
440 """
441 import itertools
442 for datasetType, datasetsForType in self._fileDatasets.items():
443 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType),
444 datasetType.name)
445 expanded = []
446 for dataset in datasetsForType:
447 for i, ref in enumerate(dataset.refs):
448 try:
449 dataId = self.task.registry.expandDataId(ref.dataId)
450 dataset.refs[i] = ref.expanded(dataId)
451 except LookupError as err:
452 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
453 # Remove skipped datasets from multi-extension FileDatasets
454 dataset.refs[i] = None # We will strip off the `None`s after the loop.
455 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs)
456 if dataset.refs:
457 expanded.append(dataset)
459 datasetsForType[:] = expanded
461 def ingest(self):
462 """Insert converted datasets into the Gen3 repository.
464 Subclasses may override this method, but must delegate to the base
465 class implementation at some point in their own logic.
467 This method is guaranteed to be called after `expandDataIds`.
468 """
469 for datasetType, datasetsForType in self._fileDatasets.items():
470 self.task.registry.registerDatasetType(datasetType)
471 self.task.log.info("Ingesting %s %s datasets.", len(datasetsForType), datasetType.name)
472 try:
473 collections = self.getCollections(datasetType.name)
474 except LookupError as err:
475 self.task.log.warn(str(err))
476 continue
477 try:
478 self.task.registry.registerRun(collections[0])
479 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer,
480 run=collections[0])
481 except LookupError as err:
482 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
483 for collection in collections[1:]:
484 self.task.registry.associate(collection,
485 [ref for dataset in datasetsForType for ref in dataset.refs])
487 def getCollections(self, datasetTypeName: str) -> List[str]:
488 """Return the set of collections a particular dataset type should be
489 associated with.
491 Parameters
492 ----------
493 datasetTypeName : `str`
494 Name of the dataset type.
496 Returns
497 -------
498 collections : `list` of `str`
499 Collections the dataset should be associated with. The first
500 item in the list is the run the dataset should be added to
501 initially.
502 """
503 if datasetTypeName in self.task.config.collections:
504 return [self.task.config.collections[datasetTypeName]] + self._collections
505 elif self._collections:
506 return self._collections
507 else:
508 raise LookupError("No collection configured for dataset type {datasetTypeName}.")
510 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
511 ) -> Optional[StorageClass]:
512 """Infer the Gen3 `StorageClass` from a dataset from a combination of
513 configuration and Gen2 dataset type information.
515 datasetTypeName: `str`
516 Name of the dataset type.
517 mapping : `lsst.obs.base.mapping.Mapping`
518 Mapping object used by the Gen2 `CameraMapper` to describe the
519 dataset type.
520 """
521 storageClassName = self.task.config.storageClasses.get(datasetTypeName)
522 if storageClassName is None and mapping.python is not None:
523 storageClassName = self.task.config.storageClasses.get(mapping.python, None)
524 if storageClassName is None and mapping.persistable is not None:
525 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
526 if storageClassName is None and mapping.python is not None:
527 unqualified = mapping.python.split(".")[-1]
528 storageClassName = self.task.config.storageClasses.get(unqualified, None)
529 if storageClassName is not None:
530 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
531 else:
532 try:
533 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
534 except KeyError:
535 storageClass = None
536 if storageClass is None and mapping.python is not None:
537 try:
538 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
539 except KeyError:
540 pass
541 if storageClass is None:
542 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
543 else:
544 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
545 return storageClass
547 # Class attributes that will be shadowed by public instance attributes;
548 # defined here only for documentation purposes.
550 task: ConvertRepoTask
551 """The parent task that constructed and uses this converter
552 (`ConvertRepoTask`).
553 """
555 root: str
556 """Root path to the Gen2 repository this converter manages (`str`).
558 This is a complete path, not relative to some other repository root.
559 """
561 subset: Optional[ConversionSubset]
562 """An object that represents a filter to be applied to the datasets that
563 are converted (`ConversionSubset` or `None`).
564 """