Coverage for python/lsst/obs/base/gen2to3/repoConverter.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RepoConverter"]
25from dataclasses import dataclass
26from collections import defaultdict
27from abc import ABC, abstractmethod
28import fnmatch
29import os.path
30import re
31from typing import (
32 Dict,
33 Iterator,
34 List,
35 MutableMapping,
36 Optional,
37 Set,
38 Tuple,
39 Union,
40 TYPE_CHECKING,
41)
43from lsst.utils import doImport
44from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
45from lsst.sphgeom import RangeSet, Region
46from .repoWalker import RepoWalker
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
50 from .convertRepo import ConvertRepoTask
51 from .scanner import PathElementHandler
52 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter
55@dataclass
56class ConversionSubset:
57 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
58 lists of related data ID values that should be included in the conversion.
60 Parameters
61 ----------
62 instrument : `str`
63 Instrument name used in Gen3 data IDs.
64 visits : `set` of `int`
65 Visit IDs that define the filter.
66 """
68 def __init__(self, instrument: str, visits: Set[int]):
69 self.instrument = instrument
70 self.visits = visits
71 self.regions = None
72 self.tracts = {}
73 self.skypix = {}
75 def addSkyMap(self, registry: Registry, name: str):
76 """Populate the included tract IDs for the given skymap from those that
77 overlap the visits the `ConversionSubset` was initialized with.
79 Parameters
80 ----------
81 registry : `lsst.daf.butler.Registry`
82 Registry that can be queried for visit/tract overlaps.
83 name : `str`
84 SkyMap name used in Gen3 data IDs.
85 """
86 tracts = set()
87 self.tracts[name] = tracts
88 for visit in self.visits:
89 for dataId in registry.queryDataIds(["tract"],
90 dataId={"skymap": name,
91 "instrument": self.instrument,
92 "visit": visit}):
93 tracts.add(dataId["tract"])
95 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
96 """Populate the included skypix IDs for the given dimension from those
97 that overlap the visits the `ConversionSubset` was initialized with.
99 Parameters
100 ----------
101 registry : `lsst.daf.butler.Registry`
102 Registry that can be queried for visit regions.
103 name : `str`
104 SkyMap name used in Gen3 data IDs.
105 """
106 if self.regions is None:
107 self.regions = []
108 for visit in self.visits:
109 dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
110 self.regions.append(dataId.region)
111 ranges = RangeSet()
112 for region in self.regions:
113 ranges = ranges.union(dimension.pixelization.envelope(region))
114 self.skypix[dimension] = ranges
116 def isRelated(self, dataId: DataCoordinate) -> bool:
117 """Test whether the given data ID is related to this subset and hence
118 should be included in a repository conversion.
120 Parameters
121 ----------
122 dataId : `lsst.daf.butler.DataCoordinate`
123 Data ID to test.
125 Returns
126 -------
127 related : `bool`
128 `True` if this data ID should be included in a repository
129 conversion.
131 Notes
132 -----
133 More formally, this tests that the given data ID is not unrelated;
134 if a data ID does not involve tracts, visits, or skypix dimensions,
135 we always include it.
136 """
137 if self.visits is None:
138 # We're not filtering at all.
139 return True
140 if "visit" in dataId.graph and dataId["visit"] not in self.visits:
141 return False
142 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
143 return False
144 for dimension, ranges in self.skypix.items():
145 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
146 return False
147 return True
149 # Class attributes that will be shadowed by public instance attributes;
150 # defined here only for documentation purposes.
152 instrument: str
153 """The name of the instrument, as used in Gen3 data IDs (`str`).
154 """
156 visits: Set[int]
157 """The set of visit IDs that should be included in the conversion (`set`
158 of `int`).
159 """
161 regions: Optional[List[Region]]
162 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
164 Set to `None` before it has been initialized. Any code that attempts to
165 use it when it is `None` has a logic bug.
166 """
168 tracts: Dict[str, Set[int]]
169 """Tracts that should be included in the conversion, grouped by skymap
170 name (`dict` mapping `str` to `set` of `int`).
171 """
173 skypix: Dict[SkyPixDimension, RangeSet]
174 """SkyPix ranges that should be included in the conversion, grouped by
175 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
176 """
179class RepoConverter(ABC):
180 """An abstract base class for objects that help `ConvertRepoTask` convert
181 datasets from a single Gen2 repository.
183 Parameters
184 ----------
185 task : `ConvertRepoTask`
186 Task instance that is using this helper object.
187 root : `str`
188 Root of the Gen2 repo being converted. Will be converted to an
189 absolute path, resolving symbolic links and ``~``, if necessary.
190 collections : `list` of `str`
191 Gen3 collections with which all converted datasets should be
192 associated.
193 subset : `ConversionSubset, optional
194 Helper object that implements a filter that restricts the data IDs that
195 are converted.
197 Notes
198 -----
199 `RepoConverter` defines the only public API users of its subclasses should
200 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
201 several abstract methods that subclasses must implement. In some cases,
202 subclasses may reimplement the public methods as well, but are expected to
203 delegate to ``super()`` either at the beginning or end of their own
204 implementation.
205 """
207 def __init__(self, *, task: ConvertRepoTask, root: str, run: Optional[str],
208 subset: Optional[ConversionSubset] = None):
209 self.task = task
210 self.root = os.path.realpath(os.path.expanduser(root))
211 self.subset = subset
212 self._run = run
213 self._repoWalker = None # Created in prep
214 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
216 @abstractmethod
217 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
218 """Test whether the given dataset is handled specially by this
219 converter and hence should be ignored by generic base-class logic that
220 searches for dataset types to convert.
222 Parameters
223 ----------
224 datasetTypeName : `str`
225 Name of the dataset type to test.
227 Returns
228 -------
229 special : `bool`
230 `True` if the dataset type is special.
231 """
232 raise NotImplementedError()
234 @abstractmethod
235 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
236 """Iterate over all `CameraMapper` `Mapping` objects that should be
237 considered for conversion by this repository.
239 This this should include any datasets that may appear in the
240 repository, including those that are special (see
241 `isDatasetTypeSpecial`) and those that are being ignored (see
242 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
243 to identify and hence skip these datasets quietly instead of warning
244 about them as unrecognized.
246 Yields
247 ------
248 datasetTypeName: `str`
249 Name of the dataset type.
250 mapping : `lsst.obs.base.mapping.Mapping`
251 Mapping object used by the Gen2 `CameraMapper` to describe the
252 dataset type.
253 """
254 raise NotImplementedError()
256 @abstractmethod
257 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
258 storageClass: StorageClass,
259 formatter: FormatterParameter = None,
260 targetHandler: Optional[PathElementHandler] = None,
261 ) -> RepoWalker.Target:
262 """Make a struct that identifies a dataset type to be extracted by
263 walking the repo directory structure.
265 Parameters
266 ----------
267 datasetTypeName : `str`
268 Name of the dataset type (the same in both Gen2 and Gen3).
269 template : `str`
270 The full Gen2 filename template.
271 keys : `dict` [`str`, `type`]
272 A dictionary mapping Gen2 data ID key to the type of its value.
273 storageClass : `lsst.daf.butler.StorageClass`
274 Gen3 storage class for this dataset type.
275 formatter : `lsst.daf.butler.Formatter` or `str`, optional
276 A Gen 3 formatter class or fully-qualified name.
277 targetHandler : `PathElementHandler`, optional
278 Specialist target handler to use for this dataset type.
280 Returns
281 -------
282 target : `RepoWalker.Target`
283 A struct containing information about the target dataset (much of
284 it simplify forwarded from the arguments).
285 """
286 raise NotImplementedError()
288 def getSpecialDirectories(self) -> List[str]:
289 """Return a list of directory paths that should not be searched for
290 files.
292 These may be directories that simply do not contain datasets (or
293 contain datasets in another repository), or directories whose datasets
294 are handled specially by a subclass.
296 Returns
297 -------
298 directories : `list` [`str`]
299 The full paths of directories to skip, relative to the repository
300 root.
301 """
302 return []
304 def prep(self):
305 """Perform preparatory work associated with the dataset types to be
306 converted from this repository (but not the datasets themselves).
308 Notes
309 -----
310 This should be a relatively fast operation that should not depend on
311 the size of the repository.
313 Subclasses may override this method, but must delegate to the base
314 class implementation at some point in their own logic.
315 More often, subclasses will specialize the behavior of `prep` by
316 overriding other methods to which the base class implementation
317 delegates. These include:
318 - `iterMappings`
319 - `isDatasetTypeSpecial`
320 - `getSpecialDirectories`
321 - `makeRepoWalkerTarget`
323 This should not perform any write operations to the Gen3 repository.
324 It is guaranteed to be called before `insertDimensionData`.
325 """
326 self.task.log.info(f"Preparing other dataset types from root {self.root}.")
327 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
328 for datasetTypeName, mapping in self.iterMappings():
329 try:
330 template = mapping.template
331 except RuntimeError:
332 # No template for this dataset in this mapper, so there's no
333 # way there should be instances of this dataset in this repo.
334 continue
335 extensions = [""]
336 skip = False
337 message = None
338 storageClass = None
339 if (not self.task.isDatasetTypeIncluded(datasetTypeName)
340 or self.isDatasetTypeSpecial(datasetTypeName)):
341 # User indicated not to include this data, but we still want
342 # to recognize files of that type to avoid warning about them.
343 skip = True
344 else:
345 storageClass = self._guessStorageClass(datasetTypeName, mapping)
346 if storageClass is None:
347 # This may be a problem, but only if we actually encounter any
348 # files corresponding to this dataset. Of course, we need
349 # to be able to parse those files in order to recognize that
350 # situation.
351 message = f"no storage class found for {datasetTypeName}"
352 skip = True
353 # Handle files that are compressed on disk, but the gen2 template is just `.fits`
354 if template.endswith(".fits"):
355 extensions.extend((".gz", ".fz"))
356 for extension in extensions:
357 if skip:
358 walkerInput = RepoWalker.Skip(
359 template=template+extension,
360 keys=mapping.keys(),
361 message=message,
362 )
363 self.task.log.debug("Skipping template in walker: %s", template)
364 else:
365 assert message is None
366 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName)
367 if targetHandler is not None:
368 targetHandler = doImport(targetHandler)
369 walkerInput = self.makeRepoWalkerTarget(
370 datasetTypeName=datasetTypeName,
371 template=template+extension,
372 keys=mapping.keys(),
373 storageClass=storageClass,
374 formatter=self.task.config.formatterClasses.get(datasetTypeName),
375 targetHandler=targetHandler,
376 )
377 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension,
378 walkerInput.datasetType)
379 walkerInputs.append(walkerInput)
381 for dirPath in self.getSpecialDirectories():
382 walkerInputs.append(
383 RepoWalker.Skip(
384 template=dirPath, # not really a template, but that's fine; it's relative to root.
385 keys={},
386 message=None,
387 isForFiles=True,
388 )
389 )
390 fileIgnoreRegExTerms = []
391 for pattern in self.task.config.fileIgnorePatterns:
392 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
393 if fileIgnoreRegExTerms:
394 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
395 else:
396 fileIgnoreRegEx = None
397 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx,
398 log=self.task.log.getChild("repoWalker"))
400 def iterDatasets(self) -> Iterator[FileDataset]:
401 """Iterate over datasets in the repository that should be ingested into
402 the Gen3 repository.
404 The base class implementation yields nothing; the datasets handled by
405 the `RepoConverter` base class itself are read directly in
406 `findDatasets`.
408 Subclasses should override this method if they support additional
409 datasets that are handled some other way.
411 Yields
412 ------
413 dataset : `FileDataset`
414 Structures representing datasets to be ingested. Paths should be
415 absolute.
416 """
417 yield from ()
419 def findDatasets(self):
420 assert self._repoWalker, "prep() must be called before findDatasets."
421 self.task.log.info("Adding special datasets in repo %s.", self.root)
422 for dataset in self.iterDatasets():
423 assert len(dataset.refs) == 1
424 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
425 self.task.log.info("Finding datasets from files in repo %s.", self.root)
426 self._fileDatasets.update(
427 self._repoWalker.walk(
428 self.root,
429 predicate=(self.subset.isRelated if self.subset is not None else None)
430 )
431 )
433 def insertDimensionData(self):
434 """Insert any dimension records uniquely derived from this repository
435 into the registry.
437 Subclasses may override this method, but may not need to; the default
438 implementation does nothing.
440 SkyMap and SkyPix dimensions should instead be handled by calling
441 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
442 these dimensions are in general shared by multiple Gen2 repositories.
444 This method is guaranteed to be called between `prep` and
445 `expandDataIds`.
446 """
447 pass
449 def expandDataIds(self):
450 """Expand the data IDs for all datasets to be inserted.
452 Subclasses may override this method, but must delegate to the base
453 class implementation if they do.
455 This involves queries to the registry, but not writes. It is
456 guaranteed to be called between `insertDimensionData` and `ingest`.
457 """
458 import itertools
459 for datasetType, datasetsForType in self._fileDatasets.items():
460 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType),
461 datasetType.name)
462 expanded = []
463 for dataset in datasetsForType:
464 for i, ref in enumerate(dataset.refs):
465 try:
466 dataId = self.task.registry.expandDataId(ref.dataId)
467 dataset.refs[i] = ref.expanded(dataId)
468 except LookupError as err:
469 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
470 # Remove skipped datasets from multi-extension FileDatasets
471 dataset.refs[i] = None # We will strip off the `None`s after the loop.
472 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs)
473 if dataset.refs:
474 expanded.append(dataset)
476 datasetsForType[:] = expanded
478 def ingest(self):
479 """Insert converted datasets into the Gen3 repository.
481 Subclasses may override this method, but must delegate to the base
482 class implementation at some point in their own logic.
484 This method is guaranteed to be called after `expandDataIds`.
485 """
486 for datasetType, datasetsForType in self._fileDatasets.items():
487 self.task.registry.registerDatasetType(datasetType)
488 try:
489 run = self.getRun(datasetType.name)
490 except LookupError:
491 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.")
492 continue
493 self.task.log.info("Ingesting %s %s datasets into run %s.", len(datasetsForType),
494 datasetType.name, run)
495 try:
496 self.task.registry.registerRun(run)
497 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, run=run)
498 except LookupError as err:
499 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
501 def getRun(self, datasetTypeName: str) -> str:
502 """Return the name of the run to insert instances of the given dataset
503 type into in this collection.
505 Parameters
506 ----------
507 datasetTypeName : `str`
508 Name of the dataset type.
510 Returns
511 -------
512 run : `str`
513 Name of the `~lsst.daf.butler.CollectionType.RUN` collection.
514 """
515 assert self._run is not None, "Method must be overridden if self._run is allowed to be None"
516 return self._run
518 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
519 ) -> Optional[StorageClass]:
520 """Infer the Gen3 `StorageClass` from a dataset from a combination of
521 configuration and Gen2 dataset type information.
523 datasetTypeName: `str`
524 Name of the dataset type.
525 mapping : `lsst.obs.base.mapping.Mapping`
526 Mapping object used by the Gen2 `CameraMapper` to describe the
527 dataset type.
528 """
529 storageClassName = self.task.config.storageClasses.get(datasetTypeName)
530 if storageClassName is None and mapping.python is not None:
531 storageClassName = self.task.config.storageClasses.get(mapping.python, None)
532 if storageClassName is None and mapping.persistable is not None:
533 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
534 if storageClassName is None and mapping.python is not None:
535 unqualified = mapping.python.split(".")[-1]
536 storageClassName = self.task.config.storageClasses.get(unqualified, None)
537 if storageClassName is not None:
538 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
539 else:
540 try:
541 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
542 except KeyError:
543 storageClass = None
544 if storageClass is None and mapping.python is not None:
545 try:
546 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
547 except KeyError:
548 pass
549 if storageClass is None:
550 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
551 else:
552 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
553 return storageClass
555 # Class attributes that will be shadowed by public instance attributes;
556 # defined here only for documentation purposes.
558 task: ConvertRepoTask
559 """The parent task that constructed and uses this converter
560 (`ConvertRepoTask`).
561 """
563 root: str
564 """Root path to the Gen2 repository this converter manages (`str`).
566 This is a complete path, not relative to some other repository root.
567 """
569 subset: Optional[ConversionSubset]
570 """An object that represents a filter to be applied to the datasets that
571 are converted (`ConversionSubset` or `None`).
572 """