Coverage for python/lsst/obs/base/gen2to3/repoConverter.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RepoConverter"]
25from dataclasses import dataclass
26from collections import defaultdict
27from abc import ABC, abstractmethod
28import fnmatch
29import re
30from typing import (
31 Dict,
32 Iterator,
33 List,
34 MutableMapping,
35 Optional,
36 Set,
37 Tuple,
38 Union,
39 TYPE_CHECKING,
40)
42from lsst.utils import doImport
43from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
44from lsst.sphgeom import RangeSet, Region
45from .repoWalker import RepoWalker
47if TYPE_CHECKING: 47 ↛ 48line 47 didn't jump to line 48, because the condition on line 47 was never true
48 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
49 from .convertRepo import ConvertRepoTask
50 from .scanner import PathElementHandler
51 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter
54@dataclass
55class ConversionSubset:
56 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
57 lists of related data ID values that should be included in the conversion.
59 Parameters
60 ----------
61 instrument : `str`
62 Instrument name used in Gen3 data IDs.
63 visits : `set` of `int`
64 Visit IDs that define the filter.
65 """
67 def __init__(self, instrument: str, visits: Set[int]):
68 self.instrument = instrument
69 self.visits = visits
70 self.regions = None
71 self.tracts = {}
72 self.skypix = {}
74 def addSkyMap(self, registry: Registry, name: str):
75 """Populate the included tract IDs for the given skymap from those that
76 overlap the visits the `ConversionSubset` was initialized with.
78 Parameters
79 ----------
80 registry : `lsst.daf.butler.Registry`
81 Registry that can be queried for visit/tract overlaps.
82 name : `str`
83 SkyMap name used in Gen3 data IDs.
84 """
85 tracts = set()
86 self.tracts[name] = tracts
87 for visit in self.visits:
88 for dataId in registry.queryDimensions(["tract"], expand=False,
89 dataId={"skymap": name,
90 "instrument": self.instrument,
91 "visit": visit}):
92 tracts.add(dataId["tract"])
94 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
95 """Populate the included skypix IDs for the given dimension from those
96 that overlap the visits the `ConversionSubset` was initialized with.
98 Parameters
99 ----------
100 registry : `lsst.daf.butler.Registry`
101 Registry that can be queried for visit regions.
102 name : `str`
103 SkyMap name used in Gen3 data IDs.
104 """
105 if self.regions is None:
106 self.regions = []
107 for visit in self.visits:
108 dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
109 self.regions.append(dataId.region)
110 ranges = RangeSet()
111 for region in self.regions:
112 ranges = ranges.union(dimension.pixelization.envelope(region))
113 self.skypix[dimension] = ranges
115 def isRelated(self, dataId: DataCoordinate) -> bool:
116 """Test whether the given data ID is related to this subset and hence
117 should be included in a repository conversion.
119 Parameters
120 ----------
121 dataId : `lsst.daf.butler.DataCoordinate`
122 Data ID to test.
124 Returns
125 -------
126 related : `bool`
127 `True` if this data ID should be included in a repository
128 conversion.
130 Notes
131 -----
132 More formally, this tests that the given data ID is not unrelated;
133 if a data ID does not involve tracts, visits, or skypix dimensions,
134 we always include it.
135 """
136 if self.visits is None:
137 # We're not filtering at all.
138 return True
139 if "visit" in dataId.graph and dataId["visit"] not in self.visits:
140 return False
141 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
142 return False
143 for dimension, ranges in self.skypix.items():
144 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
145 return False
146 return True
148 # Class attributes that will be shadowed by public instance attributes;
149 # defined here only for documentation purposes.
151 instrument: str
152 """The name of the instrument, as used in Gen3 data IDs (`str`).
153 """
155 visits: Set[int]
156 """The set of visit IDs that should be included in the conversion (`set`
157 of `int`).
158 """
160 regions: Optional[List[Region]]
161 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
163 Set to `None` before it has been initialized. Any code that attempts to
164 use it when it is `None` has a logic bug.
165 """
167 tracts: Dict[str, Set[int]]
168 """Tracts that should be included in the conversion, grouped by skymap
169 name (`dict` mapping `str` to `set` of `int`).
170 """
172 skypix: Dict[SkyPixDimension, RangeSet]
173 """SkyPix ranges that should be included in the conversion, grouped by
174 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
175 """
178class RepoConverter(ABC):
179 """An abstract base class for objects that help `ConvertRepoTask` convert
180 datasets from a single Gen2 repository.
182 Parameters
183 ----------
184 task : `ConvertRepoTask`
185 Task instance that is using this helper object.
186 root : `str`
187 Root of the Gen2 repo being converted.
188 collections : `list` of `str`
189 Gen3 collections with which all converted datasets should be
190 associated.
191 subset : `ConversionSubset, optional
192 Helper object that implements a filter that restricts the data IDs that
193 are converted.
195 Notes
196 -----
197 `RepoConverter` defines the only public API users of its subclasses should
198 use (`prep`, `insertDimensionRecords`, and `ingest`). These delegate to
199 several abstract methods that subclasses must implement. In some cases,
200 subclasses may reimplement the public methods as well, but are expected to
201 delegate to ``super()`` either at the beginning or end of their own
202 implementation.
203 """
205 def __init__(self, *, task: ConvertRepoTask, root: str, run: Optional[str],
206 subset: Optional[ConversionSubset] = None):
207 self.task = task
208 self.root = root
209 self.subset = subset
210 self._run = run
211 self._repoWalker = None # Created in prep
212 self._fileDatasets: MutableMapping[DatasetType, List[FileDataset]] = defaultdict(list)
214 @abstractmethod
215 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
216 """Test whether the given dataset is handled specially by this
217 converter and hence should be ignored by generic base-class logic that
218 searches for dataset types to convert.
220 Parameters
221 ----------
222 datasetTypeName : `str`
223 Name of the dataset type to test.
225 Returns
226 -------
227 special : `bool`
228 `True` if the dataset type is special.
229 """
230 raise NotImplementedError()
232 @abstractmethod
233 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
234 """Iterate over all `CameraMapper` `Mapping` objects that should be
235 considered for conversion by this repository.
237 This this should include any datasets that may appear in the
238 repository, including those that are special (see
239 `isDatasetTypeSpecial`) and those that are being ignored (see
240 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
241 to identify and hence skip these datasets quietly instead of warning
242 about them as unrecognized.
244 Yields
245 ------
246 datasetTypeName: `str`
247 Name of the dataset type.
248 mapping : `lsst.obs.base.mapping.Mapping`
249 Mapping object used by the Gen2 `CameraMapper` to describe the
250 dataset type.
251 """
252 raise NotImplementedError()
254 @abstractmethod
255 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
256 storageClass: StorageClass,
257 formatter: FormatterParameter = None,
258 targetHandler: Optional[PathElementHandler] = None,
259 ) -> RepoWalker.Target:
260 """Make a struct that identifies a dataset type to be extracted by
261 walking the repo directory structure.
263 Parameters
264 ----------
265 datasetTypeName : `str`
266 Name of the dataset type (the same in both Gen2 and Gen3).
267 template : `str`
268 The full Gen2 filename template.
269 keys : `dict` [`str`, `type`]
270 A dictionary mapping Gen2 data ID key to the type of its value.
271 storageClass : `lsst.daf.butler.StorageClass`
272 Gen3 storage class for this dataset type.
273 formatter : `lsst.daf.butler.Formatter` or `str`, optional
274 A Gen 3 formatter class or fully-qualified name.
275 targetHandler : `PathElementHandler`, optional
276 Specialist target handler to use for this dataset type.
278 Returns
279 -------
280 target : `RepoWalker.Target`
281 A struct containing information about the target dataset (much of
282 it simplify forwarded from the arguments).
283 """
284 raise NotImplementedError()
286 def getSpecialDirectories(self) -> List[str]:
287 """Return a list of directory paths that should not be searched for
288 files.
290 These may be directories that simply do not contain datasets (or
291 contain datasets in another repository), or directories whose datasets
292 are handled specially by a subclass.
294 Returns
295 -------
296 directories : `list` [`str`]
297 The full paths of directories to skip, relative to the repository
298 root.
299 """
300 return []
302 def prep(self):
303 """Perform preparatory work associated with the dataset types to be
304 converted from this repository (but not the datasets themselves).
306 Notes
307 -----
308 This should be a relatively fast operation that should not depend on
309 the size of the repository.
311 Subclasses may override this method, but must delegate to the base
312 class implementation at some point in their own logic.
313 More often, subclasses will specialize the behavior of `prep` by
314 overriding other methods to which the base class implementation
315 delegates. These include:
316 - `iterMappings`
317 - `isDatasetTypeSpecial`
318 - `getSpecialDirectories`
319 - `makeRepoWalkerTarget`
321 This should not perform any write operations to the Gen3 repository.
322 It is guaranteed to be called before `insertDimensionData`.
323 """
324 self.task.log.info(f"Preparing other dataset types from root {self.root}.")
325 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
326 for datasetTypeName, mapping in self.iterMappings():
327 try:
328 template = mapping.template
329 except RuntimeError:
330 # No template for this dataset in this mapper, so there's no
331 # way there should be instances of this dataset in this repo.
332 continue
333 extensions = [""]
334 skip = False
335 message = None
336 storageClass = None
337 if (not self.task.isDatasetTypeIncluded(datasetTypeName)
338 or self.isDatasetTypeSpecial(datasetTypeName)):
339 # User indicated not to include this data, but we still want
340 # to recognize files of that type to avoid warning about them.
341 skip = True
342 else:
343 storageClass = self._guessStorageClass(datasetTypeName, mapping)
344 if storageClass is None:
345 # This may be a problem, but only if we actually encounter any
346 # files corresponding to this dataset. Of course, we need
347 # to be able to parse those files in order to recognize that
348 # situation.
349 message = f"no storage class found for {datasetTypeName}"
350 skip = True
351 # Handle files that are compressed on disk, but the gen2 template is just `.fits`
352 if template.endswith(".fits"):
353 extensions.extend((".gz", ".fz"))
354 for extension in extensions:
355 if skip:
356 walkerInput = RepoWalker.Skip(
357 template=template+extension,
358 keys=mapping.keys(),
359 message=message,
360 )
361 self.task.log.debug("Skipping template in walker: %s", template)
362 else:
363 assert message is None
364 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName)
365 if targetHandler is not None:
366 targetHandler = doImport(targetHandler)
367 walkerInput = self.makeRepoWalkerTarget(
368 datasetTypeName=datasetTypeName,
369 template=template+extension,
370 keys=mapping.keys(),
371 storageClass=storageClass,
372 formatter=self.task.config.formatterClasses.get(datasetTypeName),
373 targetHandler=targetHandler,
374 )
375 self.task.log.debug("Adding template to walker: %s", template)
376 walkerInputs.append(walkerInput)
378 for dirPath in self.getSpecialDirectories():
379 walkerInputs.append(
380 RepoWalker.Skip(
381 template=dirPath, # not really a template, but that's fine; it's relative to root.
382 keys={},
383 message=None,
384 isForFiles=True,
385 )
386 )
387 fileIgnoreRegExTerms = []
388 for pattern in self.task.config.fileIgnorePatterns:
389 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
390 if fileIgnoreRegExTerms:
391 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
392 else:
393 fileIgnoreRegEx = None
394 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx)
396 def iterDatasets(self) -> Iterator[FileDataset]:
397 """Iterate over datasets in the repository that should be ingested into
398 the Gen3 repository.
400 The base class implementation yields nothing; the datasets handled by
401 the `RepoConverter` base class itself are read directly in
402 `findDatasets`.
404 Subclasses should override this method if they support additional
405 datasets that are handled some other way.
407 Yields
408 ------
409 dataset : `FileDataset`
410 Structures representing datasets to be ingested. Paths should be
411 absolute.
412 """
413 yield from ()
415 def findDatasets(self):
416 assert self._repoWalker, "prep() must be called before findDatasets."
417 self.task.log.info("Adding special datasets in repo %s.", self.root)
418 for dataset in self.iterDatasets():
419 assert len(dataset.refs) == 1
420 self._fileDatasets[dataset.refs[0].datasetType].append(dataset)
421 self.task.log.info("Finding datasets from files in repo %s.", self.root)
422 self._fileDatasets.update(
423 self._repoWalker.walk(
424 self.root,
425 log=self.task.log,
426 predicate=(self.subset.isRelated if self.subset is not None else None)
427 )
428 )
430 def insertDimensionData(self):
431 """Insert any dimension records uniquely derived from this repository
432 into the registry.
434 Subclasses may override this method, but may not need to; the default
435 implementation does nothing.
437 SkyMap and SkyPix dimensions should instead be handled by calling
438 `ConvertRepoTask.useSkyMap` or `ConvertRepoTask.useSkyPix`, because
439 these dimensions are in general shared by multiple Gen2 repositories.
441 This method is guaranteed to be called between `prep` and
442 `expandDataIds`.
443 """
444 pass
446 def expandDataIds(self):
447 """Expand the data IDs for all datasets to be inserted.
449 Subclasses may override this method, but must delegate to the base
450 class implementation if they do.
452 This involves queries to the registry, but not writes. It is
453 guaranteed to be called between `insertDimensionData` and `ingest`.
454 """
455 import itertools
456 for datasetType, datasetsForType in self._fileDatasets.items():
457 self.task.log.info("Expanding data IDs for %s %s datasets.", len(datasetsForType),
458 datasetType.name)
459 expanded = []
460 for dataset in datasetsForType:
461 for i, ref in enumerate(dataset.refs):
462 try:
463 dataId = self.task.registry.expandDataId(ref.dataId)
464 dataset.refs[i] = ref.expanded(dataId)
465 except LookupError as err:
466 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
467 # Remove skipped datasets from multi-extension FileDatasets
468 dataset.refs[i] = None # We will strip off the `None`s after the loop.
469 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs)
470 if dataset.refs:
471 expanded.append(dataset)
473 datasetsForType[:] = expanded
475 def ingest(self):
476 """Insert converted datasets into the Gen3 repository.
478 Subclasses may override this method, but must delegate to the base
479 class implementation at some point in their own logic.
481 This method is guaranteed to be called after `expandDataIds`.
482 """
483 for datasetType, datasetsForType in self._fileDatasets.items():
484 self.task.registry.registerDatasetType(datasetType)
485 try:
486 run = self.getRun(datasetType.name)
487 except LookupError:
488 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.")
489 continue
490 self.task.log.info("Ingesting %s %s datasets into run %s.", len(datasetsForType),
491 datasetType.name, run)
492 try:
493 self.task.registry.registerRun(run)
494 self.task.butler3.ingest(*datasetsForType, transfer=self.task.config.transfer, run=run)
495 except LookupError as err:
496 raise LookupError(f"Error expanding data ID for dataset type {datasetType.name}.") from err
498 def getRun(self, datasetTypeName: str) -> str:
499 """Return the name of the run to insert instances of the given dataset
500 type into in this collection.
502 Parameters
503 ----------
504 datasetTypeName : `str`
505 Name of the dataset type.
507 Returns
508 -------
509 run : `str`
510 Name of the `~lsst.daf.butler.CollectionType.RUN` collection.
511 """
512 assert self._run is not None, "Method must be overridden if self._run is allowed to be None"
513 return self._run
515 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
516 ) -> Optional[StorageClass]:
517 """Infer the Gen3 `StorageClass` from a dataset from a combination of
518 configuration and Gen2 dataset type information.
520 datasetTypeName: `str`
521 Name of the dataset type.
522 mapping : `lsst.obs.base.mapping.Mapping`
523 Mapping object used by the Gen2 `CameraMapper` to describe the
524 dataset type.
525 """
526 storageClassName = self.task.config.storageClasses.get(datasetTypeName)
527 if storageClassName is None and mapping.python is not None:
528 storageClassName = self.task.config.storageClasses.get(mapping.python, None)
529 if storageClassName is None and mapping.persistable is not None:
530 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
531 if storageClassName is None and mapping.python is not None:
532 unqualified = mapping.python.split(".")[-1]
533 storageClassName = self.task.config.storageClasses.get(unqualified, None)
534 if storageClassName is not None:
535 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
536 else:
537 try:
538 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
539 except KeyError:
540 storageClass = None
541 if storageClass is None and mapping.python is not None:
542 try:
543 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
544 except KeyError:
545 pass
546 if storageClass is None:
547 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
548 else:
549 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
550 return storageClass
552 # Class attributes that will be shadowed by public instance attributes;
553 # defined here only for documentation purposes.
555 task: ConvertRepoTask
556 """The parent task that constructed and uses this converter
557 (`ConvertRepoTask`).
558 """
560 root: str
561 """Root path to the Gen2 repository this converter manages (`str`).
563 This is a complete path, not relative to some other repository root.
564 """
566 subset: Optional[ConversionSubset]
567 """An object that represents a filter to be applied to the datasets that
568 are converted (`ConversionSubset` or `None`).
569 """