Coverage for python/lsst/obs/base/gen2to3/repoConverter.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RepoConverter"]
25from dataclasses import dataclass
26from collections import defaultdict
27from abc import ABC, abstractmethod
28import fnmatch
29import os.path
30import re
31from typing import (
32 Dict,
33 Iterator,
34 List,
35 Mapping,
36 Optional,
37 Set,
38 Tuple,
39 Union,
40 TYPE_CHECKING,
41)
43from lsst.utils import doImport
44from lsst.daf.butler import DataCoordinate, FileDataset, DatasetType
45from lsst.sphgeom import RangeSet, Region
46from .repoWalker import RepoWalker
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
50 from .convertRepo import ConvertRepoTask
51 from .scanner import PathElementHandler
52 from lsst.daf.butler import StorageClass, Registry, SkyPixDimension, FormatterParameter
53 from .._instrument import Instrument
56@dataclass
57class ConversionSubset:
58 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
59 lists of related data ID values that should be included in the conversion.
61 Parameters
62 ----------
63 instrument : `str`
64 Instrument name used in Gen3 data IDs.
65 visits : `set` of `int`
66 Visit IDs that define the filter.
67 """
69 def __init__(self, instrument: str, visits: Set[int]):
70 self.instrument = instrument
71 self.visits = visits
72 self.regions = None
73 self.tracts = {}
74 self.skypix = {}
76 def addSkyMap(self, registry: Registry, name: str):
77 """Populate the included tract IDs for the given skymap from those that
78 overlap the visits the `ConversionSubset` was initialized with.
80 Parameters
81 ----------
82 registry : `lsst.daf.butler.Registry`
83 Registry that can be queried for visit/tract overlaps.
84 name : `str`
85 SkyMap name used in Gen3 data IDs.
86 """
87 tracts = set()
88 self.tracts[name] = tracts
89 for visit in self.visits:
90 for dataId in registry.queryDataIds(["tract"],
91 dataId={"skymap": name,
92 "instrument": self.instrument,
93 "visit": visit}):
94 tracts.add(dataId["tract"])
96 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
97 """Populate the included skypix IDs for the given dimension from those
98 that overlap the visits the `ConversionSubset` was initialized with.
100 Parameters
101 ----------
102 registry : `lsst.daf.butler.Registry`
103 Registry that can be queried for visit regions.
104 name : `str`
105 SkyMap name used in Gen3 data IDs.
106 """
107 if self.regions is None:
108 self.regions = []
109 for visit in self.visits:
110 dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
111 self.regions.append(dataId.region)
112 ranges = RangeSet()
113 for region in self.regions:
114 ranges = ranges.union(dimension.pixelization.envelope(region))
115 self.skypix[dimension] = ranges
117 def isRelated(self, dataId: DataCoordinate) -> bool:
118 """Test whether the given data ID is related to this subset and hence
119 should be included in a repository conversion.
121 Parameters
122 ----------
123 dataId : `lsst.daf.butler.DataCoordinate`
124 Data ID to test.
126 Returns
127 -------
128 related : `bool`
129 `True` if this data ID should be included in a repository
130 conversion.
132 Notes
133 -----
134 More formally, this tests that the given data ID is not unrelated;
135 if a data ID does not involve tracts, visits, or skypix dimensions,
136 we always include it.
137 """
138 if self.visits is None:
139 # We're not filtering at all.
140 return True
141 if "visit" in dataId.graph and dataId["visit"] not in self.visits:
142 return False
143 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
144 return False
145 for dimension, ranges in self.skypix.items():
146 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
147 return False
148 return True
150 # Class attributes that will be shadowed by public instance attributes;
151 # defined here only for documentation purposes.
153 instrument: str
154 """The name of the instrument, as used in Gen3 data IDs (`str`).
155 """
157 visits: Set[int]
158 """The set of visit IDs that should be included in the conversion (`set`
159 of `int`).
160 """
162 regions: Optional[List[Region]]
163 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
165 Set to `None` before it has been initialized. Any code that attempts to
166 use it when it is `None` has a logic bug.
167 """
169 tracts: Dict[str, Set[int]]
170 """Tracts that should be included in the conversion, grouped by skymap
171 name (`dict` mapping `str` to `set` of `int`).
172 """
174 skypix: Dict[SkyPixDimension, RangeSet]
175 """SkyPix ranges that should be included in the conversion, grouped by
176 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
177 """
180class RepoConverter(ABC):
181 """An abstract base class for objects that help `ConvertRepoTask` convert
182 datasets from a single Gen2 repository.
184 Parameters
185 ----------
186 task : `ConvertRepoTask`
187 Task instance that is using this helper object.
188 root : `str`
189 Root of the Gen2 repo being converted. Will be converted to an
190 absolute path, resolving symbolic links and ``~``, if necessary.
191 instrument : `Instrument`
192 Gen3 instrument class to use for this conversion.
193 collections : `list` of `str`
194 Gen3 collections with which all converted datasets should be
195 associated.
196 subset : `ConversionSubset, optional
197 Helper object that implements a filter that restricts the data IDs that
198 are converted.
200 Notes
201 -----
202 `RepoConverter` defines the only public API users of its subclasses should
203 use (`prep` `ingest`, and `finish`). These delegate to several abstract
204 methods that subclasses must implement. In some cases, subclasses may
205 reimplement the public methods as well, but are expected to delegate to
206 ``super()`` either at the beginning or end of their own implementation.
207 """
209 def __init__(self, *, task: ConvertRepoTask, root: str, instrument: Instrument, run: Optional[str],
210 subset: Optional[ConversionSubset] = None):
211 self.task = task
212 self.root = os.path.realpath(os.path.expanduser(root))
213 self.instrument = instrument
214 self.subset = subset
215 self._run = run
216 self._repoWalker = None # Created in prep
217 self._fileDatasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] \
218 = defaultdict(lambda: defaultdict(list))
220 @abstractmethod
221 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
222 """Test whether the given dataset is handled specially by this
223 converter and hence should be ignored by generic base-class logic that
224 searches for dataset types to convert.
226 Parameters
227 ----------
228 datasetTypeName : `str`
229 Name of the dataset type to test.
231 Returns
232 -------
233 special : `bool`
234 `True` if the dataset type is special.
235 """
236 raise NotImplementedError()
238 @abstractmethod
239 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
240 """Iterate over all `CameraMapper` `Mapping` objects that should be
241 considered for conversion by this repository.
243 This this should include any datasets that may appear in the
244 repository, including those that are special (see
245 `isDatasetTypeSpecial`) and those that are being ignored (see
246 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
247 to identify and hence skip these datasets quietly instead of warning
248 about them as unrecognized.
250 Yields
251 ------
252 datasetTypeName: `str`
253 Name of the dataset type.
254 mapping : `lsst.obs.base.mapping.Mapping`
255 Mapping object used by the Gen2 `CameraMapper` to describe the
256 dataset type.
257 """
258 raise NotImplementedError()
260 @abstractmethod
261 def makeRepoWalkerTarget(self, datasetTypeName: str, template: str, keys: Dict[str, type],
262 storageClass: StorageClass,
263 formatter: FormatterParameter = None,
264 targetHandler: Optional[PathElementHandler] = None,
265 ) -> RepoWalker.Target:
266 """Make a struct that identifies a dataset type to be extracted by
267 walking the repo directory structure.
269 Parameters
270 ----------
271 datasetTypeName : `str`
272 Name of the dataset type (the same in both Gen2 and Gen3).
273 template : `str`
274 The full Gen2 filename template.
275 keys : `dict` [`str`, `type`]
276 A dictionary mapping Gen2 data ID key to the type of its value.
277 storageClass : `lsst.daf.butler.StorageClass`
278 Gen3 storage class for this dataset type.
279 formatter : `lsst.daf.butler.Formatter` or `str`, optional
280 A Gen 3 formatter class or fully-qualified name.
281 targetHandler : `PathElementHandler`, optional
282 Specialist target handler to use for this dataset type.
284 Returns
285 -------
286 target : `RepoWalker.Target`
287 A struct containing information about the target dataset (much of
288 it simplify forwarded from the arguments).
289 """
290 raise NotImplementedError()
292 def getSpecialDirectories(self) -> List[str]:
293 """Return a list of directory paths that should not be searched for
294 files.
296 These may be directories that simply do not contain datasets (or
297 contain datasets in another repository), or directories whose datasets
298 are handled specially by a subclass.
300 Returns
301 -------
302 directories : `list` [`str`]
303 The full paths of directories to skip, relative to the repository
304 root.
305 """
306 return []
308 def prep(self):
309 """Perform preparatory work associated with the dataset types to be
310 converted from this repository (but not the datasets themselves).
312 Notes
313 -----
314 This should be a relatively fast operation that should not depend on
315 the size of the repository.
317 Subclasses may override this method, but must delegate to the base
318 class implementation at some point in their own logic.
319 More often, subclasses will specialize the behavior of `prep` by
320 overriding other methods to which the base class implementation
321 delegates. These include:
322 - `iterMappings`
323 - `isDatasetTypeSpecial`
324 - `getSpecialDirectories`
325 - `makeRepoWalkerTarget`
327 This should not perform any write operations to the Gen3 repository.
328 It is guaranteed to be called before `ingest`.
329 """
330 self.task.log.info(f"Preparing other dataset types from root {self.root}.")
331 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
332 for datasetTypeName, mapping in self.iterMappings():
333 try:
334 template = mapping.template
335 except RuntimeError:
336 # No template for this dataset in this mapper, so there's no
337 # way there should be instances of this dataset in this repo.
338 continue
339 extensions = [""]
340 skip = False
341 message = None
342 storageClass = None
343 if (not self.task.isDatasetTypeIncluded(datasetTypeName)
344 or self.isDatasetTypeSpecial(datasetTypeName)):
345 # User indicated not to include this data, but we still want
346 # to recognize files of that type to avoid warning about them.
347 skip = True
348 else:
349 storageClass = self._guessStorageClass(datasetTypeName, mapping)
350 if storageClass is None:
351 # This may be a problem, but only if we actually encounter
352 # any files corresponding to this dataset. Of course, we
353 # need to be able to parse those files in order to
354 # recognize that situation.
355 message = f"no storage class found for {datasetTypeName}"
356 skip = True
357 # Handle files that are compressed on disk, but the gen2 template
358 # is just `.fits`
359 if template.endswith(".fits"):
360 extensions.extend((".gz", ".fz"))
361 for extension in extensions:
362 if skip:
363 walkerInput = RepoWalker.Skip(
364 template=template+extension,
365 keys=mapping.keys(),
366 message=message,
367 )
368 self.task.log.debug("Skipping template in walker: %s", template)
369 else:
370 assert message is None
371 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName)
372 if targetHandler is not None:
373 targetHandler = doImport(targetHandler)
374 walkerInput = self.makeRepoWalkerTarget(
375 datasetTypeName=datasetTypeName,
376 template=template+extension,
377 keys=mapping.keys(),
378 storageClass=storageClass,
379 formatter=self.task.config.formatterClasses.get(datasetTypeName),
380 targetHandler=targetHandler,
381 )
382 self.task.log.debug("Adding template to walker: %s + %s, for %s", template, extension,
383 walkerInput.datasetType)
384 walkerInputs.append(walkerInput)
386 for dirPath in self.getSpecialDirectories():
387 walkerInputs.append(
388 RepoWalker.Skip(
389 template=dirPath, # not really a template, but that's fine; it's relative to root.
390 keys={},
391 message=None,
392 isForFiles=True,
393 )
394 )
395 fileIgnoreRegExTerms = []
396 for pattern in self.task.config.fileIgnorePatterns:
397 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
398 if fileIgnoreRegExTerms:
399 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
400 else:
401 fileIgnoreRegEx = None
402 self._repoWalker = RepoWalker(walkerInputs, fileIgnoreRegEx=fileIgnoreRegEx,
403 log=self.task.log.getChild("repoWalker"))
405 def iterDatasets(self) -> Iterator[FileDataset]:
406 """Iterate over datasets in the repository that should be ingested into
407 the Gen3 repository.
409 The base class implementation yields nothing; the datasets handled by
410 the `RepoConverter` base class itself are read directly in
411 `findDatasets`.
413 Subclasses should override this method if they support additional
414 datasets that are handled some other way.
416 Yields
417 ------
418 dataset : `FileDataset`
419 Structures representing datasets to be ingested. Paths should be
420 absolute.
421 """
422 yield from ()
424 def findDatasets(self):
425 assert self._repoWalker, "prep() must be called before findDatasets."
426 self.task.log.info("Adding special datasets in repo %s.", self.root)
427 for dataset in self.iterDatasets():
428 assert len(dataset.refs) == 1
429 # None index below is for calibDate, which is only relevant for
430 # CalibRepoConverter.
431 self._fileDatasets[dataset.refs[0].datasetType][None].append(dataset)
432 self.task.log.info("Finding datasets from files in repo %s.", self.root)
433 datasetsByTypeAndCalibDate = self._repoWalker.walk(
434 self.root,
435 predicate=(self.subset.isRelated if self.subset is not None else None)
436 )
437 for datasetType, datasetsByCalibDate in datasetsByTypeAndCalibDate.items():
438 for calibDate, datasets in datasetsByCalibDate.items():
439 self._fileDatasets[datasetType][calibDate].extend(datasets)
441 def expandDataIds(self):
442 """Expand the data IDs for all datasets to be inserted.
444 Subclasses may override this method, but must delegate to the base
445 class implementation if they do.
447 This involves queries to the registry, but not writes. It is
448 guaranteed to be called between `findDatasets` and `ingest`.
449 """
450 import itertools
451 for datasetType, datasetsByCalibDate in self._fileDatasets.items():
452 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
453 nDatasets = len(datasetsForCalibDate)
454 suffix = "" if nDatasets == 1 else "s"
455 if calibDate is not None:
456 self.task.log.info("Expanding data IDs for %s %s dataset%s at calibDate %s.",
457 nDatasets,
458 datasetType.name,
459 suffix,
460 calibDate)
461 else:
462 self.task.log.info("Expanding data IDs for %s %s non-calibration dataset%s.",
463 nDatasets,
464 datasetType.name,
465 suffix)
466 expanded = []
467 for dataset in datasetsForCalibDate:
468 for i, ref in enumerate(dataset.refs):
469 self.task.log.debug("Expanding data ID %s.", ref.dataId)
470 try:
471 dataId = self.task.registry.expandDataId(ref.dataId)
472 dataset.refs[i] = ref.expanded(dataId)
473 except LookupError as err:
474 self.task.log.warn("Skipping ingestion for '%s': %s", dataset.path, err)
475 # Remove skipped datasets from multi-extension
476 # FileDatasets
477 dataset.refs[i] = None # We will strip off the `None`s after the loop.
478 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs)
479 if dataset.refs:
480 expanded.append(dataset)
481 datasetsForCalibDate[:] = expanded
483 def ingest(self):
484 """Insert converted datasets into the Gen3 repository.
486 Subclasses may override this method, but must delegate to the base
487 class implementation at some point in their own logic.
489 This method is guaranteed to be called after `expandDataIds`.
490 """
491 for datasetType, datasetsByCalibDate in self._fileDatasets.items():
492 self.task.registry.registerDatasetType(datasetType)
493 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
494 try:
495 run = self.getRun(datasetType.name, calibDate)
496 except LookupError:
497 self.task.log.warn(f"No run configured for dataset type {datasetType.name}.")
498 continue
499 nDatasets = len(datasetsForCalibDate)
500 self.task.log.info("Ingesting %s %s dataset%s into run %s.", nDatasets,
501 datasetType.name, "" if nDatasets == 1 else "s", run)
502 try:
503 self.task.registry.registerRun(run)
504 self.task.butler3.ingest(*datasetsForCalibDate, transfer=self.task.config.transfer,
505 run=run)
506 except LookupError as err:
507 raise LookupError(
508 f"Error expanding data ID for dataset type {datasetType.name}."
509 ) from err
511 def finish(self) -> None:
512 # TODO: docs
513 self._finish(self._fileDatasets)
515 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]) -> None:
516 # TODO: docs
517 pass
519 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str:
520 """Return the name of the run to insert instances of the given dataset
521 type into in this collection.
523 Parameters
524 ----------
525 datasetTypeName : `str`
526 Name of the dataset type.
527 calibDate : `str`, optional
528 If not `None`, the "CALIBDATE" associated with this (calibration)
529 dataset in the Gen2 data repository.
531 Returns
532 -------
533 run : `str`
534 Name of the `~lsst.daf.butler.CollectionType.RUN` collection.
535 """
536 assert self._run is not None, "Method must be overridden if self._run is allowed to be None"
537 assert calibDate is None, "Method must be overridden if calibDate is allowed to be not None"
538 return self._run
540 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
541 ) -> Optional[StorageClass]:
542 """Infer the Gen3 `StorageClass` from a dataset from a combination of
543 configuration and Gen2 dataset type information.
545 datasetTypeName: `str`
546 Name of the dataset type.
547 mapping : `lsst.obs.base.mapping.Mapping`
548 Mapping object used by the Gen2 `CameraMapper` to describe the
549 dataset type.
550 """
551 storageClassName = self.task.config.storageClasses.get(datasetTypeName)
552 if storageClassName is None and mapping.python is not None:
553 storageClassName = self.task.config.storageClasses.get(mapping.python, None)
554 if storageClassName is None and mapping.persistable is not None:
555 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
556 if storageClassName is None and mapping.python is not None:
557 unqualified = mapping.python.split(".")[-1]
558 storageClassName = self.task.config.storageClasses.get(unqualified, None)
559 if storageClassName is not None:
560 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
561 else:
562 try:
563 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
564 except KeyError:
565 storageClass = None
566 if storageClass is None and mapping.python is not None:
567 try:
568 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
569 except KeyError:
570 pass
571 if storageClass is None:
572 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
573 else:
574 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
575 return storageClass
577 # Class attributes that will be shadowed by public instance attributes;
578 # defined here only for documentation purposes.
580 task: ConvertRepoTask
581 """The parent task that constructed and uses this converter
582 (`ConvertRepoTask`).
583 """
585 root: str
586 """Root path to the Gen2 repository this converter manages (`str`).
588 This is a complete path, not relative to some other repository root.
589 """
591 subset: Optional[ConversionSubset]
592 """An object that represents a filter to be applied to the datasets that
593 are converted (`ConversionSubset` or `None`).
594 """