Coverage for python/lsst/obs/base/gen2to3/repoConverter.py: 20%
220 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 03:03 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 03:03 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RepoConverter"]
25import fnmatch
26import os.path
27import re
28from abc import ABC, abstractmethod
29from collections import defaultdict
30from dataclasses import dataclass
31from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Optional, Set, Tuple, Union
33from lsst.daf.butler import DataCoordinate, DatasetType, FileDataset, Progress
34from lsst.daf.butler.registry import DataIdError
35from lsst.sphgeom import RangeSet, Region
36from lsst.utils import doImportType
38from ..ingest import _log_msg_counter
39from .repoWalker import RepoWalker
41if TYPE_CHECKING: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true
42 from lsst.daf.butler import FormatterParameter, Registry, SkyPixDimension, StorageClass
44 from .._instrument import Instrument
45 from ..mapping import Mapping as CameraMapperMapping # disambiguate from collections.abc.Mapping
46 from .convertRepo import ConvertRepoTask
47 from .scanner import PathElementHandler
50@dataclass
51class ConversionSubset:
52 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
53 lists of related data ID values that should be included in the conversion.
55 Parameters
56 ----------
57 instrument : `str`
58 Instrument name used in Gen3 data IDs.
59 visits : `set` of `int`
60 Visit IDs that define the filter.
61 """
63 def __init__(self, instrument: str, visits: Set[int]):
64 self.instrument = instrument
65 self.visits = visits
66 self.regions = None
67 self.tracts = {}
68 self.skypix = {}
70 def addSkyMap(self, registry: Registry, name: str):
71 """Populate the included tract IDs for the given skymap from those that
72 overlap the visits the `ConversionSubset` was initialized with.
74 Parameters
75 ----------
76 registry : `lsst.daf.butler.Registry`
77 Registry that can be queried for visit/tract overlaps.
78 name : `str`
79 SkyMap name used in Gen3 data IDs.
80 """
81 tracts = set()
82 self.tracts[name] = tracts
83 for visit in self.visits:
84 for dataId in registry.queryDataIds(
85 ["tract"], dataId={"skymap": name, "instrument": self.instrument, "visit": visit}
86 ):
87 tracts.add(dataId["tract"])
89 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
90 """Populate the included skypix IDs for the given dimension from those
91 that overlap the visits the `ConversionSubset` was initialized with.
93 Parameters
94 ----------
95 registry : `lsst.daf.butler.Registry`
96 Registry that can be queried for visit regions.
97 name : `str`
98 SkyMap name used in Gen3 data IDs.
99 """
100 if self.regions is None:
101 self.regions = []
102 for visit in self.visits:
103 dataId = registry.expandDataId(instrument=self.instrument, visit=visit)
104 self.regions.append(dataId.region)
105 ranges = RangeSet()
106 for region in self.regions:
107 ranges = ranges.union(dimension.pixelization.envelope(region))
108 self.skypix[dimension] = ranges
110 def isRelated(self, dataId: DataCoordinate) -> bool:
111 """Test whether the given data ID is related to this subset and hence
112 should be included in a repository conversion.
114 Parameters
115 ----------
116 dataId : `lsst.daf.butler.DataCoordinate`
117 Data ID to test.
119 Returns
120 -------
121 related : `bool`
122 `True` if this data ID should be included in a repository
123 conversion.
125 Notes
126 -----
127 More formally, this tests that the given data ID is not unrelated;
128 if a data ID does not involve tracts, visits, or skypix dimensions,
129 we always include it.
130 """
131 if self.visits is None:
132 # We're not filtering at all.
133 return True
134 if "visit" in dataId.graph and dataId["visit"] not in self.visits:
135 return False
136 if "tract" in dataId.graph and dataId["tract"] not in self.tracts[dataId["skymap"]]:
137 return False
138 for dimension, ranges in self.skypix.items():
139 if dimension in dataId.graph and not ranges.intersects(dataId[dimension]):
140 return False
141 return True
143 # Class attributes that will be shadowed by public instance attributes;
144 # defined here only for documentation purposes.
146 instrument: str
147 """The name of the instrument, as used in Gen3 data IDs (`str`).
148 """
150 visits: Set[int]
151 """The set of visit IDs that should be included in the conversion (`set`
152 of `int`).
153 """
155 regions: Optional[List[Region]]
156 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
158 Set to `None` before it has been initialized. Any code that attempts to
159 use it when it is `None` has a logic bug.
160 """
162 tracts: Dict[str, Set[int]]
163 """Tracts that should be included in the conversion, grouped by skymap
164 name (`dict` mapping `str` to `set` of `int`).
165 """
167 skypix: Dict[SkyPixDimension, RangeSet]
168 """SkyPix ranges that should be included in the conversion, grouped by
169 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
170 """
173class RepoConverter(ABC):
174 """An abstract base class for objects that help `ConvertRepoTask` convert
175 datasets from a single Gen2 repository.
177 Parameters
178 ----------
179 task : `ConvertRepoTask`
180 Task instance that is using this helper object.
181 root : `str`
182 Root of the Gen2 repo being converted. Will be converted to an
183 absolute path, resolving symbolic links and ``~``, if necessary.
184 instrument : `Instrument`
185 Gen3 instrument class to use for this conversion.
186 collections : `list` of `str`
187 Gen3 collections with which all converted datasets should be
188 associated.
189 subset : `ConversionSubset, optional
190 Helper object that implements a filter that restricts the data IDs that
191 are converted.
193 Notes
194 -----
195 `RepoConverter` defines the only public API users of its subclasses should
196 use (`prep` `ingest`, and `finish`). These delegate to several abstract
197 methods that subclasses must implement. In some cases, subclasses may
198 reimplement the public methods as well, but are expected to delegate to
199 ``super()`` either at the beginning or end of their own implementation.
200 """
202 def __init__(
203 self,
204 *,
205 task: ConvertRepoTask,
206 root: str,
207 instrument: Instrument,
208 run: Optional[str],
209 subset: Optional[ConversionSubset] = None,
210 ):
211 self.task = task
212 self.root = os.path.realpath(os.path.expanduser(root))
213 self.instrument = instrument
214 self.subset = subset
215 self.progress = Progress("obs.base.gen2to3")
216 self._run = run
217 self._repoWalker = None # Created in prep
218 self._fileDatasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] = defaultdict(
219 lambda: defaultdict(list)
220 )
221 self._fileDatasetCount = 0
223 @abstractmethod
224 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
225 """Test whether the given dataset is handled specially by this
226 converter and hence should be ignored by generic base-class logic that
227 searches for dataset types to convert.
229 Parameters
230 ----------
231 datasetTypeName : `str`
232 Name of the dataset type to test.
234 Returns
235 -------
236 special : `bool`
237 `True` if the dataset type is special.
238 """
239 raise NotImplementedError()
241 @abstractmethod
242 def iterMappings(self) -> Iterator[Tuple[str, CameraMapperMapping]]:
243 """Iterate over all `CameraMapper` `Mapping` objects that should be
244 considered for conversion by this repository.
246 This this should include any datasets that may appear in the
247 repository, including those that are special (see
248 `isDatasetTypeSpecial`) and those that are being ignored (see
249 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
250 to identify and hence skip these datasets quietly instead of warning
251 about them as unrecognized.
253 Yields
254 ------
255 datasetTypeName: `str`
256 Name of the dataset type.
257 mapping : `lsst.obs.base.mapping.Mapping`
258 Mapping object used by the Gen2 `CameraMapper` to describe the
259 dataset type.
260 """
261 raise NotImplementedError()
263 @abstractmethod
264 def makeRepoWalkerTarget(
265 self,
266 datasetTypeName: str,
267 template: str,
268 keys: Dict[str, type],
269 storageClass: StorageClass,
270 formatter: FormatterParameter = None,
271 targetHandler: Optional[PathElementHandler] = None,
272 ) -> RepoWalker.Target:
273 """Make a struct that identifies a dataset type to be extracted by
274 walking the repo directory structure.
276 Parameters
277 ----------
278 datasetTypeName : `str`
279 Name of the dataset type (the same in both Gen2 and Gen3).
280 template : `str`
281 The full Gen2 filename template.
282 keys : `dict` [`str`, `type`]
283 A dictionary mapping Gen2 data ID key to the type of its value.
284 storageClass : `lsst.daf.butler.StorageClass`
285 Gen3 storage class for this dataset type.
286 formatter : `lsst.daf.butler.Formatter` or `str`, optional
287 A Gen 3 formatter class or fully-qualified name.
288 targetHandler : `PathElementHandler`, optional
289 Specialist target handler to use for this dataset type.
291 Returns
292 -------
293 target : `RepoWalker.Target`
294 A struct containing information about the target dataset (much of
295 it simplify forwarded from the arguments).
296 """
297 raise NotImplementedError()
299 def getSpecialDirectories(self) -> List[str]:
300 """Return a list of directory paths that should not be searched for
301 files.
303 These may be directories that simply do not contain datasets (or
304 contain datasets in another repository), or directories whose datasets
305 are handled specially by a subclass.
307 Returns
308 -------
309 directories : `list` [`str`]
310 The full paths of directories to skip, relative to the repository
311 root.
312 """
313 return []
315 def prep(self):
316 """Perform preparatory work associated with the dataset types to be
317 converted from this repository (but not the datasets themselves).
319 Notes
320 -----
321 This should be a relatively fast operation that should not depend on
322 the size of the repository.
324 Subclasses may override this method, but must delegate to the base
325 class implementation at some point in their own logic.
326 More often, subclasses will specialize the behavior of `prep` by
327 overriding other methods to which the base class implementation
328 delegates. These include:
329 - `iterMappings`
330 - `isDatasetTypeSpecial`
331 - `getSpecialDirectories`
332 - `makeRepoWalkerTarget`
334 This should not perform any write operations to the Gen3 repository.
335 It is guaranteed to be called before `ingest`.
336 """
337 self.task.log.info("Preparing other dataset types from root %s.", self.root)
338 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
339 for datasetTypeName, mapping in self.iterMappings():
340 try:
341 template = self.task.config.datasetTemplateOverrides.get(datasetTypeName, mapping.template)
342 except RuntimeError:
343 # No template for this dataset in this mapper, so there's no
344 # way there should be instances of this dataset in this repo.
345 continue
346 extensions = [""]
347 skip = False
348 message = None
349 storageClass = None
350 if not self.task.isDatasetTypeIncluded(datasetTypeName) or self.isDatasetTypeSpecial(
351 datasetTypeName
352 ):
353 # User indicated not to include this data, but we still want
354 # to recognize files of that type to avoid warning about them.
355 skip = True
356 else:
357 storageClass = self._guessStorageClass(datasetTypeName, mapping)
358 if storageClass is None:
359 # This may be a problem, but only if we actually encounter
360 # any files corresponding to this dataset. Of course, we
361 # need to be able to parse those files in order to
362 # recognize that situation.
363 message = f"no storage class found for {datasetTypeName}"
364 skip = True
365 # Handle files that are compressed on disk, but the gen2 template
366 # is just `.fits`
367 if template.endswith(".fits"):
368 extensions.extend((".gz", ".fz"))
369 for extension in extensions:
370 if skip:
371 walkerInput = RepoWalker.Skip(
372 template=template + extension,
373 keys=mapping.keys(),
374 message=message,
375 )
376 self.task.log.debug("Skipping template in walker: %s", template)
377 else:
378 assert message is None
379 targetHandler = self.task.config.targetHandlerClasses.get(datasetTypeName)
380 if targetHandler is not None:
381 targetHandler = doImportType(targetHandler)
382 walkerInput = self.makeRepoWalkerTarget(
383 datasetTypeName=datasetTypeName,
384 template=template + extension,
385 keys=mapping.keys(),
386 storageClass=storageClass,
387 formatter=self.task.config.formatterClasses.get(datasetTypeName),
388 targetHandler=targetHandler,
389 )
390 self.task.log.debug(
391 "Adding template to walker: %s + %s, for %s",
392 template,
393 extension,
394 walkerInput.datasetType,
395 )
396 walkerInputs.append(walkerInput)
398 for dirPath in self.getSpecialDirectories():
399 walkerInputs.append(
400 RepoWalker.Skip(
401 template=dirPath, # not really a template, but that's fine; it's relative to root.
402 keys={},
403 message=None,
404 isForFiles=True,
405 )
406 )
407 fileIgnoreRegExTerms = []
408 for pattern in self.task.config.fileIgnorePatterns:
409 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
410 if fileIgnoreRegExTerms:
411 fileIgnoreRegEx = re.compile("|".join(fileIgnoreRegExTerms))
412 else:
413 fileIgnoreRegEx = None
414 self._repoWalker = RepoWalker(
415 walkerInputs,
416 fileIgnoreRegEx=fileIgnoreRegEx,
417 log=self.task.log.getChild("repoWalker"),
418 progress=self.progress,
419 )
421 def iterDatasets(self) -> Iterator[FileDataset]:
422 """Iterate over datasets in the repository that should be ingested into
423 the Gen3 repository.
425 The base class implementation yields nothing; the datasets handled by
426 the `RepoConverter` base class itself are read directly in
427 `findDatasets`.
429 Subclasses should override this method if they support additional
430 datasets that are handled some other way.
432 Yields
433 ------
434 dataset : `FileDataset`
435 Structures representing datasets to be ingested. Paths should be
436 absolute.
437 """
438 yield from ()
440 def findDatasets(self):
441 assert self._repoWalker, "prep() must be called before findDatasets."
442 self.task.log.info("Adding special datasets in repo %s.", self.root)
443 for dataset in self.iterDatasets():
444 assert len(dataset.refs) == 1
445 # None index below is for calibDate, which is only relevant for
446 # CalibRepoConverter.
447 self._fileDatasets[dataset.refs[0].datasetType][None].append(dataset)
448 self.task.log.info("Finding datasets from files in repo %s.", self.root)
449 datasetsByTypeAndCalibDate = self._repoWalker.walk(
450 self.root, predicate=(self.subset.isRelated if self.subset is not None else None)
451 )
452 for datasetType, datasetsByCalibDate in datasetsByTypeAndCalibDate.items():
453 for calibDate, datasets in datasetsByCalibDate.items():
454 self._fileDatasets[datasetType][calibDate].extend(datasets)
455 self._fileDatasetCount += len(datasets)
457 def expandDataIds(self):
458 """Expand the data IDs for all datasets to be inserted.
460 Subclasses may override this method, but must delegate to the base
461 class implementation if they do.
463 This involves queries to the registry, but not writes. It is
464 guaranteed to be called between `findDatasets` and `ingest`.
465 """
466 import itertools
468 with self.progress.bar(desc="Expanding data IDs", total=self._fileDatasetCount) as progressBar:
469 for datasetType, datasetsByCalibDate in self._fileDatasets.items():
470 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
471 if calibDate is not None:
472 self.task.log.info(
473 "Expanding data IDs for %d dataset%s of type %s at calibDate %s.",
474 *_log_msg_counter(datasetsForCalibDate),
475 datasetType.name,
476 calibDate,
477 )
478 else:
479 self.task.log.info(
480 "Expanding data IDs for %d non-calibration dataset%s of type %s.",
481 *_log_msg_counter(datasetsForCalibDate),
482 datasetType.name,
483 )
484 expanded = []
485 for dataset in datasetsForCalibDate:
486 for i, ref in enumerate(dataset.refs):
487 self.task.log.debug("Expanding data ID %s.", ref.dataId)
488 try:
489 dataId = self.task.registry.expandDataId(ref.dataId)
490 dataset.refs[i] = ref.expanded(dataId)
491 except DataIdError as err:
492 self.task.log.warning("Skipping ingestion for '%s': %s", dataset.path, err)
493 # Remove skipped datasets from multi-extension
494 # FileDatasets
495 dataset.refs[i] = None # We will strip off the `None`s after the loop.
496 dataset.refs[:] = itertools.filterfalse(lambda x: x is None, dataset.refs)
497 if dataset.refs:
498 expanded.append(dataset)
499 progressBar.update()
500 datasetsForCalibDate[:] = expanded
502 def ingest(self):
503 """Insert converted datasets into the Gen3 repository.
505 Subclasses may override this method, but must delegate to the base
506 class implementation at some point in their own logic.
508 This method is guaranteed to be called after `expandDataIds`.
509 """
510 with self.progress.bar(
511 desc="Ingesting converted datasets", total=self._fileDatasetCount
512 ) as progressBar:
513 for datasetType, datasetsByCalibDate in self._fileDatasets.items():
514 self.task.registry.registerDatasetType(datasetType)
515 for calibDate, datasetsForCalibDate in datasetsByCalibDate.items():
516 try:
517 run = self.getRun(datasetType.name, calibDate)
518 except LookupError:
519 self.task.log.warning(f"No run configured for dataset type {datasetType.name}.")
520 continue
521 self.task.log.info(
522 "Ingesting %d dataset%s into run %s of type %s.",
523 *_log_msg_counter(datasetsForCalibDate),
524 run,
525 datasetType.name,
526 )
527 try:
528 self.task.registry.registerRun(run)
529 self.task.butler3.ingest(
530 *datasetsForCalibDate, transfer=self.task.config.transfer, run=run
531 )
532 progressBar.update(len(datasetsForCalibDate))
533 except LookupError as err:
534 raise LookupError(
535 f"Error expanding data ID for dataset type {datasetType.name}."
536 ) from err
538 def finish(self) -> None:
539 """Finish conversion of a repository.
541 This is run after ``ingest``, and delegates to `_finish`, which should
542 be overridden by derived classes instead of this method.
543 """
544 self._finish(self._fileDatasets, self._fileDatasetCount)
546 def _finish(
547 self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], count: int
548 ) -> None:
549 """Subclass implementation hook for `_finish`.
551 The default implementation does nothing. This is generally the best
552 place to define and populate non-``RUN`` collections that may contain
553 some of the datasets that have just been ingested.
555 Parameters
556 ----------
557 datasets : `Mapping`
558 Nested mapping containing all converted datasets. The outer
559 mapping keys are `DatasetType` instances. Values are mappings from
560 ``calibDate`` or `None` to a `list` of `FileDataset` instances.
561 count : `int`
562 Total number of `FileDataset` instances in ``datasets``.
563 """
564 pass
566 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str:
567 """Return the name of the run to insert instances of the given dataset
568 type into in this collection.
570 Parameters
571 ----------
572 datasetTypeName : `str`
573 Name of the dataset type.
574 calibDate : `str`, optional
575 If not `None`, the "CALIBDATE" associated with this (calibration)
576 dataset in the Gen2 data repository.
578 Returns
579 -------
580 run : `str`
581 Name of the `~lsst.daf.butler.CollectionType.RUN` collection.
582 """
583 assert self._run is not None, "Method must be overridden if self._run is allowed to be None"
584 assert calibDate is None, "Method must be overridden if calibDate is allowed to be not None"
585 return self._run
587 def _guessStorageClass(
588 self, datasetTypeName: str, mapping: CameraMapperMapping
589 ) -> Optional[StorageClass]:
590 """Infer the Gen3 `StorageClass` from a dataset from a combination of
591 configuration and Gen2 dataset type information.
593 datasetTypeName: `str`
594 Name of the dataset type.
595 mapping : `lsst.obs.base.mapping.Mapping`
596 Mapping object used by the Gen2 `CameraMapper` to describe the
597 dataset type.
598 """
599 storageClassName = self.task.config.storageClasses.get(datasetTypeName)
600 if storageClassName is None and mapping.python is not None:
601 storageClassName = self.task.config.storageClasses.get(mapping.python, None)
602 if storageClassName is None and mapping.persistable is not None:
603 storageClassName = self.task.config.storageClasses.get(mapping.persistable, None)
604 if storageClassName is None and mapping.python is not None:
605 unqualified = mapping.python.split(".")[-1]
606 storageClassName = self.task.config.storageClasses.get(unqualified, None)
607 if storageClassName is not None:
608 storageClass = self.task.butler3.storageClasses.getStorageClass(storageClassName)
609 else:
610 try:
611 storageClass = self.task.butler3.storageClasses.getStorageClass(mapping.persistable)
612 except KeyError:
613 storageClass = None
614 if storageClass is None and mapping.python is not None:
615 try:
616 storageClass = self.task.butler3.storageClasses.getStorageClass(unqualified)
617 except KeyError:
618 pass
619 if storageClass is None:
620 self.task.log.debug("No StorageClass found for %s; skipping.", datasetTypeName)
621 else:
622 self.task.log.debug("Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
623 return storageClass
625 # Class attributes that will be shadowed by public instance attributes;
626 # defined here only for documentation purposes.
628 task: ConvertRepoTask
629 """The parent task that constructed and uses this converter
630 (`ConvertRepoTask`).
631 """
633 root: str
634 """Root path to the Gen2 repository this converter manages (`str`).
636 This is a complete path, not relative to some other repository root.
637 """
639 subset: Optional[ConversionSubset]
640 """An object that represents a filter to be applied to the datasets that
641 are converted (`ConversionSubset` or `None`).
642 """