Coverage for python/lsst/obs/base/gen2to3/convertRepo.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"]
25import os
26import fnmatch
27from dataclasses import dataclass
28from typing import Iterable, Optional, List, Dict
30from lsst.daf.butler import (
31 Butler as Butler3,
32 CollectionType,
33 SkyPixDimension
34)
35from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36from lsst.pipe.base import Task
37from lsst.skymap import skyMapRegistry, BaseSkyMap
39from ..ingest import RawIngestTask
40from ..defineVisits import DefineVisitsTask
41from .repoConverter import ConversionSubset
42from .rootRepoConverter import RootRepoConverter
43from .calibRepoConverter import CalibRepoConverter
44from .standardRepoConverter import StandardRepoConverter
45from .._instrument import Instrument
48@dataclass
49class ConfiguredSkyMap:
50 """Struct containing information about a skymap that may appear in a Gen2
51 repository.
52 """
54 name: str
55 """Name of the skymap used in Gen3 data IDs.
56 """
58 sha1: bytes
59 """Hash computed by `BaseSkyMap.getSha1`.
60 """
62 instance: BaseSkyMap
63 """Name of the skymap used in Gen3 data IDs.
64 """
66 used: bool = False
67 """Whether this skymap has been found in at least one repository being
68 converted.
69 """
72@dataclass
73class Rerun:
74 """Specification for a Gen2 processing-output repository to convert.
75 """
77 path: str
78 """Absolute or relative (to the root repository) path to the Gen2
79 repository (`str`).
80 """
82 runName: str
83 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets
84 will be inserted into (`str`).
85 """
87 chainName: Optional[str]
88 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will
89 combine this repository's datasets with those of its parent repositories
90 (`str`, optional).
91 """
93 parents: List[str]
94 """Collection names associated with parent repositories, used to define the
95 chained collection (`list` [ `str` ]).
97 Ignored if `chainName` is `None`. Runs used in the root repo are
98 automatically included.
99 """
102class ConvertRepoSkyMapConfig(Config):
103 """Sub-config used to hold the parameters of a SkyMap.
105 Notes
106 -----
107 This config only needs to exist because we can't put a
108 `~lsst.pex.config.RegistryField` directly inside a
109 `~lsst.pex.config.ConfigDictField`.
111 It needs to have its only field named "skyMap" for compatibility with the
112 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
113 use one config file in an obs package to configure both.
115 This name leads to unfortunate repetition with the field named
116 "skymap" that holds it - "skyMap[name].skyMap" - but that seems
117 unavoidable.
118 """
119 skyMap = skyMapRegistry.makeField(
120 doc="Type and parameters for the SkyMap itself.",
121 default="dodeca",
122 )
125class ConvertRepoConfig(Config):
126 raws = ConfigurableField(
127 "Configuration for subtask responsible for ingesting raws and adding "
128 "exposure dimension entries.",
129 target=RawIngestTask,
130 )
131 defineVisits = ConfigurableField(
132 "Configuration for the subtask responsible for defining visits from "
133 "exposures.",
134 target=DefineVisitsTask,
135 )
136 skyMaps = ConfigDictField(
137 "Mapping from Gen3 skymap name to the parameters used to construct a "
138 "BaseSkyMap instance. This will be used to associate names with "
139 "existing skymaps found in the Gen2 repo.",
140 keytype=str,
141 itemtype=ConvertRepoSkyMapConfig,
142 default={}
143 )
144 rootSkyMapName = Field(
145 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
146 "datasets in the root repository when no SkyMap is found there. ",
147 dtype=str,
148 optional=True,
149 default=None,
150 )
151 runs = DictField(
152 "A mapping from dataset type name to the RUN collection they should "
153 "be inserted into. This must include all datasets that can be found "
154 "in the root repository; other repositories will use per-repository "
155 "runs.",
156 keytype=str,
157 itemtype=str,
158 default={
159 "deepCoadd_skyMap": "skymaps",
160 "brightObjectMask": "masks",
161 }
162 )
163 storageClasses = DictField(
164 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
165 "or 'persistable') to the Gen3 StorageClass name.",
166 keytype=str,
167 itemtype=str,
168 default={
169 "bias": "ExposureF",
170 "dark": "ExposureF",
171 "flat": "ExposureF",
172 "defects": "Defects",
173 "crosstalk": "CrosstalkCalib",
174 "BaseSkyMap": "SkyMap",
175 "BaseCatalog": "Catalog",
176 "BackgroundList": "Background",
177 "raw": "Exposure",
178 "MultilevelParquetTable": "DataFrame",
179 "ParquetTable": "DataFrame",
180 "SkyWcs": "Wcs",
181 }
182 )
183 formatterClasses = DictField(
184 "Mapping from dataset type name to formatter class. "
185 "By default these are derived from the formatters listed in the"
186 " Gen3 datastore configuration.",
187 keytype=str,
188 itemtype=str,
189 default={}
190 )
191 targetHandlerClasses = DictField(
192 "Mapping from dataset type name to target handler class.",
193 keytype=str,
194 itemtype=str,
195 default={}
196 )
197 doRegisterInstrument = Field(
198 "If True (default), add dimension records for the Instrument and its "
199 "filters and detectors to the registry instead of assuming they are "
200 "already present.",
201 dtype=bool,
202 default=True,
203 )
204 doWriteCuratedCalibrations = Field(
205 "If True (default), ingest human-curated calibrations directly via "
206 "the Instrument interface. Note that these calibrations are never "
207 "converted from Gen2 repositories.",
208 dtype=bool,
209 default=True,
210 )
211 refCats = ListField(
212 "The names of reference catalogs (subdirectories under ref_cats) to "
213 "be converted",
214 dtype=str,
215 default=[]
216 )
217 fileIgnorePatterns = ListField(
218 "Filename globs that should be ignored instead of being treated as "
219 "datasets.",
220 dtype=str,
221 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
222 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
223 "_parent", "repositoryCfg.yaml"]
224 )
225 rawDatasetType = Field(
226 "Gen2 dataset type to use for raw data.",
227 dtype=str,
228 default="raw",
229 )
230 datasetIncludePatterns = ListField(
231 "Glob-style patterns for dataset type names that should be converted.",
232 dtype=str,
233 default=["*"]
234 )
235 datasetIgnorePatterns = ListField(
236 "Glob-style patterns for dataset type names that should not be "
237 "converted despite matching a pattern in datasetIncludePatterns.",
238 dtype=str,
239 default=[]
240 )
241 ccdKey = Field(
242 "Key used for the Gen2 equivalent of 'detector' in data IDs.",
243 dtype=str,
244 default="ccd",
245 )
246 relatedOnly = Field(
247 "If True (default), only convert datasets that are related to the "
248 "ingested visits. Ignored unless a list of visits is passed to "
249 "run().",
250 dtype=bool,
251 default=False,
252 )
253 curatedCalibrations = ListField(
254 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` "
255 "and thus should not be converted using the standard calibration "
256 "conversion system.",
257 dtype=str,
258 default=["camera",
259 "transmission_sensor",
260 "transmission_filter",
261 "transmission_optics",
262 "transmission_atmosphere",
263 "bfKernel"]
264 )
266 @property
267 def transfer(self):
268 return self.raws.transfer
270 @transfer.setter
271 def transfer(self, value):
272 self.raws.transfer = value
274 def setDefaults(self):
275 self.transfer = None
277 # TODO: check that there are no collection overrides for curated
278 # calibrations, since we don't have a good way to utilize them.
281class ConvertRepoTask(Task):
282 """A task that converts one or more related Gen2 data repositories to a
283 single Gen3 data repository (with multiple collections).
285 Parameters
286 ----------
287 config: `ConvertRepoConfig`
288 Configuration for this task.
289 butler3: `lsst.daf.butler.Butler`
290 A writeable Gen3 Butler instance that represents the data repository
291 that datasets will be ingested into. If the 'raw' dataset is
292 configured to be included in the conversion, ``butler3.run`` should be
293 set to the name of the collection raws should be ingested into, and
294 ``butler3.collections`` should include a calibration collection from
295 which the ``camera`` dataset can be loaded, unless a calibration repo
296 is converted and ``doWriteCuratedCalibrations`` is `True`.
297 **kwargs
298 Other keyword arguments are forwarded to the `Task` constructor.
300 Notes
301 -----
302 Most of the work of converting repositories is delegated to instances of
303 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
304 only state that is relevant for all Gen2 repositories being ingested, while
305 each `RepoConverter` instance holds only state relevant for the conversion
306 of a single Gen2 repository. Both the task and the `RepoConverter`
307 instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
308 methods may only be called once on a particular instance.
309 """
311 ConfigClass = ConvertRepoConfig
313 _DefaultName = "convertRepo"
315 def __init__(self, config=None, *, butler3: Butler3, instrument: Instrument, **kwargs):
316 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
317 super().__init__(config, **kwargs)
318 self.butler3 = butler3
319 self.registry = self.butler3.registry
320 self.universe = self.registry.dimensions
321 if self.isDatasetTypeIncluded("raw"):
322 self.makeSubtask("raws", butler=butler3)
323 self.makeSubtask("defineVisits", butler=butler3)
324 else:
325 self.raws = None
326 self.defineVisits = None
327 self.instrument = instrument
328 self._configuredSkyMapsBySha1 = {}
329 self._configuredSkyMapsByName = {}
330 for name, config in self.config.skyMaps.items():
331 instance = config.skyMap.apply()
332 self._populateSkyMapDicts(name, instance)
333 self._usedSkyPix = set()
334 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory()
335 self.translatorFactory.log = self.log.getChild("translators")
337 def _populateSkyMapDicts(self, name, instance):
338 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
339 self._configuredSkyMapsBySha1[struct.sha1] = struct
340 self._configuredSkyMapsByName[struct.name] = struct
342 def isDatasetTypeIncluded(self, datasetTypeName: str):
343 """Return `True` if configuration indicates that the given dataset type
344 should be converted.
346 This method is intended to be called primarily by the
347 `RepoConverter` instances used interally by the task.
349 Parameters
350 ----------
351 datasetTypeName: str
352 Name of the dataset type.
354 Returns
355 -------
356 included : `bool`
357 Whether the dataset should be included in the conversion.
358 """
359 return (
360 any(fnmatch.fnmatchcase(datasetTypeName, pattern)
361 for pattern in self.config.datasetIncludePatterns)
362 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
363 for pattern in self.config.datasetIgnorePatterns)
364 )
366 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
367 """Indicate that a repository uses the given SkyMap.
369 This method is intended to be called primarily by the
370 `RepoConverter` instances used interally by the task.
372 Parameters
373 ----------
374 skyMap : `lsst.skymap.BaseSkyMap`
375 SkyMap instance being used, typically retrieved from a Gen2
376 data repository.
377 skyMapName : `str`
378 The name of the gen2 skymap, for error reporting.
380 Returns
381 -------
382 name : `str`
383 The name of the skymap in Gen3 data IDs.
385 Raises
386 ------
387 LookupError
388 Raised if the specified skymap cannot be found.
389 """
390 sha1 = skyMap.getSha1()
391 if sha1 not in self._configuredSkyMapsBySha1:
392 self._populateSkyMapDicts(skyMapName, skyMap)
393 try:
394 struct = self._configuredSkyMapsBySha1[sha1]
395 except KeyError as err:
396 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
397 raise LookupError(msg) from err
398 struct.used = True
399 return struct.name
401 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
402 """Register all skymaps that have been marked as used.
404 This method is intended to be called primarily by the
405 `RepoConverter` instances used interally by the task.
407 Parameters
408 ----------
409 subset : `ConversionSubset`, optional
410 Object that will be used to filter converted datasets by data ID.
411 If given, it will be updated with the tracts of this skymap that
412 overlap the visits in the subset.
413 """
414 for struct in self._configuredSkyMapsBySha1.values():
415 if struct.used:
416 struct.instance.register(struct.name, self.registry)
417 if subset is not None and self.config.relatedOnly:
418 subset.addSkyMap(self.registry, struct.name)
420 def useSkyPix(self, dimension: SkyPixDimension):
421 """Indicate that a repository uses the given SkyPix dimension.
423 This method is intended to be called primarily by the
424 `RepoConverter` instances used interally by the task.
426 Parameters
427 ----------
428 dimension : `lsst.daf.butler.SkyPixDimension`
429 Dimension represening a pixelization of the sky.
430 """
431 self._usedSkyPix.add(dimension)
433 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
434 """Register all skymaps that have been marked as used.
436 This method is intended to be called primarily by the
437 `RepoConverter` instances used interally by the task.
439 Parameters
440 ----------
441 subset : `ConversionSubset`, optional
442 Object that will be used to filter converted datasets by data ID.
443 If given, it will be updated with the pixelization IDs that
444 overlap the visits in the subset.
445 """
446 if subset is not None and self.config.relatedOnly:
447 for dimension in self._usedSkyPix:
448 subset.addSkyPix(self.registry, dimension)
450 def run(self, root: str, *,
451 calibs: Dict[str, str] = None,
452 reruns: List[Rerun],
453 visits: Optional[Iterable[int]] = None):
454 """Convert a group of related data repositories.
456 Parameters
457 ----------
458 root : `str`
459 Complete path to the root Gen2 data repository. This should be
460 a data repository that includes a Gen2 registry and any raw files
461 and/or reference catalogs.
462 calibs : `dict`
463 Dictionary mapping calibration repository path to the
464 `~lsst.daf.butler.CollectionType.RUN` collection that converted
465 datasets within it should be inserted into.
466 reruns : `list` of `Rerun`
467 Specifications for rerun (processing output) collections to
468 convert.
469 visits : iterable of `int`, optional
470 The integer IDs of visits to convert. If not provided, all visits
471 in the Gen2 root repository will be converted.
472 """
473 if calibs is None:
474 calibs = {}
475 if visits is not None:
476 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
477 else:
478 if self.config.relatedOnly:
479 self.log.warn("config.relatedOnly is True but all visits are being ingested; "
480 "no filtering will be done.")
481 subset = None
483 # Make converters for all Gen2 repos.
484 converters = []
485 rootConverter = RootRepoConverter(task=self, root=root, subset=subset)
486 converters.append(rootConverter)
487 for calibRoot, run in calibs.items():
488 if not os.path.isabs(calibRoot):
489 calibRoot = os.path.join(rootConverter.root, calibRoot)
490 converter = CalibRepoConverter(task=self, root=calibRoot, run=run,
491 mapper=rootConverter.mapper,
492 subset=rootConverter.subset)
493 converters.append(converter)
494 for spec in reruns:
495 runRoot = spec.path
496 if not os.path.isabs(runRoot):
497 runRoot = os.path.join(rootConverter.root, runRoot)
498 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName,
499 subset=rootConverter.subset)
500 converters.append(converter)
502 # Register the instrument if we're configured to do so.
503 if self.config.doRegisterInstrument:
504 # Allow registration to fail on the assumption that this means
505 # we are reusing a butler
506 try:
507 self.instrument.register(self.registry)
508 except Exception:
509 pass
511 # Run raw ingest (does nothing if we weren't configured to convert the
512 # 'raw' dataset type).
513 rootConverter.runRawIngest()
515 # Write curated calibrations to all calibration repositories.
516 # Add new collections to the list of collections the butler was
517 # initialized to pass to DefineVisitsTask, to deal with the (likely)
518 # case the only 'camera' dataset in the repo will be one we're adding
519 # here.
520 if self.config.doWriteCuratedCalibrations:
521 for run in calibs.values():
522 butler3 = Butler3(butler=self.butler3, run=run)
523 self.instrument.writeCuratedCalibrations(butler3)
525 # Define visits (also does nothing if we weren't configurd to convert
526 # the 'raw' dataset type).
527 rootConverter.runDefineVisits()
529 # Walk Gen2 repos to find datasets convert.
530 for converter in converters:
531 converter.prep()
533 # Insert dimensions needed by any converters. In practice this is just
534 # calibration_labels right now, because exposures and visits (and
535 # things related to them) are handled by RawIngestTask and
536 # DefineVisitsTask earlier and skymaps are handled later.
537 #
538 # Note that we do not try to filter dimensions down to just those
539 # related to the given visits, even if config.relatedOnly is True; we
540 # need them in the Gen3 repo in order to be able to know which datasets
541 # to convert, because Gen2 alone doesn't know enough about the
542 # relationships between data IDs.
543 for converter in converters:
544 converter.insertDimensionData()
546 # Insert dimensions that are potentially shared by all Gen2
547 # repositories (and are hence managed directly by the Task, rather
548 # than a converter instance).
549 # This also finishes setting up the (shared) converter.subsets object
550 # that is used to filter data IDs for config.relatedOnly.
551 self.registerUsedSkyMaps(rootConverter.subset)
552 self.registerUsedSkyPix(rootConverter.subset)
554 # Look for datasets, generally by scanning the filesystem.
555 # This requires dimensions to have already been inserted so we can use
556 # dimension information to identify related datasets.
557 for converter in converters:
558 converter.findDatasets()
560 # Expand data IDs.
561 for converter in converters:
562 converter.expandDataIds()
564 # Actually ingest datasets.
565 for converter in converters:
566 converter.ingest()
568 # Add chained collections for reruns.
569 for spec in reruns:
570 if spec.chainName is not None:
571 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED)
572 chain = [spec.runName]
573 chain.extend(spec.parents)
574 chain.extend(rootConverter.getCollectionChain())
575 self.log.info("Defining %s from chain %s.", spec.chainName, chain)
576 self.butler3.registry.setCollectionChain(spec.chainName, chain)