Coverage for python/lsst/obs/base/gen2to3/convertRepo.py : 32%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"]
25import os
26import fnmatch
27from dataclasses import dataclass
28from typing import Iterable, Optional, List, Dict
30from lsst.utils import doImport
31from lsst.daf.butler import (
32 Butler as Butler3,
33 CollectionType,
34 SkyPixDimension
35)
36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
37from lsst.pipe.base import Task
38from lsst.skymap import skyMapRegistry, BaseSkyMap
40from ..ingest import RawIngestTask
41from ..defineVisits import DefineVisitsTask
42from .repoConverter import ConversionSubset
43from .rootRepoConverter import RootRepoConverter
44from .calibRepoConverter import CalibRepoConverter
45from .standardRepoConverter import StandardRepoConverter
48@dataclass
49class ConfiguredSkyMap:
50 """Struct containing information about a skymap that may appear in a Gen2
51 repository.
52 """
54 name: str
55 """Name of the skymap used in Gen3 data IDs.
56 """
58 sha1: bytes
59 """Hash computed by `BaseSkyMap.getSha1`.
60 """
62 instance: BaseSkyMap
63 """Name of the skymap used in Gen3 data IDs.
64 """
66 used: bool = False
67 """Whether this skymap has been found in at least one repository being
68 converted.
69 """
72@dataclass
73class Rerun:
74 """Specification for a Gen2 processing-output repository to convert.
75 """
77 path: str
78 """Absolute or relative (to the root repository) path to the Gen2
79 repository (`str`).
80 """
82 runName: str
83 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets
84 will be inserted into (`str`).
85 """
87 chainName: Optional[str]
88 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will
89 combine this repository's datasets with those of its parent repositories
90 (`str`, optional).
91 """
93 parents: List[str]
94 """Collection names associated with parent repositories, used to define the
95 chained collection (`list` [ `str` ]).
97 Ignored if `chainName` is `None`. Runs used in the root repo are
98 automatically included.
99 """
102class ConvertRepoSkyMapConfig(Config):
103 """Sub-config used to hold the parameters of a SkyMap.
105 Notes
106 -----
107 This config only needs to exist because we can't put a
108 `~lsst.pex.config.RegistryField` directly inside a
109 `~lsst.pex.config.ConfigDictField`.
111 It needs to have its only field named "skyMap" for compatibility with the
112 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
113 use one config file in an obs package to configure both.
115 This name leads to unfortunate repetition with the field named
116 "skymap" that holds it - "skyMap[name].skyMap" - but that seems
117 unavoidable.
118 """
119 skyMap = skyMapRegistry.makeField(
120 doc="Type and parameters for the SkyMap itself.",
121 default="dodeca",
122 )
125class ConvertRepoConfig(Config):
126 raws = ConfigurableField(
127 "Configuration for subtask responsible for ingesting raws and adding "
128 "exposure dimension entries.",
129 target=RawIngestTask,
130 )
131 defineVisits = ConfigurableField(
132 "Configuration for the subtask responsible for defining visits from "
133 "exposures.",
134 target=DefineVisitsTask,
135 )
136 skyMaps = ConfigDictField(
137 "Mapping from Gen3 skymap name to the parameters used to construct a "
138 "BaseSkyMap instance. This will be used to associate names with "
139 "existing skymaps found in the Gen2 repo.",
140 keytype=str,
141 itemtype=ConvertRepoSkyMapConfig,
142 default={}
143 )
144 rootSkyMapName = Field(
145 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
146 "datasets in the root repository when no SkyMap is found there. ",
147 dtype=str,
148 optional=True,
149 default=None,
150 )
151 runs = DictField(
152 "A mapping from dataset type name to the RUN collection they should "
153 "be inserted into. This must include all datasets that can be found "
154 "in the root repository; other repositories will use per-repository "
155 "runs.",
156 keytype=str,
157 itemtype=str,
158 default={
159 "deepCoadd_skyMap": "skymaps",
160 "brightObjectMask": "masks",
161 }
162 )
163 storageClasses = DictField(
164 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
165 "or 'persistable') to the Gen3 StorageClass name.",
166 keytype=str,
167 itemtype=str,
168 default={
169 "bias": "ExposureF",
170 "dark": "ExposureF",
171 "flat": "ExposureF",
172 "defects": "Defects",
173 "BaseSkyMap": "SkyMap",
174 "BaseCatalog": "Catalog",
175 "BackgroundList": "Background",
176 "raw": "Exposure",
177 "MultilevelParquetTable": "DataFrame",
178 "ParquetTable": "DataFrame",
179 "SkyWcs": "Wcs",
180 }
181 )
182 formatterClasses = DictField(
183 "Mapping from dataset type name to formatter class. "
184 "By default these are derived from the formatters listed in the"
185 " Gen3 datastore configuration.",
186 keytype=str,
187 itemtype=str,
188 default={}
189 )
190 targetHandlerClasses = DictField(
191 "Mapping from dataset type name to target handler class.",
192 keytype=str,
193 itemtype=str,
194 default={}
195 )
196 doRegisterInstrument = Field(
197 "If True (default), add dimension records for the Instrument and its "
198 "filters and detectors to the registry instead of assuming they are "
199 "already present.",
200 dtype=bool,
201 default=True,
202 )
203 doWriteCuratedCalibrations = Field(
204 "If True (default), ingest human-curated calibrations directly via "
205 "the Instrument interface. Note that these calibrations are never "
206 "converted from Gen2 repositories.",
207 dtype=bool,
208 default=True,
209 )
210 refCats = ListField(
211 "The names of reference catalogs (subdirectories under ref_cats) to "
212 "be converted",
213 dtype=str,
214 default=[]
215 )
216 fileIgnorePatterns = ListField(
217 "Filename globs that should be ignored instead of being treated as "
218 "datasets.",
219 dtype=str,
220 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
221 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
222 "_parent", "repositoryCfg.yaml"]
223 )
224 rawDatasetType = Field(
225 "Gen2 dataset type to use for raw data.",
226 dtype=str,
227 default="raw",
228 )
229 datasetIncludePatterns = ListField(
230 "Glob-style patterns for dataset type names that should be converted.",
231 dtype=str,
232 default=["*"]
233 )
234 datasetIgnorePatterns = ListField(
235 "Glob-style patterns for dataset type names that should not be "
236 "converted despite matching a pattern in datasetIncludePatterns.",
237 dtype=str,
238 default=[]
239 )
240 ccdKey = Field(
241 "Key used for the Gen2 equivalent of 'detector' in data IDs.",
242 dtype=str,
243 default="ccd",
244 )
245 relatedOnly = Field(
246 "If True (default), only convert datasets that are related to the "
247 "ingested visits. Ignored unless a list of visits is passed to "
248 "run().",
249 dtype=bool,
250 default=False,
251 )
252 curatedCalibrations = ListField(
253 "Dataset types that are handled by `Instrument.writeCuratedCalibrations()` "
254 "and thus should not be converted using the standard calibration "
255 "conversion system.",
256 dtype=str,
257 default=["camera",
258 "transmission_sensor",
259 "transmission_filter",
260 "transmission_optics",
261 "transmission_atmosphere",
262 "bfKernel"]
263 )
264 instrument = Field(
265 doc=("Fully-qualified Python name of the `Instrument` subclass for "
266 "all converted datasets."),
267 dtype=str,
268 optional=False,
269 default=None,
270 )
272 @property
273 def transfer(self):
274 return self.raws.transfer
276 @transfer.setter
277 def transfer(self, value):
278 self.raws.transfer = value
280 def setDefaults(self):
281 self.transfer = None
283 # TODO: check that there are no collection overrides for curated
284 # calibrations, since we don't have a good way to utilize them.
287class ConvertRepoTask(Task):
288 """A task that converts one or more related Gen2 data repositories to a
289 single Gen3 data repository (with multiple collections).
291 Parameters
292 ----------
293 config: `ConvertRepoConfig`
294 Configuration for this task.
295 butler3: `lsst.daf.butler.Butler`
296 A writeable Gen3 Butler instance that represents the data repository
297 that datasets will be ingested into. If the 'raw' dataset is
298 configured to be included in the conversion, ``butler3.run`` should be
299 set to the name of the collection raws should be ingested into, and
300 ``butler3.collections`` should include a calibration collection from
301 which the ``camera`` dataset can be loaded, unless a calibration repo
302 is converted and ``doWriteCuratedCalibrations`` is `True`.
303 **kwargs
304 Other keyword arguments are forwarded to the `Task` constructor.
306 Notes
307 -----
308 Most of the work of converting repositories is delegated to instances of
309 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
310 only state that is relevant for all Gen2 repositories being ingested, while
311 each `RepoConverter` instance holds only state relevant for the conversion
312 of a single Gen2 repository. Both the task and the `RepoConverter`
313 instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
314 methods may only be called once on a particular instance.
315 """
317 ConfigClass = ConvertRepoConfig
319 _DefaultName = "convertRepo"
321 def __init__(self, config=None, *, butler3: Butler3, **kwargs):
322 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
323 super().__init__(config, **kwargs)
324 self.butler3 = butler3
325 self.registry = self.butler3.registry
326 self.universe = self.registry.dimensions
327 if self.isDatasetTypeIncluded("raw"):
328 self.makeSubtask("raws", butler=butler3)
329 self.makeSubtask("defineVisits", butler=butler3)
330 else:
331 self.raws = None
332 self.defineVisits = None
333 self.instrument = doImport(self.config.instrument)()
334 self._configuredSkyMapsBySha1 = {}
335 self._configuredSkyMapsByName = {}
336 for name, config in self.config.skyMaps.items():
337 instance = config.skyMap.apply()
338 self._populateSkyMapDicts(name, instance)
339 self._usedSkyPix = set()
340 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory()
342 def _populateSkyMapDicts(self, name, instance):
343 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
344 self._configuredSkyMapsBySha1[struct.sha1] = struct
345 self._configuredSkyMapsByName[struct.name] = struct
347 def isDatasetTypeIncluded(self, datasetTypeName: str):
348 """Return `True` if configuration indicates that the given dataset type
349 should be converted.
351 This method is intended to be called primarily by the
352 `RepoConverter` instances used interally by the task.
354 Parameters
355 ----------
356 datasetTypeName: str
357 Name of the dataset type.
359 Returns
360 -------
361 included : `bool`
362 Whether the dataset should be included in the conversion.
363 """
364 return (
365 any(fnmatch.fnmatchcase(datasetTypeName, pattern)
366 for pattern in self.config.datasetIncludePatterns)
367 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
368 for pattern in self.config.datasetIgnorePatterns)
369 )
371 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
372 """Indicate that a repository uses the given SkyMap.
374 This method is intended to be called primarily by the
375 `RepoConverter` instances used interally by the task.
377 Parameters
378 ----------
379 skyMap : `lsst.skymap.BaseSkyMap`
380 SkyMap instance being used, typically retrieved from a Gen2
381 data repository.
382 skyMapName : `str`
383 The name of the gen2 skymap, for error reporting.
385 Returns
386 -------
387 name : `str`
388 The name of the skymap in Gen3 data IDs.
390 Raises
391 ------
392 LookupError
393 Raised if the specified skymap cannot be found.
394 """
395 sha1 = skyMap.getSha1()
396 if sha1 not in self._configuredSkyMapsBySha1:
397 self._populateSkyMapDicts(skyMapName, skyMap)
398 try:
399 struct = self._configuredSkyMapsBySha1[sha1]
400 except KeyError as err:
401 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
402 raise LookupError(msg) from err
403 struct.used = True
404 return struct.name
406 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
407 """Register all skymaps that have been marked as used.
409 This method is intended to be called primarily by the
410 `RepoConverter` instances used interally by the task.
412 Parameters
413 ----------
414 subset : `ConversionSubset`, optional
415 Object that will be used to filter converted datasets by data ID.
416 If given, it will be updated with the tracts of this skymap that
417 overlap the visits in the subset.
418 """
419 for struct in self._configuredSkyMapsBySha1.values():
420 if struct.used:
421 struct.instance.register(struct.name, self.registry)
422 if subset is not None and self.config.relatedOnly:
423 subset.addSkyMap(self.registry, struct.name)
425 def useSkyPix(self, dimension: SkyPixDimension):
426 """Indicate that a repository uses the given SkyPix dimension.
428 This method is intended to be called primarily by the
429 `RepoConverter` instances used interally by the task.
431 Parameters
432 ----------
433 dimension : `lsst.daf.butler.SkyPixDimension`
434 Dimension represening a pixelization of the sky.
435 """
436 self._usedSkyPix.add(dimension)
438 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
439 """Register all skymaps that have been marked as used.
441 This method is intended to be called primarily by the
442 `RepoConverter` instances used interally by the task.
444 Parameters
445 ----------
446 subset : `ConversionSubset`, optional
447 Object that will be used to filter converted datasets by data ID.
448 If given, it will be updated with the pixelization IDs that
449 overlap the visits in the subset.
450 """
451 if subset is not None and self.config.relatedOnly:
452 for dimension in self._usedSkyPix:
453 subset.addSkyPix(self.registry, dimension)
455 def run(self, root: str, *,
456 calibs: Dict[str, str] = None,
457 reruns: List[Rerun],
458 visits: Optional[Iterable[int]] = None):
459 """Convert a group of related data repositories.
461 Parameters
462 ----------
463 root : `str`
464 Complete path to the root Gen2 data repository. This should be
465 a data repository that includes a Gen2 registry and any raw files
466 and/or reference catalogs.
467 calibs : `dict`
468 Dictionary mapping calibration repository path to the
469 `~lsst.daf.butler.CollectionType.RUN` collection that converted
470 datasets within it should be inserted into.
471 reruns : `list` of `Rerun`
472 Specifications for rerun (processing output) collections to
473 convert.
474 visits : iterable of `int`, optional
475 The integer IDs of visits to convert. If not provided, all visits
476 in the Gen2 root repository will be converted.
477 """
478 if calibs is None:
479 calibs = {}
480 if visits is not None:
481 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
482 else:
483 if self.config.relatedOnly:
484 self.log.warn("config.relatedOnly is True but all visits are being ingested; "
485 "no filtering will be done.")
486 subset = None
488 # Make converters for all Gen2 repos.
489 converters = []
490 rootConverter = RootRepoConverter(task=self, root=root, subset=subset)
491 converters.append(rootConverter)
492 for calibRoot, run in calibs.items():
493 if not os.path.isabs(calibRoot):
494 calibRoot = os.path.join(rootConverter.root, calibRoot)
495 converter = CalibRepoConverter(task=self, root=calibRoot, run=run,
496 mapper=rootConverter.mapper,
497 subset=rootConverter.subset)
498 converters.append(converter)
499 for spec in reruns:
500 runRoot = spec.path
501 if not os.path.isabs(runRoot):
502 runRoot = os.path.join(rootConverter.root, runRoot)
503 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName,
504 subset=rootConverter.subset)
505 converters.append(converter)
507 # Register the instrument if we're configured to do so.
508 if self.config.doRegisterInstrument:
509 # Allow registration to fail on the assumption that this means
510 # we are reusing a butler
511 try:
512 self.instrument.register(self.registry)
513 except Exception:
514 pass
516 # Run raw ingest (does nothing if we weren't configured to convert the
517 # 'raw' dataset type).
518 rootConverter.runRawIngest()
520 # Write curated calibrations to all calibration repositories.
521 # Add new collections to the list of collections the butler was
522 # initialized to pass to DefineVisitsTask, to deal with the (likely)
523 # case the only 'camera' dataset in the repo will be one we're adding
524 # here.
525 if self.config.doWriteCuratedCalibrations:
526 for run in calibs.values():
527 butler3 = Butler3(butler=self.butler3, run=run)
528 self.instrument.writeCuratedCalibrations(butler3)
530 # Define visits (also does nothing if we weren't configurd to convert
531 # the 'raw' dataset type).
532 rootConverter.runDefineVisits()
534 # Walk Gen2 repos to find datasets convert.
535 for converter in converters:
536 converter.prep()
538 # Insert dimensions needed by any converters. In practice this is just
539 # calibration_labels right now, because exposures and visits (and
540 # things related to them) are handled by RawIngestTask and
541 # DefineVisitsTask earlier and skymaps are handled later.
542 #
543 # Note that we do not try to filter dimensions down to just those
544 # related to the given visits, even if config.relatedOnly is True; we
545 # need them in the Gen3 repo in order to be able to know which datasets
546 # to convert, because Gen2 alone doesn't know enough about the
547 # relationships between data IDs.
548 for converter in converters:
549 converter.insertDimensionData()
551 # Insert dimensions that are potentially shared by all Gen2
552 # repositories (and are hence managed directly by the Task, rather
553 # than a converter instance).
554 # This also finishes setting up the (shared) converter.subsets object
555 # that is used to filter data IDs for config.relatedOnly.
556 self.registerUsedSkyMaps(rootConverter.subset)
557 self.registerUsedSkyPix(rootConverter.subset)
559 # Look for datasets, generally by scanning the filesystem.
560 # This requires dimensions to have already been inserted so we can use
561 # dimension information to identify related datasets.
562 for converter in converters:
563 converter.findDatasets()
565 # Expand data IDs.
566 for converter in converters:
567 converter.expandDataIds()
569 # Actually ingest datasets.
570 for converter in converters:
571 converter.ingest()
573 # Add chained collections for reruns.
574 for spec in reruns:
575 if spec.chainName is not None:
576 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED)
577 chain = [spec.runName]
578 chain.extend(spec.parents)
579 chain.extend(rootConverter.getCollectionChain())
580 self.log.info("Defining %s from chain %s.", spec.chainName, chain)
581 self.butler3.registry.setCollectionChain(spec.chainName, chain)