Coverage for python/lsst/obs/base/gen2to3/convertRepo.py : 29%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig", "Rerun"]
25import os
26import fnmatch
27from dataclasses import dataclass
28from multiprocessing import Pool
29from typing import Iterable, Optional, List, Dict
31from lsst.daf.butler import (
32 Butler as Butler3,
33 CollectionType,
34 SkyPixDimension
35)
36from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
37from lsst.pipe.base import Task
38from lsst.skymap import skyMapRegistry, BaseSkyMap
40from ..ingest import RawIngestTask
41from ..defineVisits import DefineVisitsTask
42from .repoConverter import ConversionSubset
43from .rootRepoConverter import RootRepoConverter
44from .calibRepoConverter import CalibRepoConverter
45from .standardRepoConverter import StandardRepoConverter
46from .._instrument import Instrument
49@dataclass
50class ConfiguredSkyMap:
51 """Struct containing information about a skymap that may appear in a Gen2
52 repository.
53 """
55 name: str
56 """Name of the skymap used in Gen3 data IDs.
57 """
59 sha1: bytes
60 """Hash computed by `BaseSkyMap.getSha1`.
61 """
63 instance: BaseSkyMap
64 """Name of the skymap used in Gen3 data IDs.
65 """
67 used: bool = False
68 """Whether this skymap has been found in at least one repository being
69 converted.
70 """
73@dataclass
74class Rerun:
75 """Specification for a Gen2 processing-output repository to convert.
76 """
78 path: str
79 """Absolute or relative (to the root repository) path to the Gen2
80 repository (`str`).
81 """
83 runName: str
84 """Name of the `~lsst.daf.butler.CollectionType.RUN` collection datasets
85 will be inserted into (`str`).
86 """
88 chainName: Optional[str]
89 """Name of a `~lsst.daf.butler.CollectionType.CHAINED` collection that will
90 combine this repository's datasets with those of its parent repositories
91 (`str`, optional).
92 """
94 parents: List[str]
95 """Collection names associated with parent repositories, used to define the
96 chained collection (`list` [ `str` ]).
98 Ignored if `chainName` is `None`. Runs used in the root repo are
99 automatically included.
100 """
103class ConvertRepoSkyMapConfig(Config):
104 """Sub-config used to hold the parameters of a SkyMap.
106 Notes
107 -----
108 This config only needs to exist because we can't put a
109 `~lsst.pex.config.RegistryField` directly inside a
110 `~lsst.pex.config.ConfigDictField`.
112 It needs to have its only field named "skyMap" for compatibility with the
113 configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
114 use one config file in an obs package to configure both.
116 This name leads to unfortunate repetition with the field named
117 "skymap" that holds it - "skyMap[name].skyMap" - but that seems
118 unavoidable.
119 """
120 skyMap = skyMapRegistry.makeField(
121 doc="Type and parameters for the SkyMap itself.",
122 default="dodeca",
123 )
126class ConvertRepoConfig(Config):
127 raws = ConfigurableField(
128 "Configuration for subtask responsible for ingesting raws and adding "
129 "exposure dimension entries.",
130 target=RawIngestTask,
131 )
132 defineVisits = ConfigurableField(
133 "Configuration for the subtask responsible for defining visits from "
134 "exposures.",
135 target=DefineVisitsTask,
136 )
137 skyMaps = ConfigDictField(
138 "Mapping from Gen3 skymap name to the parameters used to construct a "
139 "BaseSkyMap instance. This will be used to associate names with "
140 "existing skymaps found in the Gen2 repo.",
141 keytype=str,
142 itemtype=ConvertRepoSkyMapConfig,
143 default={}
144 )
145 rootSkyMapName = Field(
146 "Name of a Gen3 skymap (an entry in ``self.skyMaps``) to assume for "
147 "datasets in the root repository when no SkyMap is found there. ",
148 dtype=str,
149 optional=True,
150 default=None,
151 )
152 runs = DictField(
153 "A mapping from dataset type name to the RUN collection they should "
154 "be inserted into. This must include all datasets that can be found "
155 "in the root repository; other repositories will use per-repository "
156 "runs.",
157 keytype=str,
158 itemtype=str,
159 default={
160 "deepCoadd_skyMap": "skymaps",
161 "brightObjectMask": "masks",
162 }
163 )
164 storageClasses = DictField(
165 "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
166 "or 'persistable') to the Gen3 StorageClass name.",
167 keytype=str,
168 itemtype=str,
169 default={
170 "bias": "ExposureF",
171 "dark": "ExposureF",
172 "flat": "ExposureF",
173 "defects": "Defects",
174 "crosstalk": "CrosstalkCalib",
175 "BaseSkyMap": "SkyMap",
176 "BaseCatalog": "Catalog",
177 "BackgroundList": "Background",
178 "raw": "Exposure",
179 "MultilevelParquetTable": "DataFrame",
180 "ParquetTable": "DataFrame",
181 "SkyWcs": "Wcs",
182 }
183 )
184 formatterClasses = DictField(
185 "Mapping from dataset type name to formatter class. "
186 "By default these are derived from the formatters listed in the"
187 " Gen3 datastore configuration.",
188 keytype=str,
189 itemtype=str,
190 default={}
191 )
192 targetHandlerClasses = DictField(
193 "Mapping from dataset type name to target handler class.",
194 keytype=str,
195 itemtype=str,
196 default={}
197 )
198 doRegisterInstrument = Field(
199 "If True (default), add dimension records for the Instrument and its "
200 "filters and detectors to the registry instead of assuming they are "
201 "already present.",
202 dtype=bool,
203 default=True,
204 )
205 doWriteCuratedCalibrations = Field(
206 "If True (default), ingest human-curated calibrations directly via "
207 "the Instrument interface. Note that these calibrations are never "
208 "converted from Gen2 repositories.",
209 dtype=bool,
210 default=True,
211 )
212 refCats = ListField(
213 "The names of reference catalogs (subdirectories under ref_cats) to "
214 "be converted",
215 dtype=str,
216 default=[]
217 )
218 fileIgnorePatterns = ListField(
219 "Filename globs that should be ignored instead of being treated as "
220 "datasets.",
221 dtype=str,
222 default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3",
223 "registry.sqlite3", "calibRegistry.sqlite3", "_mapper",
224 "_parent", "repositoryCfg.yaml"]
225 )
226 rawDatasetType = Field(
227 "Gen2 dataset type to use for raw data.",
228 dtype=str,
229 default="raw",
230 )
231 datasetIncludePatterns = ListField(
232 "Glob-style patterns for dataset type names that should be converted.",
233 dtype=str,
234 default=["*"]
235 )
236 datasetIgnorePatterns = ListField(
237 "Glob-style patterns for dataset type names that should not be "
238 "converted despite matching a pattern in datasetIncludePatterns.",
239 dtype=str,
240 default=[]
241 )
242 ccdKey = Field(
243 "Key used for the Gen2 equivalent of 'detector' in data IDs.",
244 dtype=str,
245 default="ccd",
246 )
247 relatedOnly = Field(
248 "If True (default), only convert datasets that are related to the "
249 "ingested visits. Ignored unless a list of visits is passed to "
250 "run().",
251 dtype=bool,
252 default=False,
253 )
255 @property
256 def transfer(self):
257 return self.raws.transfer
259 @transfer.setter
260 def transfer(self, value):
261 self.raws.transfer = value
263 def setDefaults(self):
264 self.transfer = None
266 # TODO: check that there are no collection overrides for curated
267 # calibrations, since we don't have a good way to utilize them.
270class ConvertRepoTask(Task):
271 """A task that converts one or more related Gen2 data repositories to a
272 single Gen3 data repository (with multiple collections).
274 Parameters
275 ----------
276 config: `ConvertRepoConfig`
277 Configuration for this task.
278 butler3: `lsst.daf.butler.Butler`
279 A writeable Gen3 Butler instance that represents the data repository
280 that datasets will be ingested into. If the 'raw' dataset is
281 configured to be included in the conversion, ``butler3.run`` should be
282 set to the name of the collection raws should be ingested into, and
283 ``butler3.collections`` should include a calibration collection from
284 which the ``camera`` dataset can be loaded, unless a calibration repo
285 is converted and ``doWriteCuratedCalibrations`` is `True`.
286 instrument : `lsst.obs.base.Instrument`
287 The Gen3 instrument that should be used for this conversion.
288 **kwargs
289 Other keyword arguments are forwarded to the `Task` constructor.
291 Notes
292 -----
293 Most of the work of converting repositories is delegated to instances of
294 the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
295 only state that is relevant for all Gen2 repositories being ingested, while
296 each `RepoConverter` instance holds only state relevant for the conversion
297 of a single Gen2 repository. Both the task and the `RepoConverter`
298 instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
299 methods may only be called once on a particular instance.
300 """
302 ConfigClass = ConvertRepoConfig
304 _DefaultName = "convertRepo"
306 def __init__(self, config=None, *, butler3: Butler3, instrument: Instrument, **kwargs):
307 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here.
308 super().__init__(config, **kwargs)
309 self.butler3 = butler3
310 self.registry = self.butler3.registry
311 self.universe = self.registry.dimensions
312 if self.isDatasetTypeIncluded("raw"):
313 self.makeSubtask("raws", butler=butler3)
314 self.makeSubtask("defineVisits", butler=butler3)
315 else:
316 self.raws = None
317 self.defineVisits = None
318 self.instrument = instrument
319 self._configuredSkyMapsBySha1 = {}
320 self._configuredSkyMapsByName = {}
321 for name, config in self.config.skyMaps.items():
322 instance = config.skyMap.apply()
323 self._populateSkyMapDicts(name, instance)
324 self._usedSkyPix = set()
325 self.translatorFactory = self.instrument.makeDataIdTranslatorFactory()
326 self.translatorFactory.log = self.log.getChild("translators")
328 def _reduce_kwargs(self):
329 # Add extra parameters to pickle
330 return dict(**super()._reduce_kwargs(), butler3=self.butler3, instrument=self.instrument)
332 def _populateSkyMapDicts(self, name, instance):
333 struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
334 self._configuredSkyMapsBySha1[struct.sha1] = struct
335 self._configuredSkyMapsByName[struct.name] = struct
337 def isDatasetTypeIncluded(self, datasetTypeName: str):
338 """Return `True` if configuration indicates that the given dataset type
339 should be converted.
341 This method is intended to be called primarily by the
342 `RepoConverter` instances used interally by the task.
344 Parameters
345 ----------
346 datasetTypeName: str
347 Name of the dataset type.
349 Returns
350 -------
351 included : `bool`
352 Whether the dataset should be included in the conversion.
353 """
354 return (
355 any(fnmatch.fnmatchcase(datasetTypeName, pattern)
356 for pattern in self.config.datasetIncludePatterns)
357 and not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
358 for pattern in self.config.datasetIgnorePatterns)
359 )
361 def useSkyMap(self, skyMap: BaseSkyMap, skyMapName: str) -> str:
362 """Indicate that a repository uses the given SkyMap.
364 This method is intended to be called primarily by the
365 `RepoConverter` instances used interally by the task.
367 Parameters
368 ----------
369 skyMap : `lsst.skymap.BaseSkyMap`
370 SkyMap instance being used, typically retrieved from a Gen2
371 data repository.
372 skyMapName : `str`
373 The name of the gen2 skymap, for error reporting.
375 Returns
376 -------
377 name : `str`
378 The name of the skymap in Gen3 data IDs.
380 Raises
381 ------
382 LookupError
383 Raised if the specified skymap cannot be found.
384 """
385 sha1 = skyMap.getSha1()
386 if sha1 not in self._configuredSkyMapsBySha1:
387 self._populateSkyMapDicts(skyMapName, skyMap)
388 try:
389 struct = self._configuredSkyMapsBySha1[sha1]
390 except KeyError as err:
391 msg = f"SkyMap '{skyMapName}' with sha1={sha1} not included in configuration."
392 raise LookupError(msg) from err
393 struct.used = True
394 return struct.name
396 def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
397 """Register all skymaps that have been marked as used.
399 This method is intended to be called primarily by the
400 `RepoConverter` instances used interally by the task.
402 Parameters
403 ----------
404 subset : `ConversionSubset`, optional
405 Object that will be used to filter converted datasets by data ID.
406 If given, it will be updated with the tracts of this skymap that
407 overlap the visits in the subset.
408 """
409 for struct in self._configuredSkyMapsBySha1.values():
410 if struct.used:
411 struct.instance.register(struct.name, self.registry)
412 if subset is not None and self.config.relatedOnly:
413 subset.addSkyMap(self.registry, struct.name)
415 def useSkyPix(self, dimension: SkyPixDimension):
416 """Indicate that a repository uses the given SkyPix dimension.
418 This method is intended to be called primarily by the
419 `RepoConverter` instances used interally by the task.
421 Parameters
422 ----------
423 dimension : `lsst.daf.butler.SkyPixDimension`
424 Dimension represening a pixelization of the sky.
425 """
426 self._usedSkyPix.add(dimension)
428 def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
429 """Register all skymaps that have been marked as used.
431 This method is intended to be called primarily by the
432 `RepoConverter` instances used interally by the task.
434 Parameters
435 ----------
436 subset : `ConversionSubset`, optional
437 Object that will be used to filter converted datasets by data ID.
438 If given, it will be updated with the pixelization IDs that
439 overlap the visits in the subset.
440 """
441 if subset is not None and self.config.relatedOnly:
442 for dimension in self._usedSkyPix:
443 subset.addSkyPix(self.registry, dimension)
445 def run(self, root: str, *,
446 calibs: Dict[str, str] = None,
447 reruns: List[Rerun],
448 visits: Optional[Iterable[int]] = None,
449 pool: Optional[Pool] = None,
450 processes: int = 1):
451 """Convert a group of related data repositories.
453 Parameters
454 ----------
455 root : `str`
456 Complete path to the root Gen2 data repository. This should be
457 a data repository that includes a Gen2 registry and any raw files
458 and/or reference catalogs.
459 calibs : `dict`
460 Dictionary mapping calibration repository path to the
461 `~lsst.daf.butler.CollectionType.CALIBRATION` collection that
462 converted datasets within it should be certified into.
463 reruns : `list` of `Rerun`
464 Specifications for rerun (processing output) collections to
465 convert.
466 visits : iterable of `int`, optional
467 The integer IDs of visits to convert. If not provided, all visits
468 in the Gen2 root repository will be converted.
469 pool : `multiprocessing.Pool`, optional
470 If not `None`, a process pool with which to parallelize some
471 operations.
472 processes : `int`, optional
473 The number of processes to use for conversion.
474 """
475 if pool is None and processes > 1:
476 pool = Pool(processes)
477 if calibs is None:
478 calibs = {}
479 if visits is not None:
480 subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
481 else:
482 if self.config.relatedOnly:
483 self.log.warn("config.relatedOnly is True but all visits are being ingested; "
484 "no filtering will be done.")
485 subset = None
487 # Make converters for all Gen2 repos.
488 converters = []
489 rootConverter = RootRepoConverter(task=self, root=root, subset=subset, instrument=self.instrument)
490 converters.append(rootConverter)
491 for calibRoot, collection in calibs.items():
492 if not os.path.isabs(calibRoot):
493 calibRoot = os.path.join(rootConverter.root, calibRoot)
494 converter = CalibRepoConverter(task=self, root=calibRoot, collection=collection,
495 instrument=self.instrument,
496 mapper=rootConverter.mapper,
497 subset=rootConverter.subset)
498 converters.append(converter)
499 rerunConverters = {}
500 for spec in reruns:
501 runRoot = spec.path
502 if not os.path.isabs(runRoot):
503 runRoot = os.path.join(rootConverter.root, runRoot)
504 converter = StandardRepoConverter(task=self, root=runRoot, run=spec.runName,
505 instrument=self.instrument, subset=rootConverter.subset)
506 converters.append(converter)
507 rerunConverters[spec.runName] = converter
509 # Register the instrument if we're configured to do so.
510 if self.config.doRegisterInstrument:
511 self.instrument.register(self.registry)
513 # Run raw ingest (does nothing if we weren't configured to convert the
514 # 'raw' dataset type).
515 rootConverter.runRawIngest(pool=pool)
517 # Write curated calibrations to all calibration runs and
518 # also in the default collection.
519 # Add new collections to the list of collections the butler was
520 # initialized to pass to DefineVisitsTask, to deal with the (likely)
521 # case the only 'camera' dataset in the repo will be one we're adding
522 # here.
523 if self.config.doWriteCuratedCalibrations:
524 butler3 = Butler3(butler=self.butler3)
525 # Write curated calibrations to any new calibration collections we
526 # created by converting a Gen2 calibration repo.
527 calibCollections = set()
528 for collection in calibs.values():
529 self.instrument.writeCuratedCalibrations(butler3, collection=collection)
530 calibCollections.add(collection)
531 # Ensure that we have the curated calibrations even if there
532 # is no calibration conversion. It's possible that the default
533 # calib collection will have been specified (in fact the
534 # butler convert script enforces that behavior for now) so
535 # we check for the default situation
536 # Assume we know the default rather than letting
537 # writeCuratedCalibrations default itself
538 defaultCalibCollection = self.instrument.makeCollectionName("calib")
539 if defaultCalibCollection not in calibCollections:
540 self.instrument.writeCuratedCalibrations(butler3, collection=defaultCalibCollection)
542 # Define visits (also does nothing if we weren't configurd to convert
543 # the 'raw' dataset type).
544 rootConverter.runDefineVisits(pool=pool)
546 # Walk Gen2 repos to find datasets convert.
547 for converter in converters:
548 converter.prep()
550 # Insert dimensions that are potentially shared by all Gen2
551 # repositories (and are hence managed directly by the Task, rather
552 # than a converter instance).
553 # This also finishes setting up the (shared) converter.subsets object
554 # that is used to filter data IDs for config.relatedOnly.
555 self.registerUsedSkyMaps(rootConverter.subset)
556 self.registerUsedSkyPix(rootConverter.subset)
558 # Look for datasets, generally by scanning the filesystem.
559 # This requires dimensions to have already been inserted so we can use
560 # dimension information to identify related datasets.
561 for converter in converters:
562 converter.findDatasets()
564 # Expand data IDs.
565 for converter in converters:
566 converter.expandDataIds()
568 # Actually ingest datasets.
569 for converter in converters:
570 converter.ingest()
572 # Perform any post-ingest processing.
573 for converter in converters:
574 converter.finish()
576 # Add chained collections for reruns.
577 for spec in reruns:
578 if spec.chainName is not None:
579 self.butler3.registry.registerCollection(spec.chainName, type=CollectionType.CHAINED)
580 chain = [spec.runName]
581 chain.extend(rerunConverters[spec.runName].getCollectionChain())
582 for parent in spec.parents:
583 chain.append(spec.parent)
584 parentConverter = rerunConverters.get(parent)
585 if parentConverter is not None:
586 chain.extend(parentConverter.getCollectionChain())
587 chain.extend(rootConverter.getCollectionChain())
588 self.log.info("Defining %s from chain %s.", spec.chainName, chain)
589 self.butler3.registry.setCollectionChain(spec.chainName, chain)