Coverage for python/lsst/obs/base/gen2to3/rootRepoConverter.py: 16%
117 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 03:03 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 03:03 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RootRepoConverter"]
25import itertools
26import os
27import re
28from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Optional, Tuple
30from lsst.daf.butler import CollectionType, DatasetRef, DatasetType, DimensionGraph, FileDataset
31from lsst.skymap import BaseSkyMap
33from .standardRepoConverter import StandardRepoConverter
35SKYMAP_DATASET_TYPES = {coaddName: f"{coaddName}Coadd_skyMap" for coaddName in ("deep", "goodSeeing", "dcr")}
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from lsst.daf.butler import SkyPixDimension
41def getDataPaths(dataRefs):
42 """Strip HDU identifiers from paths and return a unique set of paths.
44 Parameters
45 ----------
46 dataRefs : `lsst.daf.persistence.ButlerDataRef`
47 The gen2 datarefs to strip "[HDU]" values from.
49 Returns
50 -------
51 paths : `set` [`str`]
52 The unique file paths without appended "[HDU]".
53 """
54 paths = set()
55 for dataRef in dataRefs:
56 path = dataRef.getUri()
57 # handle with FITS files with multiple HDUs (e.g. decam raw)
58 paths.add(path.split("[")[0])
59 return paths
62class RootRepoConverter(StandardRepoConverter):
63 """A specialization of `RepoConverter` for root data repositories.
65 `RootRepoConverter` adds support for raw images (mostly delegated to the
66 parent task's `RawIngestTask` subtask) and reference catalogs.
68 Parameters
69 ----------
70 kwds
71 Keyword arguments are forwarded to (and required by) `RepoConverter`.
72 """
74 def __init__(self, **kwds):
75 super().__init__(run=None, **kwds)
76 self._refCats: Dict[str, SkyPixDimension] = {}
77 if self.task.config.rootSkyMapName is not None:
78 self._rootSkyMap = self.task.config.skyMaps[self.task.config.rootSkyMapName].skyMap.apply()
79 else:
80 self._rootSkyMap = None # All access to _rootSkyMap is guarded
81 self._rawRefs = []
83 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
84 # Docstring inherited from RepoConverter.
85 return (
86 super().isDatasetTypeSpecial(datasetTypeName)
87 or datasetTypeName in ("raw", "ref_cat", "ref_cat_config")
88 # in Gen2, some of these are in the root repo, not a calib repo
89 or datasetTypeName in self.instrument.getCuratedCalibrationNames()
90 )
92 def getSpecialDirectories(self) -> List[str]:
93 # Docstring inherited from RepoConverter.
94 return super().getSpecialDirectories() + ["CALIB", "ref_cats", "rerun"]
96 def findMatchingSkyMap(self, datasetTypeName: str) -> Tuple[Optional[BaseSkyMap], Optional[str]]:
97 # Docstring inherited from StandardRepoConverter.findMatchingSkyMap.
98 skyMap, name = super().findMatchingSkyMap(datasetTypeName)
99 if skyMap is None and self.task.config.rootSkyMapName is not None:
100 self.task.log.debug(
101 "Assuming configured root skymap with name '%s' for dataset %s.",
102 self.task.config.rootSkyMapName,
103 datasetTypeName,
104 )
105 skyMap = self._rootSkyMap
106 name = self.task.config.rootSkyMapName
107 return skyMap, name
109 def runRawIngest(self, pool=None):
110 if self.task.raws is None:
111 self.task.log.info("Skipping raw ingest for %s.", self.root)
112 return
113 self.task.log.info("Finding raws in root %s.", self.root)
114 if self.subset is not None:
115 dataRefs = itertools.chain.from_iterable(
116 self.butler2.subset(self.task.config.rawDatasetType, visit=visit)
117 for visit in self.subset.visits
118 )
119 else:
120 dataRefs = self.butler2.subset(self.task.config.rawDatasetType)
121 dataPaths = getDataPaths(dataRefs)
122 if not self.task.dry_run:
123 self.task.log.info(
124 "Ingesting raws from root %s into run %s.", self.root, self.task.raws.butler.run
125 )
126 self._rawRefs.extend(self.task.raws.run(dataPaths, pool=pool))
127 else:
128 self.task.log.info(
129 "[dry run] skipping ingesting raws from root %s into run %s.",
130 self.root,
131 self.task.raws.butler.run,
132 )
133 self._chain = [self.task.raws.butler.run]
135 def runDefineVisits(self):
136 if self.task.defineVisits is None:
137 self.task.log.info("Skipping visit definition for %s.", self.root)
138 return
139 dimensions = DimensionGraph(self.task.universe, names=["exposure"])
140 exposureDataIds = set(ref.dataId.subset(dimensions) for ref in self._rawRefs)
141 if not self.task.dry_run:
142 self.task.log.info("Defining visits from exposures.")
143 self.task.defineVisits.run(exposureDataIds)
144 else:
145 self.task.log.info("[dry run] Skipping defining visits from exposures.")
147 def prep(self):
148 # Docstring inherited from RepoConverter.
149 # Gather information about reference catalogs.
150 if self.task.isDatasetTypeIncluded("ref_cat") and len(self.task.config.refCats) != 0:
151 from lsst.meas.algorithms import DatasetConfig as RefCatDatasetConfig
153 for refCat in os.listdir(os.path.join(self.root, "ref_cats")):
154 path = os.path.join(self.root, "ref_cats", refCat)
155 configFile = os.path.join(path, "config.py")
156 if not os.path.exists(configFile):
157 continue
158 if refCat not in self.task.config.refCats:
159 continue
160 self.task.log.info("Preparing ref_cat %s from root %s.", refCat, self.root)
161 onDiskConfig = RefCatDatasetConfig()
162 onDiskConfig.load(configFile)
163 if onDiskConfig.indexer.name != "HTM":
164 raise ValueError(
165 f"Reference catalog '{refCat}' uses unsupported "
166 f"pixelization '{onDiskConfig.indexer.name}'."
167 )
168 level = onDiskConfig.indexer["HTM"].depth
169 try:
170 dimension = self.task.universe[f"htm{level}"]
171 except KeyError as err:
172 raise ValueError(
173 f"Reference catalog {refCat} uses HTM level {level}, but no htm{level} "
174 f"skypix dimension is configured for this registry."
175 ) from err
176 self.task.useSkyPix(dimension)
177 self._refCats[refCat] = dimension
178 if self.task.isDatasetTypeIncluded("brightObjectMask") and self.task.config.rootSkyMapName:
179 self.task.useSkyMap(self._rootSkyMap, self.task.config.rootSkyMapName)
180 super().prep()
182 def iterDatasets(self) -> Iterator[FileDataset]:
183 # Docstring inherited from RepoConverter.
184 # Iterate over reference catalog files.
185 for refCat, dimension in self._refCats.items():
186 datasetType = DatasetType(
187 refCat, dimensions=[dimension], universe=self.task.universe, storageClass="SimpleCatalog"
188 )
189 if self.subset is None:
190 regex = re.compile(r"(\d+)\.fits")
191 for fileName in self.progress.wrap(
192 os.listdir(os.path.join(self.root, "ref_cats", refCat)),
193 desc=f"Processing refcat {refCat}",
194 ):
195 m = regex.match(fileName)
196 if m is not None:
197 htmId = int(m.group(1))
198 dataId = self.task.registry.expandDataId({dimension: htmId})
199 yield FileDataset(
200 path=os.path.join(self.root, "ref_cats", refCat, fileName),
201 refs=DatasetRef(datasetType, dataId),
202 )
203 else:
204 for begin, end in self.progress.wrap(
205 self.subset.skypix[dimension], desc=f"Processing ranges for refcat {refCat}"
206 ):
207 for htmId in range(begin, end):
208 dataId = self.task.registry.expandDataId({dimension: htmId})
209 yield FileDataset(
210 path=os.path.join(self.root, "ref_cats", refCat, f"{htmId}.fits"),
211 refs=DatasetRef(datasetType, dataId),
212 )
213 yield from super().iterDatasets()
215 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str:
216 # Docstring inherited from RepoConverter.
217 if datasetTypeName in self._refCats:
218 return self.instrument.makeRefCatCollectionName("gen2")
219 return super().getRun(datasetTypeName, calibDate)
221 def _finish(
222 self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], count: int
223 ) -> None:
224 # Docstring inherited from RepoConverter.
225 super()._finish(datasets, count)
226 if self._refCats:
227 # Set up a CHAINED collection named something like "refcats" to
228 # also point to "refcats/gen2". It's conceivable (but unlikely)
229 # that "refcats/gen2" might not exist, if the scanner saw reference
230 # catalog datasets on disk but none overlapped the area of
231 # interest, so we register that here, too (multiple registrations
232 # of collections are fine).
233 chained = self.instrument.makeRefCatCollectionName()
234 child = self.instrument.makeRefCatCollectionName("gen2")
235 self.task.registry.registerCollection(chained, CollectionType.CHAINED)
236 self.task.registry.registerCollection(child, CollectionType.RUN)
237 children = list(self.task.registry.getCollectionChain(chained))
238 children.append(child)
239 self.task.registry.setCollectionChain(chained, children)
240 # Also add "refcats" to the list of collections that contains
241 # everything found in the root repo. Normally this is done in
242 # getRun, but here we want to add the (possibly new) CHAINED
243 # collection instead of the RUN collection.
244 self._chain.append(chained)