Coverage for python/lsst/obs/base/gen2to3/rootRepoConverter.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

117 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RootRepoConverter"] 

24 

25import os 

26import re 

27import itertools 

28from typing import TYPE_CHECKING, Dict, Iterator, Mapping, Optional, Tuple, List 

29 

30from lsst.skymap import BaseSkyMap 

31from lsst.daf.butler import CollectionType, DatasetType, DatasetRef, DimensionGraph, FileDataset 

32from .standardRepoConverter import StandardRepoConverter 

33 

34SKYMAP_DATASET_TYPES = { 

35 coaddName: f"{coaddName}Coadd_skyMap" for coaddName in ("deep", "goodSeeing", "dcr") 

36} 

37 

38if TYPE_CHECKING: 38 ↛ 39line 38 didn't jump to line 39, because the condition on line 38 was never true

39 from lsst.daf.butler import SkyPixDimension 

40 

41 

42def getDataPaths(dataRefs): 

43 """Strip HDU identifiers from paths and return a unique set of paths. 

44 

45 Parameters 

46 ---------- 

47 dataRefs : `lsst.daf.persistence.ButlerDataRef` 

48 The gen2 datarefs to strip "[HDU]" values from. 

49 

50 Returns 

51 ------- 

52 paths : `set` [`str`] 

53 The unique file paths without appended "[HDU]". 

54 """ 

55 paths = set() 

56 for dataRef in dataRefs: 

57 path = dataRef.getUri() 

58 # handle with FITS files with multiple HDUs (e.g. decam raw) 

59 paths.add(path.split('[')[0]) 

60 return paths 

61 

62 

63class RootRepoConverter(StandardRepoConverter): 

64 """A specialization of `RepoConverter` for root data repositories. 

65 

66 `RootRepoConverter` adds support for raw images (mostly delegated to the 

67 parent task's `RawIngestTask` subtask) and reference catalogs. 

68 

69 Parameters 

70 ---------- 

71 kwds 

72 Keyword arguments are forwarded to (and required by) `RepoConverter`. 

73 """ 

74 

75 def __init__(self, **kwds): 

76 super().__init__(run=None, **kwds) 

77 self._refCats: Dict[str, SkyPixDimension] = {} 

78 if self.task.config.rootSkyMapName is not None: 

79 self._rootSkyMap = self.task.config.skyMaps[self.task.config.rootSkyMapName].skyMap.apply() 

80 else: 

81 self._rootSkyMap = None # All access to _rootSkyMap is guarded 

82 self._rawRefs = [] 

83 

84 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

85 # Docstring inherited from RepoConverter. 

86 return ( 

87 super().isDatasetTypeSpecial(datasetTypeName) 

88 or datasetTypeName in ("raw", "ref_cat", "ref_cat_config") 

89 # in Gen2, some of these are in the root repo, not a calib repo 

90 or datasetTypeName in self.instrument.getCuratedCalibrationNames() 

91 ) 

92 

93 def getSpecialDirectories(self) -> List[str]: 

94 # Docstring inherited from RepoConverter. 

95 return super().getSpecialDirectories() + ["CALIB", "ref_cats", "rerun"] 

96 

97 def findMatchingSkyMap(self, datasetTypeName: str) -> Tuple[Optional[BaseSkyMap], Optional[str]]: 

98 # Docstring inherited from StandardRepoConverter.findMatchingSkyMap. 

99 skyMap, name = super().findMatchingSkyMap(datasetTypeName) 

100 if skyMap is None and self.task.config.rootSkyMapName is not None: 

101 self.task.log.debug( 

102 "Assuming configured root skymap with name '%s' for dataset %s.", 

103 self.task.config.rootSkyMapName, datasetTypeName 

104 ) 

105 skyMap = self._rootSkyMap 

106 name = self.task.config.rootSkyMapName 

107 return skyMap, name 

108 

109 def runRawIngest(self, pool=None): 

110 if self.task.raws is None: 

111 self.task.log.info("Skipping raw ingest for %s.", self.root) 

112 return 

113 self.task.log.info("Finding raws in root %s.", self.root) 

114 if self.subset is not None: 

115 dataRefs = itertools.chain.from_iterable( 

116 self.butler2.subset(self.task.config.rawDatasetType, 

117 visit=visit) for visit in self.subset.visits 

118 ) 

119 else: 

120 dataRefs = self.butler2.subset(self.task.config.rawDatasetType) 

121 dataPaths = getDataPaths(dataRefs) 

122 if not self.task.dry_run: 

123 self.task.log.info("Ingesting raws from root %s into run %s.", 

124 self.root, self.task.raws.butler.run) 

125 self._rawRefs.extend(self.task.raws.run(dataPaths, pool=pool)) 

126 else: 

127 self.task.log.info("[dry run] skipping ingesting raws from root %s into run %s.", 

128 self.root, self.task.raws.butler.run) 

129 self._chain = [self.task.raws.butler.run] 

130 

131 def runDefineVisits(self, pool=None): 

132 if self.task.defineVisits is None: 

133 self.task.log.info("Skipping visit definition for %s.", self.root) 

134 return 

135 dimensions = DimensionGraph(self.task.universe, names=["exposure"]) 

136 exposureDataIds = set(ref.dataId.subset(dimensions) for ref in self._rawRefs) 

137 if not self.task.dry_run: 

138 self.task.log.info("Defining visits from exposures.") 

139 self.task.defineVisits.run(exposureDataIds, pool=pool) 

140 else: 

141 self.task.log.info("[dry run] Skipping defining visits from exposures.") 

142 

143 def prep(self): 

144 # Docstring inherited from RepoConverter. 

145 # Gather information about reference catalogs. 

146 if self.task.isDatasetTypeIncluded("ref_cat") and len(self.task.config.refCats) != 0: 

147 from lsst.meas.algorithms import DatasetConfig as RefCatDatasetConfig 

148 for refCat in os.listdir(os.path.join(self.root, "ref_cats")): 

149 path = os.path.join(self.root, "ref_cats", refCat) 

150 configFile = os.path.join(path, "config.py") 

151 if not os.path.exists(configFile): 

152 continue 

153 if refCat not in self.task.config.refCats: 

154 continue 

155 self.task.log.info("Preparing ref_cat %s from root %s.", refCat, self.root) 

156 onDiskConfig = RefCatDatasetConfig() 

157 onDiskConfig.load(configFile) 

158 if onDiskConfig.indexer.name != "HTM": 

159 raise ValueError(f"Reference catalog '{refCat}' uses unsupported " 

160 f"pixelization '{onDiskConfig.indexer.name}'.") 

161 level = onDiskConfig.indexer["HTM"].depth 

162 try: 

163 dimension = self.task.universe[f"htm{level}"] 

164 except KeyError as err: 

165 raise ValueError(f"Reference catalog {refCat} uses HTM level {level}, but no htm{level} " 

166 f"skypix dimension is configured for this registry.") from err 

167 self.task.useSkyPix(dimension) 

168 self._refCats[refCat] = dimension 

169 if self.task.isDatasetTypeIncluded("brightObjectMask") and self.task.config.rootSkyMapName: 

170 self.task.useSkyMap(self._rootSkyMap, self.task.config.rootSkyMapName) 

171 super().prep() 

172 

173 def iterDatasets(self) -> Iterator[FileDataset]: 

174 # Docstring inherited from RepoConverter. 

175 # Iterate over reference catalog files. 

176 for refCat, dimension in self._refCats.items(): 

177 datasetType = DatasetType(refCat, dimensions=[dimension], universe=self.task.universe, 

178 storageClass="SimpleCatalog") 

179 if self.subset is None: 

180 regex = re.compile(r"(\d+)\.fits") 

181 for fileName in self.progress.wrap(os.listdir(os.path.join(self.root, "ref_cats", refCat)), 

182 desc=f"Processing refcat {refCat}"): 

183 m = regex.match(fileName) 

184 if m is not None: 

185 htmId = int(m.group(1)) 

186 dataId = self.task.registry.expandDataId({dimension: htmId}) 

187 yield FileDataset(path=os.path.join(self.root, "ref_cats", refCat, fileName), 

188 refs=DatasetRef(datasetType, dataId)) 

189 else: 

190 for begin, end in self.progress.wrap(self.subset.skypix[dimension], 

191 desc=f"Processing ranges for refcat {refCat}"): 

192 for htmId in range(begin, end): 

193 dataId = self.task.registry.expandDataId({dimension: htmId}) 

194 yield FileDataset(path=os.path.join(self.root, "ref_cats", refCat, f"{htmId}.fits"), 

195 refs=DatasetRef(datasetType, dataId)) 

196 yield from super().iterDatasets() 

197 

198 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str: 

199 # Docstring inherited from RepoConverter. 

200 if datasetTypeName in self._refCats: 

201 return self.instrument.makeRefCatCollectionName("gen2") 

202 return super().getRun(datasetTypeName, calibDate) 

203 

204 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], 

205 count: int) -> None: 

206 # Docstring inherited from RepoConverter. 

207 super()._finish(datasets, count) 

208 if self._refCats: 

209 # Set up a CHAINED collection named something like "refcats" to 

210 # also point to "refcats/gen2". It's conceivable (but unlikely) 

211 # that "refcats/gen2" might not exist, if the scanner saw reference 

212 # catalog datasets on disk but none overlapped the area of 

213 # interest, so we register that here, too (multiple registrations 

214 # of collections are fine). 

215 chained = self.instrument.makeRefCatCollectionName() 

216 child = self.instrument.makeRefCatCollectionName("gen2") 

217 self.task.registry.registerCollection(chained, CollectionType.CHAINED) 

218 self.task.registry.registerCollection(child, CollectionType.RUN) 

219 children = list(self.task.registry.getCollectionChain(chained)) 

220 children.append(child) 

221 self.task.registry.setCollectionChain(chained, children) 

222 # Also add "refcats" to the list of collections that contains 

223 # everything found in the root repo. Normally this is done in 

224 # getRun, but here we want to add the (possibly new) CHAINED 

225 # collection instead of the RUN collection. 

226 self._chain.append(chained)