Coverage for python/lsst/obs/base/gen2to3/rootRepoConverter.py: 15%

117 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-09 06:32 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RootRepoConverter"] 

24 

25import itertools 

26import os 

27import re 

28from typing import TYPE_CHECKING, Dict, Iterator, List, Mapping, Optional, Tuple 

29 

30from lsst.daf.butler import CollectionType, DatasetRef, DatasetType, DimensionGraph, FileDataset 

31from lsst.skymap import BaseSkyMap 

32 

33from .standardRepoConverter import StandardRepoConverter 

34 

35SKYMAP_DATASET_TYPES = {coaddName: f"{coaddName}Coadd_skyMap" for coaddName in ("deep", "goodSeeing", "dcr")} 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from lsst.daf.butler import SkyPixDimension 

39 

40 

41def getDataPaths(dataRefs): 

42 """Strip HDU identifiers from paths and return a unique set of paths. 

43 

44 Parameters 

45 ---------- 

46 dataRefs : `lsst.daf.persistence.ButlerDataRef` 

47 The gen2 datarefs to strip "[HDU]" values from. 

48 

49 Returns 

50 ------- 

51 paths : `set` [`str`] 

52 The unique file paths without appended "[HDU]". 

53 """ 

54 paths = set() 

55 for dataRef in dataRefs: 

56 path = dataRef.getUri() 

57 # handle with FITS files with multiple HDUs (e.g. decam raw) 

58 paths.add(path.split("[")[0]) 

59 return paths 

60 

61 

62class RootRepoConverter(StandardRepoConverter): 

63 """A specialization of `RepoConverter` for root data repositories. 

64 

65 `RootRepoConverter` adds support for raw images (mostly delegated to the 

66 parent task's `RawIngestTask` subtask) and reference catalogs. 

67 

68 Parameters 

69 ---------- 

70 kwds 

71 Keyword arguments are forwarded to (and required by) `RepoConverter`. 

72 """ 

73 

74 def __init__(self, **kwds): 

75 super().__init__(run=None, **kwds) 

76 self._refCats: Dict[str, SkyPixDimension] = {} 

77 if self.task.config.rootSkyMapName is not None: 

78 self._rootSkyMap = self.task.config.skyMaps[self.task.config.rootSkyMapName].skyMap.apply() 

79 else: 

80 self._rootSkyMap = None # All access to _rootSkyMap is guarded 

81 self._rawRefs = [] 

82 

83 def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool: 

84 # Docstring inherited from RepoConverter. 

85 return ( 

86 super().isDatasetTypeSpecial(datasetTypeName) 

87 or datasetTypeName in ("raw", "ref_cat", "ref_cat_config") 

88 # in Gen2, some of these are in the root repo, not a calib repo 

89 or datasetTypeName in self.instrument.getCuratedCalibrationNames() 

90 ) 

91 

92 def getSpecialDirectories(self) -> List[str]: 

93 # Docstring inherited from RepoConverter. 

94 return super().getSpecialDirectories() + ["CALIB", "ref_cats", "rerun"] 

95 

96 def findMatchingSkyMap(self, datasetTypeName: str) -> Tuple[Optional[BaseSkyMap], Optional[str]]: 

97 # Docstring inherited from StandardRepoConverter.findMatchingSkyMap. 

98 skyMap, name = super().findMatchingSkyMap(datasetTypeName) 

99 if skyMap is None and self.task.config.rootSkyMapName is not None: 

100 self.task.log.debug( 

101 "Assuming configured root skymap with name '%s' for dataset %s.", 

102 self.task.config.rootSkyMapName, 

103 datasetTypeName, 

104 ) 

105 skyMap = self._rootSkyMap 

106 name = self.task.config.rootSkyMapName 

107 return skyMap, name 

108 

109 def runRawIngest(self, pool=None): 

110 if self.task.raws is None: 

111 self.task.log.info("Skipping raw ingest for %s.", self.root) 

112 return 

113 self.task.log.info("Finding raws in root %s.", self.root) 

114 if self.subset is not None: 

115 dataRefs = itertools.chain.from_iterable( 

116 self.butler2.subset(self.task.config.rawDatasetType, visit=visit) 

117 for visit in self.subset.visits 

118 ) 

119 else: 

120 dataRefs = self.butler2.subset(self.task.config.rawDatasetType) 

121 dataPaths = getDataPaths(dataRefs) 

122 if not self.task.dry_run: 

123 self.task.log.info( 

124 "Ingesting raws from root %s into run %s.", self.root, self.task.raws.butler.run 

125 ) 

126 self._rawRefs.extend(self.task.raws.run(dataPaths, pool=pool)) 

127 else: 

128 self.task.log.info( 

129 "[dry run] skipping ingesting raws from root %s into run %s.", 

130 self.root, 

131 self.task.raws.butler.run, 

132 ) 

133 self._chain = [self.task.raws.butler.run] 

134 

135 def runDefineVisits(self): 

136 if self.task.defineVisits is None: 

137 self.task.log.info("Skipping visit definition for %s.", self.root) 

138 return 

139 dimensions = DimensionGraph(self.task.universe, names=["exposure"]) 

140 exposureDataIds = set(ref.dataId.subset(dimensions) for ref in self._rawRefs) 

141 if not self.task.dry_run: 

142 self.task.log.info("Defining visits from exposures.") 

143 self.task.defineVisits.run(exposureDataIds) 

144 else: 

145 self.task.log.info("[dry run] Skipping defining visits from exposures.") 

146 

147 def prep(self): 

148 # Docstring inherited from RepoConverter. 

149 # Gather information about reference catalogs. 

150 if self.task.isDatasetTypeIncluded("ref_cat") and len(self.task.config.refCats) != 0: 

151 from lsst.meas.algorithms import DatasetConfig as RefCatDatasetConfig 

152 

153 for refCat in os.listdir(os.path.join(self.root, "ref_cats")): 

154 path = os.path.join(self.root, "ref_cats", refCat) 

155 configFile = os.path.join(path, "config.py") 

156 if not os.path.exists(configFile): 

157 continue 

158 if refCat not in self.task.config.refCats: 

159 continue 

160 self.task.log.info("Preparing ref_cat %s from root %s.", refCat, self.root) 

161 onDiskConfig = RefCatDatasetConfig() 

162 onDiskConfig.load(configFile) 

163 if onDiskConfig.indexer.name != "HTM": 

164 raise ValueError( 

165 f"Reference catalog '{refCat}' uses unsupported " 

166 f"pixelization '{onDiskConfig.indexer.name}'." 

167 ) 

168 level = onDiskConfig.indexer["HTM"].depth 

169 try: 

170 dimension = self.task.universe[f"htm{level}"] 

171 except KeyError as err: 

172 raise ValueError( 

173 f"Reference catalog {refCat} uses HTM level {level}, but no htm{level} " 

174 f"skypix dimension is configured for this registry." 

175 ) from err 

176 self.task.useSkyPix(dimension) 

177 self._refCats[refCat] = dimension 

178 if self.task.isDatasetTypeIncluded("brightObjectMask") and self.task.config.rootSkyMapName: 

179 self.task.useSkyMap(self._rootSkyMap, self.task.config.rootSkyMapName) 

180 super().prep() 

181 

182 def iterDatasets(self) -> Iterator[FileDataset]: 

183 # Docstring inherited from RepoConverter. 

184 # Iterate over reference catalog files. 

185 for refCat, dimension in self._refCats.items(): 

186 datasetType = DatasetType( 

187 refCat, dimensions=[dimension], universe=self.task.universe, storageClass="SimpleCatalog" 

188 ) 

189 if self.subset is None: 

190 regex = re.compile(r"(\d+)\.fits") 

191 for fileName in self.progress.wrap( 

192 os.listdir(os.path.join(self.root, "ref_cats", refCat)), 

193 desc=f"Processing refcat {refCat}", 

194 ): 

195 m = regex.match(fileName) 

196 if m is not None: 

197 htmId = int(m.group(1)) 

198 dataId = self.task.registry.expandDataId({dimension: htmId}) 

199 yield FileDataset( 

200 path=os.path.join(self.root, "ref_cats", refCat, fileName), 

201 refs=DatasetRef(datasetType, dataId), 

202 ) 

203 else: 

204 for begin, end in self.progress.wrap( 

205 self.subset.skypix[dimension], desc=f"Processing ranges for refcat {refCat}" 

206 ): 

207 for htmId in range(begin, end): 

208 dataId = self.task.registry.expandDataId({dimension: htmId}) 

209 yield FileDataset( 

210 path=os.path.join(self.root, "ref_cats", refCat, f"{htmId}.fits"), 

211 refs=DatasetRef(datasetType, dataId), 

212 ) 

213 yield from super().iterDatasets() 

214 

215 def getRun(self, datasetTypeName: str, calibDate: Optional[str] = None) -> str: 

216 # Docstring inherited from RepoConverter. 

217 if datasetTypeName in self._refCats: 

218 return self.instrument.makeRefCatCollectionName("gen2") 

219 return super().getRun(datasetTypeName, calibDate) 

220 

221 def _finish( 

222 self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], count: int 

223 ) -> None: 

224 # Docstring inherited from RepoConverter. 

225 super()._finish(datasets, count) 

226 if self._refCats: 

227 # Set up a CHAINED collection named something like "refcats" to 

228 # also point to "refcats/gen2". It's conceivable (but unlikely) 

229 # that "refcats/gen2" might not exist, if the scanner saw reference 

230 # catalog datasets on disk but none overlapped the area of 

231 # interest, so we register that here, too (multiple registrations 

232 # of collections are fine). 

233 chained = self.instrument.makeRefCatCollectionName() 

234 child = self.instrument.makeRefCatCollectionName("gen2") 

235 self.task.registry.registerCollection(chained, CollectionType.CHAINED) 

236 self.task.registry.registerCollection(child, CollectionType.RUN) 

237 children = list(self.task.registry.getCollectionChain(chained)) 

238 children.append(child) 

239 self.task.registry.setCollectionChain(chained, children) 

240 # Also add "refcats" to the list of collections that contains 

241 # everything found in the root repo. Normally this is done in 

242 # getRun, but here we want to add the (possibly new) CHAINED 

243 # collection instead of the RUN collection. 

244 self._chain.append(chained)