Coverage for python/lsst/ap/verify/ingestion.py: 30%

93 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-19 20:41 +0000

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["Gen3DatasetIngestConfig", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36from glob import glob 

37import logging 

38 

39import lsst.utils 

40import lsst.pex.config as pexConfig 

41import lsst.pipe.base as pipeBase 

42 

43import lsst.daf.butler 

44import lsst.obs.base 

45 

46_LOG = logging.getLogger(__name__) 

47 

48 

49class Gen3DatasetIngestConfig(pexConfig.Config): 

50 """Settings and defaults for `Gen3DatasetIngestTask`. 

51 

52 The correct target for `ingester` can be found in the documentation of 

53 the appropriate ``obs`` package. 

54 """ 

55 

56 ingester = pexConfig.ConfigurableField( 

57 target=lsst.obs.base.RawIngestTask, 

58 doc="Task used to perform raw data ingestion.", 

59 ) 

60 visitDefiner = pexConfig.ConfigurableField( 

61 target=lsst.obs.base.DefineVisitsTask, 

62 doc="Task used to organize raw exposures into visits.", 

63 ) 

64 # Normally file patterns should be user input, but put them in a config so 

65 # the ap_verify dataset can configure them 

66 dataFiles = pexConfig.ListField( 

67 dtype=str, 

68 default=["*.fits", "*.fz", "*.fits.gz"], 

69 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

70 ) 

71 dataBadFiles = pexConfig.ListField( 

72 dtype=str, 

73 default=[], 

74 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

75 "supersedes ``dataFiles``.", 

76 ) 

77 

78 

79class Gen3DatasetIngestTask(pipeBase.Task): 

80 """Task for automating ingestion of a ap_verify dataset. 

81 

82 Each dataset configures this task as appropriate for the files it provides 

83 and the target instrument. Therefore, this task takes no input besides the 

84 ap_verify dataset to load and the repositories to ingest to. 

85 

86 Parameters 

87 ---------- 

88 dataset : `lsst.ap.verify.dataset.Dataset` 

89 The ``ap_verify`` dataset to be ingested. 

90 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

91 The abstract location for all ``ap_verify`` outputs, including 

92 a Gen 3 repository. 

93 """ 

94 

95 ConfigClass = Gen3DatasetIngestConfig 

96 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides 

97 _DefaultName = "datasetIngest-gen3" 

98 

99 def __init__(self, dataset, workspace, *args, **kwargs): 

100 super().__init__(*args, **kwargs) 

101 self.workspace = workspace 

102 self.dataset = dataset 

103 # workspace.workButler is undefined until the repository is created 

104 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

105 self.makeSubtask("ingester", butler=self.workspace.workButler) 

106 self.makeSubtask("visitDefiner", butler=self.workspace.workButler) 

107 

108 def _reduce_kwargs(self): 

109 # Add extra parameters to pickle 

110 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace) 

111 

112 def run(self, processes=1): 

113 """Ingest the contents of a dataset into a Butler repository. 

114 

115 Parameters 

116 ---------- 

117 processes : `int` 

118 The number processes to use to ingest. 

119 """ 

120 self._ensureRaws(processes=processes) 

121 self._defineVisits(processes=processes) 

122 self._copyConfigs() 

123 

124 def _ensureRaws(self, processes): 

125 """Ensure that the repository in ``workspace`` has raws ingested. 

126 

127 After this method returns, this task's repository contains all science 

128 data from this task's ap_verify dataset. Butler operations on the 

129 repository are not able to modify ``dataset`` in any way. 

130 

131 Parameters 

132 ---------- 

133 processes : `int` 

134 The number processes to use to ingest, if ingestion must be run. 

135 

136 Raises 

137 ------ 

138 RuntimeError 

139 Raised if there are no files to ingest. 

140 """ 

141 # TODO: regex is workaround for DM-25945 

142 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

143 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

144 rawData = list(self.workspace.workButler.registry.queryDatasets( 

145 'raw', 

146 collections=rawCollections, 

147 dataId={"instrument": self.dataset.instrument.getName()})) \ 

148 if rawCollections else [] 

149 

150 if rawData: 

151 self.log.info("Raw images for %s were previously ingested, skipping...", 

152 self.dataset.instrument.getName()) 

153 else: 

154 self.log.info("Ingesting raw images...") 

155 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

156 exclude=self.config.dataBadFiles) 

157 if dataFiles: 

158 self._ingestRaws(dataFiles, processes=processes) 

159 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

160 else: 

161 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

162 

163 def _ingestRaws(self, dataFiles, processes): 

164 """Ingest raw images into a repository. 

165 

166 This task's repository is populated with *links* to ``dataFiles``. 

167 

168 Parameters 

169 ---------- 

170 dataFiles : `list` of `str` 

171 A list of filenames to ingest. May contain wildcards. 

172 processes : `int` 

173 The number processes to use to ingest. 

174 

175 Raises 

176 ------ 

177 RuntimeError 

178 Raised if ``dataFiles`` is empty or any file has already been ingested. 

179 """ 

180 if not dataFiles: 

181 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

182 

183 try: 

184 # run=None because expect ingester to name a new collection. 

185 # HACK: update_exposure_records=True to modernize exposure records 

186 # from old ap_verify datasets. Since the exposure records are 

187 # generated from the same files, the only changes should be 

188 # schema-related. 

189 self.ingester.run(dataFiles, run=None, processes=processes, update_exposure_records=True) 

190 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

191 raise RuntimeError("Not all raw files are unique") from detail 

192 

193 def _defineVisits(self, processes): 

194 """Map visits to the ingested exposures. 

195 

196 This step is necessary to be able to run most pipelines on raw datasets. 

197 

198 Parameters 

199 ---------- 

200 processes : `int` 

201 The number processes to use to define visits. 

202 

203 Raises 

204 ------ 

205 RuntimeError 

206 Raised if there are no exposures in the repository. 

207 """ 

208 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"])) 

209 if not exposures: 

210 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.") 

211 

212 exposureKeys = list(exposures)[0].graph 

213 exposuresWithVisits = {x.subset(exposureKeys) for x in 

214 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])} 

215 exposuresNoVisits = exposures - exposuresWithVisits 

216 if exposuresNoVisits: 

217 self.log.info("Defining visits...") 

218 self.visitDefiner.run(exposuresNoVisits) 

219 else: 

220 self.log.info("Visits were previously defined, skipping...") 

221 

222 def _copyConfigs(self): 

223 """Give a workspace a copy of all configs associated with the 

224 ingested data. 

225 

226 After this method returns, the config directory in the workspace 

227 contains all config files from the ap_verify dataset, and the 

228 pipelines directory in the workspace contains all pipeline files 

229 from the dataset. 

230 """ 

231 if os.listdir(self.workspace.pipelineDir): 

232 self.log.info("Configs already copied, skipping...") 

233 else: 

234 self.log.info("Storing data-specific configs...") 

235 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

236 shutil.copy2(configFile, self.workspace.configDir) 

237 self.log.info("Configs are now stored in %s.", self.workspace.configDir) 

238 for pipelineFile in _findMatchingFiles(self.dataset.pipelineLocation, ['*.yaml']): 

239 shutil.copy2(pipelineFile, self.workspace.pipelineDir) 

240 self.log.info("Configs are now stored in %s.", self.workspace.pipelineDir) 

241 

242 

243def ingestDatasetGen3(dataset, workspace, processes=1): 

244 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

245 

246 The original data directory is not modified. 

247 

248 Parameters 

249 ---------- 

250 dataset : `lsst.ap.verify.dataset.Dataset` 

251 The ap_verify dataset to be ingested. 

252 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

253 The abstract location where the epository is be created, if it does 

254 not already exist. 

255 processes : `int` 

256 The number processes to use to ingest. 

257 """ 

258 log = _LOG.getChild("ingestDataset") 

259 

260 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

261 ingester.run(processes=processes) 

262 log.info("Data ingested") 

263 

264 

265def _getConfig(task, dataset): 

266 """Return the ingestion config associated with a specific dataset. 

267 

268 Parameters 

269 ---------- 

270 task : `lsst.pipe.base.Task`-type 

271 The task whose config is needed 

272 dataset : `lsst.ap.verify.dataset.Dataset` 

273 The dataset whose ingestion config is desired. 

274 

275 Returns 

276 ------- 

277 config : ``task.ConfigClass`` 

278 The config for running ``task`` on ``dataset``. 

279 """ 

280 config = task.ConfigClass() 

281 dataset.instrument.applyConfigOverrides(task._DefaultName, config) 

282 return config 

283 

284 

285def _findMatchingFiles(basePath, include, exclude=None): 

286 """Recursively identify files matching one set of patterns and not matching another. 

287 

288 Parameters 

289 ---------- 

290 basePath : `str` 

291 The path on disk where the files in ``include`` are located. 

292 include : iterable of `str` 

293 A collection of files (with wildcards) to include. Must not 

294 contain paths. 

295 exclude : iterable of `str`, optional 

296 A collection of filenames (with wildcards) to exclude. Must not 

297 contain paths. If omitted, all files matching ``include`` are returned. 

298 

299 Returns 

300 ------- 

301 files : `set` of `str` 

302 The files in ``basePath`` or any subdirectory that match ``include`` 

303 but not ``exclude``. 

304 """ 

305 _exclude = exclude if exclude is not None else [] 

306 

307 allFiles = set() 

308 for pattern in include: 

309 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

310 

311 for pattern in _exclude: 

312 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

313 allFiles.difference_update(excludedFiles) 

314 return allFiles