Coverage for python/lsst/ap/verify/ingestion.py: 30%

93 statements  

« prev     ^ index     » next       coverage.py v6.4, created at 2022-06-02 14:58 +0000

1# 

2# This file is part of ap_verify. 

3# 

4# Developed for the LSST Data Management System. 

5# This product includes software developed by the LSST Project 

6# (http://www.lsst.org). 

7# See the COPYRIGHT file at the top-level directory of this distribution 

8# for details of code ownership. 

9# 

10# This program is free software: you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation, either version 3 of the License, or 

13# (at your option) any later version. 

14# 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19# 

20# You should have received a copy of the GNU General Public License 

21# along with this program. If not, see <http://www.gnu.org/licenses/>. 

22# 

23 

24"""Data ingestion for ap_verify. 

25 

26This module handles ingestion of an ap_verify dataset into an appropriate repository, so 

27that pipeline code need not be aware of the dataset framework. 

28""" 

29 

30__all__ = ["Gen3DatasetIngestConfig", "ingestDatasetGen3"] 

31 

32import fnmatch 

33import os 

34import re 

35import shutil 

36from glob import glob 

37import logging 

38 

39import lsst.utils 

40import lsst.pex.config as pexConfig 

41import lsst.pipe.base as pipeBase 

42 

43import lsst.daf.butler 

44import lsst.obs.base 

45 

46_LOG = logging.getLogger(__name__) 

47 

48 

49class Gen3DatasetIngestConfig(pexConfig.Config): 

50 """Settings and defaults for `Gen3DatasetIngestTask`. 

51 

52 The correct target for `ingester` can be found in the documentation of 

53 the appropriate ``obs`` package. 

54 """ 

55 

56 ingester = pexConfig.ConfigurableField( 

57 target=lsst.obs.base.RawIngestTask, 

58 doc="Task used to perform raw data ingestion.", 

59 ) 

60 visitDefiner = pexConfig.ConfigurableField( 

61 target=lsst.obs.base.DefineVisitsTask, 

62 doc="Task used to organize raw exposures into visits.", 

63 ) 

64 # Normally file patterns should be user input, but put them in a config so 

65 # the ap_verify dataset can configure them 

66 dataFiles = pexConfig.ListField( 

67 dtype=str, 

68 default=["*.fits", "*.fz", "*.fits.gz"], 

69 doc="Names of raw science files (no path; wildcards allowed) to ingest from the ap_verify dataset.", 

70 ) 

71 dataBadFiles = pexConfig.ListField( 

72 dtype=str, 

73 default=[], 

74 doc="Names of raw science files (no path; wildcards allowed) to not ingest, " 

75 "supersedes ``dataFiles``.", 

76 ) 

77 

78 

79class Gen3DatasetIngestTask(pipeBase.Task): 

80 """Task for automating ingestion of a ap_verify dataset. 

81 

82 Each dataset configures this task as appropriate for the files it provides 

83 and the target instrument. Therefore, this task takes no input besides the 

84 ap_verify dataset to load and the repositories to ingest to. 

85 

86 Parameters 

87 ---------- 

88 dataset : `lsst.ap.verify.dataset.Dataset` 

89 The ``ap_verify`` dataset to be ingested. 

90 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

91 The abstract location for all ``ap_verify`` outputs, including 

92 a Gen 3 repository. 

93 """ 

94 

95 ConfigClass = Gen3DatasetIngestConfig 

96 # Suffix is de-facto convention for distinguishing Gen 2 and Gen 3 config overrides 

97 _DefaultName = "datasetIngest-gen3" 

98 

99 def __init__(self, dataset, workspace, *args, **kwargs): 

100 super().__init__(*args, **kwargs) 

101 self.workspace = workspace 

102 self.dataset = dataset 

103 # workspace.workButler is undefined until the repository is created 

104 self.dataset.makeCompatibleRepoGen3(self.workspace.repo) 

105 self.makeSubtask("ingester", butler=self.workspace.workButler) 

106 self.makeSubtask("visitDefiner", butler=self.workspace.workButler) 

107 

108 def _reduce_kwargs(self): 

109 # Add extra parameters to pickle 

110 return dict(**super()._reduce_kwargs(), dataset=self.dataset, workspace=self.workspace) 

111 

112 def run(self, processes=1): 

113 """Ingest the contents of a dataset into a Butler repository. 

114 

115 Parameters 

116 ---------- 

117 processes : `int` 

118 The number processes to use to ingest. 

119 """ 

120 self._ensureRaws(processes=processes) 

121 self._defineVisits(processes=processes) 

122 self._copyConfigs() 

123 

124 def _ensureRaws(self, processes): 

125 """Ensure that the repository in ``workspace`` has raws ingested. 

126 

127 After this method returns, this task's repository contains all science 

128 data from this task's ap_verify dataset. Butler operations on the 

129 repository are not able to modify ``dataset`` in any way. 

130 

131 Parameters 

132 ---------- 

133 processes : `int` 

134 The number processes to use to ingest, if ingestion must be run. 

135 

136 Raises 

137 ------ 

138 RuntimeError 

139 Raised if there are no files to ingest. 

140 """ 

141 # TODO: regex is workaround for DM-25945 

142 rawCollectionFilter = re.compile(self.dataset.instrument.makeDefaultRawIngestRunName()) 

143 rawCollections = list(self.workspace.workButler.registry.queryCollections(rawCollectionFilter)) 

144 rawData = list(self.workspace.workButler.registry.queryDatasets( 

145 'raw', 

146 collections=rawCollections, 

147 dataId={"instrument": self.dataset.instrument.getName()})) \ 

148 if rawCollections else [] 

149 

150 if rawData: 

151 self.log.info("Raw images for %s were previously ingested, skipping...", 

152 self.dataset.instrument.getName()) 

153 else: 

154 self.log.info("Ingesting raw images...") 

155 dataFiles = _findMatchingFiles(self.dataset.rawLocation, self.config.dataFiles, 

156 exclude=self.config.dataBadFiles) 

157 if dataFiles: 

158 self._ingestRaws(dataFiles, processes=processes) 

159 self.log.info("Images are now ingested in {0}".format(self.workspace.repo)) 

160 else: 

161 raise RuntimeError("No raw files found at %s." % self.dataset.rawLocation) 

162 

163 def _ingestRaws(self, dataFiles, processes): 

164 """Ingest raw images into a repository. 

165 

166 This task's repository is populated with *links* to ``dataFiles``. 

167 

168 Parameters 

169 ---------- 

170 dataFiles : `list` of `str` 

171 A list of filenames to ingest. May contain wildcards. 

172 processes : `int` 

173 The number processes to use to ingest. 

174 

175 Raises 

176 ------ 

177 RuntimeError 

178 Raised if ``dataFiles`` is empty or any file has already been ingested. 

179 """ 

180 if not dataFiles: 

181 raise RuntimeError("No raw files to ingest (expected list of filenames, got %r)." % dataFiles) 

182 

183 try: 

184 # run=None because expect ingester to name a new collection 

185 self.ingester.run(dataFiles, run=None, processes=processes) 

186 except lsst.daf.butler.registry.ConflictingDefinitionError as detail: 

187 raise RuntimeError("Not all raw files are unique") from detail 

188 

189 def _defineVisits(self, processes): 

190 """Map visits to the ingested exposures. 

191 

192 This step is necessary to be able to run most pipelines on raw datasets. 

193 

194 Parameters 

195 ---------- 

196 processes : `int` 

197 The number processes to use to define visits. 

198 

199 Raises 

200 ------ 

201 RuntimeError 

202 Raised if there are no exposures in the repository. 

203 """ 

204 exposures = set(self.workspace.workButler.registry.queryDataIds(["exposure"])) 

205 if not exposures: 

206 raise RuntimeError(f"No exposures defined in {self.workspace.repo}.") 

207 

208 exposureKeys = list(exposures)[0].graph 

209 exposuresWithVisits = {x.subset(exposureKeys) for x in 

210 self.workspace.workButler.registry.queryDataIds(["exposure", "visit"])} 

211 exposuresNoVisits = exposures - exposuresWithVisits 

212 if exposuresNoVisits: 

213 self.log.info("Defining visits...") 

214 self.visitDefiner.run(exposuresNoVisits) 

215 else: 

216 self.log.info("Visits were previously defined, skipping...") 

217 

218 def _copyConfigs(self): 

219 """Give a workspace a copy of all configs associated with the 

220 ingested data. 

221 

222 After this method returns, the config directory in the workspace 

223 contains all config files from the ap_verify dataset, and the 

224 pipelines directory in the workspace contains all pipeline files 

225 from the dataset. 

226 """ 

227 if os.listdir(self.workspace.pipelineDir): 

228 self.log.info("Configs already copied, skipping...") 

229 else: 

230 self.log.info("Storing data-specific configs...") 

231 for configFile in _findMatchingFiles(self.dataset.configLocation, ['*.py']): 

232 shutil.copy2(configFile, self.workspace.configDir) 

233 self.log.info("Configs are now stored in %s.", self.workspace.configDir) 

234 for pipelineFile in _findMatchingFiles(self.dataset.pipelineLocation, ['*.yaml']): 

235 shutil.copy2(pipelineFile, self.workspace.pipelineDir) 

236 self.log.info("Configs are now stored in %s.", self.workspace.pipelineDir) 

237 

238 

239def ingestDatasetGen3(dataset, workspace, processes=1): 

240 """Ingest the contents of an ap_verify dataset into a Gen 3 Butler repository. 

241 

242 The original data directory is not modified. 

243 

244 Parameters 

245 ---------- 

246 dataset : `lsst.ap.verify.dataset.Dataset` 

247 The ap_verify dataset to be ingested. 

248 workspace : `lsst.ap.verify.workspace.WorkspaceGen3` 

249 The abstract location where the epository is be created, if it does 

250 not already exist. 

251 processes : `int` 

252 The number processes to use to ingest. 

253 """ 

254 log = _LOG.getChild("ingestDataset") 

255 

256 ingester = Gen3DatasetIngestTask(dataset, workspace, config=_getConfig(Gen3DatasetIngestTask, dataset)) 

257 ingester.run(processes=processes) 

258 log.info("Data ingested") 

259 

260 

261def _getConfig(task, dataset): 

262 """Return the ingestion config associated with a specific dataset. 

263 

264 Parameters 

265 ---------- 

266 task : `lsst.pipe.base.Task`-type 

267 The task whose config is needed 

268 dataset : `lsst.ap.verify.dataset.Dataset` 

269 The dataset whose ingestion config is desired. 

270 

271 Returns 

272 ------- 

273 config : ``task.ConfigClass`` 

274 The config for running ``task`` on ``dataset``. 

275 """ 

276 config = task.ConfigClass() 

277 dataset.instrument.applyConfigOverrides(task._DefaultName, config) 

278 return config 

279 

280 

281def _findMatchingFiles(basePath, include, exclude=None): 

282 """Recursively identify files matching one set of patterns and not matching another. 

283 

284 Parameters 

285 ---------- 

286 basePath : `str` 

287 The path on disk where the files in ``include`` are located. 

288 include : iterable of `str` 

289 A collection of files (with wildcards) to include. Must not 

290 contain paths. 

291 exclude : iterable of `str`, optional 

292 A collection of filenames (with wildcards) to exclude. Must not 

293 contain paths. If omitted, all files matching ``include`` are returned. 

294 

295 Returns 

296 ------- 

297 files : `set` of `str` 

298 The files in ``basePath`` or any subdirectory that match ``include`` 

299 but not ``exclude``. 

300 """ 

301 _exclude = exclude if exclude is not None else [] 

302 

303 allFiles = set() 

304 for pattern in include: 

305 allFiles.update(glob(os.path.join(basePath, '**', pattern), recursive=True)) 

306 

307 for pattern in _exclude: 

308 excludedFiles = [f for f in allFiles if fnmatch.fnmatch(os.path.basename(f), pattern)] 

309 allFiles.difference_update(excludedFiles) 

310 return allFiles