Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 10%

101 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-05 18:04 -0800

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['PreExecInit'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import itertools 

29 

30# ----------------------------- 

31# Imports for other modules -- 

32# ----------------------------- 

33from lsst.base import Packages 

34from lsst.daf.butler import DatasetType 

35from lsst.pipe.base import PipelineDatasetTypes 

36 

37_LOG = logging.getLogger(__name__.partition(".")[2]) 

38 

39 

40class PreExecInit: 

41 """Initialization of registry for QuantumGraph execution. 

42 

43 This class encapsulates all necessary operations that have to be performed 

44 on butler and registry to prepare them for QuantumGraph execution. 

45 

46 Parameters 

47 ---------- 

48 butler : `~lsst.daf.butler.Butler` 

49 Data butler instance. 

50 taskFactory : `~lsst.pipe.base.TaskFactory` 

51 Task factory. 

52 extendRun : `bool`, optional 

53 If `True` then do not try to overwrite any datasets that might exist 

54 in ``butler.run``; instead compare them when appropriate/possible. If 

55 `False`, then any existing conflicting dataset will cause a butler 

56 exception to be raised. 

57 """ 

58 def __init__(self, butler, taskFactory, extendRun=False): 

59 self.butler = butler 

60 self.taskFactory = taskFactory 

61 self.extendRun = extendRun 

62 if self.extendRun and self.butler.run is None: 

63 raise RuntimeError( 

64 "Cannot perform extendRun logic unless butler is initialized " 

65 "with a default output RUN collection." 

66 ) 

67 

68 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

69 """Perform all initialization steps. 

70 

71 Convenience method to execute all initialization steps. Instead of 

72 calling this method and providing all options it is also possible to 

73 call methods individually. 

74 

75 Parameters 

76 ---------- 

77 graph : `~lsst.pipe.base.QuantumGraph` 

78 Execution graph. 

79 saveInitOutputs : `bool`, optional 

80 If ``True`` (default) then save "init outputs", configurations, 

81 and package versions to butler. 

82 registerDatasetTypes : `bool`, optional 

83 If ``True`` then register dataset types in registry, otherwise 

84 they must be already registered. 

85 saveVersions : `bool`, optional 

86 If ``False`` then do not save package versions even if 

87 ``saveInitOutputs`` is set to ``True``. 

88 """ 

89 # register dataset types or check consistency 

90 self.initializeDatasetTypes(graph, registerDatasetTypes) 

91 

92 # Save task initialization data or check that saved data 

93 # is consistent with what tasks would save 

94 if saveInitOutputs: 

95 self.saveInitOutputs(graph) 

96 self.saveConfigs(graph) 

97 if saveVersions: 

98 self.savePackageVersions(graph) 

99 

100 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

101 """Save or check DatasetTypes output by the tasks in a graph. 

102 

103 Iterates over all DatasetTypes for all tasks in a graph and either 

104 tries to add them to registry or compares them to exising ones. 

105 

106 Parameters 

107 ---------- 

108 graph : `~lsst.pipe.base.QuantumGraph` 

109 Execution graph. 

110 registerDatasetTypes : `bool`, optional 

111 If ``True`` then register dataset types in registry, otherwise 

112 they must be already registered. 

113 

114 Raises 

115 ------ 

116 ValueError 

117 Raised if existing DatasetType is different from DatasetType 

118 in a graph. 

119 KeyError 

120 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

121 does not exist in registry. 

122 """ 

123 pipeline = graph.taskGraph 

124 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry, 

125 include_configs=True, include_packages=True) 

126 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs, 

127 datasetTypes.intermediates, datasetTypes.outputs): 

128 # Only composites are registered, no components, and by this point 

129 # the composite should already exist. 

130 if registerDatasetTypes and not datasetType.isComponent(): 

131 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

132 # this is a no-op if it already exists and is consistent, 

133 # and it raises if it is inconsistent. 

134 self.butler.registry.registerDatasetType(datasetType) 

135 else: 

136 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

137 try: 

138 expected = self.butler.registry.getDatasetType(datasetType.name) 

139 except KeyError: 

140 # Likely means that --register-dataset-types is forgotten. 

141 raise KeyError(f"Dataset type with name '{datasetType.name}' not found. Dataset types " 

142 "have to be registered with either `butler register-dataset-type` or " 

143 "passing `--register-dataset-types` option to `pipetask run`.") from None 

144 if datasetType.isComponent() \ 

145 and datasetType.parentStorageClass == DatasetType.PlaceholderParentStorageClass: 

146 # Force the parent storage classes to match since we 

147 # are using a placeholder 

148 datasetType.finalizeParentStorageClass(expected.parentStorageClass) 

149 if expected != datasetType: 

150 raise ValueError(f"DatasetType configuration does not match Registry: " 

151 f"{datasetType} != {expected}") 

152 

153 def saveInitOutputs(self, graph): 

154 """Write any datasets produced by initializing tasks in a graph. 

155 

156 Parameters 

157 ---------- 

158 graph : `~lsst.pipe.base.QuantumGraph` 

159 Execution graph. 

160 

161 Raises 

162 ------ 

163 TypeError 

164 Raised if ``extendRun`` is `True` but type of existing object in 

165 butler is different from new data. 

166 Exception 

167 Raised if ``extendRun`` is `False` and datasets already 

168 exists. Content of a butler collection may be changed if 

169 exception is raised. 

170 

171 Notes 

172 ----- 

173 If ``extendRun`` is `True` then existing datasets are not 

174 overwritten, instead we should check that their stored object is 

175 exactly the same as what we would save at this time. Comparing 

176 arbitrary types of object is, of course, non-trivial. Current 

177 implementation only checks the existence of the datasets and their 

178 types against the types of objects produced by tasks. Ideally we 

179 would like to check that object data is identical too but presently 

180 there is no generic way to compare objects. In the future we can 

181 potentially introduce some extensible mechanism for that. 

182 """ 

183 _LOG.debug("Will save InitOutputs for all tasks") 

184 for taskDef in graph.iterTaskGraph(): 

185 task = self.taskFactory.makeTask(taskDef.taskClass, 

186 taskDef.label, 

187 taskDef.config, 

188 None, 

189 self.butler) 

190 for name in taskDef.connections.initOutputs: 

191 attribute = getattr(taskDef.connections, name) 

192 initOutputVar = getattr(task, name) 

193 objFromStore = None 

194 if self.extendRun: 

195 # check if it is there already 

196 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", 

197 task, name, attribute.name) 

198 try: 

199 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

200 # Types are supposed to be identical. 

201 # TODO: Check that object contents is identical too. 

202 if type(objFromStore) is not type(initOutputVar): 

203 raise TypeError(f"Stored initOutput object type {type(objFromStore)} " 

204 f"is different from task-generated type " 

205 f"{type(initOutputVar)} for task {taskDef}") 

206 except (LookupError, FileNotFoundError): 

207 # FileNotFoundError likely means execution butler 

208 # where refs do exist but datastore artifacts do not. 

209 pass 

210 if objFromStore is None: 

211 # butler will raise exception if dataset is already there 

212 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name) 

213 self.butler.put(initOutputVar, attribute.name, {}) 

214 

215 def saveConfigs(self, graph): 

216 """Write configurations for pipeline tasks to butler or check that 

217 existing configurations are equal to the new ones. 

218 

219 Parameters 

220 ---------- 

221 graph : `~lsst.pipe.base.QuantumGraph` 

222 Execution graph. 

223 

224 Raises 

225 ------ 

226 TypeError 

227 Raised if ``extendRun`` is `True` but existing object in butler is 

228 different from new data. 

229 Exception 

230 Raised if ``extendRun`` is `False` and datasets already exists. 

231 Content of a butler collection should not be changed if exception 

232 is raised. 

233 """ 

234 def logConfigMismatch(msg): 

235 """Log messages about configuration mismatch. 

236 """ 

237 _LOG.fatal("Comparing configuration: %s", msg) 

238 

239 _LOG.debug("Will save Configs for all tasks") 

240 # start transaction to rollback any changes on exceptions 

241 with self.butler.transaction(): 

242 for taskDef in graph.taskGraph: 

243 configName = taskDef.configDatasetName 

244 

245 oldConfig = None 

246 if self.extendRun: 

247 try: 

248 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

249 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

250 raise TypeError( 

251 f"Config does not match existing task config {configName!r} in butler; " 

252 "tasks configurations must be consistent within the same run collection") 

253 except (LookupError, FileNotFoundError): 

254 # FileNotFoundError likely means execution butler 

255 # where refs do exist but datastore artifacts do not. 

256 pass 

257 if oldConfig is None: 

258 # butler will raise exception if dataset is already there 

259 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

260 self.butler.put(taskDef.config, configName, {}) 

261 

262 def savePackageVersions(self, graph): 

263 """Write versions of software packages to butler. 

264 

265 Parameters 

266 ---------- 

267 graph : `~lsst.pipe.base.QuantumGraph` 

268 Execution graph. 

269 

270 Raises 

271 ------ 

272 TypeError 

273 Raised if ``extendRun`` is `True` but existing object in butler is 

274 different from new data. 

275 """ 

276 packages = Packages.fromSystem() 

277 _LOG.debug("want to save packages: %s", packages) 

278 datasetType = PipelineDatasetTypes.packagesDatasetName 

279 dataId = {} 

280 oldPackages = None 

281 # start transaction to rollback any changes on exceptions 

282 with self.butler.transaction(): 

283 if self.extendRun: 

284 try: 

285 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

286 _LOG.debug("old packages: %s", oldPackages) 

287 except (LookupError, FileNotFoundError): 

288 # FileNotFoundError likely means execution butler where 

289 # refs do exist but datastore artifacts do not. 

290 pass 

291 if oldPackages is not None: 

292 # Note that because we can only detect python modules that have 

293 # been imported, the stored list of products may be more or 

294 # less complete than what we have now. What's important is 

295 # that the products that are in common have the same version. 

296 diff = packages.difference(oldPackages) 

297 if diff: 

298 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

299 raise TypeError(f"Package versions mismatch: ({versions_str})") 

300 else: 

301 _LOG.debug("new packages are consistent with old") 

302 # Update the old set of packages in case we have more packages 

303 # that haven't been persisted. 

304 extra = packages.extra(oldPackages) 

305 if extra: 

306 _LOG.debug("extra packages: %s", extra) 

307 oldPackages.update(packages) 

308 # have to remove existing dataset first, butler has no 

309 # replace option. 

310 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

311 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

312 self.butler.put(oldPackages, datasetType, dataId) 

313 else: 

314 self.butler.put(packages, datasetType, dataId)