Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['PreExecInit'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import itertools 

29 

30# ----------------------------- 

31# Imports for other modules -- 

32# ----------------------------- 

33from lsst.base import Packages 

34from lsst.daf.butler import DatasetType 

35from lsst.pipe.base import PipelineDatasetTypes 

36 

37_LOG = logging.getLogger(__name__.partition(".")[2]) 

38 

39 

40class PreExecInit: 

41 """Initialization of registry for QuantumGraph execution. 

42 

43 This class encapsulates all necessary operations that have to be performed 

44 on butler and registry to prepare them for QuantumGraph execution. 

45 

46 Parameters 

47 ---------- 

48 butler : `~lsst.daf.butler.Butler` 

49 Data butler instance. 

50 taskFactory : `~lsst.pipe.base.TaskFactory` 

51 Task factory. 

52 skipExisting : `bool`, optional 

53 If `True` then do not try to overwrite any datasets that might exist 

54 in ``butler.run``; instead compare them when appropriate/possible. If 

55 `False`, then any existing conflicting dataset will cause a butler 

56 exception to be raised. 

57 """ 

58 def __init__(self, butler, taskFactory, skipExisting=False): 

59 self.butler = butler 

60 self.taskFactory = taskFactory 

61 self.skipExisting = skipExisting 

62 if self.skipExisting and self.butler.run is None: 

63 raise RuntimeError( 

64 "Cannot perform skipExisting logic unless butler is initialized " 

65 "with a default output RUN collection." 

66 ) 

67 

68 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

69 """Perform all initialization steps. 

70 

71 Convenience method to execute all initialization steps. Instead of 

72 calling this method and providing all options it is also possible to 

73 call methods individually. 

74 

75 Parameters 

76 ---------- 

77 graph : `~lsst.pipe.base.QuantumGraph` 

78 Execution graph. 

79 saveInitOutputs : `bool`, optional 

80 If ``True`` (default) then save "init outputs", configurations, 

81 and package versions to butler. 

82 registerDatasetTypes : `bool`, optional 

83 If ``True`` then register dataset types in registry, otherwise 

84 they must be already registered. 

85 saveVersions : `bool`, optional 

86 If ``False`` then do not save package versions even if 

87 ``saveInitOutputs`` is set to ``True``. 

88 """ 

89 # register dataset types or check consistency 

90 self.initializeDatasetTypes(graph, registerDatasetTypes) 

91 

92 # Save task initialization data or check that saved data 

93 # is consistent with what tasks would save 

94 if saveInitOutputs: 

95 self.saveInitOutputs(graph) 

96 self.saveConfigs(graph) 

97 if saveVersions: 

98 self.savePackageVersions(graph) 

99 

100 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

101 """Save or check DatasetTypes output by the tasks in a graph. 

102 

103 Iterates over all DatasetTypes for all tasks in a graph and either 

104 tries to add them to registry or compares them to exising ones. 

105 

106 Parameters 

107 ---------- 

108 graph : `~lsst.pipe.base.QuantumGraph` 

109 Execution graph. 

110 registerDatasetTypes : `bool`, optional 

111 If ``True`` then register dataset types in registry, otherwise 

112 they must be already registered. 

113 

114 Raises 

115 ------ 

116 ValueError 

117 Raised if existing DatasetType is different from DatasetType 

118 in a graph. 

119 KeyError 

120 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

121 does not exist in registry. 

122 """ 

123 pipeline = graph.taskGraph 

124 

125 # Make dataset types for configurations 

126 configDatasetTypes = [DatasetType(taskDef.configDatasetName, {}, 

127 storageClass="Config", 

128 universe=self.butler.registry.dimensions) 

129 for taskDef in pipeline] 

130 

131 # And one dataset type for package versions 

132 packagesDatasetType = DatasetType("packages", {}, 

133 storageClass="Packages", 

134 universe=self.butler.registry.dimensions) 

135 

136 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry) 

137 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs, 

138 datasetTypes.intermediates, datasetTypes.outputs, 

139 configDatasetTypes, [packagesDatasetType]): 

140 # Only composites are registered, no components, and by this point 

141 # the composite should already exist. 

142 if registerDatasetTypes and not datasetType.isComponent(): 

143 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

144 # this is a no-op if it already exists and is consistent, 

145 # and it raises if it is inconsistent. 

146 self.butler.registry.registerDatasetType(datasetType) 

147 else: 

148 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

149 expected = self.butler.registry.getDatasetType(datasetType.name) 

150 if datasetType.isComponent() \ 

151 and datasetType.parentStorageClass == DatasetType.PlaceholderParentStorageClass: 

152 # Force the parent storage classes to match since we 

153 # are using a placeholder 

154 datasetType.finalizeParentStorageClass(expected.parentStorageClass) 

155 if expected != datasetType: 

156 raise ValueError(f"DatasetType configuration does not match Registry: " 

157 f"{datasetType} != {expected}") 

158 

159 def saveInitOutputs(self, graph): 

160 """Write any datasets produced by initializing tasks in a graph. 

161 

162 Parameters 

163 ---------- 

164 graph : `~lsst.pipe.base.QuantumGraph` 

165 Execution graph. 

166 

167 Raises 

168 ------ 

169 Exception 

170 Raised if ``skipExisting`` is `False` and datasets already 

171 exists. Content of a butler collection may be changed if 

172 exception is raised. 

173 

174 Notes 

175 ----- 

176 If ``skipExisting`` is `True` then existing datasets are not 

177 overwritten, instead we should check that their stored object is 

178 exactly the same as what we would save at this time. Comparing 

179 arbitrary types of object is, of course, non-trivial. Current 

180 implementation only checks the existence of the datasets and their 

181 types against the types of objects produced by tasks. Ideally we 

182 would like to check that object data is identical too but presently 

183 there is no generic way to compare objects. In the future we can 

184 potentially introduce some extensible mechanism for that. 

185 """ 

186 _LOG.debug("Will save InitOutputs for all tasks") 

187 for taskDef in graph.iterTaskGraph(): 

188 task = self.taskFactory.makeTask(taskDef.taskClass, 

189 taskDef.label, 

190 taskDef.config, 

191 None, 

192 self.butler) 

193 for name in taskDef.connections.initOutputs: 

194 attribute = getattr(taskDef.connections, name) 

195 initOutputVar = getattr(task, name) 

196 objFromStore = None 

197 if self.skipExisting: 

198 # check if it is there already 

199 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", 

200 task, name, attribute.name) 

201 try: 

202 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run]) 

203 # Types are supposed to be identical. 

204 # TODO: Check that object contents is identical too. 

205 if type(objFromStore) is not type(initOutputVar): 

206 raise TypeError(f"Stored initOutput object type {type(objFromStore)} " 

207 f"is different from task-generated type " 

208 f"{type(initOutputVar)} for task {taskDef}") 

209 except LookupError: 

210 pass 

211 if objFromStore is None: 

212 # butler will raise exception if dataset is already there 

213 _LOG.debug("Saving InitOutputs for task=%s key=%s", task, name) 

214 self.butler.put(initOutputVar, attribute.name, {}) 

215 

216 def saveConfigs(self, graph): 

217 """Write configurations for pipeline tasks to butler or check that 

218 existing configurations are equal to the new ones. 

219 

220 Parameters 

221 ---------- 

222 graph : `~lsst.pipe.base.QuantumGraph` 

223 Execution graph. 

224 

225 Raises 

226 ------ 

227 Exception 

228 Raised if ``skipExisting`` is `False` and datasets already exists. 

229 Content of a butler collection should not be changed if exception 

230 is raised. 

231 """ 

232 def logConfigMismatch(msg): 

233 """Log messages about configuration mismatch. 

234 """ 

235 _LOG.fatal("Comparing configuration: %s", msg) 

236 

237 _LOG.debug("Will save Configs for all tasks") 

238 # start transaction to rollback any changes on exceptions 

239 with self.butler.transaction(): 

240 for taskDef in graph.taskGraph: 

241 configName = taskDef.configDatasetName 

242 

243 oldConfig = None 

244 if self.skipExisting: 

245 try: 

246 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run]) 

247 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

248 raise TypeError( 

249 f"Config does not match existing task config {configName!r} in butler; " 

250 "tasks configurations must be consistent within the same run collection") 

251 except LookupError: 

252 pass 

253 if oldConfig is None: 

254 # butler will raise exception if dataset is already there 

255 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

256 self.butler.put(taskDef.config, configName, {}) 

257 

258 def savePackageVersions(self, graph): 

259 """Write versions of software packages to butler. 

260 

261 Parameters 

262 ---------- 

263 graph : `~lsst.pipe.base.QuantumGraph` 

264 Execution graph. 

265 

266 Raises 

267 ------ 

268 Exception 

269 Raised if ``checkExisting`` is ``True`` but versions are not 

270 compatible. 

271 """ 

272 packages = Packages.fromSystem() 

273 _LOG.debug("want to save packages: %s", packages) 

274 datasetType = "packages" 

275 dataId = {} 

276 oldPackages = None 

277 # start transaction to rollback any changes on exceptions 

278 with self.butler.transaction(): 

279 if self.skipExisting: 

280 try: 

281 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

282 _LOG.debug("old packages: %s", oldPackages) 

283 except LookupError: 

284 pass 

285 if oldPackages is not None: 

286 # Note that because we can only detect python modules that have been imported, the stored 

287 # list of products may be more or less complete than what we have now. What's important is 

288 # that the products that are in common have the same version. 

289 diff = packages.difference(oldPackages) 

290 if diff: 

291 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

292 raise TypeError(f"Package versions mismatch: ({versions_str})") 

293 else: 

294 _LOG.debug("new packages are consistent with old") 

295 # Update the old set of packages in case we have more packages that haven't been persisted. 

296 extra = packages.extra(oldPackages) 

297 if extra: 

298 _LOG.debug("extra packages: %s", extra) 

299 oldPackages.update(packages) 

300 # have to remove existing dataset first, butler nas no replace option 

301 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

302 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

303 self.butler.put(oldPackages, datasetType, dataId) 

304 else: 

305 self.butler.put(packages, datasetType, dataId)