Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['PreExecInit'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import itertools 

29 

30# ----------------------------- 

31# Imports for other modules -- 

32# ----------------------------- 

33from lsst.base import Packages 

34from lsst.daf.butler import DatasetType 

35from lsst.pipe.base import PipelineDatasetTypes 

36 

37_LOG = logging.getLogger(__name__.partition(".")[2]) 

38 

39 

40class PreExecInit: 

41 """Initialization of registry for QuantumGraph execution. 

42 

43 This class encapsulates all necessary operations that have to be performed 

44 on butler and registry to prepare them for QuantumGraph execution. 

45 

46 Parameters 

47 ---------- 

48 butler : `~lsst.daf.butler.Butler` 

49 Data butler instance. 

50 taskFactory : `~lsst.pipe.base.TaskFactory` 

51 Task factory. 

52 skipExisting : `bool`, optional 

53 If `True` then do not try to overwrite any datasets that might exist 

54 in the butler. If `False` then any existing conflicting dataset will 

55 cause butler exception. 

56 """ 

57 def __init__(self, butler, taskFactory, skipExisting=False): 

58 self.butler = butler 

59 self.taskFactory = taskFactory 

60 self.skipExisting = skipExisting 

61 

62 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True): 

63 """Perform all initialization steps. 

64 

65 Convenience method to execute all initialization steps. Instead of 

66 calling this method and providing all options it is also possible to 

67 call methods individually. 

68 

69 Parameters 

70 ---------- 

71 graph : `~lsst.pipe.base.QuantumGraph` 

72 Execution graph. 

73 saveInitOutputs : `bool`, optional 

74 If ``True`` (default) then save "init outputs", configurations, 

75 and package versions to butler. 

76 registerDatasetTypes : `bool`, optional 

77 If ``True`` then register dataset types in registry, otherwise 

78 they must be already registered. 

79 saveVersions : `bool`, optional 

80 If ``False`` then do not save package versions even if 

81 ``saveInitOutputs`` is set to ``True``. 

82 """ 

83 # register dataset types or check consistency 

84 self.initializeDatasetTypes(graph, registerDatasetTypes) 

85 

86 # Save task initialization data or check that saved data 

87 # is consistent with what tasks would save 

88 if saveInitOutputs: 

89 self.saveInitOutputs(graph) 

90 self.saveConfigs(graph) 

91 if saveVersions: 

92 self.savePackageVersions(graph) 

93 

94 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

95 """Save or check DatasetTypes output by the tasks in a graph. 

96 

97 Iterates over all DatasetTypes for all tasks in a graph and either 

98 tries to add them to registry or compares them to exising ones. 

99 

100 Parameters 

101 ---------- 

102 graph : `~lsst.pipe.base.QuantumGraph` 

103 Execution graph. 

104 registerDatasetTypes : `bool`, optional 

105 If ``True`` then register dataset types in registry, otherwise 

106 they must be already registered. 

107 

108 Raises 

109 ------ 

110 ValueError 

111 Raised if existing DatasetType is different from DatasetType 

112 in a graph. 

113 KeyError 

114 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

115 does not exist in registry. 

116 """ 

117 pipeline = list(nodes.taskDef for nodes in graph) 

118 

119 # Make dataset types for configurations 

120 configDatasetTypes = [DatasetType(taskDef.configDatasetName, {}, 

121 storageClass="Config", 

122 universe=self.butler.registry.dimensions) 

123 for taskDef in pipeline] 

124 

125 # And one dataset type for package versions 

126 packagesDatasetType = DatasetType("packages", {}, 

127 storageClass="Packages", 

128 universe=self.butler.registry.dimensions) 

129 

130 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry) 

131 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs, 

132 datasetTypes.intermediates, datasetTypes.outputs, 

133 configDatasetTypes, [packagesDatasetType]): 

134 # Only composites are registered, no components, and by this point 

135 # the composite should already exist. 

136 if registerDatasetTypes and not datasetType.isComponent(): 

137 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

138 # this is a no-op if it already exists and is consistent, 

139 # and it raises if it is inconsistent. 

140 self.butler.registry.registerDatasetType(datasetType) 

141 else: 

142 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

143 expected = self.butler.registry.getDatasetType(datasetType.name) 

144 if datasetType.isComponent() \ 

145 and datasetType.parentStorageClass == DatasetType.PlaceholderParentStorageClass: 

146 # Force the parent storage classes to match since we 

147 # are using a placeholder 

148 datasetType.finalizeParentStorageClass(expected.parentStorageClass) 

149 if expected != datasetType: 

150 raise ValueError(f"DatasetType configuration does not match Registry: " 

151 f"{datasetType} != {expected}") 

152 

153 def saveInitOutputs(self, graph): 

154 """Write any datasets produced by initializing tasks in a graph. 

155 

156 Parameters 

157 ---------- 

158 graph : `~lsst.pipe.base.QuantumGraph` 

159 Execution graph. 

160 

161 Raises 

162 ------ 

163 Exception 

164 Raised if ``skipExisting`` is `False` and datasets already 

165 exists. Content of a butler collection may be changed if 

166 exception is raised. 

167 

168 Notes 

169 ----- 

170 If ``skipExisting`` is `True` then existing datasets are not 

171 overwritten, instead we should check that their stored object is 

172 exactly the same as what we would save at this time. Comparing 

173 arbitrary types of object is, of course, non-trivial. Current 

174 implementation only checks the existence of the datasets and their 

175 types against the types of objects produced by tasks. Ideally we 

176 would like to check that object data is identical too but presently 

177 there is no generic way to compare objects. In the future we can 

178 potentially introduce some extensible mechanism for that. 

179 """ 

180 _LOG.debug("Will save InitOutputs for all tasks") 

181 for taskNodes in graph: 

182 taskDef = taskNodes.taskDef 

183 task = self.taskFactory.makeTask(taskDef.taskClass, taskDef.config, None, self.butler) 

184 for name in taskDef.connections.initOutputs: 

185 attribute = getattr(taskDef.connections, name) 

186 initOutputVar = getattr(task, name) 

187 objFromStore = None 

188 if self.skipExisting: 

189 # check if it is there already 

190 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", 

191 task, name, attribute.name) 

192 try: 

193 objFromStore = self.butler.get(attribute.name, {}) 

194 # Types are supposed to be identical. 

195 # TODO: Check that object contents is identical too. 

196 if type(objFromStore) is not type(initOutputVar): 

197 raise TypeError(f"Stored initOutput object type {type(objFromStore)} " 

198 f"is different from task-generated type " 

199 f"{type(initOutputVar)} for task {taskDef}") 

200 except LookupError: 

201 pass 

202 if objFromStore is None: 

203 # butler will raise exception if dataset is already there 

204 _LOG.debug("Saving InitOutputs for task=%s key=%s", task, name) 

205 self.butler.put(initOutputVar, attribute.name, {}) 

206 

207 def saveConfigs(self, graph): 

208 """Write configurations for pipeline tasks to butler or check that 

209 existing configurations are equal to the new ones. 

210 

211 Parameters 

212 ---------- 

213 graph : `~lsst.pipe.base.QuantumGraph` 

214 Execution graph. 

215 

216 Raises 

217 ------ 

218 Exception 

219 Raised if ``skipExisting`` is `False` and datasets already exists. 

220 Content of a butler collection should not be changed if exception 

221 is raised. 

222 """ 

223 def logConfigMismatch(msg): 

224 """Log messages about configuration mismatch. 

225 """ 

226 _LOG.fatal("Comparing configuration: %s", msg) 

227 

228 _LOG.debug("Will save Configs for all tasks") 

229 # start transaction to rollback any changes on exceptions 

230 with self.butler.transaction(): 

231 for taskNodes in graph: 

232 taskDef = taskNodes.taskDef 

233 configName = taskDef.configDatasetName 

234 

235 oldConfig = None 

236 if self.skipExisting: 

237 try: 

238 oldConfig = self.butler.get(configName, {}) 

239 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

240 raise TypeError( 

241 f"Config does not match existing task config {configName!r} in butler; " 

242 "tasks configurations must be consistent within the same run collection") 

243 except LookupError: 

244 pass 

245 if oldConfig is None: 

246 # butler will raise exception if dataset is already there 

247 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName) 

248 self.butler.put(taskDef.config, configName, {}) 

249 

250 def savePackageVersions(self, graph): 

251 """Write versions of software packages to butler. 

252 

253 Parameters 

254 ---------- 

255 graph : `~lsst.pipe.base.QuantumGraph` 

256 Execution graph. 

257 

258 Raises 

259 ------ 

260 Exception 

261 Raised if ``checkExisting`` is ``True`` but versions are not 

262 compatible. 

263 """ 

264 packages = Packages.fromSystem() 

265 _LOG.debug("want to save packages: %s", packages) 

266 datasetType = "packages" 

267 dataId = {} 

268 oldPackages = None 

269 # start transaction to rollback any changes on exceptions 

270 with self.butler.transaction(): 

271 if self.skipExisting: 

272 try: 

273 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run]) 

274 _LOG.debug("old packages: %s", oldPackages) 

275 except LookupError: 

276 pass 

277 if oldPackages is not None: 

278 # Note that because we can only detect python modules that have been imported, the stored 

279 # list of products may be more or less complete than what we have now. What's important is 

280 # that the products that are in common have the same version. 

281 diff = packages.difference(oldPackages) 

282 if diff: 

283 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

284 raise TypeError(f"Package versions mismatch: ({versions_str})") 

285 else: 

286 _LOG.debug("new packages are consistent with old") 

287 # Update the old set of packages in case we have more packages that haven't been persisted. 

288 extra = packages.extra(oldPackages) 

289 if extra: 

290 _LOG.debug("extra packages: %s", extra) 

291 oldPackages.update(packages) 

292 # have to remove existing dataset first, butler nas no replace option 

293 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run]) 

294 self.butler.pruneDatasets([ref], unstore=True, purge=True) 

295 self.butler.put(oldPackages, datasetType, dataId) 

296 else: 

297 self.butler.put(packages, datasetType, dataId)