Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22__all__ = ['PreExecInit'] 

23 

24# ------------------------------- 

25# Imports of standard modules -- 

26# ------------------------------- 

27import logging 

28import itertools 

29 

30# ----------------------------- 

31# Imports for other modules -- 

32# ----------------------------- 

33from lsst.pipe.base import PipelineDatasetTypes 

34 

35_LOG = logging.getLogger(__name__.partition(".")[2]) 

36 

37 

38class PreExecInit: 

39 """Initialization of registry for QuantumGraph execution. 

40 

41 This class encapsulates all necessary operations that have to be performed 

42 on butler and registry to prepare them for QuantumGraph execution. 

43 

44 Parameters 

45 ---------- 

46 butler : `~lsst.daf.butler.Butler` 

47 Data butler instance. 

48 taskFactory : `~lsst.pipe.base.TaskFactory` 

49 Task factory. 

50 skipExisting : `bool`, optional 

51 If `True` then do not try to overwrite any datasets that might exist 

52 in the butler. If `False` then any existing conflicting dataset will 

53 cause butler exception. 

54 clobberOutput : `bool`, optional 

55 It `True` then override all existing output datasets in an output 

56 collection. 

57 """ 

58 def __init__(self, butler, taskFactory, skipExisting=False, clobberOutput=False): 

59 self.butler = butler 

60 self.taskFactory = taskFactory 

61 self.skipExisting = skipExisting 

62 self.clobberOutput = clobberOutput 

63 

64 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False): 

65 """Perform all initialization steps. 

66 

67 Convenience method to execute all initialization steps. Instead of 

68 calling this method and providing all options it is also possible to 

69 call methods individually. 

70 

71 Parameters 

72 ---------- 

73 graph : `~lsst.pipe.base.QuantumGraph` 

74 Execution graph. 

75 saveInitOutputs : `bool`, optional 

76 If ``True`` (default) then save task "init outputs" to butler. 

77 registerDatasetTypes : `bool`, optional 

78 If ``True`` then register dataset types in registry, otherwise 

79 they must be already registered. 

80 """ 

81 # register dataset types or check consistency 

82 self.initializeDatasetTypes(graph, registerDatasetTypes) 

83 

84 # associate all existing datasets with output collection. 

85 self.updateOutputCollection(graph) 

86 

87 # Save task initialization data or check that saved data 

88 # is consistent with what tasks would save 

89 if saveInitOutputs: 

90 self.saveInitOutputs(graph) 

91 

92 def initializeDatasetTypes(self, graph, registerDatasetTypes=False): 

93 """Save or check DatasetTypes output by the tasks in a graph. 

94 

95 Iterates over all DatasetTypes for all tasks in a graph and either 

96 tries to add them to registry or compares them to exising ones. 

97 

98 Parameters 

99 ---------- 

100 graph : `~lsst.pipe.base.QuantumGraph` 

101 Execution graph. 

102 registerDatasetTypes : `bool`, optional 

103 If ``True`` then register dataset types in registry, otherwise 

104 they must be already registered. 

105 

106 Raises 

107 ------ 

108 ValueError 

109 Raised if existing DatasetType is different from DatasetType 

110 in a graph. 

111 KeyError 

112 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType 

113 does not exist in registry. 

114 """ 

115 pipeline = list(nodes.taskDef for nodes in graph) 

116 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry) 

117 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs, 

118 datasetTypes.intermediates, datasetTypes.outputs): 

119 if registerDatasetTypes: 

120 _LOG.debug("Registering DatasetType %s with registry", datasetType) 

121 # this is a no-op if it already exists and is consistent, 

122 # and it raises if it is inconsistent. 

123 self.butler.registry.registerDatasetType(datasetType) 

124 else: 

125 _LOG.debug("Checking DatasetType %s against registry", datasetType) 

126 expected = self.butler.registry.getDatasetType(datasetType.name) 

127 if expected != datasetType: 

128 raise ValueError(f"DatasetType configuration does not match Registry: " 

129 f"{datasetType} != {expected}") 

130 

131 def updateOutputCollection(self, graph): 

132 """Associate all existing datasets with output collection. 

133 

134 For every Quantum in a graph make sure that its existing inputs are 

135 added to the Butler's output collection. 

136 

137 For each quantum there are input and output DatasetRefs. With the 

138 current implementation of preflight output refs should not exist but 

139 input refs may belong to a different collection. We want all refs to 

140 appear in output collection, so we have to "copy" those refs. 

141 

142 Parameters 

143 ---------- 

144 graph : `~lsst.pipe.base.QuantumGraph` 

145 Execution graph. 

146 """ 

147 def _refComponents(refs): 

148 """Return all resolved dataset components recursively.""" 

149 for ref in refs: 

150 if ref.id is not None: 

151 yield ref 

152 yield from _refComponents(ref.components.values()) 

153 

154 collection = self.butler.run 

155 registry = self.butler.registry 

156 

157 # Main issue here is that the same DatasetRef can appear as input for 

158 # many quanta, to keep them unique we first collect them into one 

159 # dict indexed by dataset id. 

160 id2ref = {} 

161 for taskDef, quantum in graph.quanta(): 

162 for refs in quantum.predictedInputs.values(): 

163 for ref in _refComponents(refs): 

164 id2ref[ref.id] = ref 

165 for initInput in graph.initInputs.values(): 

166 id2ref[initInput.id] = initInput 

167 

168 _LOG.debug("Associating %d datasets with output collection %s", len(id2ref), collection) 

169 

170 refsToAdd = [] 

171 refsToRemove = [] 

172 if not self.skipExisting and not self.clobberOutput: 

173 # optimization - save all at once, butler will raise an exception 

174 # if any dataset is already there 

175 refsToAdd = list(id2ref.values()) 

176 else: 

177 # skip or override existing ones 

178 for ref in id2ref.values(): 

179 if registry.find(collection, ref.datasetType, ref.dataId) is None: 

180 refsToAdd.append(ref) 

181 elif self.clobberOutput: 

182 # replace this dataset 

183 refsToRemove.append(ref) 

184 refsToAdd.append(ref) 

185 if refsToRemove: 

186 registry.disassociate(collection, refsToRemove) 

187 if refsToAdd: 

188 registry.associate(collection, refsToAdd) 

189 

190 def saveInitOutputs(self, graph): 

191 """Write any datasets produced by initializing tasks in a graph. 

192 

193 Parameters 

194 ---------- 

195 graph : `~lsst.pipe.base.QuantumGraph` 

196 Execution graph. 

197 

198 Raises 

199 ------ 

200 Exception 

201 Raised if ``skipExisting`` is `False` and datasets already 

202 exists. Content of a butler collection may be changed if 

203 exception is raised. 

204 

205 Note 

206 ---- 

207 If ``skipExisting`` is `True` then existing datasets are not 

208 overwritten, instead we should check that their stored object is 

209 exactly the same as what we would save at this time. Comparing 

210 arbitrary types of object is, of course, non-trivial. Current 

211 implementation only checks the existence of the datasets and their 

212 types against the types of objects produced by tasks. Ideally we 

213 would like to check that object data is identical too but presently 

214 there is no generic way to compare objects. In the future we can 

215 potentially introduce some extensible mechanism for that. 

216 """ 

217 _LOG.debug("Will save InitOutputs for all tasks") 

218 for taskNodes in graph: 

219 taskDef = taskNodes.taskDef 

220 task = self.taskFactory.makeTask(taskDef.taskClass, taskDef.config, None, self.butler) 

221 for name in taskDef.connections.initOutputs: 

222 attribute = getattr(taskDef.connections, name) 

223 initOutputVar = getattr(task, name) 

224 objFromStore = None 

225 if self.clobberOutput: 

226 # Remove if it already exists. 

227 collection = self.butler.run 

228 registry = self.butler.registry 

229 ref = registry.find(collection, attribute.name, {}) 

230 if ref is not None: 

231 # It is not enough to remove dataset from collection, 

232 # it has to be removed from butler too. 

233 self.butler.remove(ref) 

234 elif self.skipExisting: 

235 # check if it is there already 

236 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", 

237 task, name, attribute.name) 

238 objFromStore = self.butler.get(attribute.name, {}) 

239 if objFromStore is not None: 

240 # Types are supposed to be identical. 

241 # TODO: Check that object contents is identical too. 

242 if type(objFromStore) is not type(initOutputVar): 

243 raise TypeError(f"Stored initOutput object type {type(objFromStore)} " 

244 f"is different from task-generated type " 

245 f"{type(initOutputVar)} for task {taskDef}") 

246 if objFromStore is None: 

247 # butler will raise exception if dataset is already there 

248 _LOG.debug("Saving InitOutputs for task=%s key=%s", task, name) 

249 self.butler.put(initOutputVar, attribute.name, {})