Coverage for python/lsst/ctrl/mpexec/preExecInit.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ['PreExecInit']
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
28import itertools
30# -----------------------------
31# Imports for other modules --
32# -----------------------------
33from lsst.pipe.base import PipelineDatasetTypes
35_LOG = logging.getLogger(__name__.partition(".")[2])
38class PreExecInit:
39 """Initialization of registry for QuantumGraph execution.
41 This class encapsulates all necessary operations that have to be performed
42 on butler and registry to prepare them for QuantumGraph execution.
44 Parameters
45 ----------
46 butler : `~lsst.daf.butler.Butler`
47 Data butler instance.
48 taskFactory : `~lsst.pipe.base.TaskFactory`
49 Task factory.
50 skipExisting : `bool`, optional
51 If `True` then do not try to overwrite any datasets that might exist
52 in the butler. If `False` then any existing conflicting dataset will
53 cause butler exception.
54 clobberOutput : `bool`, optional
55 It `True` then override all existing output datasets in an output
56 collection.
57 """
58 def __init__(self, butler, taskFactory, skipExisting=False, clobberOutput=False):
59 self.butler = butler
60 self.taskFactory = taskFactory
61 self.skipExisting = skipExisting
62 self.clobberOutput = clobberOutput
64 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False):
65 """Perform all initialization steps.
67 Convenience method to execute all initialization steps. Instead of
68 calling this method and providing all options it is also possible to
69 call methods individually.
71 Parameters
72 ----------
73 graph : `~lsst.pipe.base.QuantumGraph`
74 Execution graph.
75 saveInitOutputs : `bool`, optional
76 If ``True`` (default) then save task "init outputs" to butler.
77 registerDatasetTypes : `bool`, optional
78 If ``True`` then register dataset types in registry, otherwise
79 they must be already registered.
80 """
81 # register dataset types or check consistency
82 self.initializeDatasetTypes(graph, registerDatasetTypes)
84 # associate all existing datasets with output collection.
85 self.updateOutputCollection(graph)
87 # Save task initialization data or check that saved data
88 # is consistent with what tasks would save
89 if saveInitOutputs:
90 self.saveInitOutputs(graph)
92 def initializeDatasetTypes(self, graph, registerDatasetTypes=False):
93 """Save or check DatasetTypes output by the tasks in a graph.
95 Iterates over all DatasetTypes for all tasks in a graph and either
96 tries to add them to registry or compares them to exising ones.
98 Parameters
99 ----------
100 graph : `~lsst.pipe.base.QuantumGraph`
101 Execution graph.
102 registerDatasetTypes : `bool`, optional
103 If ``True`` then register dataset types in registry, otherwise
104 they must be already registered.
106 Raises
107 ------
108 ValueError
109 Raised if existing DatasetType is different from DatasetType
110 in a graph.
111 KeyError
112 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
113 does not exist in registry.
114 """
115 pipeline = list(nodes.taskDef for nodes in graph)
116 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry)
117 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs,
118 datasetTypes.intermediates, datasetTypes.outputs):
119 if registerDatasetTypes:
120 _LOG.debug("Registering DatasetType %s with registry", datasetType)
121 # this is a no-op if it already exists and is consistent,
122 # and it raises if it is inconsistent.
123 self.butler.registry.registerDatasetType(datasetType)
124 else:
125 _LOG.debug("Checking DatasetType %s against registry", datasetType)
126 expected = self.butler.registry.getDatasetType(datasetType.name)
127 if expected != datasetType:
128 raise ValueError(f"DatasetType configuration does not match Registry: "
129 f"{datasetType} != {expected}")
131 def updateOutputCollection(self, graph):
132 """Associate all existing datasets with output collection.
134 For every Quantum in a graph make sure that its existing inputs are
135 added to the Butler's output collection.
137 For each quantum there are input and output DatasetRefs. With the
138 current implementation of preflight output refs should not exist but
139 input refs may belong to a different collection. We want all refs to
140 appear in output collection, so we have to "copy" those refs.
142 Parameters
143 ----------
144 graph : `~lsst.pipe.base.QuantumGraph`
145 Execution graph.
146 """
147 def _refComponents(refs):
148 """Return all resolved dataset components recursively."""
149 for ref in refs:
150 if ref.id is not None:
151 yield ref
152 yield from _refComponents(ref.components.values())
154 collection = self.butler.run
155 registry = self.butler.registry
157 # Main issue here is that the same DatasetRef can appear as input for
158 # many quanta, to keep them unique we first collect them into one
159 # dict indexed by dataset id.
160 id2ref = {}
161 for taskDef, quantum in graph.quanta():
162 for refs in quantum.predictedInputs.values():
163 for ref in _refComponents(refs):
164 id2ref[ref.id] = ref
165 for initInput in graph.initInputs.values():
166 id2ref[initInput.id] = initInput
168 _LOG.debug("Associating %d datasets with output collection %s", len(id2ref), collection)
170 refsToAdd = []
171 refsToRemove = []
172 if not self.skipExisting and not self.clobberOutput:
173 # optimization - save all at once, butler will raise an exception
174 # if any dataset is already there
175 refsToAdd = list(id2ref.values())
176 else:
177 # skip or override existing ones
178 for ref in id2ref.values():
179 if registry.find(collection, ref.datasetType, ref.dataId) is None:
180 refsToAdd.append(ref)
181 elif self.clobberOutput:
182 # replace this dataset
183 refsToRemove.append(ref)
184 refsToAdd.append(ref)
185 if refsToRemove:
186 registry.disassociate(collection, refsToRemove)
187 if refsToAdd:
188 registry.associate(collection, refsToAdd)
190 def saveInitOutputs(self, graph):
191 """Write any datasets produced by initializing tasks in a graph.
193 Parameters
194 ----------
195 graph : `~lsst.pipe.base.QuantumGraph`
196 Execution graph.
198 Raises
199 ------
200 Exception
201 Raised if ``skipExisting`` is `False` and datasets already
202 exists. Content of a butler collection may be changed if
203 exception is raised.
205 Note
206 ----
207 If ``skipExisting`` is `True` then existing datasets are not
208 overwritten, instead we should check that their stored object is
209 exactly the same as what we would save at this time. Comparing
210 arbitrary types of object is, of course, non-trivial. Current
211 implementation only checks the existence of the datasets and their
212 types against the types of objects produced by tasks. Ideally we
213 would like to check that object data is identical too but presently
214 there is no generic way to compare objects. In the future we can
215 potentially introduce some extensible mechanism for that.
216 """
217 _LOG.debug("Will save InitOutputs for all tasks")
218 for taskNodes in graph:
219 taskDef = taskNodes.taskDef
220 task = self.taskFactory.makeTask(taskDef.taskClass, taskDef.config, None, self.butler)
221 for name in taskDef.connections.initOutputs:
222 attribute = getattr(taskDef.connections, name)
223 initOutputVar = getattr(task, name)
224 objFromStore = None
225 if self.clobberOutput:
226 # Remove if it already exists.
227 collection = self.butler.run
228 registry = self.butler.registry
229 ref = registry.find(collection, attribute.name, {})
230 if ref is not None:
231 # It is not enough to remove dataset from collection,
232 # it has to be removed from butler too.
233 self.butler.remove(ref)
234 elif self.skipExisting:
235 # check if it is there already
236 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s",
237 task, name, attribute.name)
238 objFromStore = self.butler.get(attribute.name, {})
239 if objFromStore is not None:
240 # Types are supposed to be identical.
241 # TODO: Check that object contents is identical too.
242 if type(objFromStore) is not type(initOutputVar):
243 raise TypeError(f"Stored initOutput object type {type(objFromStore)} "
244 f"is different from task-generated type "
245 f"{type(initOutputVar)} for task {taskDef}")
246 if objFromStore is None:
247 # butler will raise exception if dataset is already there
248 _LOG.debug("Saving InitOutputs for task=%s key=%s", task, name)
249 self.butler.put(initOutputVar, attribute.name, {})