Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 11%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["PreExecInit"]
24import itertools
26# -------------------------------
27# Imports of standard modules --
28# -------------------------------
29import logging
31# -----------------------------
32# Imports for other modules --
33# -----------------------------
34from lsst.base import Packages
35from lsst.pipe.base import PipelineDatasetTypes
37_LOG = logging.getLogger(__name__)
40class PreExecInit:
41 """Initialization of registry for QuantumGraph execution.
43 This class encapsulates all necessary operations that have to be performed
44 on butler and registry to prepare them for QuantumGraph execution.
46 Parameters
47 ----------
48 butler : `~lsst.daf.butler.Butler`
49 Data butler instance.
50 taskFactory : `~lsst.pipe.base.TaskFactory`
51 Task factory.
52 extendRun : `bool`, optional
53 If `True` then do not try to overwrite any datasets that might exist
54 in ``butler.run``; instead compare them when appropriate/possible. If
55 `False`, then any existing conflicting dataset will cause a butler
56 exception to be raised.
57 """
59 def __init__(self, butler, taskFactory, extendRun=False):
60 self.butler = butler
61 self.taskFactory = taskFactory
62 self.extendRun = extendRun
63 if self.extendRun and self.butler.run is None:
64 raise RuntimeError(
65 "Cannot perform extendRun logic unless butler is initialized "
66 "with a default output RUN collection."
67 )
69 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True):
70 """Perform all initialization steps.
72 Convenience method to execute all initialization steps. Instead of
73 calling this method and providing all options it is also possible to
74 call methods individually.
76 Parameters
77 ----------
78 graph : `~lsst.pipe.base.QuantumGraph`
79 Execution graph.
80 saveInitOutputs : `bool`, optional
81 If ``True`` (default) then save "init outputs", configurations,
82 and package versions to butler.
83 registerDatasetTypes : `bool`, optional
84 If ``True`` then register dataset types in registry, otherwise
85 they must be already registered.
86 saveVersions : `bool`, optional
87 If ``False`` then do not save package versions even if
88 ``saveInitOutputs`` is set to ``True``.
89 """
90 # register dataset types or check consistency
91 self.initializeDatasetTypes(graph, registerDatasetTypes)
93 # Save task initialization data or check that saved data
94 # is consistent with what tasks would save
95 if saveInitOutputs:
96 self.saveInitOutputs(graph)
97 self.saveConfigs(graph)
98 if saveVersions:
99 self.savePackageVersions(graph)
101 def initializeDatasetTypes(self, graph, registerDatasetTypes=False):
102 """Save or check DatasetTypes output by the tasks in a graph.
104 Iterates over all DatasetTypes for all tasks in a graph and either
105 tries to add them to registry or compares them to exising ones.
107 Parameters
108 ----------
109 graph : `~lsst.pipe.base.QuantumGraph`
110 Execution graph.
111 registerDatasetTypes : `bool`, optional
112 If ``True`` then register dataset types in registry, otherwise
113 they must be already registered.
115 Raises
116 ------
117 ValueError
118 Raised if existing DatasetType is different from DatasetType
119 in a graph.
120 KeyError
121 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
122 does not exist in registry.
123 """
124 pipeline = graph.taskGraph
125 datasetTypes = PipelineDatasetTypes.fromPipeline(
126 pipeline, registry=self.butler.registry, include_configs=True, include_packages=True
127 )
128 for datasetType in itertools.chain(
129 datasetTypes.initIntermediates,
130 datasetTypes.initOutputs,
131 datasetTypes.intermediates,
132 datasetTypes.outputs,
133 ):
134 # Only composites are registered, no components, and by this point
135 # the composite should already exist.
136 if registerDatasetTypes and not datasetType.isComponent():
137 _LOG.debug("Registering DatasetType %s with registry", datasetType)
138 # this is a no-op if it already exists and is consistent,
139 # and it raises if it is inconsistent.
140 self.butler.registry.registerDatasetType(datasetType)
141 else:
142 _LOG.debug("Checking DatasetType %s against registry", datasetType)
143 try:
144 expected = self.butler.registry.getDatasetType(datasetType.name)
145 except KeyError:
146 # Likely means that --register-dataset-types is forgotten.
147 raise KeyError(
148 f"Dataset type with name '{datasetType.name}' not found. Dataset types "
149 "have to be registered with either `butler register-dataset-type` or "
150 "passing `--register-dataset-types` option to `pipetask run`."
151 ) from None
152 if expected != datasetType:
153 raise ValueError(
154 f"DatasetType configuration does not match Registry: {datasetType} != {expected}"
155 )
157 def saveInitOutputs(self, graph):
158 """Write any datasets produced by initializing tasks in a graph.
160 Parameters
161 ----------
162 graph : `~lsst.pipe.base.QuantumGraph`
163 Execution graph.
165 Raises
166 ------
167 TypeError
168 Raised if ``extendRun`` is `True` but type of existing object in
169 butler is different from new data.
170 Exception
171 Raised if ``extendRun`` is `False` and datasets already
172 exists. Content of a butler collection may be changed if
173 exception is raised.
175 Notes
176 -----
177 If ``extendRun`` is `True` then existing datasets are not
178 overwritten, instead we should check that their stored object is
179 exactly the same as what we would save at this time. Comparing
180 arbitrary types of object is, of course, non-trivial. Current
181 implementation only checks the existence of the datasets and their
182 types against the types of objects produced by tasks. Ideally we
183 would like to check that object data is identical too but presently
184 there is no generic way to compare objects. In the future we can
185 potentially introduce some extensible mechanism for that.
186 """
187 _LOG.debug("Will save InitOutputs for all tasks")
188 for taskDef in graph.iterTaskGraph():
189 task = self.taskFactory.makeTask(
190 taskDef.taskClass, taskDef.label, taskDef.config, None, self.butler
191 )
192 for name in taskDef.connections.initOutputs:
193 attribute = getattr(taskDef.connections, name)
194 initOutputVar = getattr(task, name)
195 objFromStore = None
196 if self.extendRun:
197 # check if it is there already
198 _LOG.debug(
199 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name
200 )
201 try:
202 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run])
203 # Types are supposed to be identical.
204 # TODO: Check that object contents is identical too.
205 if type(objFromStore) is not type(initOutputVar):
206 raise TypeError(
207 f"Stored initOutput object type {type(objFromStore)} "
208 f"is different from task-generated type "
209 f"{type(initOutputVar)} for task {taskDef}"
210 )
211 except (LookupError, FileNotFoundError):
212 # FileNotFoundError likely means execution butler
213 # where refs do exist but datastore artifacts do not.
214 pass
215 if objFromStore is None:
216 # butler will raise exception if dataset is already there
217 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name)
218 self.butler.put(initOutputVar, attribute.name, {})
220 def saveConfigs(self, graph):
221 """Write configurations for pipeline tasks to butler or check that
222 existing configurations are equal to the new ones.
224 Parameters
225 ----------
226 graph : `~lsst.pipe.base.QuantumGraph`
227 Execution graph.
229 Raises
230 ------
231 TypeError
232 Raised if ``extendRun`` is `True` but existing object in butler is
233 different from new data.
234 Exception
235 Raised if ``extendRun`` is `False` and datasets already exists.
236 Content of a butler collection should not be changed if exception
237 is raised.
238 """
240 def logConfigMismatch(msg):
241 """Log messages about configuration mismatch."""
242 _LOG.fatal("Comparing configuration: %s", msg)
244 _LOG.debug("Will save Configs for all tasks")
245 # start transaction to rollback any changes on exceptions
246 with self.butler.transaction():
247 for taskDef in graph.taskGraph:
248 configName = taskDef.configDatasetName
250 oldConfig = None
251 if self.extendRun:
252 try:
253 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run])
254 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
255 raise TypeError(
256 f"Config does not match existing task config {configName!r} in butler; "
257 "tasks configurations must be consistent within the same run collection"
258 )
259 except (LookupError, FileNotFoundError):
260 # FileNotFoundError likely means execution butler
261 # where refs do exist but datastore artifacts do not.
262 pass
263 if oldConfig is None:
264 # butler will raise exception if dataset is already there
265 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName)
266 self.butler.put(taskDef.config, configName, {})
268 def savePackageVersions(self, graph):
269 """Write versions of software packages to butler.
271 Parameters
272 ----------
273 graph : `~lsst.pipe.base.QuantumGraph`
274 Execution graph.
276 Raises
277 ------
278 TypeError
279 Raised if ``extendRun`` is `True` but existing object in butler is
280 different from new data.
281 """
282 packages = Packages.fromSystem()
283 _LOG.debug("want to save packages: %s", packages)
284 datasetType = PipelineDatasetTypes.packagesDatasetName
285 dataId = {}
286 oldPackages = None
287 # start transaction to rollback any changes on exceptions
288 with self.butler.transaction():
289 if self.extendRun:
290 try:
291 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run])
292 _LOG.debug("old packages: %s", oldPackages)
293 except (LookupError, FileNotFoundError):
294 # FileNotFoundError likely means execution butler where
295 # refs do exist but datastore artifacts do not.
296 pass
297 if oldPackages is not None:
298 # Note that because we can only detect python modules that have
299 # been imported, the stored list of products may be more or
300 # less complete than what we have now. What's important is
301 # that the products that are in common have the same version.
302 diff = packages.difference(oldPackages)
303 if diff:
304 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
305 raise TypeError(f"Package versions mismatch: ({versions_str})")
306 else:
307 _LOG.debug("new packages are consistent with old")
308 # Update the old set of packages in case we have more packages
309 # that haven't been persisted.
310 extra = packages.extra(oldPackages)
311 if extra:
312 _LOG.debug("extra packages: %s", extra)
313 oldPackages.update(packages)
314 # have to remove existing dataset first, butler has no
315 # replace option.
316 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run])
317 self.butler.pruneDatasets([ref], unstore=True, purge=True)
318 self.butler.put(oldPackages, datasetType, dataId)
319 else:
320 self.butler.put(packages, datasetType, dataId)