Coverage for python/lsst/ctrl/mpexec/preExecInit.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ['PreExecInit']
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
28import itertools
30# -----------------------------
31# Imports for other modules --
32# -----------------------------
33from lsst.base import Packages
34from lsst.daf.butler import DatasetType
35from lsst.pipe.base import PipelineDatasetTypes
37_LOG = logging.getLogger(__name__.partition(".")[2])
40class PreExecInit:
41 """Initialization of registry for QuantumGraph execution.
43 This class encapsulates all necessary operations that have to be performed
44 on butler and registry to prepare them for QuantumGraph execution.
46 Parameters
47 ----------
48 butler : `~lsst.daf.butler.Butler`
49 Data butler instance.
50 taskFactory : `~lsst.pipe.base.TaskFactory`
51 Task factory.
52 skipExisting : `bool`, optional
53 If `True` then do not try to overwrite any datasets that might exist
54 in ``butler.run``; instead compare them when appropriate/possible. If
55 `False`, then any existing conflicting dataset will cause a butler
56 exception to be raised.
57 """
58 def __init__(self, butler, taskFactory, skipExisting=False):
59 self.butler = butler
60 self.taskFactory = taskFactory
61 self.skipExisting = skipExisting
62 if self.skipExisting and self.butler.run is None:
63 raise RuntimeError(
64 "Cannot perform skipExisting logic unless butler is initialized "
65 "with a default output RUN collection."
66 )
68 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True):
69 """Perform all initialization steps.
71 Convenience method to execute all initialization steps. Instead of
72 calling this method and providing all options it is also possible to
73 call methods individually.
75 Parameters
76 ----------
77 graph : `~lsst.pipe.base.QuantumGraph`
78 Execution graph.
79 saveInitOutputs : `bool`, optional
80 If ``True`` (default) then save "init outputs", configurations,
81 and package versions to butler.
82 registerDatasetTypes : `bool`, optional
83 If ``True`` then register dataset types in registry, otherwise
84 they must be already registered.
85 saveVersions : `bool`, optional
86 If ``False`` then do not save package versions even if
87 ``saveInitOutputs`` is set to ``True``.
88 """
89 # register dataset types or check consistency
90 self.initializeDatasetTypes(graph, registerDatasetTypes)
92 # Save task initialization data or check that saved data
93 # is consistent with what tasks would save
94 if saveInitOutputs:
95 self.saveInitOutputs(graph)
96 self.saveConfigs(graph)
97 if saveVersions:
98 self.savePackageVersions(graph)
100 def initializeDatasetTypes(self, graph, registerDatasetTypes=False):
101 """Save or check DatasetTypes output by the tasks in a graph.
103 Iterates over all DatasetTypes for all tasks in a graph and either
104 tries to add them to registry or compares them to exising ones.
106 Parameters
107 ----------
108 graph : `~lsst.pipe.base.QuantumGraph`
109 Execution graph.
110 registerDatasetTypes : `bool`, optional
111 If ``True`` then register dataset types in registry, otherwise
112 they must be already registered.
114 Raises
115 ------
116 ValueError
117 Raised if existing DatasetType is different from DatasetType
118 in a graph.
119 KeyError
120 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
121 does not exist in registry.
122 """
123 pipeline = graph.taskGraph
125 # Make dataset types for configurations
126 configDatasetTypes = [DatasetType(taskDef.configDatasetName, {},
127 storageClass="Config",
128 universe=self.butler.registry.dimensions)
129 for taskDef in pipeline]
131 # And one dataset type for package versions
132 packagesDatasetType = DatasetType("packages", {},
133 storageClass="Packages",
134 universe=self.butler.registry.dimensions)
136 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=self.butler.registry)
137 for datasetType in itertools.chain(datasetTypes.initIntermediates, datasetTypes.initOutputs,
138 datasetTypes.intermediates, datasetTypes.outputs,
139 configDatasetTypes, [packagesDatasetType]):
140 # Only composites are registered, no components, and by this point
141 # the composite should already exist.
142 if registerDatasetTypes and not datasetType.isComponent():
143 _LOG.debug("Registering DatasetType %s with registry", datasetType)
144 # this is a no-op if it already exists and is consistent,
145 # and it raises if it is inconsistent.
146 self.butler.registry.registerDatasetType(datasetType)
147 else:
148 _LOG.debug("Checking DatasetType %s against registry", datasetType)
149 expected = self.butler.registry.getDatasetType(datasetType.name)
150 if datasetType.isComponent() \
151 and datasetType.parentStorageClass == DatasetType.PlaceholderParentStorageClass:
152 # Force the parent storage classes to match since we
153 # are using a placeholder
154 datasetType.finalizeParentStorageClass(expected.parentStorageClass)
155 if expected != datasetType:
156 raise ValueError(f"DatasetType configuration does not match Registry: "
157 f"{datasetType} != {expected}")
159 def saveInitOutputs(self, graph):
160 """Write any datasets produced by initializing tasks in a graph.
162 Parameters
163 ----------
164 graph : `~lsst.pipe.base.QuantumGraph`
165 Execution graph.
167 Raises
168 ------
169 Exception
170 Raised if ``skipExisting`` is `False` and datasets already
171 exists. Content of a butler collection may be changed if
172 exception is raised.
174 Notes
175 -----
176 If ``skipExisting`` is `True` then existing datasets are not
177 overwritten, instead we should check that their stored object is
178 exactly the same as what we would save at this time. Comparing
179 arbitrary types of object is, of course, non-trivial. Current
180 implementation only checks the existence of the datasets and their
181 types against the types of objects produced by tasks. Ideally we
182 would like to check that object data is identical too but presently
183 there is no generic way to compare objects. In the future we can
184 potentially introduce some extensible mechanism for that.
185 """
186 _LOG.debug("Will save InitOutputs for all tasks")
187 for taskDef in graph.iterTaskGraph():
188 task = self.taskFactory.makeTask(taskDef.taskClass, taskDef.config, None, self.butler)
189 for name in taskDef.connections.initOutputs:
190 attribute = getattr(taskDef.connections, name)
191 initOutputVar = getattr(task, name)
192 objFromStore = None
193 if self.skipExisting:
194 # check if it is there already
195 _LOG.debug("Retrieving InitOutputs for task=%s key=%s dsTypeName=%s",
196 task, name, attribute.name)
197 try:
198 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run])
199 # Types are supposed to be identical.
200 # TODO: Check that object contents is identical too.
201 if type(objFromStore) is not type(initOutputVar):
202 raise TypeError(f"Stored initOutput object type {type(objFromStore)} "
203 f"is different from task-generated type "
204 f"{type(initOutputVar)} for task {taskDef}")
205 except LookupError:
206 pass
207 if objFromStore is None:
208 # butler will raise exception if dataset is already there
209 _LOG.debug("Saving InitOutputs for task=%s key=%s", task, name)
210 self.butler.put(initOutputVar, attribute.name, {})
212 def saveConfigs(self, graph):
213 """Write configurations for pipeline tasks to butler or check that
214 existing configurations are equal to the new ones.
216 Parameters
217 ----------
218 graph : `~lsst.pipe.base.QuantumGraph`
219 Execution graph.
221 Raises
222 ------
223 Exception
224 Raised if ``skipExisting`` is `False` and datasets already exists.
225 Content of a butler collection should not be changed if exception
226 is raised.
227 """
228 def logConfigMismatch(msg):
229 """Log messages about configuration mismatch.
230 """
231 _LOG.fatal("Comparing configuration: %s", msg)
233 _LOG.debug("Will save Configs for all tasks")
234 # start transaction to rollback any changes on exceptions
235 with self.butler.transaction():
236 for taskDef in graph.taskGraph:
237 configName = taskDef.configDatasetName
239 oldConfig = None
240 if self.skipExisting:
241 try:
242 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run])
243 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
244 raise TypeError(
245 f"Config does not match existing task config {configName!r} in butler; "
246 "tasks configurations must be consistent within the same run collection")
247 except LookupError:
248 pass
249 if oldConfig is None:
250 # butler will raise exception if dataset is already there
251 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName)
252 self.butler.put(taskDef.config, configName, {})
254 def savePackageVersions(self, graph):
255 """Write versions of software packages to butler.
257 Parameters
258 ----------
259 graph : `~lsst.pipe.base.QuantumGraph`
260 Execution graph.
262 Raises
263 ------
264 Exception
265 Raised if ``checkExisting`` is ``True`` but versions are not
266 compatible.
267 """
268 packages = Packages.fromSystem()
269 _LOG.debug("want to save packages: %s", packages)
270 datasetType = "packages"
271 dataId = {}
272 oldPackages = None
273 # start transaction to rollback any changes on exceptions
274 with self.butler.transaction():
275 if self.skipExisting:
276 try:
277 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run])
278 _LOG.debug("old packages: %s", oldPackages)
279 except LookupError:
280 pass
281 if oldPackages is not None:
282 # Note that because we can only detect python modules that have been imported, the stored
283 # list of products may be more or less complete than what we have now. What's important is
284 # that the products that are in common have the same version.
285 diff = packages.difference(oldPackages)
286 if diff:
287 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
288 raise TypeError(f"Package versions mismatch: ({versions_str})")
289 else:
290 _LOG.debug("new packages are consistent with old")
291 # Update the old set of packages in case we have more packages that haven't been persisted.
292 extra = packages.extra(oldPackages)
293 if extra:
294 _LOG.debug("extra packages: %s", extra)
295 oldPackages.update(packages)
296 # have to remove existing dataset first, butler nas no replace option
297 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run])
298 self.butler.pruneDatasets([ref], unstore=True, purge=True)
299 self.butler.put(oldPackages, datasetType, dataId)
300 else:
301 self.butler.put(packages, datasetType, dataId)