Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 9%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22__all__ = ["PreExecInit"]
24# -------------------------------
25# Imports of standard modules --
26# -------------------------------
27import logging
29# -----------------------------
30# Imports for other modules --
31# -----------------------------
32from lsst.daf.butler import DatasetType
33from lsst.daf.butler.registry import ConflictingDefinitionError
34from lsst.pipe.base import PipelineDatasetTypes
35from lsst.utils.packages import Packages
37from .mock_task import MockButlerQuantumContext
39_LOG = logging.getLogger(__name__)
42class PreExecInit:
43 """Initialization of registry for QuantumGraph execution.
45 This class encapsulates all necessary operations that have to be performed
46 on butler and registry to prepare them for QuantumGraph execution.
48 Parameters
49 ----------
50 butler : `~lsst.daf.butler.Butler`
51 Data butler instance.
52 taskFactory : `~lsst.pipe.base.TaskFactory`
53 Task factory.
54 extendRun : `bool`, optional
55 If `True` then do not try to overwrite any datasets that might exist
56 in ``butler.run``; instead compare them when appropriate/possible. If
57 `False`, then any existing conflicting dataset will cause a butler
58 exception to be raised.
59 mock : `bool`, optional
60 If `True` then also do initialization needed for pipeline mocking.
61 """
63 def __init__(self, butler, taskFactory, extendRun=False, mock=False):
64 self.butler = butler
65 self.taskFactory = taskFactory
66 self.extendRun = extendRun
67 self.mock = mock
68 if self.extendRun and self.butler.run is None:
69 raise RuntimeError(
70 "Cannot perform extendRun logic unless butler is initialized "
71 "with a default output RUN collection."
72 )
74 def initialize(self, graph, saveInitOutputs=True, registerDatasetTypes=False, saveVersions=True):
75 """Perform all initialization steps.
77 Convenience method to execute all initialization steps. Instead of
78 calling this method and providing all options it is also possible to
79 call methods individually.
81 Parameters
82 ----------
83 graph : `~lsst.pipe.base.QuantumGraph`
84 Execution graph.
85 saveInitOutputs : `bool`, optional
86 If ``True`` (default) then save "init outputs", configurations,
87 and package versions to butler.
88 registerDatasetTypes : `bool`, optional
89 If ``True`` then register dataset types in registry, otherwise
90 they must be already registered.
91 saveVersions : `bool`, optional
92 If ``False`` then do not save package versions even if
93 ``saveInitOutputs`` is set to ``True``.
94 """
95 # register dataset types or check consistency
96 self.initializeDatasetTypes(graph, registerDatasetTypes)
98 # Save task initialization data or check that saved data
99 # is consistent with what tasks would save
100 if saveInitOutputs:
101 self.saveInitOutputs(graph)
102 self.saveConfigs(graph)
103 if saveVersions:
104 self.savePackageVersions(graph)
106 def initializeDatasetTypes(self, graph, registerDatasetTypes=False):
107 """Save or check DatasetTypes output by the tasks in a graph.
109 Iterates over all DatasetTypes for all tasks in a graph and either
110 tries to add them to registry or compares them to exising ones.
112 Parameters
113 ----------
114 graph : `~lsst.pipe.base.QuantumGraph`
115 Execution graph.
116 registerDatasetTypes : `bool`, optional
117 If ``True`` then register dataset types in registry, otherwise
118 they must be already registered.
120 Raises
121 ------
122 ValueError
123 Raised if existing DatasetType is different from DatasetType
124 in a graph.
125 KeyError
126 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
127 does not exist in registry.
128 """
129 pipeline = graph.taskGraph
130 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline(
131 pipeline, registry=self.butler.registry, include_configs=True, include_packages=True
132 )
134 for datasetTypes, is_input in (
135 (pipelineDatasetTypes.initIntermediates, True),
136 (pipelineDatasetTypes.initOutputs, False),
137 (pipelineDatasetTypes.intermediates, True),
138 (pipelineDatasetTypes.outputs, False),
139 ):
140 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input)
142 if self.mock:
143 # register special mock data types, skip logs and metadata
144 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline}
145 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline}
146 for datasetTypes, is_input in (
147 (pipelineDatasetTypes.intermediates, True),
148 (pipelineDatasetTypes.outputs, False),
149 ):
150 mockDatasetTypes = []
151 for datasetType in datasetTypes:
152 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()):
153 mockDatasetTypes.append(
154 DatasetType(
155 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name),
156 datasetType.dimensions,
157 "StructuredDataDict",
158 )
159 )
160 if mockDatasetTypes:
161 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input)
163 def _register_output_dataset_types(self, registerDatasetTypes, datasetTypes, is_input):
164 def _check_compatibility(datasetType, expected, is_input) -> bool:
165 # These are output dataset types so check for compatibility on put.
166 is_compatible = expected.is_compatible_with(datasetType)
168 if is_input:
169 # This dataset type is also used for input so must be
170 # compatible on get as ell.
171 is_compatible = is_compatible and datasetType.is_compatible_with(expected)
173 if is_compatible:
174 _LOG.debug(
175 "The dataset type configurations differ (%s from task != %s from registry) "
176 "but the storage classes are compatible. Can continue.",
177 datasetType,
178 expected,
179 )
180 return is_compatible
182 for datasetType in datasetTypes:
183 # Only composites are registered, no components, and by this point
184 # the composite should already exist.
185 if registerDatasetTypes and not datasetType.isComponent():
186 _LOG.debug("Registering DatasetType %s with registry", datasetType)
187 # this is a no-op if it already exists and is consistent,
188 # and it raises if it is inconsistent.
189 try:
190 self.butler.registry.registerDatasetType(datasetType)
191 except ConflictingDefinitionError:
192 if not _check_compatibility(
193 datasetType, self.butler.registry.getDatasetType(datasetType.name), is_input
194 ):
195 raise
196 else:
197 _LOG.debug("Checking DatasetType %s against registry", datasetType)
198 try:
199 expected = self.butler.registry.getDatasetType(datasetType.name)
200 except KeyError:
201 # Likely means that --register-dataset-types is forgotten.
202 raise KeyError(
203 f"Dataset type with name '{datasetType.name}' not found. Dataset types "
204 "have to be registered with either `butler register-dataset-type` or "
205 "passing `--register-dataset-types` option to `pipetask run`."
206 ) from None
207 if expected != datasetType:
208 if not _check_compatibility(datasetType, expected, is_input):
209 raise ValueError(
210 f"DatasetType configuration does not match Registry: {datasetType} != {expected}"
211 )
213 def saveInitOutputs(self, graph):
214 """Write any datasets produced by initializing tasks in a graph.
216 Parameters
217 ----------
218 graph : `~lsst.pipe.base.QuantumGraph`
219 Execution graph.
221 Raises
222 ------
223 TypeError
224 Raised if ``extendRun`` is `True` but type of existing object in
225 butler is different from new data.
226 Exception
227 Raised if ``extendRun`` is `False` and datasets already
228 exists. Content of a butler collection may be changed if
229 exception is raised.
231 Notes
232 -----
233 If ``extendRun`` is `True` then existing datasets are not
234 overwritten, instead we should check that their stored object is
235 exactly the same as what we would save at this time. Comparing
236 arbitrary types of object is, of course, non-trivial. Current
237 implementation only checks the existence of the datasets and their
238 types against the types of objects produced by tasks. Ideally we
239 would like to check that object data is identical too but presently
240 there is no generic way to compare objects. In the future we can
241 potentially introduce some extensible mechanism for that.
242 """
243 _LOG.debug("Will save InitOutputs for all tasks")
244 for taskDef in graph.iterTaskGraph():
245 task = self.taskFactory.makeTask(
246 taskDef.taskClass, taskDef.label, taskDef.config, None, self.butler
247 )
248 for name in taskDef.connections.initOutputs:
249 attribute = getattr(taskDef.connections, name)
250 initOutputVar = getattr(task, name)
251 objFromStore = None
252 if self.extendRun:
253 # check if it is there already
254 _LOG.debug(
255 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name
256 )
257 try:
258 objFromStore = self.butler.get(attribute.name, {}, collections=[self.butler.run])
259 # Types are supposed to be identical.
260 # TODO: Check that object contents is identical too.
261 if type(objFromStore) is not type(initOutputVar):
262 raise TypeError(
263 f"Stored initOutput object type {type(objFromStore)} "
264 f"is different from task-generated type "
265 f"{type(initOutputVar)} for task {taskDef}"
266 )
267 except (LookupError, FileNotFoundError):
268 # FileNotFoundError likely means execution butler
269 # where refs do exist but datastore artifacts do not.
270 pass
271 if objFromStore is None:
272 # butler will raise exception if dataset is already there
273 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name)
274 self.butler.put(initOutputVar, attribute.name, {})
276 def saveConfigs(self, graph):
277 """Write configurations for pipeline tasks to butler or check that
278 existing configurations are equal to the new ones.
280 Parameters
281 ----------
282 graph : `~lsst.pipe.base.QuantumGraph`
283 Execution graph.
285 Raises
286 ------
287 TypeError
288 Raised if ``extendRun`` is `True` but existing object in butler is
289 different from new data.
290 Exception
291 Raised if ``extendRun`` is `False` and datasets already exists.
292 Content of a butler collection should not be changed if exception
293 is raised.
294 """
296 def logConfigMismatch(msg):
297 """Log messages about configuration mismatch."""
298 _LOG.fatal("Comparing configuration: %s", msg)
300 _LOG.debug("Will save Configs for all tasks")
301 # start transaction to rollback any changes on exceptions
302 with self.butler.transaction():
303 for taskDef in graph.taskGraph:
304 configName = taskDef.configDatasetName
306 oldConfig = None
307 if self.extendRun:
308 try:
309 oldConfig = self.butler.get(configName, {}, collections=[self.butler.run])
310 if not taskDef.config.compare(oldConfig, shortcut=False, output=logConfigMismatch):
311 raise TypeError(
312 f"Config does not match existing task config {configName!r} in butler; "
313 "tasks configurations must be consistent within the same run collection"
314 )
315 except (LookupError, FileNotFoundError):
316 # FileNotFoundError likely means execution butler
317 # where refs do exist but datastore artifacts do not.
318 pass
319 if oldConfig is None:
320 # butler will raise exception if dataset is already there
321 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, configName)
322 self.butler.put(taskDef.config, configName, {})
324 def savePackageVersions(self, graph):
325 """Write versions of software packages to butler.
327 Parameters
328 ----------
329 graph : `~lsst.pipe.base.QuantumGraph`
330 Execution graph.
332 Raises
333 ------
334 TypeError
335 Raised if ``extendRun`` is `True` but existing object in butler is
336 different from new data.
337 """
338 packages = Packages.fromSystem()
339 _LOG.debug("want to save packages: %s", packages)
340 datasetType = PipelineDatasetTypes.packagesDatasetName
341 dataId = {}
342 oldPackages = None
343 # start transaction to rollback any changes on exceptions
344 with self.butler.transaction():
345 if self.extendRun:
346 try:
347 oldPackages = self.butler.get(datasetType, dataId, collections=[self.butler.run])
348 _LOG.debug("old packages: %s", oldPackages)
349 except (LookupError, FileNotFoundError):
350 # FileNotFoundError likely means execution butler where
351 # refs do exist but datastore artifacts do not.
352 pass
353 if oldPackages is not None:
354 # Note that because we can only detect python modules that have
355 # been imported, the stored list of products may be more or
356 # less complete than what we have now. What's important is
357 # that the products that are in common have the same version.
358 diff = packages.difference(oldPackages)
359 if diff:
360 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
361 raise TypeError(f"Package versions mismatch: ({versions_str})")
362 else:
363 _LOG.debug("new packages are consistent with old")
364 # Update the old set of packages in case we have more packages
365 # that haven't been persisted.
366 extra = packages.extra(oldPackages)
367 if extra:
368 _LOG.debug("extra packages: %s", extra)
369 oldPackages.update(packages)
370 # have to remove existing dataset first, butler has no
371 # replace option.
372 ref = self.butler.registry.findDataset(datasetType, dataId, collections=[self.butler.run])
373 self.butler.pruneDatasets([ref], unstore=True, purge=True)
374 self.butler.put(oldPackages, datasetType, dataId)
375 else:
376 self.butler.put(packages, datasetType, dataId)