Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 16%
155 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-09 02:48 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-09 02:48 -0700
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["PreExecInit"]
26# -------------------------------
27# Imports of standard modules --
28# -------------------------------
29import abc
30import logging
31from collections.abc import Iterable, Iterator
32from contextlib import contextmanager
33from typing import TYPE_CHECKING, Any
35# -----------------------------
36# Imports for other modules --
37# -----------------------------
38from lsst.daf.butler import DatasetRef, DatasetType
39from lsst.daf.butler.registry import ConflictingDefinitionError
40from lsst.pipe.base import PipelineDatasetTypes
41from lsst.utils.packages import Packages
43if TYPE_CHECKING:
44 from lsst.daf.butler import Butler, LimitedButler
45 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory
47_LOG = logging.getLogger(__name__)
50class MissingReferenceError(Exception):
51 """Exception raised when resolved reference is missing from graph."""
53 pass
56def _compare_packages(old_packages: Packages, new_packages: Packages) -> None:
57 """Compare two versions of Packages.
59 Parameters
60 ----------
61 old_packages : `Packages`
62 Previously recorded package versions.
63 new_packages : `Packages`
64 New set of package versions.
66 Raises
67 ------
68 TypeError
69 Raised if parameters are inconsistent.
70 """
71 diff = new_packages.difference(old_packages)
72 if diff:
73 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
74 raise TypeError(f"Package versions mismatch: ({versions_str})")
75 else:
76 _LOG.debug("new packages are consistent with old")
79class PreExecInitBase(abc.ABC):
80 """Common part of the implementation of PreExecInit classes that does not
81 depend on Butler type.
82 """
84 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool):
85 self.butler = butler
86 self.taskFactory = taskFactory
87 self.extendRun = extendRun
89 def initialize(
90 self,
91 graph: QuantumGraph,
92 saveInitOutputs: bool = True,
93 registerDatasetTypes: bool = False,
94 saveVersions: bool = True,
95 ) -> None:
96 """Perform all initialization steps.
98 Convenience method to execute all initialization steps. Instead of
99 calling this method and providing all options it is also possible to
100 call methods individually.
102 Parameters
103 ----------
104 graph : `~lsst.pipe.base.QuantumGraph`
105 Execution graph.
106 saveInitOutputs : `bool`, optional
107 If ``True`` (default) then save "init outputs", configurations,
108 and package versions to butler.
109 registerDatasetTypes : `bool`, optional
110 If ``True`` then register dataset types in registry, otherwise
111 they must be already registered.
112 saveVersions : `bool`, optional
113 If ``False`` then do not save package versions even if
114 ``saveInitOutputs`` is set to ``True``.
115 """
116 # register dataset types or check consistency
117 self.initializeDatasetTypes(graph, registerDatasetTypes)
119 # Save task initialization data or check that saved data
120 # is consistent with what tasks would save
121 if saveInitOutputs:
122 self.saveInitOutputs(graph)
123 self.saveConfigs(graph)
124 if saveVersions:
125 self.savePackageVersions(graph)
127 @abc.abstractmethod
128 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
129 """Save or check DatasetTypes output by the tasks in a graph.
131 Iterates over all DatasetTypes for all tasks in a graph and either
132 tries to add them to registry or compares them to existing ones.
134 Parameters
135 ----------
136 graph : `~lsst.pipe.base.QuantumGraph`
137 Execution graph.
138 registerDatasetTypes : `bool`, optional
139 If ``True`` then register dataset types in registry, otherwise
140 they must be already registered.
142 Raises
143 ------
144 ValueError
145 Raised if existing DatasetType is different from DatasetType
146 in a graph.
147 KeyError
148 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
149 does not exist in registry.
150 """
151 raise NotImplementedError()
153 def saveInitOutputs(self, graph: QuantumGraph) -> None:
154 """Write any datasets produced by initializing tasks in a graph.
156 Parameters
157 ----------
158 graph : `~lsst.pipe.base.QuantumGraph`
159 Execution graph.
161 Raises
162 ------
163 TypeError
164 Raised if the type of existing object in butler is different from
165 new data.
166 """
167 _LOG.debug("Will save InitOutputs for all tasks")
168 for taskDef in self._task_iter(graph):
169 init_input_refs = graph.initInputRefs(taskDef) or []
170 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs)
171 for name in taskDef.connections.initOutputs:
172 attribute = getattr(taskDef.connections, name)
173 init_output_refs = graph.initOutputRefs(taskDef) or []
174 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name)
175 if init_output_ref is None:
176 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph")
177 init_output_var = getattr(task, name)
179 if obj_from_store is not None:
180 _LOG.debug(
181 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name
182 )
183 obj_from_store = self.butler.get(init_output_ref)
184 # Types are supposed to be identical.
185 # TODO: Check that object contents is identical too.
186 if type(obj_from_store) is not type(init_output_var):
187 raise TypeError(
188 f"Stored initOutput object type {type(obj_from_store)} "
189 "is different from task-generated type "
190 f"{type(init_output_var)} for task {taskDef}"
191 )
192 else:
193 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name)
194 # This can still raise if there is a concurrent write.
195 self.butler.put(init_output_var, init_output_ref)
197 def saveConfigs(self, graph: QuantumGraph) -> None:
198 """Write configurations for pipeline tasks to butler or check that
199 existing configurations are equal to the new ones.
201 Parameters
202 ----------
203 graph : `~lsst.pipe.base.QuantumGraph`
204 Execution graph.
206 Raises
207 ------
208 TypeError
209 Raised if existing object in butler is different from new data.
210 Exception
211 Raised if ``extendRun`` is `False` and datasets already exists.
212 Content of a butler collection should not be changed if exception
213 is raised.
214 """
216 def logConfigMismatch(msg: str) -> None:
217 """Log messages about configuration mismatch."""
218 _LOG.fatal("Comparing configuration: %s", msg)
220 _LOG.debug("Will save Configs for all tasks")
221 # start transaction to rollback any changes on exceptions
222 with self.transaction():
223 for taskDef in self._task_iter(graph):
224 # Config dataset ref is stored in task init outputs, but it
225 # may be also be missing.
226 task_output_refs = graph.initOutputRefs(taskDef)
227 if task_output_refs is None:
228 continue
230 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName)
231 if config_ref is None:
232 continue
234 if old_config is not None:
235 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch):
236 raise TypeError(
237 f"Config does not match existing task config {taskDef.configDatasetName!r} in "
238 "butler; tasks configurations must be consistent within the same run collection"
239 )
240 else:
241 _LOG.debug(
242 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName
243 )
244 self.butler.put(taskDef.config, config_ref)
246 def savePackageVersions(self, graph: QuantumGraph) -> None:
247 """Write versions of software packages to butler.
249 Parameters
250 ----------
251 graph : `~lsst.pipe.base.QuantumGraph`
252 Execution graph.
254 Raises
255 ------
256 TypeError
257 Raised if existing object in butler is incompatible with new data.
258 """
259 packages = Packages.fromSystem()
260 _LOG.debug("want to save packages: %s", packages)
262 # start transaction to rollback any changes on exceptions
263 with self.transaction():
264 # Packages dataset ref is stored in graph's global init outputs,
265 # but it may be also be missing.
267 packages_ref, old_packages = self._find_dataset(
268 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName
269 )
270 if packages_ref is None:
271 return
273 if old_packages is not None:
274 # Note that because we can only detect python modules that have
275 # been imported, the stored list of products may be more or
276 # less complete than what we have now. What's important is
277 # that the products that are in common have the same version.
278 _compare_packages(old_packages, packages)
279 # Update the old set of packages in case we have more packages
280 # that haven't been persisted.
281 extra = packages.extra(old_packages)
282 if extra:
283 _LOG.debug("extra packages: %s", extra)
284 old_packages.update(packages)
285 # have to remove existing dataset first, butler has no
286 # replace option.
287 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True)
288 self.butler.put(old_packages, packages_ref)
289 else:
290 self.butler.put(packages, packages_ref)
292 def _find_dataset(
293 self, refs: Iterable[DatasetRef], dataset_type: str
294 ) -> tuple[DatasetRef | None, Any | None]:
295 """Find a ref with a given dataset type name in a list of references
296 and try to retrieve its data from butler.
298 Parameters
299 ----------
300 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
301 References to check for matching dataset type.
302 dataset_type : `str`
303 Name of a dtaset type to look for.
305 Returns
306 -------
307 ref : `DatasetRef` or `None`
308 Dataset reference or `None` if there is no matching dataset type.
309 data : `Any`
310 An existing object extracted from butler, `None` if ``ref`` is
311 `None` or if there is no existing object for that reference.
312 """
313 ref: DatasetRef | None = None
314 for ref in refs:
315 if ref.datasetType.name == dataset_type:
316 break
317 else:
318 return None, None
320 try:
321 data = self.butler.get(ref)
322 if data is not None and not self.extendRun:
323 # It must not exist unless we are extending run.
324 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler")
325 except (LookupError, FileNotFoundError):
326 data = None
327 return ref, data
329 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]:
330 """Iterate over TaskDefs in a graph, return only tasks that have one or
331 more associated quanta.
332 """
333 for taskDef in graph.iterTaskGraph():
334 if graph.getNumberOfQuantaForTask(taskDef) > 0:
335 yield taskDef
337 @contextmanager
338 def transaction(self) -> Iterator[None]:
339 """Context manager for transaction.
341 Default implementation has no transaction support.
342 """
343 yield
346class PreExecInit(PreExecInitBase):
347 """Initialization of registry for QuantumGraph execution.
349 This class encapsulates all necessary operations that have to be performed
350 on butler and registry to prepare them for QuantumGraph execution.
352 Parameters
353 ----------
354 butler : `~lsst.daf.butler.Butler`
355 Data butler instance.
356 taskFactory : `~lsst.pipe.base.TaskFactory`
357 Task factory.
358 extendRun : `bool`, optional
359 If `True` then do not try to overwrite any datasets that might exist
360 in ``butler.run``; instead compare them when appropriate/possible. If
361 `False`, then any existing conflicting dataset will cause a butler
362 exception to be raised.
363 """
365 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False):
366 super().__init__(butler, taskFactory, extendRun)
367 self.full_butler = butler
368 if self.extendRun and self.full_butler.run is None:
369 raise RuntimeError(
370 "Cannot perform extendRun logic unless butler is initialized "
371 "with a default output RUN collection."
372 )
374 @contextmanager
375 def transaction(self) -> Iterator[None]:
376 # dosctring inherited
377 with self.full_butler.transaction():
378 yield
380 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
381 # docstring inherited
382 pipeline = graph.taskGraph
383 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline(
384 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True
385 )
387 for datasetTypes, is_input in (
388 (pipelineDatasetTypes.initIntermediates, True),
389 (pipelineDatasetTypes.initOutputs, False),
390 (pipelineDatasetTypes.intermediates, True),
391 (pipelineDatasetTypes.outputs, False),
392 ):
393 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input)
395 def _register_output_dataset_types(
396 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool
397 ) -> None:
398 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool:
399 # These are output dataset types so check for compatibility on put.
400 is_compatible = expected.is_compatible_with(datasetType)
402 if is_input:
403 # This dataset type is also used for input so must be
404 # compatible on get as ell.
405 is_compatible = is_compatible and datasetType.is_compatible_with(expected)
407 if is_compatible:
408 _LOG.debug(
409 "The dataset type configurations differ (%s from task != %s from registry) "
410 "but the storage classes are compatible. Can continue.",
411 datasetType,
412 expected,
413 )
414 return is_compatible
416 missing_datasetTypes = set()
417 for datasetType in datasetTypes:
418 # Only composites are registered, no components, and by this point
419 # the composite should already exist.
420 if registerDatasetTypes and not datasetType.isComponent():
421 _LOG.debug("Registering DatasetType %s with registry", datasetType)
422 # this is a no-op if it already exists and is consistent,
423 # and it raises if it is inconsistent.
424 try:
425 self.full_butler.registry.registerDatasetType(datasetType)
426 except ConflictingDefinitionError:
427 if not _check_compatibility(
428 datasetType, self.full_butler.registry.getDatasetType(datasetType.name), is_input
429 ):
430 raise
431 else:
432 _LOG.debug("Checking DatasetType %s against registry", datasetType)
433 try:
434 expected = self.full_butler.registry.getDatasetType(datasetType.name)
435 except KeyError:
436 # Likely means that --register-dataset-types is forgotten.
437 missing_datasetTypes.add(datasetType.name)
438 continue
439 if expected != datasetType:
440 if not _check_compatibility(datasetType, expected, is_input):
441 raise ValueError(
442 f"DatasetType configuration does not match Registry: {datasetType} != {expected}"
443 )
445 if missing_datasetTypes:
446 plural = "s" if len(missing_datasetTypes) != 1 else ""
447 raise KeyError(
448 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. "
449 "Dataset types have to be registered with either `butler register-dataset-type` or "
450 "passing `--register-dataset-types` option to `pipetask run`."
451 )
454class PreExecInitLimited(PreExecInitBase):
455 """Initialization of registry for QuantumGraph execution.
457 This class works with LimitedButler and expects that all references in
458 QuantumGraph are resolved.
460 Parameters
461 ----------
462 butler : `~lsst.daf.butler.LimitedButler`
463 Limited data butler instance.
464 taskFactory : `~lsst.pipe.base.TaskFactory`
465 Task factory.
466 """
468 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory):
469 super().__init__(butler, taskFactory, False)
471 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
472 # docstring inherited
473 # With LimitedButler we never create or check dataset types.
474 pass