Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 18%
159 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-03 10:43 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-03 10:43 +0000
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["PreExecInit"]
32# -------------------------------
33# Imports of standard modules --
34# -------------------------------
35import abc
36import logging
37from collections.abc import Iterable, Iterator
38from contextlib import contextmanager
39from typing import TYPE_CHECKING, Any
41# -----------------------------
42# Imports for other modules --
43# -----------------------------
44from lsst.daf.butler import DatasetRef, DatasetType
45from lsst.daf.butler.registry import ConflictingDefinitionError
46from lsst.pipe.base import PipelineDatasetTypes
47from lsst.pipe.base import automatic_connection_constants as acc
48from lsst.utils.packages import Packages
50if TYPE_CHECKING:
51 from lsst.daf.butler import Butler, LimitedButler
52 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory
54_LOG = logging.getLogger(__name__)
57class MissingReferenceError(Exception):
58 """Exception raised when resolved reference is missing from graph."""
60 pass
63def _compare_packages(old_packages: Packages, new_packages: Packages) -> None:
64 """Compare two versions of Packages.
66 Parameters
67 ----------
68 old_packages : `Packages`
69 Previously recorded package versions.
70 new_packages : `Packages`
71 New set of package versions.
73 Raises
74 ------
75 TypeError
76 Raised if parameters are inconsistent.
77 """
78 diff = new_packages.difference(old_packages)
79 if diff:
80 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
81 raise TypeError(f"Package versions mismatch: ({versions_str})")
82 else:
83 _LOG.debug("new packages are consistent with old")
86class PreExecInitBase(abc.ABC):
87 """Common part of the implementation of PreExecInit classes that does not
88 depend on Butler type.
89 """
91 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool):
92 self.butler = butler
93 self.taskFactory = taskFactory
94 self.extendRun = extendRun
96 def initialize(
97 self,
98 graph: QuantumGraph,
99 saveInitOutputs: bool = True,
100 registerDatasetTypes: bool = False,
101 saveVersions: bool = True,
102 ) -> None:
103 """Perform all initialization steps.
105 Convenience method to execute all initialization steps. Instead of
106 calling this method and providing all options it is also possible to
107 call methods individually.
109 Parameters
110 ----------
111 graph : `~lsst.pipe.base.QuantumGraph`
112 Execution graph.
113 saveInitOutputs : `bool`, optional
114 If ``True`` (default) then save "init outputs", configurations,
115 and package versions to butler.
116 registerDatasetTypes : `bool`, optional
117 If ``True`` then register dataset types in registry, otherwise
118 they must be already registered.
119 saveVersions : `bool`, optional
120 If ``False`` then do not save package versions even if
121 ``saveInitOutputs`` is set to ``True``.
122 """
123 # register dataset types or check consistency
124 self.initializeDatasetTypes(graph, registerDatasetTypes)
126 # Save task initialization data or check that saved data
127 # is consistent with what tasks would save
128 if saveInitOutputs:
129 self.saveInitOutputs(graph)
130 self.saveConfigs(graph)
131 if saveVersions:
132 self.savePackageVersions(graph)
134 @abc.abstractmethod
135 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
136 """Save or check DatasetTypes output by the tasks in a graph.
138 Iterates over all DatasetTypes for all tasks in a graph and either
139 tries to add them to registry or compares them to existing ones.
141 Parameters
142 ----------
143 graph : `~lsst.pipe.base.QuantumGraph`
144 Execution graph.
145 registerDatasetTypes : `bool`, optional
146 If ``True`` then register dataset types in registry, otherwise
147 they must be already registered.
149 Raises
150 ------
151 ValueError
152 Raised if existing DatasetType is different from DatasetType
153 in a graph.
154 KeyError
155 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
156 does not exist in registry.
157 """
158 raise NotImplementedError()
160 def saveInitOutputs(self, graph: QuantumGraph) -> None:
161 """Write any datasets produced by initializing tasks in a graph.
163 Parameters
164 ----------
165 graph : `~lsst.pipe.base.QuantumGraph`
166 Execution graph.
168 Raises
169 ------
170 TypeError
171 Raised if the type of existing object in butler is different from
172 new data.
173 """
174 _LOG.debug("Will save InitOutputs for all tasks")
175 for taskDef in self._task_iter(graph):
176 init_input_refs = graph.initInputRefs(taskDef) or []
177 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs)
178 for name in taskDef.connections.initOutputs:
179 attribute = getattr(taskDef.connections, name)
180 init_output_refs = graph.initOutputRefs(taskDef) or []
181 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name)
182 if init_output_ref is None:
183 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph")
184 init_output_var = getattr(task, name)
186 if obj_from_store is not None:
187 _LOG.debug(
188 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name
189 )
190 obj_from_store = self.butler.get(init_output_ref)
191 # Types are supposed to be identical.
192 # TODO: Check that object contents is identical too.
193 if type(obj_from_store) is not type(init_output_var):
194 raise TypeError(
195 f"Stored initOutput object type {type(obj_from_store)} "
196 "is different from task-generated type "
197 f"{type(init_output_var)} for task {taskDef}"
198 )
199 else:
200 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name)
201 # This can still raise if there is a concurrent write.
202 self.butler.put(init_output_var, init_output_ref)
204 def saveConfigs(self, graph: QuantumGraph) -> None:
205 """Write configurations for pipeline tasks to butler or check that
206 existing configurations are equal to the new ones.
208 Parameters
209 ----------
210 graph : `~lsst.pipe.base.QuantumGraph`
211 Execution graph.
213 Raises
214 ------
215 TypeError
216 Raised if existing object in butler is different from new data.
217 Exception
218 Raised if ``extendRun`` is `False` and datasets already exists.
219 Content of a butler collection should not be changed if exception
220 is raised.
221 """
223 def logConfigMismatch(msg: str) -> None:
224 """Log messages about configuration mismatch."""
225 _LOG.fatal("Comparing configuration: %s", msg)
227 _LOG.debug("Will save Configs for all tasks")
228 # start transaction to rollback any changes on exceptions
229 with self.transaction():
230 for taskDef in self._task_iter(graph):
231 # Config dataset ref is stored in task init outputs, but it
232 # may be also be missing.
233 task_output_refs = graph.initOutputRefs(taskDef)
234 if task_output_refs is None:
235 continue
237 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName)
238 if config_ref is None:
239 continue
241 if old_config is not None:
242 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch):
243 raise TypeError(
244 f"Config does not match existing task config {taskDef.configDatasetName!r} in "
245 "butler; tasks configurations must be consistent within the same run collection"
246 )
247 else:
248 _LOG.debug(
249 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName
250 )
251 self.butler.put(taskDef.config, config_ref)
253 def savePackageVersions(self, graph: QuantumGraph) -> None:
254 """Write versions of software packages to butler.
256 Parameters
257 ----------
258 graph : `~lsst.pipe.base.QuantumGraph`
259 Execution graph.
261 Raises
262 ------
263 TypeError
264 Raised if existing object in butler is incompatible with new data.
265 """
266 packages = Packages.fromSystem()
267 _LOG.debug("want to save packages: %s", packages)
269 # start transaction to rollback any changes on exceptions
270 with self.transaction():
271 # Packages dataset ref is stored in graph's global init outputs,
272 # but it may be also be missing.
274 packages_ref, old_packages = self._find_dataset(
275 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName
276 )
277 if packages_ref is None:
278 return
280 if old_packages is not None:
281 # Note that because we can only detect python modules that have
282 # been imported, the stored list of products may be more or
283 # less complete than what we have now. What's important is
284 # that the products that are in common have the same version.
285 _compare_packages(old_packages, packages)
286 # Update the old set of packages in case we have more packages
287 # that haven't been persisted.
288 extra = packages.extra(old_packages)
289 if extra:
290 _LOG.debug("extra packages: %s", extra)
291 old_packages.update(packages)
292 # have to remove existing dataset first, butler has no
293 # replace option.
294 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True)
295 self.butler.put(old_packages, packages_ref)
296 else:
297 self.butler.put(packages, packages_ref)
299 def _find_dataset(
300 self, refs: Iterable[DatasetRef], dataset_type: str
301 ) -> tuple[DatasetRef | None, Any | None]:
302 """Find a ref with a given dataset type name in a list of references
303 and try to retrieve its data from butler.
305 Parameters
306 ----------
307 refs : `~collections.abc.Iterable` [ `~lsst.daf.butler.DatasetRef` ]
308 References to check for matching dataset type.
309 dataset_type : `str`
310 Name of a dataset type to look for.
312 Returns
313 -------
314 ref : `~lsst.daf.butler.DatasetRef` or `None`
315 Dataset reference or `None` if there is no matching dataset type.
316 data : `Any`
317 An existing object extracted from butler, `None` if ``ref`` is
318 `None` or if there is no existing object for that reference.
319 """
320 ref: DatasetRef | None = None
321 for ref in refs:
322 if ref.datasetType.name == dataset_type:
323 break
324 else:
325 return None, None
327 try:
328 data = self.butler.get(ref)
329 if data is not None and not self.extendRun:
330 # It must not exist unless we are extending run.
331 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler")
332 except (LookupError, FileNotFoundError):
333 data = None
334 return ref, data
336 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]:
337 """Iterate over TaskDefs in a graph, return only tasks that have one or
338 more associated quanta.
339 """
340 for taskDef in graph.iterTaskGraph():
341 if graph.getNumberOfQuantaForTask(taskDef) > 0:
342 yield taskDef
344 @contextmanager
345 def transaction(self) -> Iterator[None]:
346 """Context manager for transaction.
348 Default implementation has no transaction support.
349 """
350 yield
353class PreExecInit(PreExecInitBase):
354 """Initialization of registry for QuantumGraph execution.
356 This class encapsulates all necessary operations that have to be performed
357 on butler and registry to prepare them for QuantumGraph execution.
359 Parameters
360 ----------
361 butler : `~lsst.daf.butler.Butler`
362 Data butler instance.
363 taskFactory : `~lsst.pipe.base.TaskFactory`
364 Task factory.
365 extendRun : `bool`, optional
366 If `True` then do not try to overwrite any datasets that might exist
367 in ``butler.run``; instead compare them when appropriate/possible. If
368 `False`, then any existing conflicting dataset will cause a butler
369 exception to be raised.
370 """
372 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False):
373 super().__init__(butler, taskFactory, extendRun)
374 self.full_butler = butler
375 if self.extendRun and self.full_butler.run is None:
376 raise RuntimeError(
377 "Cannot perform extendRun logic unless butler is initialized "
378 "with a default output RUN collection."
379 )
381 @contextmanager
382 def transaction(self) -> Iterator[None]:
383 # dosctring inherited
384 with self.full_butler.transaction():
385 yield
387 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
388 # docstring inherited
389 pipeline = graph.taskGraph
390 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline(
391 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True
392 )
393 # The "registry dataset types" saved with the QG have had their storage
394 # classes carefully resolved by PipelineGraph, whereas the dataset
395 # types from PipelineDatasetTypes are a mess because it uses
396 # NamedValueSet and that ignores storage classes. It will be fully
397 # removed here (and deprecated everywhere) on DM-40441.
398 # Note that these "registry dataset types" include dataset types that
399 # are not actually registered yet; they're the PipelineGraph's
400 # determination of what _should_ be registered.
401 registry_storage_classes = {
402 dataset_type.name: dataset_type.storageClass_name for dataset_type in graph.registryDatasetTypes()
403 }
404 registry_storage_classes[acc.PACKAGES_INIT_OUTPUT_NAME] = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS
405 dataset_types: Iterable[DatasetType]
406 for dataset_types, is_input in (
407 (pipelineDatasetTypes.initIntermediates, True),
408 (pipelineDatasetTypes.initOutputs, False),
409 (pipelineDatasetTypes.intermediates, True),
410 (pipelineDatasetTypes.outputs, False),
411 ):
412 dataset_types = [
413 (
414 # The registry dataset types do not include components, but
415 # we don't support storage class overrides for those in
416 # other contexts anyway, and custom-built QGs may not have
417 # the registry dataset types field populated at all.x
418 dataset_type.overrideStorageClass(registry_storage_classes[dataset_type.name])
419 if dataset_type.name in registry_storage_classes
420 else dataset_type
421 )
422 for dataset_type in dataset_types
423 ]
424 self._register_output_dataset_types(registerDatasetTypes, dataset_types, is_input)
426 def _register_output_dataset_types(
427 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool
428 ) -> None:
429 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool:
430 # These are output dataset types so check for compatibility on put.
431 is_compatible = expected.is_compatible_with(datasetType)
433 if is_input:
434 # This dataset type is also used for input so must be
435 # compatible on get as ell.
436 is_compatible = is_compatible and datasetType.is_compatible_with(expected)
438 if is_compatible:
439 _LOG.debug(
440 "The dataset type configurations differ (%s from task != %s from registry) "
441 "but the storage classes are compatible. Can continue.",
442 datasetType,
443 expected,
444 )
445 return is_compatible
447 missing_datasetTypes = set()
448 for datasetType in datasetTypes:
449 # Only composites are registered, no components, and by this point
450 # the composite should already exist.
451 if registerDatasetTypes and not datasetType.isComponent():
452 _LOG.debug("Registering DatasetType %s with registry", datasetType)
453 # this is a no-op if it already exists and is consistent,
454 # and it raises if it is inconsistent.
455 try:
456 self.full_butler.registry.registerDatasetType(datasetType)
457 except ConflictingDefinitionError:
458 if not _check_compatibility(
459 datasetType, self.full_butler.get_dataset_type(datasetType.name), is_input
460 ):
461 raise
462 else:
463 _LOG.debug("Checking DatasetType %s against registry", datasetType)
464 try:
465 expected = self.full_butler.get_dataset_type(datasetType.name)
466 except KeyError:
467 # Likely means that --register-dataset-types is forgotten.
468 missing_datasetTypes.add(datasetType.name)
469 continue
470 if expected != datasetType:
471 if not _check_compatibility(datasetType, expected, is_input):
472 raise ValueError(
473 f"DatasetType configuration does not match Registry: {datasetType} != {expected}"
474 )
476 if missing_datasetTypes:
477 plural = "s" if len(missing_datasetTypes) != 1 else ""
478 raise KeyError(
479 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. "
480 "Dataset types have to be registered with either `butler register-dataset-type` or "
481 "passing `--register-dataset-types` option to `pipetask run`."
482 )
485class PreExecInitLimited(PreExecInitBase):
486 """Initialization of registry for QuantumGraph execution.
488 This class works with LimitedButler and expects that all references in
489 QuantumGraph are resolved.
491 Parameters
492 ----------
493 butler : `~lsst.daf.butler.LimitedButler`
494 Limited data butler instance.
495 taskFactory : `~lsst.pipe.base.TaskFactory`
496 Task factory.
497 """
499 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory):
500 super().__init__(butler, taskFactory, False)
502 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
503 # docstring inherited
504 # With LimitedButler we never create or check dataset types.
505 pass