Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 15%
171 statements
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-26 09:17 +0000
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-26 09:17 +0000
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["PreExecInit"]
26# -------------------------------
27# Imports of standard modules --
28# -------------------------------
29import abc
30import logging
31from collections.abc import Iterable, Iterator
32from contextlib import contextmanager
33from typing import TYPE_CHECKING, Any
35# -----------------------------
36# Imports for other modules --
37# -----------------------------
38from lsst.daf.butler import DatasetRef, DatasetType
39from lsst.daf.butler.registry import ConflictingDefinitionError
40from lsst.pipe.base import PipelineDatasetTypes
41from lsst.utils.packages import Packages
43from .mock_task import MockButlerQuantumContext
45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true
46 from lsst.daf.butler import Butler, LimitedButler
47 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory
49_LOG = logging.getLogger(__name__)
52class MissingReferenceError(Exception):
53 """Exception raised when resolved reference is missing from graph."""
55 pass
58def _compare_packages(old_packages: Packages, new_packages: Packages) -> None:
59 """Compare two versions of Packages.
61 Parameters
62 ----------
63 old_packages : `Packages`
64 Previously recorded package versions.
65 new_packages : `Packages`
66 New set of package versions.
68 Raises
69 ------
70 TypeError
71 Raised if parameters are inconsistent.
72 """
73 diff = new_packages.difference(old_packages)
74 if diff:
75 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
76 raise TypeError(f"Package versions mismatch: ({versions_str})")
77 else:
78 _LOG.debug("new packages are consistent with old")
81class PreExecInitBase(abc.ABC):
82 """Common part of the implementation of PreExecInit classes that does not
83 depend on Butler type.
84 """
86 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory, extendRun: bool):
87 self.butler = butler
88 self.taskFactory = taskFactory
89 self.extendRun = extendRun
91 def initialize(
92 self,
93 graph: QuantumGraph,
94 saveInitOutputs: bool = True,
95 registerDatasetTypes: bool = False,
96 saveVersions: bool = True,
97 ) -> None:
98 """Perform all initialization steps.
100 Convenience method to execute all initialization steps. Instead of
101 calling this method and providing all options it is also possible to
102 call methods individually.
104 Parameters
105 ----------
106 graph : `~lsst.pipe.base.QuantumGraph`
107 Execution graph.
108 saveInitOutputs : `bool`, optional
109 If ``True`` (default) then save "init outputs", configurations,
110 and package versions to butler.
111 registerDatasetTypes : `bool`, optional
112 If ``True`` then register dataset types in registry, otherwise
113 they must be already registered.
114 saveVersions : `bool`, optional
115 If ``False`` then do not save package versions even if
116 ``saveInitOutputs`` is set to ``True``.
117 """
118 # register dataset types or check consistency
119 self.initializeDatasetTypes(graph, registerDatasetTypes)
121 # Save task initialization data or check that saved data
122 # is consistent with what tasks would save
123 if saveInitOutputs:
124 self.saveInitOutputs(graph)
125 self.saveConfigs(graph)
126 if saveVersions:
127 self.savePackageVersions(graph)
129 @abc.abstractmethod
130 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
131 """Save or check DatasetTypes output by the tasks in a graph.
133 Iterates over all DatasetTypes for all tasks in a graph and either
134 tries to add them to registry or compares them to existing ones.
136 Parameters
137 ----------
138 graph : `~lsst.pipe.base.QuantumGraph`
139 Execution graph.
140 registerDatasetTypes : `bool`, optional
141 If ``True`` then register dataset types in registry, otherwise
142 they must be already registered.
144 Raises
145 ------
146 ValueError
147 Raised if existing DatasetType is different from DatasetType
148 in a graph.
149 KeyError
150 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
151 does not exist in registry.
152 """
153 raise NotImplementedError()
155 def saveInitOutputs(self, graph: QuantumGraph) -> None:
156 """Write any datasets produced by initializing tasks in a graph.
158 Parameters
159 ----------
160 graph : `~lsst.pipe.base.QuantumGraph`
161 Execution graph.
163 Raises
164 ------
165 TypeError
166 Raised if the type of existing object in butler is different from
167 new data.
168 """
169 _LOG.debug("Will save InitOutputs for all tasks")
170 for taskDef in self._task_iter(graph):
171 init_input_refs = graph.initInputRefs(taskDef) or []
172 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs)
173 for name in taskDef.connections.initOutputs:
174 attribute = getattr(taskDef.connections, name)
175 init_output_refs = graph.initOutputRefs(taskDef) or []
176 init_output_ref, obj_from_store = self._find_dataset(init_output_refs, attribute.name)
177 if init_output_ref is None:
178 raise ValueError(f"Cannot find dataset reference for init output {name} in a graph")
179 init_output_var = getattr(task, name)
181 if obj_from_store is not None:
182 _LOG.debug(
183 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name
184 )
185 obj_from_store = self.butler.get(init_output_ref)
186 # Types are supposed to be identical.
187 # TODO: Check that object contents is identical too.
188 if type(obj_from_store) is not type(init_output_var):
189 raise TypeError(
190 f"Stored initOutput object type {type(obj_from_store)} "
191 "is different from task-generated type "
192 f"{type(init_output_var)} for task {taskDef}"
193 )
194 else:
195 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name)
196 # This can still raise if there is a concurrent write.
197 self.butler.put(init_output_var, init_output_ref)
199 def saveConfigs(self, graph: QuantumGraph) -> None:
200 """Write configurations for pipeline tasks to butler or check that
201 existing configurations are equal to the new ones.
203 Parameters
204 ----------
205 graph : `~lsst.pipe.base.QuantumGraph`
206 Execution graph.
208 Raises
209 ------
210 TypeError
211 Raised if existing object in butler is different from new data.
212 Exception
213 Raised if ``extendRun`` is `False` and datasets already exists.
214 Content of a butler collection should not be changed if exception
215 is raised.
216 """
218 def logConfigMismatch(msg: str) -> None:
219 """Log messages about configuration mismatch."""
220 _LOG.fatal("Comparing configuration: %s", msg)
222 _LOG.debug("Will save Configs for all tasks")
223 # start transaction to rollback any changes on exceptions
224 with self.transaction():
225 for taskDef in self._task_iter(graph):
226 # Config dataset ref is stored in task init outputs, but it
227 # may be also be missing.
228 task_output_refs = graph.initOutputRefs(taskDef)
229 if task_output_refs is None:
230 continue
232 config_ref, old_config = self._find_dataset(task_output_refs, taskDef.configDatasetName)
233 if config_ref is None:
234 continue
236 if old_config is not None:
237 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch):
238 raise TypeError(
239 f"Config does not match existing task config {taskDef.configDatasetName!r} in "
240 "butler; tasks configurations must be consistent within the same run collection"
241 )
242 else:
243 _LOG.debug(
244 "Saving Config for task=%s dataset type=%s", taskDef.label, taskDef.configDatasetName
245 )
246 self.butler.put(taskDef.config, config_ref)
248 def savePackageVersions(self, graph: QuantumGraph) -> None:
249 """Write versions of software packages to butler.
251 Parameters
252 ----------
253 graph : `~lsst.pipe.base.QuantumGraph`
254 Execution graph.
256 Raises
257 ------
258 TypeError
259 Raised if existing object in butler is incompatible with new data.
260 """
261 packages = Packages.fromSystem()
262 _LOG.debug("want to save packages: %s", packages)
264 # start transaction to rollback any changes on exceptions
265 with self.transaction():
266 # Packages dataset ref is stored in graph's global init outputs,
267 # but it may be also be missing.
269 packages_ref, old_packages = self._find_dataset(
270 graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName
271 )
272 if packages_ref is None:
273 return
275 if old_packages is not None:
276 # Note that because we can only detect python modules that have
277 # been imported, the stored list of products may be more or
278 # less complete than what we have now. What's important is
279 # that the products that are in common have the same version.
280 _compare_packages(old_packages, packages)
281 # Update the old set of packages in case we have more packages
282 # that haven't been persisted.
283 extra = packages.extra(old_packages)
284 if extra:
285 _LOG.debug("extra packages: %s", extra)
286 old_packages.update(packages)
287 # have to remove existing dataset first, butler has no
288 # replace option.
289 self.butler.pruneDatasets([packages_ref], unstore=True, purge=True)
290 self.butler.put(old_packages, packages_ref)
291 else:
292 self.butler.put(packages, packages_ref)
294 def _find_dataset(
295 self, refs: Iterable[DatasetRef], dataset_type: str
296 ) -> tuple[DatasetRef | None, Any | None]:
297 """Find a ref with a given dataset type name in a list of references
298 and try to retrieve its data from butler.
300 Parameters
301 ----------
302 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
303 References to check for matching dataset type.
304 dataset_type : `str`
305 Name of a dtaset type to look for.
307 Returns
308 -------
309 ref : `DatasetRef` or `None`
310 Dataset reference or `None` if there is no matching dataset type.
311 data : `Any`
312 An existing object extracted from butler, `None` if ``ref`` is
313 `None` or if there is no existing object for that reference.
314 """
315 ref: DatasetRef | None = None
316 for ref in refs:
317 if ref.datasetType.name == dataset_type:
318 break
319 else:
320 return None, None
322 try:
323 data = self.butler.get(ref)
324 if data is not None and not self.extendRun:
325 # It must not exist unless we are extending run.
326 raise ConflictingDefinitionError(f"Dataset {ref} already exists in butler")
327 except (LookupError, FileNotFoundError):
328 data = None
329 return ref, data
331 def _task_iter(self, graph: QuantumGraph) -> Iterator[TaskDef]:
332 """Iterate over TaskDefs in a graph, return only tasks that have one or
333 more associated quanta.
334 """
335 for taskDef in graph.iterTaskGraph():
336 if graph.getNumberOfQuantaForTask(taskDef) > 0:
337 yield taskDef
339 @contextmanager
340 def transaction(self) -> Iterator[None]:
341 """Context manager for transaction.
343 Default implementation has no transaction support.
344 """
345 yield
348class PreExecInit(PreExecInitBase):
349 """Initialization of registry for QuantumGraph execution.
351 This class encapsulates all necessary operations that have to be performed
352 on butler and registry to prepare them for QuantumGraph execution.
354 Parameters
355 ----------
356 butler : `~lsst.daf.butler.Butler`
357 Data butler instance.
358 taskFactory : `~lsst.pipe.base.TaskFactory`
359 Task factory.
360 extendRun : `bool`, optional
361 If `True` then do not try to overwrite any datasets that might exist
362 in ``butler.run``; instead compare them when appropriate/possible. If
363 `False`, then any existing conflicting dataset will cause a butler
364 exception to be raised.
365 mock : `bool`, optional
366 If `True` then also do initialization needed for pipeline mocking.
367 """
369 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False, mock: bool = False):
370 super().__init__(butler, taskFactory, extendRun)
371 self.full_butler = butler
372 self.mock = mock
373 if self.extendRun and self.full_butler.run is None:
374 raise RuntimeError(
375 "Cannot perform extendRun logic unless butler is initialized "
376 "with a default output RUN collection."
377 )
379 @contextmanager
380 def transaction(self) -> Iterator[None]:
381 # dosctring inherited
382 with self.full_butler.transaction():
383 yield
385 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
386 # docstring inherited
387 pipeline = graph.taskGraph
388 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline(
389 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True
390 )
392 for datasetTypes, is_input in (
393 (pipelineDatasetTypes.initIntermediates, True),
394 (pipelineDatasetTypes.initOutputs, False),
395 (pipelineDatasetTypes.intermediates, True),
396 (pipelineDatasetTypes.outputs, False),
397 ):
398 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input)
400 if self.mock:
401 # register special mock data types, skip logs and metadata
402 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline}
403 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline}
404 for datasetTypes, is_input in (
405 (pipelineDatasetTypes.intermediates, True),
406 (pipelineDatasetTypes.outputs, False),
407 ):
408 mockDatasetTypes = []
409 for datasetType in datasetTypes:
410 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()):
411 mockDatasetTypes.append(
412 DatasetType(
413 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name),
414 datasetType.dimensions,
415 "StructuredDataDict",
416 )
417 )
418 if mockDatasetTypes:
419 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input)
421 def _register_output_dataset_types(
422 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool
423 ) -> None:
424 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool:
425 # These are output dataset types so check for compatibility on put.
426 is_compatible = expected.is_compatible_with(datasetType)
428 if is_input:
429 # This dataset type is also used for input so must be
430 # compatible on get as ell.
431 is_compatible = is_compatible and datasetType.is_compatible_with(expected)
433 if is_compatible:
434 _LOG.debug(
435 "The dataset type configurations differ (%s from task != %s from registry) "
436 "but the storage classes are compatible. Can continue.",
437 datasetType,
438 expected,
439 )
440 return is_compatible
442 missing_datasetTypes = set()
443 for datasetType in datasetTypes:
444 # Only composites are registered, no components, and by this point
445 # the composite should already exist.
446 if registerDatasetTypes and not datasetType.isComponent():
447 _LOG.debug("Registering DatasetType %s with registry", datasetType)
448 # this is a no-op if it already exists and is consistent,
449 # and it raises if it is inconsistent.
450 try:
451 self.full_butler.registry.registerDatasetType(datasetType)
452 except ConflictingDefinitionError:
453 if not _check_compatibility(
454 datasetType, self.full_butler.registry.getDatasetType(datasetType.name), is_input
455 ):
456 raise
457 else:
458 _LOG.debug("Checking DatasetType %s against registry", datasetType)
459 try:
460 expected = self.full_butler.registry.getDatasetType(datasetType.name)
461 except KeyError:
462 # Likely means that --register-dataset-types is forgotten.
463 missing_datasetTypes.add(datasetType.name)
464 continue
465 if expected != datasetType:
466 if not _check_compatibility(datasetType, expected, is_input):
467 raise ValueError(
468 f"DatasetType configuration does not match Registry: {datasetType} != {expected}"
469 )
471 if missing_datasetTypes:
472 plural = "s" if len(missing_datasetTypes) != 1 else ""
473 raise KeyError(
474 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. "
475 "Dataset types have to be registered with either `butler register-dataset-type` or "
476 "passing `--register-dataset-types` option to `pipetask run`."
477 )
480class PreExecInitLimited(PreExecInitBase):
481 """Initialization of registry for QuantumGraph execution.
483 This class works with LimitedButler and expects that all references in
484 QuantumGraph are resolved.
486 Parameters
487 ----------
488 butler : `~lsst.daf.butler.LimitedButler`
489 Limited data butler instance.
490 taskFactory : `~lsst.pipe.base.TaskFactory`
491 Task factory.
492 """
494 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory):
495 super().__init__(butler, taskFactory, False)
497 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
498 # docstring inherited
499 # With LimitedButler we never create or check dataset types.
500 pass