Coverage for python/lsst/ctrl/mpexec/preExecInit.py: 18%
203 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-13 02:55 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-13 02:55 -0800
1# This file is part of ctrl_mpexec.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["PreExecInit"]
26# -------------------------------
27# Imports of standard modules --
28# -------------------------------
29import abc
30import logging
31from collections.abc import Iterable, Iterator
32from contextlib import contextmanager
33from typing import TYPE_CHECKING, Any
35# -----------------------------
36# Imports for other modules --
37# -----------------------------
38from lsst.daf.butler import DataCoordinate, DatasetIdFactory, DatasetRef, DatasetType
39from lsst.daf.butler.registry import ConflictingDefinitionError
40from lsst.pipe.base import PipelineDatasetTypes
41from lsst.utils.packages import Packages
43from .mock_task import MockButlerQuantumContext
45if TYPE_CHECKING: 45 ↛ 46line 45 didn't jump to line 46, because the condition on line 45 was never true
46 from lsst.daf.butler import Butler, LimitedButler
47 from lsst.pipe.base import QuantumGraph, TaskDef, TaskFactory
49_LOG = logging.getLogger(__name__)
52class MissingReferenceError(Exception):
53 """Exception raised when resolved reference is missing from graph."""
55 pass
58def _compare_packages(old_packages: Packages, new_packages: Packages) -> None:
59 """Compare two versions of Packages.
61 Parameters
62 ----------
63 old_packages : `Packages`
64 Previously recorded package versions.
65 new_packages : `Packages`
66 New set of package versions.
68 Raises
69 ------
70 TypeError
71 Raised if parameters are inconsistent.
72 """
73 diff = new_packages.difference(old_packages)
74 if diff:
75 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff)
76 raise TypeError(f"Package versions mismatch: ({versions_str})")
77 else:
78 _LOG.debug("new packages are consistent with old")
81class PreExecInitBase(abc.ABC):
82 """Common part of the implementation of PreExecInit classes that does not
83 depend on Butler type.
84 """
86 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory):
87 self.butler = butler
88 self.taskFactory = taskFactory
90 def initialize(
91 self,
92 graph: QuantumGraph,
93 saveInitOutputs: bool = True,
94 registerDatasetTypes: bool = False,
95 saveVersions: bool = True,
96 ) -> None:
97 """Perform all initialization steps.
99 Convenience method to execute all initialization steps. Instead of
100 calling this method and providing all options it is also possible to
101 call methods individually.
103 Parameters
104 ----------
105 graph : `~lsst.pipe.base.QuantumGraph`
106 Execution graph.
107 saveInitOutputs : `bool`, optional
108 If ``True`` (default) then save "init outputs", configurations,
109 and package versions to butler.
110 registerDatasetTypes : `bool`, optional
111 If ``True`` then register dataset types in registry, otherwise
112 they must be already registered.
113 saveVersions : `bool`, optional
114 If ``False`` then do not save package versions even if
115 ``saveInitOutputs`` is set to ``True``.
116 """
117 # register dataset types or check consistency
118 self.initializeDatasetTypes(graph, registerDatasetTypes)
120 # Save task initialization data or check that saved data
121 # is consistent with what tasks would save
122 if saveInitOutputs:
123 self.saveInitOutputs(graph)
124 self.saveConfigs(graph)
125 if saveVersions:
126 self.savePackageVersions(graph)
128 @abc.abstractmethod
129 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
130 """Save or check DatasetTypes output by the tasks in a graph.
132 Iterates over all DatasetTypes for all tasks in a graph and either
133 tries to add them to registry or compares them to existing ones.
135 Parameters
136 ----------
137 graph : `~lsst.pipe.base.QuantumGraph`
138 Execution graph.
139 registerDatasetTypes : `bool`, optional
140 If ``True`` then register dataset types in registry, otherwise
141 they must be already registered.
143 Raises
144 ------
145 ValueError
146 Raised if existing DatasetType is different from DatasetType
147 in a graph.
148 KeyError
149 Raised if ``registerDatasetTypes`` is ``False`` and DatasetType
150 does not exist in registry.
151 """
152 raise NotImplementedError()
154 def saveInitOutputs(self, graph: QuantumGraph) -> None:
155 """Write any datasets produced by initializing tasks in a graph.
157 Parameters
158 ----------
159 graph : `~lsst.pipe.base.QuantumGraph`
160 Execution graph.
162 Raises
163 ------
164 TypeError
165 Raised if the type of existing object in butler is different from
166 new data.
167 """
168 _LOG.debug("Will save InitOutputs for all tasks")
169 for taskDef in graph.iterTaskGraph():
170 init_input_refs = self.find_init_input_refs(taskDef, graph)
171 task = self.taskFactory.makeTask(taskDef, self.butler, init_input_refs)
172 for name in taskDef.connections.initOutputs:
173 attribute = getattr(taskDef.connections, name)
174 obj_from_store, init_output_ref = self.find_init_output(taskDef, attribute.name, graph)
175 if init_output_ref is None:
176 raise ValueError(f"Cannot find or make dataset reference for init output {name}")
177 init_output_var = getattr(task, name)
179 if obj_from_store is not None:
180 _LOG.debug(
181 "Retrieving InitOutputs for task=%s key=%s dsTypeName=%s", task, name, attribute.name
182 )
183 obj_from_store = self.butler.getDirect(init_output_ref)
184 # Types are supposed to be identical.
185 # TODO: Check that object contents is identical too.
186 if type(obj_from_store) is not type(init_output_var):
187 raise TypeError(
188 f"Stored initOutput object type {type(obj_from_store)} "
189 f"is different from task-generated type "
190 f"{type(init_output_var)} for task {taskDef}"
191 )
192 else:
193 _LOG.debug("Saving InitOutputs for task=%s key=%s", taskDef.label, name)
194 # This can still raise if there is a concurrent write.
195 self.butler.putDirect(init_output_var, init_output_ref)
197 def saveConfigs(self, graph: QuantumGraph) -> None:
198 """Write configurations for pipeline tasks to butler or check that
199 existing configurations are equal to the new ones.
201 Parameters
202 ----------
203 graph : `~lsst.pipe.base.QuantumGraph`
204 Execution graph.
206 Raises
207 ------
208 TypeError
209 Raised if existing object in butler is different from new data.
210 Exception
211 Raised if ``extendRun`` is `False` and datasets already exists.
212 Content of a butler collection should not be changed if exception
213 is raised.
214 """
216 def logConfigMismatch(msg: str) -> None:
217 """Log messages about configuration mismatch."""
218 _LOG.fatal("Comparing configuration: %s", msg)
220 _LOG.debug("Will save Configs for all tasks")
221 # start transaction to rollback any changes on exceptions
222 with self.transaction():
223 for taskDef in graph.iterTaskGraph():
224 config_name = taskDef.configDatasetName
226 old_config, dataset_ref = self.find_init_output(taskDef, taskDef.configDatasetName, graph)
228 if old_config is not None:
229 if not taskDef.config.compare(old_config, shortcut=False, output=logConfigMismatch):
230 raise TypeError(
231 f"Config does not match existing task config {taskDef.configDatasetName!r} in "
232 "butler; tasks configurations must be consistent within the same run collection"
233 )
234 else:
235 # butler will raise exception if dataset is already there
236 _LOG.debug("Saving Config for task=%s dataset type=%s", taskDef.label, config_name)
237 self.butler.putDirect(taskDef.config, dataset_ref)
239 def savePackageVersions(self, graph: QuantumGraph) -> None:
240 """Write versions of software packages to butler.
242 Parameters
243 ----------
244 graph : `~lsst.pipe.base.QuantumGraph`
245 Execution graph.
247 Raises
248 ------
249 TypeError
250 Raised if existing object in butler is incompatible with new data.
251 """
252 packages = Packages.fromSystem()
253 _LOG.debug("want to save packages: %s", packages)
255 # start transaction to rollback any changes on exceptions
256 with self.transaction():
258 old_packages, dataset_ref = self.find_packages(graph)
260 if old_packages is not None:
261 # Note that because we can only detect python modules that have
262 # been imported, the stored list of products may be more or
263 # less complete than what we have now. What's important is
264 # that the products that are in common have the same version.
265 _compare_packages(old_packages, packages)
266 # Update the old set of packages in case we have more packages
267 # that haven't been persisted.
268 extra = packages.extra(old_packages)
269 if extra:
270 _LOG.debug("extra packages: %s", extra)
271 old_packages.update(packages)
272 # have to remove existing dataset first, butler has no
273 # replace option.
274 self.butler.pruneDatasets([dataset_ref], unstore=True, purge=True)
275 self.butler.putDirect(old_packages, dataset_ref)
276 else:
277 self.butler.putDirect(packages, dataset_ref)
279 @abc.abstractmethod
280 def find_init_input_refs(self, taskDef: TaskDef, graph: QuantumGraph) -> Iterable[DatasetRef]:
281 """Return the list of resolved dataset references for task init inputs.
283 Parameters
284 ----------
285 taskDef : `~lsst.pipe.base.TaskDef`
286 Pipeline task definition.
287 graph : `~lsst.pipe.base.QuantumGraph`
288 Quantum graph.
290 Returns
291 -------
292 refs : `~collections.abc.Iterable` [`~lsst.daf.butler.DatasetRef`]
293 Resolved dataset references.
294 """
295 raise NotImplementedError()
297 @abc.abstractmethod
298 def find_init_output(
299 self, taskDef: TaskDef, dataset_type: str, graph: QuantumGraph
300 ) -> tuple[Any | None, DatasetRef]:
301 """Find task init output for given dataset type.
303 Parameters
304 ----------
305 taskDef : `~lsst.pipe.base.TaskDef`
306 Pipeline task definition.
307 dataset_type : `str`
308 Dataset type name.
309 graph : `~lsst.pipe.base.QuantumGraph`
310 Quantum graph.
312 Returns
313 -------
314 data
315 Existing init output object retrieved from butler, `None` if butler
316 has no existing object.
317 ref : `~lsst.daf.butler.DatasetRef`
318 Resolved reference for init output to be stored in butler.
320 Raises
321 ------
322 MissingReferenceError
323 Raised if reference cannot be found or generated.
324 """
325 raise NotImplementedError()
327 @abc.abstractmethod
328 def find_packages(self, graph: QuantumGraph) -> tuple[Packages | None, DatasetRef]:
329 """Find packages information.
331 Parameters
332 ----------
333 graph : `~lsst.pipe.base.QuantumGraph`
334 Quantum graph.
336 Returns
337 -------
338 packages : `lsst.utils.packages.Packages` or `None`
339 Existing packages data retrieved from butler, or `None`.
340 ref : `~lsst.daf.butler.DatasetRef`
341 Resolved reference for packages to be stored in butler.
343 Raises
344 ------
345 MissingReferenceError
346 Raised if reference cannot be found or generated.
347 """
348 raise NotImplementedError()
350 @contextmanager
351 def transaction(self) -> Iterator[None]:
352 """Context manager for transaction.
354 Default implementation has no transaction support.
355 """
356 yield
359class PreExecInit(PreExecInitBase):
360 """Initialization of registry for QuantumGraph execution.
362 This class encapsulates all necessary operations that have to be performed
363 on butler and registry to prepare them for QuantumGraph execution.
365 Parameters
366 ----------
367 butler : `~lsst.daf.butler.Butler`
368 Data butler instance.
369 taskFactory : `~lsst.pipe.base.TaskFactory`
370 Task factory.
371 extendRun : `bool`, optional
372 If `True` then do not try to overwrite any datasets that might exist
373 in ``butler.run``; instead compare them when appropriate/possible. If
374 `False`, then any existing conflicting dataset will cause a butler
375 exception to be raised.
376 mock : `bool`, optional
377 If `True` then also do initialization needed for pipeline mocking.
378 """
380 def __init__(self, butler: Butler, taskFactory: TaskFactory, extendRun: bool = False, mock: bool = False):
381 super().__init__(butler, taskFactory)
382 self.full_butler = butler
383 self.extendRun = extendRun
384 self.mock = mock
385 if self.extendRun and self.full_butler.run is None:
386 raise RuntimeError(
387 "Cannot perform extendRun logic unless butler is initialized "
388 "with a default output RUN collection."
389 )
391 @contextmanager
392 def transaction(self) -> Iterator[None]:
393 # dosctring inherited
394 with self.full_butler.transaction():
395 yield
397 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
398 # docstring inherited
399 pipeline = graph.taskGraph
400 pipelineDatasetTypes = PipelineDatasetTypes.fromPipeline(
401 pipeline, registry=self.full_butler.registry, include_configs=True, include_packages=True
402 )
404 for datasetTypes, is_input in (
405 (pipelineDatasetTypes.initIntermediates, True),
406 (pipelineDatasetTypes.initOutputs, False),
407 (pipelineDatasetTypes.intermediates, True),
408 (pipelineDatasetTypes.outputs, False),
409 ):
410 self._register_output_dataset_types(registerDatasetTypes, datasetTypes, is_input)
412 if self.mock:
413 # register special mock data types, skip logs and metadata
414 skipDatasetTypes = {taskDef.metadataDatasetName for taskDef in pipeline}
415 skipDatasetTypes |= {taskDef.logOutputDatasetName for taskDef in pipeline}
416 for datasetTypes, is_input in (
417 (pipelineDatasetTypes.intermediates, True),
418 (pipelineDatasetTypes.outputs, False),
419 ):
420 mockDatasetTypes = []
421 for datasetType in datasetTypes:
422 if not (datasetType.name in skipDatasetTypes or datasetType.isComponent()):
423 mockDatasetTypes.append(
424 DatasetType(
425 MockButlerQuantumContext.mockDatasetTypeName(datasetType.name),
426 datasetType.dimensions,
427 "StructuredDataDict",
428 )
429 )
430 if mockDatasetTypes:
431 self._register_output_dataset_types(registerDatasetTypes, mockDatasetTypes, is_input)
433 def _register_output_dataset_types(
434 self, registerDatasetTypes: bool, datasetTypes: Iterable[DatasetType], is_input: bool
435 ) -> None:
436 def _check_compatibility(datasetType: DatasetType, expected: DatasetType, is_input: bool) -> bool:
437 # These are output dataset types so check for compatibility on put.
438 is_compatible = expected.is_compatible_with(datasetType)
440 if is_input:
441 # This dataset type is also used for input so must be
442 # compatible on get as ell.
443 is_compatible = is_compatible and datasetType.is_compatible_with(expected)
445 if is_compatible:
446 _LOG.debug(
447 "The dataset type configurations differ (%s from task != %s from registry) "
448 "but the storage classes are compatible. Can continue.",
449 datasetType,
450 expected,
451 )
452 return is_compatible
454 missing_datasetTypes = set()
455 for datasetType in datasetTypes:
456 # Only composites are registered, no components, and by this point
457 # the composite should already exist.
458 if registerDatasetTypes and not datasetType.isComponent():
459 _LOG.debug("Registering DatasetType %s with registry", datasetType)
460 # this is a no-op if it already exists and is consistent,
461 # and it raises if it is inconsistent.
462 try:
463 self.full_butler.registry.registerDatasetType(datasetType)
464 except ConflictingDefinitionError:
465 if not _check_compatibility(
466 datasetType, self.full_butler.registry.getDatasetType(datasetType.name), is_input
467 ):
468 raise
469 else:
470 _LOG.debug("Checking DatasetType %s against registry", datasetType)
471 try:
472 expected = self.full_butler.registry.getDatasetType(datasetType.name)
473 except KeyError:
474 # Likely means that --register-dataset-types is forgotten.
475 missing_datasetTypes.add(datasetType.name)
476 continue
477 if expected != datasetType:
478 if not _check_compatibility(datasetType, expected, is_input):
479 raise ValueError(
480 f"DatasetType configuration does not match Registry: {datasetType} != {expected}"
481 )
483 if missing_datasetTypes:
484 plural = "s" if len(missing_datasetTypes) != 1 else ""
485 raise KeyError(
486 f"Missing dataset type definition{plural}: {', '.join(missing_datasetTypes)}. "
487 "Dataset types have to be registered with either `butler register-dataset-type` or "
488 "passing `--register-dataset-types` option to `pipetask run`."
489 )
491 def find_init_input_refs(self, taskDef: TaskDef, graph: QuantumGraph) -> Iterable[DatasetRef]:
492 # docstring inherited
493 refs: list[DatasetRef] = []
494 for name in taskDef.connections.initInputs:
495 attribute = getattr(taskDef.connections, name)
496 dataId = DataCoordinate.makeEmpty(self.full_butler.dimensions)
497 dataset_type = DatasetType(attribute.name, graph.universe.empty, attribute.storageClass)
498 ref = self.full_butler.registry.findDataset(dataset_type, dataId)
499 if ref is None:
500 raise ValueError(f"InitInput does not exist in butler for dataset type {dataset_type}")
501 refs.append(ref)
502 return refs
504 def find_init_output(
505 self, taskDef: TaskDef, dataset_type_name: str, graph: QuantumGraph
506 ) -> tuple[Any | None, DatasetRef]:
507 # docstring inherited
508 dataset_type = self.full_butler.registry.getDatasetType(dataset_type_name)
509 dataId = DataCoordinate.makeEmpty(self.full_butler.dimensions)
510 return self._find_existing(dataset_type, dataId)
512 def find_packages(self, graph: QuantumGraph) -> tuple[Packages | None, DatasetRef]:
513 # docstring inherited
514 dataset_type = self.full_butler.registry.getDatasetType(PipelineDatasetTypes.packagesDatasetName)
515 dataId = DataCoordinate.makeEmpty(self.full_butler.dimensions)
516 return self._find_existing(dataset_type, dataId)
518 def _find_existing(
519 self, dataset_type: DatasetType, dataId: DataCoordinate
520 ) -> tuple[Any | None, DatasetRef]:
521 """Make a reference of a given dataset type and try to retrieve it from
522 butler. If not found then generate new resolved reference.
523 """
524 run = self.full_butler.run
525 assert run is not None
527 ref = self.full_butler.registry.findDataset(dataset_type, dataId, collections=[run])
528 if self.extendRun and ref is not None:
529 try:
530 config = self.butler.getDirect(ref)
531 return config, ref
532 except (LookupError, FileNotFoundError):
533 return None, ref
534 else:
535 # make new resolved dataset ref
536 ref = DatasetRef(dataset_type, dataId)
537 ref = DatasetIdFactory().resolveRef(ref, run)
538 return None, ref
541class PreExecInitLimited(PreExecInitBase):
542 """Initialization of registry for QuantumGraph execution.
544 This class works with LimitedButler and expects that all references in
545 QuantumGraph are resolved.
547 Parameters
548 ----------
549 butler : `~lsst.daf.butler.LimitedButler`
550 Limited data butler instance.
551 taskFactory : `~lsst.pipe.base.TaskFactory`
552 Task factory.
553 """
555 def __init__(self, butler: LimitedButler, taskFactory: TaskFactory):
556 super().__init__(butler, taskFactory)
558 def initializeDatasetTypes(self, graph: QuantumGraph, registerDatasetTypes: bool = False) -> None:
559 # docstring inherited
560 # With LimitedButler we never create or check dataset types.
561 pass
563 def find_init_input_refs(self, taskDef: TaskDef, graph: QuantumGraph) -> Iterable[DatasetRef]:
564 # docstring inherited
565 return graph.initInputRefs(taskDef) or []
567 def find_init_output(
568 self, taskDef: TaskDef, dataset_type: str, graph: QuantumGraph
569 ) -> tuple[Any | None, DatasetRef]:
570 # docstring inherited
571 return self._find_existing(graph.initOutputRefs(taskDef) or [], dataset_type)
573 def find_packages(self, graph: QuantumGraph) -> tuple[Packages | None, DatasetRef]:
574 # docstring inherited
575 return self._find_existing(graph.globalInitOutputRefs(), PipelineDatasetTypes.packagesDatasetName)
577 def _find_existing(self, refs: Iterable[DatasetRef], dataset_type: str) -> tuple[Any | None, DatasetRef]:
578 """Find a reference of a given dataset type in the list of references
579 and try to retrieve it from butler.
580 """
581 for ref in refs:
582 if ref.datasetType.name == dataset_type:
583 try:
584 data = self.butler.getDirect(ref)
585 return data, ref
586 except (LookupError, FileNotFoundError):
587 return None, ref
588 raise MissingReferenceError(f"Failed to find reference for dataset type {dataset_type}")