Coverage for python/lsst/pipe/base/graphBuilder.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections, AdjustQuantumHelper
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph
45from lsst.daf.butler import (
46 CollectionSearch,
47 CollectionType,
48 DataCoordinate,
49 DatasetRef,
50 DatasetType,
51 DimensionGraph,
52 DimensionUniverse,
53 NamedKeyDict,
54 Quantum,
55)
56from lsst.utils import doImport
58# ----------------------------------
59# Local non-exported definitions --
60# ----------------------------------
62_LOG = logging.getLogger(__name__.partition(".")[2])
65class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
66 """A custom dictionary that maps `DatasetType` to a nested dictionary of
67 the known `DatasetRef` instances of that type.
69 Parameters
70 ----------
71 args
72 Positional arguments are forwarded to the `dict` constructor.
73 universe : `DimensionUniverse`
74 Universe of all possible dimensions.
75 """
76 def __init__(self, *args, universe: DimensionGraph):
77 super().__init__(*args)
78 self.universe = universe
80 @classmethod
81 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
82 universe: DimensionUniverse) -> _DatasetDict:
83 """Construct a dictionary from a flat iterable of `DatasetType` keys.
85 Parameters
86 ----------
87 datasetTypes : `iterable` of `DatasetType`
88 DatasetTypes to use as keys for the dict. Values will be empty
89 dictionaries.
90 universe : `DimensionUniverse`
91 Universe of all possible dimensions.
93 Returns
94 -------
95 dictionary : `_DatasetDict`
96 A new `_DatasetDict` instance.
97 """
98 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
100 @classmethod
101 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
102 ) -> _DatasetDict:
103 """Return a new dictionary by extracting items corresponding to the
104 given keys from one or more existing dictionaries.
106 Parameters
107 ----------
108 datasetTypes : `iterable` of `DatasetType`
109 DatasetTypes to use as keys for the dict. Values will be obtained
110 by lookups against ``first`` and ``rest``.
111 first : `_DatasetDict`
112 Another dictionary from which to extract values.
113 rest
114 Additional dictionaries from which to extract values.
116 Returns
117 -------
118 dictionary : `_DatasetDict`
119 A new dictionary instance.
120 """
121 combined = ChainMap(first, *rest)
122 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
123 universe=first.universe)
125 @property
126 def dimensions(self) -> DimensionGraph:
127 """The union of all dimensions used by all dataset types in this
128 dictionary, including implied dependencies (`DimensionGraph`).
129 """
130 base = self.universe.empty
131 if len(self) == 0:
132 return base
133 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
135 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
136 """Unpack nested single-element `DatasetRef` dicts into a new
137 mapping with `DatasetType` keys and `DatasetRef` values.
139 This method assumes that each nest contains exactly one item, as is the
140 case for all "init" datasets.
142 Returns
143 -------
144 dictionary : `NamedKeyDict`
145 Dictionary mapping `DatasetType` to `DatasetRef`, with both
146 `DatasetType` instances and string names usable as keys.
147 """
148 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
149 ref, = refs.values()
150 return ref
151 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
153 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
154 """Unpack nested multi-element `DatasetRef` dicts into a new
155 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
157 Returns
158 -------
159 dictionary : `NamedKeyDict`
160 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
161 both `DatasetType` instances and string names usable as keys.
162 """
163 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
165 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
166 ) -> Iterator[DatasetRef]:
167 """Iterate over the contained `DatasetRef` instances that match the
168 given `DatasetType` and data IDs.
170 Parameters
171 ----------
172 datasetType : `DatasetType`
173 Dataset type to match.
174 dataIds : `Iterable` [ `DataCoordinate` ]
175 Data IDs to match.
177 Returns
178 -------
179 refs : `Iterator` [ `DatasetRef` ]
180 DatasetRef instances for which ``ref.datasetType == datasetType``
181 and ``ref.dataId`` is in ``dataIds``.
182 """
183 refs = self[datasetType]
184 return (refs[dataId] for dataId in dataIds)
187class _QuantumScaffolding:
188 """Helper class aggregating information about a `Quantum`, used when
189 constructing a `QuantumGraph`.
191 See `_PipelineScaffolding` for a top-down description of the full
192 scaffolding data structure.
194 Parameters
195 ----------
196 task : _TaskScaffolding
197 Back-reference to the helper object for the `PipelineTask` this quantum
198 represents an execution of.
199 dataId : `DataCoordinate`
200 Data ID for this quantum.
201 """
202 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
203 self.task = task
204 self.dataId = dataId
205 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
206 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
207 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
208 universe=dataId.universe)
210 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
212 def __repr__(self):
213 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
215 task: _TaskScaffolding
216 """Back-reference to the helper object for the `PipelineTask` this quantum
217 represents an execution of.
218 """
220 dataId: DataCoordinate
221 """Data ID for this quantum.
222 """
224 inputs: _DatasetDict
225 """Nested dictionary containing `DatasetRef` inputs to this quantum.
227 This is initialized to map each `DatasetType` to an empty dictionary at
228 construction. Those nested dictionaries are populated (with data IDs as
229 keys) with unresolved `DatasetRef` instances in
230 `_PipelineScaffolding.connectDataIds`.
231 """
233 outputs: _DatasetDict
234 """Nested dictionary containing `DatasetRef` outputs this quantum.
235 """
237 prerequisites: _DatasetDict
238 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
239 quantum.
240 """
242 def makeQuantum(self) -> Quantum:
243 """Transform the scaffolding object into a true `Quantum` instance.
245 Returns
246 -------
247 quantum : `Quantum`
248 An actual `Quantum` instance.
249 """
250 allInputs = self.inputs.unpackMultiRefs()
251 allInputs.update(self.prerequisites.unpackMultiRefs())
252 # Give the task's Connections class an opportunity to remove some
253 # inputs, or complain if they are unacceptable.
254 # This will raise if one of the check conditions is not met, which is
255 # the intended behavior.
256 # If it raises NotWorkFound, there is a bug in the QG algorithm
257 # or the adjustQuantum is incorrectly trying to make a prerequisite
258 # input behave like a regular input; adjustQuantum should only raise
259 # NoWorkFound if a regular input is missing, and it shouldn't be
260 # possible for us to have generated ``self`` if that's true.
261 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
262 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
263 return Quantum(
264 taskName=self.task.taskDef.taskName,
265 taskClass=self.task.taskDef.taskClass,
266 dataId=self.dataId,
267 initInputs=self.task.initInputs.unpackSingleRefs(),
268 inputs=helper.inputs,
269 outputs=helper.outputs,
270 )
273@dataclass
274class _TaskScaffolding:
275 """Helper class aggregating information about a `PipelineTask`, used when
276 constructing a `QuantumGraph`.
278 See `_PipelineScaffolding` for a top-down description of the full
279 scaffolding data structure.
281 Parameters
282 ----------
283 taskDef : `TaskDef`
284 Data structure that identifies the task class and its config.
285 parent : `_PipelineScaffolding`
286 The parent data structure that will hold the instance being
287 constructed.
288 datasetTypes : `TaskDatasetTypes`
289 Data structure that categorizes the dataset types used by this task.
290 """
291 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
292 universe = parent.dimensions.universe
293 self.taskDef = taskDef
294 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
295 assert self.dimensions.issubset(parent.dimensions)
296 # Initialize _DatasetDicts as subsets of the one or two
297 # corresponding dicts in the parent _PipelineScaffolding.
298 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
299 parent.initIntermediates)
300 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
301 parent.initOutputs)
302 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
303 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
304 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
305 self.dataIds = set()
306 self.quanta = {}
308 def __repr__(self):
309 # Default dataclass-injected __repr__ gets caught in an infinite loop
310 # because of back-references.
311 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
313 taskDef: TaskDef
314 """Data structure that identifies the task class and its config
315 (`TaskDef`).
316 """
318 dimensions: DimensionGraph
319 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
320 """
322 initInputs: _DatasetDict
323 """Dictionary containing information about datasets used to construct this
324 task (`_DatasetDict`).
325 """
327 initOutputs: _DatasetDict
328 """Dictionary containing information about datasets produced as a
329 side-effect of constructing this task (`_DatasetDict`).
330 """
332 inputs: _DatasetDict
333 """Dictionary containing information about datasets used as regular,
334 graph-constraining inputs to this task (`_DatasetDict`).
335 """
337 outputs: _DatasetDict
338 """Dictionary containing information about datasets produced by this task
339 (`_DatasetDict`).
340 """
342 prerequisites: _DatasetDict
343 """Dictionary containing information about input datasets that must be
344 present in the repository before any Pipeline containing this task is run
345 (`_DatasetDict`).
346 """
348 quanta: Dict[DataCoordinate, _QuantumScaffolding]
349 """Dictionary mapping data ID to a scaffolding object for the Quantum of
350 this task with that data ID.
351 """
353 def makeQuantumSet(self) -> Set[Quantum]:
354 """Create a `set` of `Quantum` from the information in ``self``.
356 Returns
357 -------
358 nodes : `set` of `Quantum
359 The `Quantum` elements corresponding to this task.
360 """
361 return set(q.makeQuantum() for q in self.quanta.values())
364@dataclass
365class _PipelineScaffolding:
366 """A helper data structure that organizes the information involved in
367 constructing a `QuantumGraph` for a `Pipeline`.
369 Parameters
370 ----------
371 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
372 Sequence of tasks from which a graph is to be constructed. Must
373 have nested task classes already imported.
374 universe : `DimensionUniverse`
375 Universe of all possible dimensions.
377 Notes
378 -----
379 The scaffolding data structure contains nested data structures for both
380 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
381 data structures are shared between the pipeline-level structure (which
382 aggregates all datasets and categorizes them from the perspective of the
383 complete pipeline) and the individual tasks that use them as inputs and
384 outputs.
386 `QuantumGraph` construction proceeds in four steps, with each corresponding
387 to a different `_PipelineScaffolding` method:
389 1. When `_PipelineScaffolding` is constructed, we extract and categorize
390 the DatasetTypes used by the pipeline (delegating to
391 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
392 nested `_TaskScaffolding` and `_DatasetDict` objects.
394 2. In `connectDataIds`, we construct and run the "Big Join Query", which
395 returns related tuples of all dimensions used to identify any regular
396 input, output, and intermediate datasets (not prerequisites). We then
397 iterate over these tuples of related dimensions, identifying the subsets
398 that correspond to distinct data IDs for each task and dataset type,
399 and then create `_QuantumScaffolding` objects.
401 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
402 dataset data IDs previously identified, transforming unresolved
403 DatasetRefs into resolved DatasetRefs where appropriate. We then look
404 up prerequisite datasets for all quanta.
406 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
407 per-task `_QuantumScaffolding` objects.
408 """
409 def __init__(self, pipeline, *, registry):
410 _LOG.debug("Initializing data structures for QuantumGraph generation.")
411 self.tasks = []
412 # Aggregate and categorize the DatasetTypes in the Pipeline.
413 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
414 # Construct dictionaries that map those DatasetTypes to structures
415 # that will (later) hold addiitonal information about them.
416 for attr in ("initInputs", "initIntermediates", "initOutputs",
417 "inputs", "intermediates", "outputs", "prerequisites"):
418 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
419 universe=registry.dimensions))
420 # Aggregate all dimensions for all non-init, non-prerequisite
421 # DatasetTypes. These are the ones we'll include in the big join
422 # query.
423 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
424 self.outputs.dimensions)
425 # Construct scaffolding nodes for each Task, and add backreferences
426 # to the Task from each DatasetScaffolding node.
427 # Note that there's only one scaffolding node for each DatasetType,
428 # shared by _PipelineScaffolding and all _TaskScaffoldings that
429 # reference it.
430 if isinstance(pipeline, Pipeline):
431 pipeline = pipeline.toExpandedPipeline()
432 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
433 for taskDef, taskDatasetTypes in zip(pipeline,
434 datasetTypes.byTask.values())]
436 def __repr__(self):
437 # Default dataclass-injected __repr__ gets caught in an infinite loop
438 # because of back-references.
439 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
441 tasks: List[_TaskScaffolding]
442 """Scaffolding data structures for each task in the pipeline
443 (`list` of `_TaskScaffolding`).
444 """
446 initInputs: _DatasetDict
447 """Datasets consumed but not produced when constructing the tasks in this
448 pipeline (`_DatasetDict`).
449 """
451 initIntermediates: _DatasetDict
452 """Datasets that are both consumed and produced when constructing the tasks
453 in this pipeline (`_DatasetDict`).
454 """
456 initOutputs: _DatasetDict
457 """Datasets produced but not consumed when constructing the tasks in this
458 pipeline (`_DatasetDict`).
459 """
461 inputs: _DatasetDict
462 """Datasets that are consumed but not produced when running this pipeline
463 (`_DatasetDict`).
464 """
466 intermediates: _DatasetDict
467 """Datasets that are both produced and consumed when running this pipeline
468 (`_DatasetDict`).
469 """
471 outputs: _DatasetDict
472 """Datasets produced but not consumed when when running this pipeline
473 (`_DatasetDict`).
474 """
476 prerequisites: _DatasetDict
477 """Datasets that are consumed when running this pipeline and looked up
478 per-Quantum when generating the graph (`_DatasetDict`).
479 """
481 dimensions: DimensionGraph
482 """All dimensions used by any regular input, intermediate, or output
483 (not prerequisite) dataset; the set of dimension used in the "Big Join
484 Query" (`DimensionGraph`).
486 This is required to be a superset of all task quantum dimensions.
487 """
489 @contextmanager
490 def connectDataIds(self, registry, collections, userQuery, externalDataId):
491 """Query for the data IDs that connect nodes in the `QuantumGraph`.
493 This method populates `_TaskScaffolding.dataIds` and
494 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
496 Parameters
497 ----------
498 registry : `lsst.daf.butler.Registry`
499 Registry for the data repository; used for all data ID queries.
500 collections
501 Expressions representing the collections to search for input
502 datasets. May be any of the types accepted by
503 `lsst.daf.butler.CollectionSearch.fromExpression`.
504 userQuery : `str` or `None`
505 User-provided expression to limit the data IDs processed.
506 externalDataId : `DataCoordinate`
507 Externally-provided data ID that should be used to restrict the
508 results, just as if these constraints had been included via ``AND``
509 in ``userQuery``. This includes (at least) any instrument named
510 in the pipeline definition.
512 Returns
513 -------
514 commonDataIds : \
515 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
516 An interface to a database temporary table containing all data IDs
517 that will appear in this `QuantumGraph`. Returned inside a
518 context manager, which will drop the temporary table at the end of
519 the `with` block in which this method is called.
520 """
521 _LOG.debug("Building query for data IDs.")
522 # Initialization datasets always have empty data IDs.
523 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
524 for datasetType, refs in itertools.chain(self.initInputs.items(),
525 self.initIntermediates.items(),
526 self.initOutputs.items()):
527 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
528 # Run one big query for the data IDs for task dimensions and regular
529 # inputs and outputs. We limit the query to only dimensions that are
530 # associated with the input dataset types, but don't (yet) try to
531 # obtain the dataset_ids for those inputs.
532 _LOG.debug("Submitting data ID query and materializing results.")
533 with registry.queryDataIds(self.dimensions,
534 datasets=list(self.inputs),
535 collections=collections,
536 where=userQuery,
537 dataId=externalDataId,
538 ).materialize() as commonDataIds:
539 _LOG.debug("Expanding data IDs.")
540 commonDataIds = commonDataIds.expanded()
541 _LOG.debug("Iterating over query results to associate quanta with datasets.")
542 # Iterate over query results, populating data IDs for datasets and
543 # quanta and then connecting them to each other.
544 n = 0
545 for n, commonDataId in enumerate(commonDataIds):
546 # Create DatasetRefs for all DatasetTypes from this result row,
547 # noting that we might have created some already.
548 # We remember both those that already existed and those that we
549 # create now.
550 refsForRow = {}
551 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
552 self.outputs.items()):
553 datasetDataId = commonDataId.subset(datasetType.dimensions)
554 ref = refs.get(datasetDataId)
555 if ref is None:
556 ref = DatasetRef(datasetType, datasetDataId)
557 refs[datasetDataId] = ref
558 refsForRow[datasetType.name] = ref
559 # Create _QuantumScaffolding objects for all tasks from this
560 # result row, noting that we might have created some already.
561 for task in self.tasks:
562 quantumDataId = commonDataId.subset(task.dimensions)
563 quantum = task.quanta.get(quantumDataId)
564 if quantum is None:
565 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
566 task.quanta[quantumDataId] = quantum
567 # Whether this is a new quantum or an existing one, we can
568 # now associate the DatasetRefs for this row with it. The
569 # fact that a Quantum data ID and a dataset data ID both
570 # came from the same result row is what tells us they
571 # should be associated.
572 # Many of these associates will be duplicates (because
573 # another query row that differed from this one only in
574 # irrelevant dimensions already added them), and we use
575 # sets to skip.
576 for datasetType in task.inputs:
577 ref = refsForRow[datasetType.name]
578 quantum.inputs[datasetType.name][ref.dataId] = ref
579 for datasetType in task.outputs:
580 ref = refsForRow[datasetType.name]
581 quantum.outputs[datasetType.name][ref.dataId] = ref
582 if n == 0:
583 for message in commonDataIds.explain_no_results():
584 _LOG.warn(message)
585 _LOG.debug("Finished processing %d rows from data ID query.", n)
586 yield commonDataIds
588 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None,
589 clobberOutputs=True):
590 """Perform follow up queries for each dataset data ID produced in
591 `fillDataIds`.
593 This method populates `_DatasetScaffolding.refs` (except for those in
594 `prerequisites`).
596 Parameters
597 ----------
598 registry : `lsst.daf.butler.Registry`
599 Registry for the data repository; used for all data ID queries.
600 collections
601 Expressions representing the collections to search for input
602 datasets. May be any of the types accepted by
603 `lsst.daf.butler.CollectionSearch.fromExpression`.
604 run : `str`, optional
605 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
606 output datasets, if it already exists.
607 commonDataIds : \
608 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
609 Result of a previous call to `connectDataIds`.
610 skipExistingIn
611 Expressions representing the collections to search for existing
612 output datasets that should be skipped. May be any of the types
613 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
614 `None` or empty string/sequence disables skipping.
615 clobberOutputs : `bool`, optional
616 If `True` (default), allow quanta to created even if outputs exist;
617 this requires the same behavior behavior to be enabled when
618 executing. If ``skipExistingIn`` is not `None`, completed quanta
619 (those with metadata, or all outputs if there is no metadata
620 dataset configured) will be skipped rather than clobbered.
622 Raises
623 ------
624 OutputExistsError
625 Raised if an output dataset already exists in the output run
626 and ``skipExistingIn`` does not include output run, or if only
627 some outputs are present and ``clobberOutputs`` is `False`.
628 """
629 skipCollections: Optional[CollectionSearch] = None
630 skipExistingInRun = False
631 if skipExistingIn:
632 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
633 if run:
634 # as optimization check in the explicit list of names first
635 skipExistingInRun = run in skipCollections.explicitNames()
636 if not skipExistingInRun:
637 # need to flatten it and check again
638 skipExistingInRun = run in registry.queryCollections(
639 skipExistingIn,
640 collectionTypes=CollectionType.RUN,
641 )
643 # Look up [init] intermediate and output datasets in the output
644 # collection, if there is an output collection.
645 if run is not None or skipCollections is not None:
646 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
647 self.initOutputs.items(),
648 self.intermediates.items(),
649 self.outputs.items()):
650 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
651 len(refs), datasetType.name)
652 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
653 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
655 # look at RUN collection first
656 if run is not None:
657 resolvedRefQueryResults = subset.findDatasets(
658 datasetType,
659 collections=run,
660 findFirst=True
661 )
662 for resolvedRef in resolvedRefQueryResults:
663 # TODO: we could easily support per-DatasetType
664 # skipExisting and I could imagine that being useful -
665 # it's probably required in order to support writing
666 # initOutputs before QuantumGraph generation.
667 assert resolvedRef.dataId in refs
668 if not (skipExistingInRun or isInit or clobberOutputs):
669 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
670 f"output RUN collection '{run}' with data ID"
671 f" {resolvedRef.dataId}.")
673 # And check skipExistingIn too, if RUN collection is in
674 # it is handled above
675 if skipCollections is not None:
676 resolvedRefQueryResults = subset.findDatasets(
677 datasetType,
678 collections=skipCollections,
679 findFirst=True
680 )
681 for resolvedRef in resolvedRefQueryResults:
682 assert resolvedRef.dataId in refs
683 refs[resolvedRef.dataId] = resolvedRef
685 # Look up input and initInput datasets in the input collection(s).
686 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
687 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
688 resolvedRefQueryResults = commonDataIds.subset(
689 datasetType.dimensions,
690 unique=True
691 ).findDatasets(
692 datasetType,
693 collections=collections,
694 findFirst=True
695 )
696 dataIdsNotFoundYet = set(refs.keys())
697 for resolvedRef in resolvedRefQueryResults:
698 dataIdsNotFoundYet.discard(resolvedRef.dataId)
699 refs[resolvedRef.dataId] = resolvedRef
700 if dataIdsNotFoundYet:
701 raise RuntimeError(
702 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
703 f"'{datasetType.name}' was/were present in a previous "
704 f"query, but could not be found now."
705 f"This is either a logic bug in QuantumGraph generation "
706 f"or the input collections have been modified since "
707 f"QuantumGraph generation began."
708 )
709 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
710 # replacing the unresolved refs there, and then look up prerequisites.
711 for task in self.tasks:
712 _LOG.debug(
713 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
714 len(task.quanta),
715 task.taskDef.label
716 )
717 lookupFunctions = {
718 c.name: c.lookupFunction
719 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
720 if c.lookupFunction is not None
721 }
722 dataIdsFailed = []
723 dataIdsSucceeded = []
724 for quantum in task.quanta.values():
725 # Process outputs datasets only if skipExistingIn is not None
726 # or there is a run to look for outputs in and clobberOutputs
727 # is True. Note that if skipExistingIn is None, any output
728 # datasets that already exist would have already caused an
729 # exception to be raised. We never update the DatasetRefs in
730 # the quantum because those should never be resolved.
731 if skipCollections is not None or (run is not None and clobberOutputs):
732 resolvedRefs = []
733 unresolvedRefs = []
734 haveMetadata = False
735 for datasetType, originalRefs in quantum.outputs.items():
736 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
737 if ref.id is not None:
738 resolvedRefs.append(ref)
739 if datasetType.name == task.taskDef.metadataDatasetName:
740 haveMetadata = True
741 else:
742 unresolvedRefs.append(ref)
743 if resolvedRefs:
744 if haveMetadata or not unresolvedRefs:
745 dataIdsSucceeded.append(quantum.dataId)
746 if skipCollections is not None:
747 continue
748 else:
749 dataIdsFailed.append(quantum.dataId)
750 if not clobberOutputs:
751 raise OutputExistsError(
752 f"Quantum {quantum.dataId} of task with label "
753 f"'{quantum.task.taskDef.label}' has some outputs that exist "
754 f"({resolvedRefs}) "
755 f"and others that don't ({unresolvedRefs}), with no metadata output, "
756 "and clobbering outputs was not enabled."
757 )
758 # Update the input DatasetRefs to the resolved ones we already
759 # searched for.
760 for datasetType, refs in quantum.inputs.items():
761 for ref in task.inputs.extract(datasetType, refs.keys()):
762 refs[ref.dataId] = ref
763 # Look up prerequisite datasets in the input collection(s).
764 # These may have dimensions that extend beyond those we queried
765 # for originally, because we want to permit those data ID
766 # values to differ across quanta and dataset types.
767 for datasetType in task.prerequisites:
768 lookupFunction = lookupFunctions.get(datasetType.name)
769 if lookupFunction is not None:
770 # PipelineTask has provided its own function to do the
771 # lookup. This always takes precedence.
772 refs = list(
773 lookupFunction(datasetType, registry, quantum.dataId, collections)
774 )
775 elif (datasetType.isCalibration()
776 and datasetType.dimensions <= quantum.dataId.graph
777 and quantum.dataId.graph.temporal):
778 # This is a master calibration lookup, which we have to
779 # handle specially because the query system can't do a
780 # temporal join on a non-dimension-based timespan yet.
781 timespan = quantum.dataId.timespan
782 try:
783 refs = [registry.findDataset(datasetType, quantum.dataId,
784 collections=collections,
785 timespan=timespan)]
786 except KeyError:
787 # This dataset type is not present in the registry,
788 # which just means there are no datasets here.
789 refs = []
790 else:
791 # Most general case.
792 refs = list(registry.queryDatasets(datasetType,
793 collections=collections,
794 dataId=quantum.dataId,
795 findFirst=True).expanded())
796 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
797 if ref is not None})
798 # Actually remove any quanta that we decided to skip above.
799 if dataIdsSucceeded:
800 if skipCollections is not None:
801 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their "
802 "outputs exist or metadata was written successfully.",
803 len(dataIdsSucceeded), task.taskDef.label)
804 for dataId in dataIdsSucceeded:
805 del task.quanta[dataId]
806 elif clobberOutputs:
807 _LOG.info("Found %d successful quanta for task with label '%s' "
808 "that will need to be clobbered during execution.",
809 len(dataIdsSucceeded),
810 task.taskDef.label)
811 else:
812 raise AssertionError("OutputExistsError should have already been raised.")
813 if dataIdsFailed:
814 if clobberOutputs:
815 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' "
816 "that will need to be clobbered during execution.",
817 len(dataIdsFailed),
818 task.taskDef.label)
819 else:
820 raise AssertionError("OutputExistsError should have already been raised.")
822 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
823 """Create a `QuantumGraph` from the quanta already present in
824 the scaffolding data structure.
826 Parameters
827 ---------
828 metadata : Optional Mapping of `str` to primitives
829 This is an optional parameter of extra data to carry with the
830 graph. Entries in this mapping should be able to be serialized in
831 JSON.
833 Returns
834 -------
835 graph : `QuantumGraph`
836 The full `QuantumGraph`.
837 """
838 graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks}, metadata=metadata)
839 return graph
842# ------------------------
843# Exported definitions --
844# ------------------------
847class GraphBuilderError(Exception):
848 """Base class for exceptions generated by graph builder.
849 """
850 pass
853class OutputExistsError(GraphBuilderError):
854 """Exception generated when output datasets already exist.
855 """
856 pass
859class PrerequisiteMissingError(GraphBuilderError):
860 """Exception generated when a prerequisite dataset does not exist.
861 """
862 pass
865class GraphBuilder(object):
866 """GraphBuilder class is responsible for building task execution graph from
867 a Pipeline.
869 Parameters
870 ----------
871 registry : `~lsst.daf.butler.Registry`
872 Data butler instance.
873 skipExistingIn
874 Expressions representing the collections to search for existing
875 output datasets that should be skipped. May be any of the types
876 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
877 clobberOutputs : `bool`, optional
878 If `True` (default), allow quanta to created even if partial outputs
879 exist; this requires the same behavior behavior to be enabled when
880 executing.
881 """
883 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True):
884 self.registry = registry
885 self.dimensions = registry.dimensions
886 self.skipExistingIn = skipExistingIn
887 self.clobberOutputs = clobberOutputs
889 def makeGraph(self, pipeline, collections, run, userQuery,
890 metadata: Optional[Mapping[str, Any]] = None):
891 """Create execution graph for a pipeline.
893 Parameters
894 ----------
895 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ]
896 Pipeline definition, task names/classes and their configs.
897 collections
898 Expressions representing the collections to search for input
899 datasets. May be any of the types accepted by
900 `lsst.daf.butler.CollectionSearch.fromExpression`.
901 run : `str`, optional
902 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
903 output datasets, if it already exists.
904 userQuery : `str`
905 String which defines user-defined selection for registry, should be
906 empty or `None` if there is no restrictions on data selection.
907 metadata : Optional Mapping of `str` to primitives
908 This is an optional parameter of extra data to carry with the
909 graph. Entries in this mapping should be able to be serialized in
910 JSON.
912 Returns
913 -------
914 graph : `QuantumGraph`
916 Raises
917 ------
918 UserExpressionError
919 Raised when user expression cannot be parsed.
920 OutputExistsError
921 Raised when output datasets already exist.
922 Exception
923 Other exceptions types may be raised by underlying registry
924 classes.
925 """
926 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
927 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
928 raise ValueError("Pipeline requires input datasets but no input collections provided.")
929 instrument = None
930 if isinstance(pipeline, Pipeline):
931 instrument = pipeline.getInstrument()
932 if isinstance(instrument, str):
933 instrument = doImport(instrument)
934 pipeline = list(pipeline.toExpandedPipeline())
935 if instrument is not None:
936 dataId = DataCoordinate.standardize(instrument=instrument.getName(),
937 universe=self.registry.dimensions)
938 else:
939 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
940 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds:
941 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
942 skipExistingIn=self.skipExistingIn,
943 clobberOutputs=self.clobberOutputs)
944 return scaffolding.makeQuantumGraph(metadata=metadata)