Coverage for python/lsst/pipe/base/graphBuilder.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23"""Module defining GraphBuilder class and related methods.
24"""
26__all__ = ['GraphBuilder']
28# -------------------------------
29# Imports of standard modules --
30# -------------------------------
31import itertools
32from collections import ChainMap
33from contextlib import contextmanager
34from dataclasses import dataclass
35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping
36import logging
39# -----------------------------
40# Imports for other modules --
41# -----------------------------
42from .connections import iterConnections, AdjustQuantumHelper
43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline
44from .graph import QuantumGraph
45from lsst.daf.butler import (
46 CollectionSearch,
47 CollectionType,
48 DataCoordinate,
49 DatasetRef,
50 DatasetType,
51 DimensionGraph,
52 DimensionUniverse,
53 NamedKeyDict,
54 Quantum,
55)
56from lsst.utils import doImport
58# ----------------------------------
59# Local non-exported definitions --
60# ----------------------------------
62_LOG = logging.getLogger(__name__.partition(".")[2])
65class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]):
66 """A custom dictionary that maps `DatasetType` to a nested dictionary of
67 the known `DatasetRef` instances of that type.
69 Parameters
70 ----------
71 args
72 Positional arguments are forwarded to the `dict` constructor.
73 universe : `DimensionUniverse`
74 Universe of all possible dimensions.
75 """
76 def __init__(self, *args, universe: DimensionGraph):
77 super().__init__(*args)
78 self.universe = universe
80 @classmethod
81 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *,
82 universe: DimensionUniverse) -> _DatasetDict:
83 """Construct a dictionary from a flat iterable of `DatasetType` keys.
85 Parameters
86 ----------
87 datasetTypes : `iterable` of `DatasetType`
88 DatasetTypes to use as keys for the dict. Values will be empty
89 dictionaries.
90 universe : `DimensionUniverse`
91 Universe of all possible dimensions.
93 Returns
94 -------
95 dictionary : `_DatasetDict`
96 A new `_DatasetDict` instance.
97 """
98 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe)
100 @classmethod
101 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict
102 ) -> _DatasetDict:
103 """Return a new dictionary by extracting items corresponding to the
104 given keys from one or more existing dictionaries.
106 Parameters
107 ----------
108 datasetTypes : `iterable` of `DatasetType`
109 DatasetTypes to use as keys for the dict. Values will be obtained
110 by lookups against ``first`` and ``rest``.
111 first : `_DatasetDict`
112 Another dictionary from which to extract values.
113 rest
114 Additional dictionaries from which to extract values.
116 Returns
117 -------
118 dictionary : `_DatasetDict`
119 A new dictionary instance.
120 """
121 combined = ChainMap(first, *rest)
122 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes},
123 universe=first.universe)
125 @property
126 def dimensions(self) -> DimensionGraph:
127 """The union of all dimensions used by all dataset types in this
128 dictionary, including implied dependencies (`DimensionGraph`).
129 """
130 base = self.universe.empty
131 if len(self) == 0:
132 return base
133 return base.union(*[datasetType.dimensions for datasetType in self.keys()])
135 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]:
136 """Unpack nested single-element `DatasetRef` dicts into a new
137 mapping with `DatasetType` keys and `DatasetRef` values.
139 This method assumes that each nest contains exactly one item, as is the
140 case for all "init" datasets.
142 Returns
143 -------
144 dictionary : `NamedKeyDict`
145 Dictionary mapping `DatasetType` to `DatasetRef`, with both
146 `DatasetType` instances and string names usable as keys.
147 """
148 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef:
149 ref, = refs.values()
150 return ref
151 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()})
153 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
154 """Unpack nested multi-element `DatasetRef` dicts into a new
155 mapping with `DatasetType` keys and `set` of `DatasetRef` values.
157 Returns
158 -------
159 dictionary : `NamedKeyDict`
160 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with
161 both `DatasetType` instances and string names usable as keys.
162 """
163 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()})
165 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]
166 ) -> Iterator[DatasetRef]:
167 """Iterate over the contained `DatasetRef` instances that match the
168 given `DatasetType` and data IDs.
170 Parameters
171 ----------
172 datasetType : `DatasetType`
173 Dataset type to match.
174 dataIds : `Iterable` [ `DataCoordinate` ]
175 Data IDs to match.
177 Returns
178 -------
179 refs : `Iterator` [ `DatasetRef` ]
180 DatasetRef instances for which ``ref.datasetType == datasetType``
181 and ``ref.dataId`` is in ``dataIds``.
182 """
183 refs = self[datasetType]
184 return (refs[dataId] for dataId in dataIds)
187class _QuantumScaffolding:
188 """Helper class aggregating information about a `Quantum`, used when
189 constructing a `QuantumGraph`.
191 See `_PipelineScaffolding` for a top-down description of the full
192 scaffolding data structure.
194 Parameters
195 ----------
196 task : _TaskScaffolding
197 Back-reference to the helper object for the `PipelineTask` this quantum
198 represents an execution of.
199 dataId : `DataCoordinate`
200 Data ID for this quantum.
201 """
202 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate):
203 self.task = task
204 self.dataId = dataId
205 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe)
206 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe)
207 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(),
208 universe=dataId.universe)
210 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites")
212 def __repr__(self):
213 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)"
215 task: _TaskScaffolding
216 """Back-reference to the helper object for the `PipelineTask` this quantum
217 represents an execution of.
218 """
220 dataId: DataCoordinate
221 """Data ID for this quantum.
222 """
224 inputs: _DatasetDict
225 """Nested dictionary containing `DatasetRef` inputs to this quantum.
227 This is initialized to map each `DatasetType` to an empty dictionary at
228 construction. Those nested dictionaries are populated (with data IDs as
229 keys) with unresolved `DatasetRef` instances in
230 `_PipelineScaffolding.connectDataIds`.
231 """
233 outputs: _DatasetDict
234 """Nested dictionary containing `DatasetRef` outputs this quantum.
235 """
237 prerequisites: _DatasetDict
238 """Nested dictionary containing `DatasetRef` prerequisite inputs to this
239 quantum.
240 """
242 def makeQuantum(self) -> Quantum:
243 """Transform the scaffolding object into a true `Quantum` instance.
245 Returns
246 -------
247 quantum : `Quantum`
248 An actual `Quantum` instance.
249 """
250 allInputs = self.inputs.unpackMultiRefs()
251 allInputs.update(self.prerequisites.unpackMultiRefs())
252 # Give the task's Connections class an opportunity to remove some
253 # inputs, or complain if they are unacceptable.
254 # This will raise if one of the check conditions is not met, which is
255 # the intended behavior.
256 # If it raises NotWorkFound, there is a bug in the QG algorithm
257 # or the adjustQuantum is incorrectly trying to make a prerequisite
258 # input behave like a regular input; adjustQuantum should only raise
259 # NoWorkFound if a regular input is missing, and it shouldn't be
260 # possible for us to have generated ``self`` if that's true.
261 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs())
262 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId)
263 return Quantum(
264 taskName=self.task.taskDef.taskName,
265 taskClass=self.task.taskDef.taskClass,
266 dataId=self.dataId,
267 initInputs=self.task.initInputs.unpackSingleRefs(),
268 inputs=helper.inputs,
269 outputs=helper.outputs,
270 )
273@dataclass
274class _TaskScaffolding:
275 """Helper class aggregating information about a `PipelineTask`, used when
276 constructing a `QuantumGraph`.
278 See `_PipelineScaffolding` for a top-down description of the full
279 scaffolding data structure.
281 Parameters
282 ----------
283 taskDef : `TaskDef`
284 Data structure that identifies the task class and its config.
285 parent : `_PipelineScaffolding`
286 The parent data structure that will hold the instance being
287 constructed.
288 datasetTypes : `TaskDatasetTypes`
289 Data structure that categorizes the dataset types used by this task.
290 """
291 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes):
292 universe = parent.dimensions.universe
293 self.taskDef = taskDef
294 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions)
295 assert self.dimensions.issubset(parent.dimensions)
296 # Initialize _DatasetDicts as subsets of the one or two
297 # corresponding dicts in the parent _PipelineScaffolding.
298 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs,
299 parent.initIntermediates)
300 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates,
301 parent.initOutputs)
302 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates)
303 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs)
304 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites)
305 self.dataIds = set()
306 self.quanta = {}
308 def __repr__(self):
309 # Default dataclass-injected __repr__ gets caught in an infinite loop
310 # because of back-references.
311 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)"
313 taskDef: TaskDef
314 """Data structure that identifies the task class and its config
315 (`TaskDef`).
316 """
318 dimensions: DimensionGraph
319 """The dimensions of a single `Quantum` of this task (`DimensionGraph`).
320 """
322 initInputs: _DatasetDict
323 """Dictionary containing information about datasets used to construct this
324 task (`_DatasetDict`).
325 """
327 initOutputs: _DatasetDict
328 """Dictionary containing information about datasets produced as a
329 side-effect of constructing this task (`_DatasetDict`).
330 """
332 inputs: _DatasetDict
333 """Dictionary containing information about datasets used as regular,
334 graph-constraining inputs to this task (`_DatasetDict`).
335 """
337 outputs: _DatasetDict
338 """Dictionary containing information about datasets produced by this task
339 (`_DatasetDict`).
340 """
342 prerequisites: _DatasetDict
343 """Dictionary containing information about input datasets that must be
344 present in the repository before any Pipeline containing this task is run
345 (`_DatasetDict`).
346 """
348 quanta: Dict[DataCoordinate, _QuantumScaffolding]
349 """Dictionary mapping data ID to a scaffolding object for the Quantum of
350 this task with that data ID.
351 """
353 def makeQuantumSet(self) -> Set[Quantum]:
354 """Create a `set` of `Quantum` from the information in ``self``.
356 Returns
357 -------
358 nodes : `set` of `Quantum
359 The `Quantum` elements corresponding to this task.
360 """
361 return set(q.makeQuantum() for q in self.quanta.values())
364@dataclass
365class _PipelineScaffolding:
366 """A helper data structure that organizes the information involved in
367 constructing a `QuantumGraph` for a `Pipeline`.
369 Parameters
370 ----------
371 pipeline : `Pipeline`
372 Sequence of tasks from which a graph is to be constructed. Must
373 have nested task classes already imported.
374 universe : `DimensionUniverse`
375 Universe of all possible dimensions.
377 Notes
378 -----
379 The scaffolding data structure contains nested data structures for both
380 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset
381 data structures are shared between the pipeline-level structure (which
382 aggregates all datasets and categorizes them from the perspective of the
383 complete pipeline) and the individual tasks that use them as inputs and
384 outputs.
386 `QuantumGraph` construction proceeds in four steps, with each corresponding
387 to a different `_PipelineScaffolding` method:
389 1. When `_PipelineScaffolding` is constructed, we extract and categorize
390 the DatasetTypes used by the pipeline (delegating to
391 `PipelineDatasetTypes.fromPipeline`), then use these to construct the
392 nested `_TaskScaffolding` and `_DatasetDict` objects.
394 2. In `connectDataIds`, we construct and run the "Big Join Query", which
395 returns related tuples of all dimensions used to identify any regular
396 input, output, and intermediate datasets (not prerequisites). We then
397 iterate over these tuples of related dimensions, identifying the subsets
398 that correspond to distinct data IDs for each task and dataset type,
399 and then create `_QuantumScaffolding` objects.
401 3. In `resolveDatasetRefs`, we run follow-up queries against all of the
402 dataset data IDs previously identified, transforming unresolved
403 DatasetRefs into resolved DatasetRefs where appropriate. We then look
404 up prerequisite datasets for all quanta.
406 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of
407 per-task `_QuantumScaffolding` objects.
408 """
409 def __init__(self, pipeline, *, registry):
410 _LOG.debug("Initializing data structures for QuantumGraph generation.")
411 self.tasks = []
412 # Aggregate and categorize the DatasetTypes in the Pipeline.
413 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry)
414 # Construct dictionaries that map those DatasetTypes to structures
415 # that will (later) hold addiitonal information about them.
416 for attr in ("initInputs", "initIntermediates", "initOutputs",
417 "inputs", "intermediates", "outputs", "prerequisites"):
418 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr),
419 universe=registry.dimensions))
420 # Aggregate all dimensions for all non-init, non-prerequisite
421 # DatasetTypes. These are the ones we'll include in the big join
422 # query.
423 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions,
424 self.outputs.dimensions)
425 # Construct scaffolding nodes for each Task, and add backreferences
426 # to the Task from each DatasetScaffolding node.
427 # Note that there's only one scaffolding node for each DatasetType,
428 # shared by _PipelineScaffolding and all _TaskScaffoldings that
429 # reference it.
430 if isinstance(pipeline, Pipeline):
431 pipeline = pipeline.toExpandedPipeline()
432 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes)
433 for taskDef, taskDatasetTypes in zip(pipeline,
434 datasetTypes.byTask.values())]
436 def __repr__(self):
437 # Default dataclass-injected __repr__ gets caught in an infinite loop
438 # because of back-references.
439 return f"_PipelineScaffolding(tasks={self.tasks}, ...)"
441 tasks: List[_TaskScaffolding]
442 """Scaffolding data structures for each task in the pipeline
443 (`list` of `_TaskScaffolding`).
444 """
446 initInputs: _DatasetDict
447 """Datasets consumed but not produced when constructing the tasks in this
448 pipeline (`_DatasetDict`).
449 """
451 initIntermediates: _DatasetDict
452 """Datasets that are both consumed and produced when constructing the tasks
453 in this pipeline (`_DatasetDict`).
454 """
456 initOutputs: _DatasetDict
457 """Datasets produced but not consumed when constructing the tasks in this
458 pipeline (`_DatasetDict`).
459 """
461 inputs: _DatasetDict
462 """Datasets that are consumed but not produced when running this pipeline
463 (`_DatasetDict`).
464 """
466 intermediates: _DatasetDict
467 """Datasets that are both produced and consumed when running this pipeline
468 (`_DatasetDict`).
469 """
471 outputs: _DatasetDict
472 """Datasets produced but not consumed when when running this pipeline
473 (`_DatasetDict`).
474 """
476 prerequisites: _DatasetDict
477 """Datasets that are consumed when running this pipeline and looked up
478 per-Quantum when generating the graph (`_DatasetDict`).
479 """
481 dimensions: DimensionGraph
482 """All dimensions used by any regular input, intermediate, or output
483 (not prerequisite) dataset; the set of dimension used in the "Big Join
484 Query" (`DimensionGraph`).
486 This is required to be a superset of all task quantum dimensions.
487 """
489 @contextmanager
490 def connectDataIds(self, registry, collections, userQuery, externalDataId):
491 """Query for the data IDs that connect nodes in the `QuantumGraph`.
493 This method populates `_TaskScaffolding.dataIds` and
494 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).
496 Parameters
497 ----------
498 registry : `lsst.daf.butler.Registry`
499 Registry for the data repository; used for all data ID queries.
500 collections
501 Expressions representing the collections to search for input
502 datasets. May be any of the types accepted by
503 `lsst.daf.butler.CollectionSearch.fromExpression`.
504 userQuery : `str` or `None`
505 User-provided expression to limit the data IDs processed.
506 externalDataId : `DataCoordinate`
507 Externally-provided data ID that should be used to restrict the
508 results, just as if these constraints had been included via ``AND``
509 in ``userQuery``. This includes (at least) any instrument named
510 in the pipeline definition.
512 Returns
513 -------
514 commonDataIds : \
515 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
516 An interface to a database temporary table containing all data IDs
517 that will appear in this `QuantumGraph`. Returned inside a
518 context manager, which will drop the temporary table at the end of
519 the `with` block in which this method is called.
520 """
521 _LOG.debug("Building query for data IDs.")
522 # Initialization datasets always have empty data IDs.
523 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
524 for datasetType, refs in itertools.chain(self.initInputs.items(),
525 self.initIntermediates.items(),
526 self.initOutputs.items()):
527 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
528 # Run one big query for the data IDs for task dimensions and regular
529 # inputs and outputs. We limit the query to only dimensions that are
530 # associated with the input dataset types, but don't (yet) try to
531 # obtain the dataset_ids for those inputs.
532 _LOG.debug("Submitting data ID query and materializing results.")
533 with registry.queryDataIds(self.dimensions,
534 datasets=list(self.inputs),
535 collections=collections,
536 where=userQuery,
537 dataId=externalDataId,
538 ).materialize() as commonDataIds:
539 _LOG.debug("Expanding data IDs.")
540 commonDataIds = commonDataIds.expanded()
541 _LOG.debug("Iterating over query results to associate quanta with datasets.")
542 # Iterate over query results, populating data IDs for datasets and
543 # quanta and then connecting them to each other.
544 n = 0
545 for n, commonDataId in enumerate(commonDataIds):
546 # Create DatasetRefs for all DatasetTypes from this result row,
547 # noting that we might have created some already.
548 # We remember both those that already existed and those that we
549 # create now.
550 refsForRow = {}
551 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(),
552 self.outputs.items()):
553 datasetDataId = commonDataId.subset(datasetType.dimensions)
554 ref = refs.get(datasetDataId)
555 if ref is None:
556 ref = DatasetRef(datasetType, datasetDataId)
557 refs[datasetDataId] = ref
558 refsForRow[datasetType.name] = ref
559 # Create _QuantumScaffolding objects for all tasks from this
560 # result row, noting that we might have created some already.
561 for task in self.tasks:
562 quantumDataId = commonDataId.subset(task.dimensions)
563 quantum = task.quanta.get(quantumDataId)
564 if quantum is None:
565 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId)
566 task.quanta[quantumDataId] = quantum
567 # Whether this is a new quantum or an existing one, we can
568 # now associate the DatasetRefs for this row with it. The
569 # fact that a Quantum data ID and a dataset data ID both
570 # came from the same result row is what tells us they
571 # should be associated.
572 # Many of these associates will be duplicates (because
573 # another query row that differed from this one only in
574 # irrelevant dimensions already added them), and we use
575 # sets to skip.
576 for datasetType in task.inputs:
577 ref = refsForRow[datasetType.name]
578 quantum.inputs[datasetType.name][ref.dataId] = ref
579 for datasetType in task.outputs:
580 ref = refsForRow[datasetType.name]
581 quantum.outputs[datasetType.name][ref.dataId] = ref
582 _LOG.debug("Finished processing %d rows from data ID query.", n)
583 yield commonDataIds
585 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None,
586 clobberOutputs=True):
587 """Perform follow up queries for each dataset data ID produced in
588 `fillDataIds`.
590 This method populates `_DatasetScaffolding.refs` (except for those in
591 `prerequisites`).
593 Parameters
594 ----------
595 registry : `lsst.daf.butler.Registry`
596 Registry for the data repository; used for all data ID queries.
597 collections
598 Expressions representing the collections to search for input
599 datasets. May be any of the types accepted by
600 `lsst.daf.butler.CollectionSearch.fromExpression`.
601 run : `str`, optional
602 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
603 output datasets, if it already exists.
604 commonDataIds : \
605 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
606 Result of a previous call to `connectDataIds`.
607 skipExistingIn
608 Expressions representing the collections to search for existing
609 output datasets that should be skipped. May be any of the types
610 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
611 `None` or empty string/sequence disables skipping.
612 clobberOutputs : `bool`, optional
613 If `True` (default), allow quanta to created even if outputs exist;
614 this requires the same behavior behavior to be enabled when
615 executing. If ``skipExistingIn`` is not `None`, completed quanta
616 (those with metadata, or all outputs if there is no metadata
617 dataset configured) will be skipped rather than clobbered.
619 Raises
620 ------
621 OutputExistsError
622 Raised if an output dataset already exists in the output run
623 and ``skipExistingIn`` does not include output run, or if only
624 some outputs are present and ``clobberOutputs`` is `False`.
625 """
626 skipCollections: Optional[CollectionSearch] = None
627 skipExistingInRun = False
628 if skipExistingIn:
629 skipCollections = CollectionSearch.fromExpression(skipExistingIn)
630 if run:
631 # as optimization check in the explicit list of names first
632 skipExistingInRun = run in skipCollections.explicitNames()
633 if not skipExistingInRun:
634 # need to flatten it and check again
635 skipExistingInRun = run in registry.queryCollections(
636 skipExistingIn,
637 collectionTypes=CollectionType.RUN,
638 )
640 # Look up [init] intermediate and output datasets in the output
641 # collection, if there is an output collection.
642 if run is not None or skipCollections is not None:
643 for datasetType, refs in itertools.chain(self.initIntermediates.items(),
644 self.initOutputs.items(),
645 self.intermediates.items(),
646 self.outputs.items()):
647 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.",
648 len(refs), datasetType.name)
649 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs
650 subset = commonDataIds.subset(datasetType.dimensions, unique=True)
652 # look at RUN collection first
653 if run is not None:
654 resolvedRefQueryResults = subset.findDatasets(
655 datasetType,
656 collections=run,
657 findFirst=True
658 )
659 for resolvedRef in resolvedRefQueryResults:
660 # TODO: we could easily support per-DatasetType
661 # skipExisting and I could imagine that being useful -
662 # it's probably required in order to support writing
663 # initOutputs before QuantumGraph generation.
664 assert resolvedRef.dataId in refs
665 if not (skipExistingInRun or isInit or clobberOutputs):
666 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in "
667 f"output RUN collection '{run}' with data ID"
668 f" {resolvedRef.dataId}.")
670 # And check skipExistingIn too, if RUN collection is in
671 # it is handled above
672 if skipCollections is not None:
673 resolvedRefQueryResults = subset.findDatasets(
674 datasetType,
675 collections=skipCollections,
676 findFirst=True
677 )
678 for resolvedRef in resolvedRefQueryResults:
679 assert resolvedRef.dataId in refs
680 refs[resolvedRef.dataId] = resolvedRef
682 # Look up input and initInput datasets in the input collection(s).
683 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()):
684 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name)
685 resolvedRefQueryResults = commonDataIds.subset(
686 datasetType.dimensions,
687 unique=True
688 ).findDatasets(
689 datasetType,
690 collections=collections,
691 findFirst=True
692 )
693 dataIdsNotFoundYet = set(refs.keys())
694 for resolvedRef in resolvedRefQueryResults:
695 dataIdsNotFoundYet.discard(resolvedRef.dataId)
696 refs[resolvedRef.dataId] = resolvedRef
697 if dataIdsNotFoundYet:
698 raise RuntimeError(
699 f"{len(dataIdsNotFoundYet)} dataset(s) of type "
700 f"'{datasetType.name}' was/were present in a previous "
701 f"query, but could not be found now."
702 f"This is either a logic bug in QuantumGraph generation "
703 f"or the input collections have been modified since "
704 f"QuantumGraph generation began."
705 )
706 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects,
707 # replacing the unresolved refs there, and then look up prerequisites.
708 for task in self.tasks:
709 _LOG.debug(
710 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.",
711 len(task.quanta),
712 task.taskDef.label
713 )
714 lookupFunctions = {
715 c.name: c.lookupFunction
716 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs")
717 if c.lookupFunction is not None
718 }
719 dataIdsFailed = []
720 dataIdsSucceeded = []
721 for quantum in task.quanta.values():
722 # Process outputs datasets only if skipExistingIn is not None
723 # or there is a run to look for outputs in and clobberOutputs
724 # is True. Note that if skipExistingIn is None, any output
725 # datasets that already exist would have already caused an
726 # exception to be raised. We never update the DatasetRefs in
727 # the quantum because those should never be resolved.
728 if skipCollections is not None or (run is not None and clobberOutputs):
729 resolvedRefs = []
730 unresolvedRefs = []
731 haveMetadata = False
732 for datasetType, originalRefs in quantum.outputs.items():
733 for ref in task.outputs.extract(datasetType, originalRefs.keys()):
734 if ref.id is not None:
735 resolvedRefs.append(ref)
736 if datasetType.name == task.taskDef.metadataDatasetName:
737 haveMetadata = True
738 else:
739 unresolvedRefs.append(ref)
740 if resolvedRefs:
741 if haveMetadata or not unresolvedRefs:
742 dataIdsSucceeded.append(quantum.dataId)
743 if skipCollections is not None:
744 continue
745 else:
746 dataIdsFailed.append(quantum.dataId)
747 if not clobberOutputs:
748 raise OutputExistsError(
749 f"Quantum {quantum.dataId} of task with label "
750 f"'{quantum.task.taskDef.label}' has some outputs that exist "
751 f"({resolvedRefs}) "
752 f"and others that don't ({unresolvedRefs}), with no metadata output, "
753 "and clobbering outputs was not enabled."
754 )
755 # Update the input DatasetRefs to the resolved ones we already
756 # searched for.
757 for datasetType, refs in quantum.inputs.items():
758 for ref in task.inputs.extract(datasetType, refs.keys()):
759 refs[ref.dataId] = ref
760 # Look up prerequisite datasets in the input collection(s).
761 # These may have dimensions that extend beyond those we queried
762 # for originally, because we want to permit those data ID
763 # values to differ across quanta and dataset types.
764 for datasetType in task.prerequisites:
765 lookupFunction = lookupFunctions.get(datasetType.name)
766 if lookupFunction is not None:
767 # PipelineTask has provided its own function to do the
768 # lookup. This always takes precedence.
769 refs = list(
770 lookupFunction(datasetType, registry, quantum.dataId, collections)
771 )
772 elif (datasetType.isCalibration()
773 and datasetType.dimensions <= quantum.dataId.graph
774 and quantum.dataId.graph.temporal):
775 # This is a master calibration lookup, which we have to
776 # handle specially because the query system can't do a
777 # temporal join on a non-dimension-based timespan yet.
778 timespan = quantum.dataId.timespan
779 try:
780 refs = [registry.findDataset(datasetType, quantum.dataId,
781 collections=collections,
782 timespan=timespan)]
783 except KeyError:
784 # This dataset type is not present in the registry,
785 # which just means there are no datasets here.
786 refs = []
787 else:
788 # Most general case.
789 refs = list(registry.queryDatasets(datasetType,
790 collections=collections,
791 dataId=quantum.dataId,
792 findFirst=True).expanded())
793 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs
794 if ref is not None})
795 # Actually remove any quanta that we decided to skip above.
796 if dataIdsSucceeded:
797 if skipCollections is not None:
798 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their "
799 "outputs exist or metadata was written successfully.",
800 len(dataIdsSucceeded), task.taskDef.label)
801 for dataId in dataIdsSucceeded:
802 del task.quanta[dataId]
803 elif clobberOutputs:
804 _LOG.info("Found %d successful quanta for task with label '%s' "
805 "that will need to be clobbered during execution.",
806 len(dataIdsSucceeded),
807 task.taskDef.label)
808 else:
809 raise AssertionError("OutputExistsError should have already been raised.")
810 if dataIdsFailed:
811 if clobberOutputs:
812 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' "
813 "that will need to be clobbered during execution.",
814 len(dataIdsFailed),
815 task.taskDef.label)
816 else:
817 raise AssertionError("OutputExistsError should have already been raised.")
819 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None):
820 """Create a `QuantumGraph` from the quanta already present in
821 the scaffolding data structure.
823 Parameters
824 ---------
825 metadata : Optional Mapping of `str` to primitives
826 This is an optional parameter of extra data to carry with the
827 graph. Entries in this mapping should be able to be serialized in
828 JSON.
830 Returns
831 -------
832 graph : `QuantumGraph`
833 The full `QuantumGraph`.
834 """
835 graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks}, metadata=metadata)
836 return graph
839# ------------------------
840# Exported definitions --
841# ------------------------
844class GraphBuilderError(Exception):
845 """Base class for exceptions generated by graph builder.
846 """
847 pass
850class OutputExistsError(GraphBuilderError):
851 """Exception generated when output datasets already exist.
852 """
853 pass
856class PrerequisiteMissingError(GraphBuilderError):
857 """Exception generated when a prerequisite dataset does not exist.
858 """
859 pass
862class GraphBuilder(object):
863 """GraphBuilder class is responsible for building task execution graph from
864 a Pipeline.
866 Parameters
867 ----------
868 registry : `~lsst.daf.butler.Registry`
869 Data butler instance.
870 skipExistingIn
871 Expressions representing the collections to search for existing
872 output datasets that should be skipped. May be any of the types
873 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`.
874 clobberOutputs : `bool`, optional
875 If `True` (default), allow quanta to created even if partial outputs
876 exist; this requires the same behavior behavior to be enabled when
877 executing.
878 """
880 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True):
881 self.registry = registry
882 self.dimensions = registry.dimensions
883 self.skipExistingIn = skipExistingIn
884 self.clobberOutputs = clobberOutputs
886 def makeGraph(self, pipeline, collections, run, userQuery,
887 metadata: Optional[Mapping[str, Any]] = None):
888 """Create execution graph for a pipeline.
890 Parameters
891 ----------
892 pipeline : `Pipeline`
893 Pipeline definition, task names/classes and their configs.
894 collections
895 Expressions representing the collections to search for input
896 datasets. May be any of the types accepted by
897 `lsst.daf.butler.CollectionSearch.fromExpression`.
898 run : `str`, optional
899 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
900 output datasets, if it already exists.
901 userQuery : `str`
902 String which defines user-defined selection for registry, should be
903 empty or `None` if there is no restrictions on data selection.
904 metadata : Optional Mapping of `str` to primitives
905 This is an optional parameter of extra data to carry with the
906 graph. Entries in this mapping should be able to be serialized in
907 JSON.
909 Returns
910 -------
911 graph : `QuantumGraph`
913 Raises
914 ------
915 UserExpressionError
916 Raised when user expression cannot be parsed.
917 OutputExistsError
918 Raised when output datasets already exist.
919 Exception
920 Other exceptions types may be raised by underlying registry
921 classes.
922 """
923 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)
924 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites):
925 raise ValueError("Pipeline requires input datasets but no input collections provided.")
926 instrument = pipeline.getInstrument()
927 if isinstance(instrument, str):
928 instrument = doImport(instrument)
929 if instrument is not None:
930 dataId = DataCoordinate.standardize(instrument=instrument.getName(),
931 universe=self.registry.dimensions)
932 else:
933 dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
934 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds:
935 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds,
936 skipExistingIn=self.skipExistingIn,
937 clobberOutputs=self.clobberOutputs)
938 return scaffolding.makeQuantumGraph(metadata=metadata)