Coverage for python/lsst/pipe/base/graphBuilder.py: 19%

388 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-04 09:17 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionSearch, 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 Quantum, 

50 Registry, 

51) 

52from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

53from lsst.utils import doImportType 

54 

55from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

56from ._status import NoWorkFound 

57 

58# ----------------------------- 

59# Imports for other modules -- 

60# ----------------------------- 

61from .connections import AdjustQuantumHelper, iterConnections 

62from .graph import QuantumGraph 

63from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

64 

65# ---------------------------------- 

66# Local non-exported definitions -- 

67# ---------------------------------- 

68 

69_LOG = logging.getLogger(__name__) 

70 

71 

72class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

73 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

74 the known `DatasetRef` instances of that type. 

75 

76 Parameters 

77 ---------- 

78 args 

79 Positional arguments are forwarded to the `dict` constructor. 

80 universe : `DimensionUniverse` 

81 Universe of all possible dimensions. 

82 """ 

83 

84 def __init__(self, *args: Any, universe: DimensionUniverse): 

85 super().__init__(*args) 

86 self.universe = universe 

87 

88 @classmethod 

89 def fromDatasetTypes( 

90 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

91 ) -> _DatasetDict: 

92 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

93 

94 Parameters 

95 ---------- 

96 datasetTypes : `iterable` of `DatasetType` 

97 DatasetTypes to use as keys for the dict. Values will be empty 

98 dictionaries. 

99 universe : `DimensionUniverse` 

100 Universe of all possible dimensions. 

101 

102 Returns 

103 ------- 

104 dictionary : `_DatasetDict` 

105 A new `_DatasetDict` instance. 

106 """ 

107 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

108 

109 @classmethod 

110 def fromSubset( 

111 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

112 ) -> _DatasetDict: 

113 """Return a new dictionary by extracting items corresponding to the 

114 given keys from one or more existing dictionaries. 

115 

116 Parameters 

117 ---------- 

118 datasetTypes : `iterable` of `DatasetType` 

119 DatasetTypes to use as keys for the dict. Values will be obtained 

120 by lookups against ``first`` and ``rest``. 

121 first : `_DatasetDict` 

122 Another dictionary from which to extract values. 

123 rest 

124 Additional dictionaries from which to extract values. 

125 

126 Returns 

127 ------- 

128 dictionary : `_DatasetDict` 

129 A new dictionary instance. 

130 """ 

131 combined = ChainMap(first, *rest) 

132 

133 # Dataset types known to match immediately can be processed 

134 # without checks. 

135 matches = combined.keys() & set(datasetTypes) 

136 _dict = {k: combined[k] for k in matches} 

137 

138 if len(_dict) < len(datasetTypes): 

139 # Work out which ones are missing. 

140 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

141 

142 # Get the known names for comparison. 

143 combined_by_name = {k.name: k for k in combined} 

144 

145 missing = set() 

146 incompatible = {} 

147 for datasetType in missing_datasetTypes: 

148 # The dataset type is not found. It may not be listed 

149 # or it may be that it is there with the same name 

150 # but different definition. 

151 if datasetType.name in combined_by_name: 

152 # This implies some inconsistency in definitions 

153 # for connections. If there is support for storage 

154 # class conversion we can let it slide. 

155 # At this point we do not know 

156 # where the inconsistency is but trust that down 

157 # stream code will be more explicit about input 

158 # vs output incompatibilities. 

159 existing = combined_by_name[datasetType.name] 

160 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

161 _LOG.warning( 

162 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

163 datasetType, 

164 existing, 

165 ) 

166 _dict[datasetType] = combined[existing] 

167 else: 

168 incompatible[datasetType] = existing 

169 else: 

170 missing.add(datasetType) 

171 

172 if missing or incompatible: 

173 reasons = [] 

174 if missing: 

175 reasons.append( 

176 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

177 + ", ".join(d.name for d in combined) 

178 ) 

179 if incompatible: 

180 for x, y in incompatible.items(): 

181 reasons.append(f"{x} incompatible with {y}") 

182 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

183 

184 return cls(_dict, universe=first.universe) 

185 

186 @property 

187 def dimensions(self) -> DimensionGraph: 

188 """The union of all dimensions used by all dataset types in this 

189 dictionary, including implied dependencies (`DimensionGraph`). 

190 """ 

191 base = self.universe.empty 

192 if len(self) == 0: 

193 return base 

194 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

195 

196 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

197 """Unpack nested single-element `DatasetRef` dicts into a new 

198 mapping with `DatasetType` keys and `DatasetRef` values. 

199 

200 This method assumes that each nest contains exactly one item, as is the 

201 case for all "init" datasets. 

202 

203 Returns 

204 ------- 

205 dictionary : `NamedKeyDict` 

206 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

207 `DatasetType` instances and string names usable as keys. 

208 """ 

209 

210 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

211 (ref,) = refs.values() 

212 return ref 

213 

214 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

215 

216 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

217 """Unpack nested multi-element `DatasetRef` dicts into a new 

218 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

219 

220 Returns 

221 ------- 

222 dictionary : `NamedKeyDict` 

223 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

224 both `DatasetType` instances and string names usable as keys. 

225 """ 

226 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

227 

228 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

229 """Iterate over the contained `DatasetRef` instances that match the 

230 given `DatasetType` and data IDs. 

231 

232 Parameters 

233 ---------- 

234 datasetType : `DatasetType` 

235 Dataset type to match. 

236 dataIds : `Iterable` [ `DataCoordinate` ] 

237 Data IDs to match. 

238 

239 Returns 

240 ------- 

241 refs : `Iterator` [ `DatasetRef` ] 

242 DatasetRef instances for which ``ref.datasetType == datasetType`` 

243 and ``ref.dataId`` is in ``dataIds``. 

244 """ 

245 refs = self[datasetType] 

246 return (refs[dataId] for dataId in dataIds) 

247 

248 

249class _QuantumScaffolding: 

250 """Helper class aggregating information about a `Quantum`, used when 

251 constructing a `QuantumGraph`. 

252 

253 See `_PipelineScaffolding` for a top-down description of the full 

254 scaffolding data structure. 

255 

256 Parameters 

257 ---------- 

258 task : _TaskScaffolding 

259 Back-reference to the helper object for the `PipelineTask` this quantum 

260 represents an execution of. 

261 dataId : `DataCoordinate` 

262 Data ID for this quantum. 

263 """ 

264 

265 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

266 self.task = task 

267 self.dataId = dataId 

268 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

269 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

270 self.prerequisites = _DatasetDict.fromDatasetTypes( 

271 task.prerequisites.keys(), universe=dataId.universe 

272 ) 

273 

274 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

275 

276 def __repr__(self) -> str: 

277 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

278 

279 task: _TaskScaffolding 

280 """Back-reference to the helper object for the `PipelineTask` this quantum 

281 represents an execution of. 

282 """ 

283 

284 dataId: DataCoordinate 

285 """Data ID for this quantum. 

286 """ 

287 

288 inputs: _DatasetDict 

289 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

290 

291 This is initialized to map each `DatasetType` to an empty dictionary at 

292 construction. Those nested dictionaries are populated (with data IDs as 

293 keys) with unresolved `DatasetRef` instances in 

294 `_PipelineScaffolding.connectDataIds`. 

295 """ 

296 

297 outputs: _DatasetDict 

298 """Nested dictionary containing `DatasetRef` outputs this quantum. 

299 """ 

300 

301 prerequisites: _DatasetDict 

302 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

303 quantum. 

304 """ 

305 

306 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

307 """Transform the scaffolding object into a true `Quantum` instance. 

308 

309 Parameters 

310 ---------- 

311 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

312 If not `None` then fill datastore records in each generated Quantum 

313 using the records from this structure. 

314 

315 Returns 

316 ------- 

317 quantum : `Quantum` 

318 An actual `Quantum` instance. 

319 """ 

320 allInputs = self.inputs.unpackMultiRefs() 

321 allInputs.update(self.prerequisites.unpackMultiRefs()) 

322 # Give the task's Connections class an opportunity to remove some 

323 # inputs, or complain if they are unacceptable. 

324 # This will raise if one of the check conditions is not met, which is 

325 # the intended behavior. 

326 # If it raises NotWorkFound, there is a bug in the QG algorithm 

327 # or the adjustQuantum is incorrectly trying to make a prerequisite 

328 # input behave like a regular input; adjustQuantum should only raise 

329 # NoWorkFound if a regular input is missing, and it shouldn't be 

330 # possible for us to have generated ``self`` if that's true. 

331 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

332 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

333 initInputs = self.task.initInputs.unpackSingleRefs() 

334 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

335 if datastore_records is not None: 

336 quantum_records = {} 

337 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

338 input_refs += list(initInputs.values()) 

339 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

340 for datastore_name, records in datastore_records.items(): 

341 matching_records = records.subset(input_ids) 

342 if matching_records is not None: 

343 quantum_records[datastore_name] = matching_records 

344 return Quantum( 

345 taskName=self.task.taskDef.taskName, 

346 taskClass=self.task.taskDef.taskClass, 

347 dataId=self.dataId, 

348 initInputs=initInputs, 

349 inputs=helper.inputs, 

350 outputs=helper.outputs, 

351 datastore_records=quantum_records, 

352 ) 

353 

354 

355@dataclass 

356class _TaskScaffolding: 

357 """Helper class aggregating information about a `PipelineTask`, used when 

358 constructing a `QuantumGraph`. 

359 

360 See `_PipelineScaffolding` for a top-down description of the full 

361 scaffolding data structure. 

362 

363 Parameters 

364 ---------- 

365 taskDef : `TaskDef` 

366 Data structure that identifies the task class and its config. 

367 parent : `_PipelineScaffolding` 

368 The parent data structure that will hold the instance being 

369 constructed. 

370 datasetTypes : `TaskDatasetTypes` 

371 Data structure that categorizes the dataset types used by this task. 

372 """ 

373 

374 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

375 universe = parent.dimensions.universe 

376 self.taskDef = taskDef 

377 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

378 assert self.dimensions.issubset(parent.dimensions) 

379 # Initialize _DatasetDicts as subsets of the one or two 

380 # corresponding dicts in the parent _PipelineScaffolding. 

381 self.initInputs = _DatasetDict.fromSubset( 

382 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

383 ) 

384 self.initOutputs = _DatasetDict.fromSubset( 

385 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

386 ) 

387 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

388 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

389 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

390 self.dataIds: Set[DataCoordinate] = set() 

391 self.quanta = {} 

392 

393 def __repr__(self) -> str: 

394 # Default dataclass-injected __repr__ gets caught in an infinite loop 

395 # because of back-references. 

396 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

397 

398 taskDef: TaskDef 

399 """Data structure that identifies the task class and its config 

400 (`TaskDef`). 

401 """ 

402 

403 dimensions: DimensionGraph 

404 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

405 """ 

406 

407 initInputs: _DatasetDict 

408 """Dictionary containing information about datasets used to construct this 

409 task (`_DatasetDict`). 

410 """ 

411 

412 initOutputs: _DatasetDict 

413 """Dictionary containing information about datasets produced as a 

414 side-effect of constructing this task (`_DatasetDict`). 

415 """ 

416 

417 inputs: _DatasetDict 

418 """Dictionary containing information about datasets used as regular, 

419 graph-constraining inputs to this task (`_DatasetDict`). 

420 """ 

421 

422 outputs: _DatasetDict 

423 """Dictionary containing information about datasets produced by this task 

424 (`_DatasetDict`). 

425 """ 

426 

427 prerequisites: _DatasetDict 

428 """Dictionary containing information about input datasets that must be 

429 present in the repository before any Pipeline containing this task is run 

430 (`_DatasetDict`). 

431 """ 

432 

433 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

434 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

435 this task with that data ID. 

436 """ 

437 

438 def makeQuantumSet( 

439 self, 

440 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

441 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

442 ) -> Set[Quantum]: 

443 """Create a `set` of `Quantum` from the information in ``self``. 

444 

445 Parameters 

446 ---------- 

447 unresolvedRefs : `set` [ `DatasetRef` ], optional 

448 Input dataset refs that have not been found. 

449 datastore_records : `dict` 

450 

451 

452 Returns 

453 ------- 

454 nodes : `set` of `Quantum` 

455 The `Quantum` elements corresponding to this task. 

456 """ 

457 if unresolvedRefs is None: 

458 unresolvedRefs = set() 

459 outputs = set() 

460 for q in self.quanta.values(): 

461 try: 

462 tmpQuanta = q.makeQuantum(datastore_records) 

463 outputs.add(tmpQuanta) 

464 except (NoWorkFound, FileNotFoundError) as exc: 

465 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

466 if unresolvedRefs.intersection(refs): 

467 # This means it is a node that is Known to be pruned 

468 # later and should be left in even though some follow up 

469 # queries fail. This allows the pruning to start from this 

470 # quantum with known issues, and prune other nodes it 

471 # touches 

472 inputs = q.inputs.unpackMultiRefs() 

473 inputs.update(q.prerequisites.unpackMultiRefs()) 

474 tmpQuantum = Quantum( 

475 taskName=q.task.taskDef.taskName, 

476 taskClass=q.task.taskDef.taskClass, 

477 dataId=q.dataId, 

478 initInputs=q.task.initInputs.unpackSingleRefs(), 

479 inputs=inputs, 

480 outputs=q.outputs.unpackMultiRefs(), 

481 ) 

482 outputs.add(tmpQuantum) 

483 else: 

484 raise exc 

485 return outputs 

486 

487 

488@dataclass 

489class _PipelineScaffolding: 

490 """A helper data structure that organizes the information involved in 

491 constructing a `QuantumGraph` for a `Pipeline`. 

492 

493 Parameters 

494 ---------- 

495 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

496 Sequence of tasks from which a graph is to be constructed. Must 

497 have nested task classes already imported. 

498 universe : `DimensionUniverse` 

499 Universe of all possible dimensions. 

500 

501 Notes 

502 ----- 

503 The scaffolding data structure contains nested data structures for both 

504 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

505 data structures are shared between the pipeline-level structure (which 

506 aggregates all datasets and categorizes them from the perspective of the 

507 complete pipeline) and the individual tasks that use them as inputs and 

508 outputs. 

509 

510 `QuantumGraph` construction proceeds in four steps, with each corresponding 

511 to a different `_PipelineScaffolding` method: 

512 

513 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

514 the DatasetTypes used by the pipeline (delegating to 

515 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

516 nested `_TaskScaffolding` and `_DatasetDict` objects. 

517 

518 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

519 returns related tuples of all dimensions used to identify any regular 

520 input, output, and intermediate datasets (not prerequisites). We then 

521 iterate over these tuples of related dimensions, identifying the subsets 

522 that correspond to distinct data IDs for each task and dataset type, 

523 and then create `_QuantumScaffolding` objects. 

524 

525 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

526 dataset data IDs previously identified, transforming unresolved 

527 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

528 up prerequisite datasets for all quanta. 

529 

530 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

531 per-task `_QuantumScaffolding` objects. 

532 """ 

533 

534 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

535 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

536 self.tasks = [] 

537 # Aggregate and categorize the DatasetTypes in the Pipeline. 

538 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

539 # Construct dictionaries that map those DatasetTypes to structures 

540 # that will (later) hold addiitonal information about them. 

541 for attr in ( 

542 "initInputs", 

543 "initIntermediates", 

544 "initOutputs", 

545 "inputs", 

546 "intermediates", 

547 "outputs", 

548 "prerequisites", 

549 ): 

550 setattr( 

551 self, 

552 attr, 

553 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

554 ) 

555 # Aggregate all dimensions for all non-init, non-prerequisite 

556 # DatasetTypes. These are the ones we'll include in the big join 

557 # query. 

558 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

559 # Construct scaffolding nodes for each Task, and add backreferences 

560 # to the Task from each DatasetScaffolding node. 

561 # Note that there's only one scaffolding node for each DatasetType, 

562 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

563 # reference it. 

564 if isinstance(pipeline, Pipeline): 

565 pipeline = pipeline.toExpandedPipeline() 

566 self.tasks = [ 

567 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

568 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

569 ] 

570 

571 def __repr__(self) -> str: 

572 # Default dataclass-injected __repr__ gets caught in an infinite loop 

573 # because of back-references. 

574 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

575 

576 tasks: List[_TaskScaffolding] 

577 """Scaffolding data structures for each task in the pipeline 

578 (`list` of `_TaskScaffolding`). 

579 """ 

580 

581 initInputs: _DatasetDict 

582 """Datasets consumed but not produced when constructing the tasks in this 

583 pipeline (`_DatasetDict`). 

584 """ 

585 

586 initIntermediates: _DatasetDict 

587 """Datasets that are both consumed and produced when constructing the tasks 

588 in this pipeline (`_DatasetDict`). 

589 """ 

590 

591 initOutputs: _DatasetDict 

592 """Datasets produced but not consumed when constructing the tasks in this 

593 pipeline (`_DatasetDict`). 

594 """ 

595 

596 inputs: _DatasetDict 

597 """Datasets that are consumed but not produced when running this pipeline 

598 (`_DatasetDict`). 

599 """ 

600 

601 intermediates: _DatasetDict 

602 """Datasets that are both produced and consumed when running this pipeline 

603 (`_DatasetDict`). 

604 """ 

605 

606 outputs: _DatasetDict 

607 """Datasets produced but not consumed when when running this pipeline 

608 (`_DatasetDict`). 

609 """ 

610 

611 prerequisites: _DatasetDict 

612 """Datasets that are consumed when running this pipeline and looked up 

613 per-Quantum when generating the graph (`_DatasetDict`). 

614 """ 

615 

616 dimensions: DimensionGraph 

617 """All dimensions used by any regular input, intermediate, or output 

618 (not prerequisite) dataset; the set of dimension used in the "Big Join 

619 Query" (`DimensionGraph`). 

620 

621 This is required to be a superset of all task quantum dimensions. 

622 """ 

623 

624 @contextmanager 

625 def connectDataIds( 

626 self, 

627 registry: Registry, 

628 collections: Any, 

629 userQuery: Optional[str], 

630 externalDataId: DataCoordinate, 

631 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

632 ) -> Iterator[DataCoordinateQueryResults]: 

633 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

634 

635 This method populates `_TaskScaffolding.dataIds` and 

636 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

637 

638 Parameters 

639 ---------- 

640 registry : `lsst.daf.butler.Registry` 

641 Registry for the data repository; used for all data ID queries. 

642 collections 

643 Expressions representing the collections to search for input 

644 datasets. May be any of the types accepted by 

645 `lsst.daf.butler.CollectionSearch.fromExpression`. 

646 userQuery : `str` or `None` 

647 User-provided expression to limit the data IDs processed. 

648 externalDataId : `DataCoordinate` 

649 Externally-provided data ID that should be used to restrict the 

650 results, just as if these constraints had been included via ``AND`` 

651 in ``userQuery``. This includes (at least) any instrument named 

652 in the pipeline definition. 

653 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

654 The query constraint variant that should be used to constraint the 

655 query based on dataset existance, defaults to 

656 `DatasetQueryConstraintVariant.ALL`. 

657 

658 Returns 

659 ------- 

660 commonDataIds : \ 

661 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

662 An interface to a database temporary table containing all data IDs 

663 that will appear in this `QuantumGraph`. Returned inside a 

664 context manager, which will drop the temporary table at the end of 

665 the `with` block in which this method is called. 

666 """ 

667 _LOG.debug("Building query for data IDs.") 

668 # Initialization datasets always have empty data IDs. 

669 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

670 for datasetType, refs in itertools.chain( 

671 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

672 ): 

673 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

674 # Run one big query for the data IDs for task dimensions and regular 

675 # inputs and outputs. We limit the query to only dimensions that are 

676 # associated with the input dataset types, but don't (yet) try to 

677 # obtain the dataset_ids for those inputs. 

678 _LOG.debug("Submitting data ID query and materializing results.") 

679 queryArgs: Dict[str, Any] = { 

680 "dimensions": self.dimensions, 

681 "where": userQuery, 

682 "dataId": externalDataId, 

683 } 

684 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

685 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

686 queryArgs["datasets"] = list(self.inputs) 

687 queryArgs["collections"] = collections 

688 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

689 _LOG.debug("Not using dataset existence to constrain query.") 

690 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

691 constraint = set(datasetQueryConstraint) 

692 inputs = {k.name: k for k in self.inputs.keys()} 

693 if remainder := constraint.difference(inputs.keys()): 

694 raise ValueError( 

695 f"{remainder} dataset type(s) specified as a graph constraint, but" 

696 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

697 ) 

698 _LOG.debug(f"Constraining graph query using {constraint}") 

699 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

700 queryArgs["collections"] = collections 

701 else: 

702 raise ValueError( 

703 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

704 ) 

705 

706 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

707 _LOG.debug("Expanding data IDs.") 

708 commonDataIds = commonDataIds.expanded() 

709 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

710 # Iterate over query results, populating data IDs for datasets and 

711 # quanta and then connecting them to each other. 

712 n = -1 

713 for n, commonDataId in enumerate(commonDataIds): 

714 # Create DatasetRefs for all DatasetTypes from this result row, 

715 # noting that we might have created some already. 

716 # We remember both those that already existed and those that we 

717 # create now. 

718 refsForRow = {} 

719 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

720 for datasetType, refs in itertools.chain( 

721 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

722 ): 

723 datasetDataId: Optional[DataCoordinate] 

724 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

725 datasetDataId = commonDataId.subset(datasetType.dimensions) 

726 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

727 ref = refs.get(datasetDataId) 

728 if ref is None: 

729 ref = DatasetRef(datasetType, datasetDataId) 

730 refs[datasetDataId] = ref 

731 refsForRow[datasetType.name] = ref 

732 # Create _QuantumScaffolding objects for all tasks from this 

733 # result row, noting that we might have created some already. 

734 for task in self.tasks: 

735 quantumDataId = commonDataId.subset(task.dimensions) 

736 quantum = task.quanta.get(quantumDataId) 

737 if quantum is None: 

738 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

739 task.quanta[quantumDataId] = quantum 

740 # Whether this is a new quantum or an existing one, we can 

741 # now associate the DatasetRefs for this row with it. The 

742 # fact that a Quantum data ID and a dataset data ID both 

743 # came from the same result row is what tells us they 

744 # should be associated. 

745 # Many of these associates will be duplicates (because 

746 # another query row that differed from this one only in 

747 # irrelevant dimensions already added them), and we use 

748 # sets to skip. 

749 for datasetType in task.inputs: 

750 ref = refsForRow[datasetType.name] 

751 quantum.inputs[datasetType.name][ref.dataId] = ref 

752 for datasetType in task.outputs: 

753 ref = refsForRow[datasetType.name] 

754 quantum.outputs[datasetType.name][ref.dataId] = ref 

755 if n < 0: 

756 emptiness_explained = False 

757 for message in commonDataIds.explain_no_results(): 

758 _LOG.warning(message) 

759 emptiness_explained = True 

760 if not emptiness_explained: 

761 _LOG.warning( 

762 "To reproduce this query for debugging purposes, run " 

763 "Registry.queryDataIds with these arguments:" 

764 ) 

765 # We could just repr() the queryArgs dict to get something 

766 # the user could make sense of, but it's friendlier to 

767 # put these args in an easier-to-construct equivalent form 

768 # so they can read it more easily and copy and paste into 

769 # a Python terminal. 

770 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

771 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName()) 

772 if queryArgs["where"]: 

773 _LOG.warning(" where=%s,", repr(queryArgs["where"])) 

774 if "datasets" in queryArgs: 

775 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

776 if "collections" in queryArgs: 

777 _LOG.warning(" collections=%s,", list(queryArgs["collections"])) 

778 _LOG.debug("Finished processing %d rows from data ID query.", n) 

779 yield commonDataIds 

780 

781 def resolveDatasetRefs( 

782 self, 

783 registry: Registry, 

784 collections: Any, 

785 run: Optional[str], 

786 commonDataIds: DataCoordinateQueryResults, 

787 *, 

788 skipExistingIn: Any = None, 

789 clobberOutputs: bool = True, 

790 constrainedByAllDatasets: bool = True, 

791 ) -> None: 

792 """Perform follow up queries for each dataset data ID produced in 

793 `fillDataIds`. 

794 

795 This method populates `_DatasetScaffolding.refs` (except for those in 

796 `prerequisites`). 

797 

798 Parameters 

799 ---------- 

800 registry : `lsst.daf.butler.Registry` 

801 Registry for the data repository; used for all data ID queries. 

802 collections 

803 Expressions representing the collections to search for input 

804 datasets. May be any of the types accepted by 

805 `lsst.daf.butler.CollectionSearch.fromExpression`. 

806 run : `str`, optional 

807 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

808 output datasets, if it already exists. 

809 commonDataIds : \ 

810 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

811 Result of a previous call to `connectDataIds`. 

812 skipExistingIn 

813 Expressions representing the collections to search for existing 

814 output datasets that should be skipped. May be any of the types 

815 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

816 `None` or empty string/sequence disables skipping. 

817 clobberOutputs : `bool`, optional 

818 If `True` (default), allow quanta to created even if outputs exist; 

819 this requires the same behavior behavior to be enabled when 

820 executing. If ``skipExistingIn`` is not `None`, completed quanta 

821 (those with metadata, or all outputs if there is no metadata 

822 dataset configured) will be skipped rather than clobbered. 

823 constrainedByAllDatasets : `bool`, optional 

824 Indicates if the commonDataIds were generated with a constraint on 

825 all dataset types. 

826 

827 Raises 

828 ------ 

829 OutputExistsError 

830 Raised if an output dataset already exists in the output run 

831 and ``skipExistingIn`` does not include output run, or if only 

832 some outputs are present and ``clobberOutputs`` is `False`. 

833 """ 

834 skipCollections: Optional[CollectionSearch] = None 

835 skipExistingInRun = False 

836 if skipExistingIn: 

837 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

838 if run: 

839 # as optimization check in the explicit list of names first 

840 skipExistingInRun = run in skipCollections.explicitNames() 

841 if not skipExistingInRun: 

842 # need to flatten it and check again 

843 skipExistingInRun = run in registry.queryCollections( 

844 skipExistingIn, 

845 collectionTypes=CollectionType.RUN, 

846 ) 

847 

848 # Look up [init] intermediate and output datasets in the output 

849 # collection, if there is an output collection. 

850 if run is not None or skipCollections is not None: 

851 for datasetType, refs in itertools.chain( 

852 self.initIntermediates.items(), 

853 self.initOutputs.items(), 

854 self.intermediates.items(), 

855 self.outputs.items(), 

856 ): 

857 _LOG.debug( 

858 "Resolving %d datasets for intermediate and/or output dataset %s.", 

859 len(refs), 

860 datasetType.name, 

861 ) 

862 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

863 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

864 

865 # look at RUN collection first 

866 if run is not None: 

867 resolvedRefQueryResults = subset.findDatasets( 

868 datasetType, collections=run, findFirst=True 

869 ) 

870 for resolvedRef in resolvedRefQueryResults: 

871 # TODO: we could easily support per-DatasetType 

872 # skipExisting and I could imagine that being useful - 

873 # it's probably required in order to support writing 

874 # initOutputs before QuantumGraph generation. 

875 assert resolvedRef.dataId in refs 

876 if not (skipExistingInRun or isInit or clobberOutputs): 

877 raise OutputExistsError( 

878 f"Output dataset {datasetType.name} already exists in " 

879 f"output RUN collection '{run}' with data ID" 

880 f" {resolvedRef.dataId}." 

881 ) 

882 

883 # And check skipExistingIn too, if RUN collection is in 

884 # it is handled above 

885 if skipCollections is not None: 

886 resolvedRefQueryResults = subset.findDatasets( 

887 datasetType, collections=skipCollections, findFirst=True 

888 ) 

889 for resolvedRef in resolvedRefQueryResults: 

890 assert resolvedRef.dataId in refs 

891 refs[resolvedRef.dataId] = resolvedRef 

892 

893 # Look up input and initInput datasets in the input collection(s). 

894 # container to accumulate unfound refs, if the common dataIs were not 

895 # constrained on dataset type existence. 

896 self.unfoundRefs = set() 

897 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

898 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

899 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets( 

900 datasetType, collections=collections, findFirst=True 

901 ) 

902 dataIdsNotFoundYet = set(refs.keys()) 

903 for resolvedRef in resolvedRefQueryResults: 

904 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

905 refs[resolvedRef.dataId] = resolvedRef 

906 if dataIdsNotFoundYet: 

907 if constrainedByAllDatasets: 

908 raise RuntimeError( 

909 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

910 f"'{datasetType.name}' was/were present in a previous " 

911 f"query, but could not be found now." 

912 f"This is either a logic bug in QuantumGraph generation " 

913 f"or the input collections have been modified since " 

914 f"QuantumGraph generation began." 

915 ) 

916 else: 

917 # if the common dataIds were not constrained using all the 

918 # input dataset types, it is possible that some data ids 

919 # found dont correspond to existing dataset types and they 

920 # will be un-resolved. Mark these for later pruning from 

921 # the quantum graph. 

922 for k in dataIdsNotFoundYet: 

923 self.unfoundRefs.add(refs[k]) 

924 

925 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

926 # replacing the unresolved refs there, and then look up prerequisites. 

927 for task in self.tasks: 

928 _LOG.debug( 

929 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

930 len(task.quanta), 

931 task.taskDef.label, 

932 ) 

933 # The way iterConnections is designed makes it impossible to 

934 # annotate precisely enough to satisfy MyPy here. 

935 lookupFunctions = { 

936 c.name: c.lookupFunction # type: ignore 

937 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

938 if c.lookupFunction is not None # type: ignore 

939 } 

940 dataIdsFailed = [] 

941 dataIdsSucceeded = [] 

942 for quantum in task.quanta.values(): 

943 # Process outputs datasets only if skipExistingIn is not None 

944 # or there is a run to look for outputs in and clobberOutputs 

945 # is True. Note that if skipExistingIn is None, any output 

946 # datasets that already exist would have already caused an 

947 # exception to be raised. We never update the DatasetRefs in 

948 # the quantum because those should never be resolved. 

949 if skipCollections is not None or (run is not None and clobberOutputs): 

950 resolvedRefs = [] 

951 unresolvedRefs = [] 

952 haveMetadata = False 

953 for datasetType, originalRefs in quantum.outputs.items(): 

954 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

955 if ref.id is not None: 

956 resolvedRefs.append(ref) 

957 if datasetType.name == task.taskDef.metadataDatasetName: 

958 haveMetadata = True 

959 else: 

960 unresolvedRefs.append(ref) 

961 if resolvedRefs: 

962 if haveMetadata or not unresolvedRefs: 

963 dataIdsSucceeded.append(quantum.dataId) 

964 if skipCollections is not None: 

965 continue 

966 else: 

967 dataIdsFailed.append(quantum.dataId) 

968 if not clobberOutputs: 

969 raise OutputExistsError( 

970 f"Quantum {quantum.dataId} of task with label " 

971 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

972 f"({resolvedRefs}) " 

973 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

974 "and clobbering outputs was not enabled." 

975 ) 

976 # Update the input DatasetRefs to the resolved ones we already 

977 # searched for. 

978 for datasetType, input_refs in quantum.inputs.items(): 

979 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

980 input_refs[ref.dataId] = ref 

981 # Look up prerequisite datasets in the input collection(s). 

982 # These may have dimensions that extend beyond those we queried 

983 # for originally, because we want to permit those data ID 

984 # values to differ across quanta and dataset types. 

985 for datasetType in task.prerequisites: 

986 lookupFunction = lookupFunctions.get(datasetType.name) 

987 if lookupFunction is not None: 

988 # PipelineTask has provided its own function to do the 

989 # lookup. This always takes precedence. 

990 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

991 elif ( 

992 datasetType.isCalibration() 

993 and datasetType.dimensions <= quantum.dataId.graph 

994 and quantum.dataId.graph.temporal 

995 ): 

996 # This is a master calibration lookup, which we have to 

997 # handle specially because the query system can't do a 

998 # temporal join on a non-dimension-based timespan yet. 

999 timespan = quantum.dataId.timespan 

1000 try: 

1001 prereq_refs = [ 

1002 registry.findDataset( 

1003 datasetType, quantum.dataId, collections=collections, timespan=timespan 

1004 ) 

1005 ] 

1006 except KeyError: 

1007 # This dataset type is not present in the registry, 

1008 # which just means there are no datasets here. 

1009 prereq_refs = [] 

1010 else: 

1011 # Most general case. 

1012 prereq_refs = list( 

1013 registry.queryDatasets( 

1014 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True 

1015 ).expanded() 

1016 ) 

1017 quantum.prerequisites[datasetType].update( 

1018 {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1019 ) 

1020 # Actually remove any quanta that we decided to skip above. 

1021 if dataIdsSucceeded: 

1022 if skipCollections is not None: 

1023 _LOG.debug( 

1024 "Pruning successful %d quanta for task with label '%s' because all of their " 

1025 "outputs exist or metadata was written successfully.", 

1026 len(dataIdsSucceeded), 

1027 task.taskDef.label, 

1028 ) 

1029 for dataId in dataIdsSucceeded: 

1030 del task.quanta[dataId] 

1031 elif clobberOutputs: 

1032 _LOG.info( 

1033 "Found %d successful quanta for task with label '%s' " 

1034 "that will need to be clobbered during execution.", 

1035 len(dataIdsSucceeded), 

1036 task.taskDef.label, 

1037 ) 

1038 else: 

1039 raise AssertionError("OutputExistsError should have already been raised.") 

1040 if dataIdsFailed: 

1041 if clobberOutputs: 

1042 _LOG.info( 

1043 "Found %d failed/incomplete quanta for task with label '%s' " 

1044 "that will need to be clobbered during execution.", 

1045 len(dataIdsFailed), 

1046 task.taskDef.label, 

1047 ) 

1048 else: 

1049 raise AssertionError("OutputExistsError should have already been raised.") 

1050 

1051 def makeQuantumGraph( 

1052 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None 

1053 ) -> QuantumGraph: 

1054 """Create a `QuantumGraph` from the quanta already present in 

1055 the scaffolding data structure. 

1056 

1057 Parameters 

1058 --------- 

1059 metadata : Optional Mapping of `str` to primitives 

1060 This is an optional parameter of extra data to carry with the 

1061 graph. Entries in this mapping should be able to be serialized in 

1062 JSON. 

1063 datastore : `Datastore`, optional 

1064 If not `None` then fill datastore records in each generated 

1065 Quantum. 

1066 

1067 Returns 

1068 ------- 

1069 graph : `QuantumGraph` 

1070 The full `QuantumGraph`. 

1071 """ 

1072 

1073 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1074 """Extract all DatasetRefs from the dictionaries""" 

1075 for ref_dict in dataset_dict.values(): 

1076 yield from ref_dict.values() 

1077 

1078 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1079 if datastore is not None: 

1080 datastore_records = datastore.export_records( 

1081 itertools.chain( 

1082 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1083 ) 

1084 ) 

1085 

1086 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1087 for task in self.tasks: 

1088 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1089 graphInput[task.taskDef] = qset 

1090 

1091 graph = QuantumGraph( 

1092 graphInput, metadata=metadata, pruneRefs=self.unfoundRefs, universe=self.dimensions.universe 

1093 ) 

1094 return graph 

1095 

1096 

1097# ------------------------ 

1098# Exported definitions -- 

1099# ------------------------ 

1100 

1101 

1102class GraphBuilderError(Exception): 

1103 """Base class for exceptions generated by graph builder.""" 

1104 

1105 pass 

1106 

1107 

1108class OutputExistsError(GraphBuilderError): 

1109 """Exception generated when output datasets already exist.""" 

1110 

1111 pass 

1112 

1113 

1114class PrerequisiteMissingError(GraphBuilderError): 

1115 """Exception generated when a prerequisite dataset does not exist.""" 

1116 

1117 pass 

1118 

1119 

1120class GraphBuilder: 

1121 """GraphBuilder class is responsible for building task execution graph from 

1122 a Pipeline. 

1123 

1124 Parameters 

1125 ---------- 

1126 registry : `~lsst.daf.butler.Registry` 

1127 Data butler instance. 

1128 skipExistingIn 

1129 Expressions representing the collections to search for existing 

1130 output datasets that should be skipped. May be any of the types 

1131 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

1132 clobberOutputs : `bool`, optional 

1133 If `True` (default), allow quanta to created even if partial outputs 

1134 exist; this requires the same behavior behavior to be enabled when 

1135 executing. 

1136 datastore : `Datastore`, optional 

1137 If not `None` then fill datastore records in each generated Quantum. 

1138 """ 

1139 

1140 def __init__( 

1141 self, 

1142 registry: Registry, 

1143 skipExistingIn: Any = None, 

1144 clobberOutputs: bool = True, 

1145 datastore: Optional[Datastore] = None, 

1146 ): 

1147 self.registry = registry 

1148 self.dimensions = registry.dimensions 

1149 self.skipExistingIn = skipExistingIn 

1150 self.clobberOutputs = clobberOutputs 

1151 self.datastore = datastore 

1152 

1153 def makeGraph( 

1154 self, 

1155 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1156 collections: Any, 

1157 run: Optional[str], 

1158 userQuery: Optional[str], 

1159 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1160 metadata: Optional[Mapping[str, Any]] = None, 

1161 ) -> QuantumGraph: 

1162 """Create execution graph for a pipeline. 

1163 

1164 Parameters 

1165 ---------- 

1166 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1167 Pipeline definition, task names/classes and their configs. 

1168 collections 

1169 Expressions representing the collections to search for input 

1170 datasets. May be any of the types accepted by 

1171 `lsst.daf.butler.CollectionSearch.fromExpression`. 

1172 run : `str`, optional 

1173 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1174 output datasets, if it already exists. 

1175 userQuery : `str` 

1176 String which defines user-defined selection for registry, should be 

1177 empty or `None` if there is no restrictions on data selection. 

1178 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1179 The query constraint variant that should be used to constraint the 

1180 query based on dataset existance, defaults to 

1181 `DatasetQueryConstraintVariant.ALL`. 

1182 metadata : Optional Mapping of `str` to primitives 

1183 This is an optional parameter of extra data to carry with the 

1184 graph. Entries in this mapping should be able to be serialized in 

1185 JSON. 

1186 

1187 Returns 

1188 ------- 

1189 graph : `QuantumGraph` 

1190 

1191 Raises 

1192 ------ 

1193 UserExpressionError 

1194 Raised when user expression cannot be parsed. 

1195 OutputExistsError 

1196 Raised when output datasets already exist. 

1197 Exception 

1198 Other exceptions types may be raised by underlying registry 

1199 classes. 

1200 """ 

1201 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1202 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1203 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1204 instrument_class: Optional[Any] = None 

1205 if isinstance(pipeline, Pipeline): 

1206 instrument_class_name = pipeline.getInstrument() 

1207 if instrument_class_name is not None: 

1208 instrument_class = doImportType(instrument_class_name) 

1209 pipeline = list(pipeline.toExpandedPipeline()) 

1210 if instrument_class is not None: 

1211 dataId = DataCoordinate.standardize( 

1212 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1213 ) 

1214 else: 

1215 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1216 with scaffolding.connectDataIds( 

1217 self.registry, collections, userQuery, dataId, datasetQueryConstraint 

1218 ) as commonDataIds: 

1219 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1220 scaffolding.resolveDatasetRefs( 

1221 self.registry, 

1222 collections, 

1223 run, 

1224 commonDataIds, 

1225 skipExistingIn=self.skipExistingIn, 

1226 clobberOutputs=self.clobberOutputs, 

1227 constrainedByAllDatasets=condition, 

1228 ) 

1229 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)