Coverage for python/lsst/pipe/base/graphBuilder.py: 15%

544 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-12 02:03 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from collections.abc import Collection, Iterable, Iterator, Mapping 

35from contextlib import contextmanager 

36from dataclasses import dataclass 

37from typing import Any, Optional 

38 

39from lsst.daf.butler import ( 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56from lsst.utils import doImportType 

57 

58from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

59from ._status import NoWorkFound 

60 

61# ----------------------------- 

62# Imports for other modules -- 

63# ----------------------------- 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75@dataclass 

76class _RefHolder: 

77 """Placeholder for `DatasetRef` representing a future resolved reference. 

78 

79 As we eliminated unresolved DatasetRefs we now use `None` to represent 

80 a reference that is yet to be resolved. Information about its corresponding 

81 dataset type and coordinate is stored in `_DatasetDict` mapping. 

82 """ 

83 

84 dataset_type: DatasetType 

85 """Dataset type of the dataset to be created later. I need to store it here 

86 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

87 between different compatible dataset types.""" 

88 

89 ref: DatasetRef | None = None 

90 """Dataset reference, initially `None`, created when all datasets are 

91 resolved. 

92 """ 

93 

94 @property 

95 def resolved_ref(self) -> DatasetRef: 

96 """Access resolved reference, should only be called after the 

97 reference is set (`DatasetRef`).""" 

98 assert self.ref is not None, "Dataset reference is not set." 

99 return self.ref 

100 

101 

102class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]): 

103 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

104 the known `DatasetRef` instances of that type. 

105 

106 Parameters 

107 ---------- 

108 args 

109 Positional arguments are forwarded to the `dict` constructor. 

110 universe : `DimensionUniverse` 

111 Universe of all possible dimensions. 

112 """ 

113 

114 def __init__(self, *args: Any, universe: DimensionUniverse): 

115 super().__init__(*args) 

116 self.universe = universe 

117 

118 @classmethod 

119 def fromDatasetTypes( 

120 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

121 ) -> _DatasetDict: 

122 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

123 

124 Parameters 

125 ---------- 

126 datasetTypes : `iterable` of `DatasetType` 

127 DatasetTypes to use as keys for the dict. Values will be empty 

128 dictionaries. 

129 universe : `DimensionUniverse` 

130 Universe of all possible dimensions. 

131 

132 Returns 

133 ------- 

134 dictionary : `_DatasetDict` 

135 A new `_DatasetDict` instance. 

136 """ 

137 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

138 

139 @classmethod 

140 def fromSubset( 

141 cls, 

142 datasetTypes: Collection[DatasetType], 

143 first: _DatasetDict, 

144 *rest: _DatasetDict, 

145 ) -> _DatasetDict: 

146 """Return a new dictionary by extracting items corresponding to the 

147 given keys from one or more existing dictionaries. 

148 

149 Parameters 

150 ---------- 

151 datasetTypes : `iterable` of `DatasetType` 

152 DatasetTypes to use as keys for the dict. Values will be obtained 

153 by lookups against ``first`` and ``rest``. 

154 first : `_DatasetDict` 

155 Another dictionary from which to extract values. 

156 rest 

157 Additional dictionaries from which to extract values. 

158 

159 Returns 

160 ------- 

161 dictionary : `_DatasetDict` 

162 A new dictionary instance. 

163 """ 

164 combined = ChainMap(first, *rest) 

165 

166 # Dataset types known to match immediately can be processed 

167 # without checks. 

168 matches = combined.keys() & set(datasetTypes) 

169 _dict = {k: combined[k] for k in matches} 

170 

171 if len(_dict) < len(datasetTypes): 

172 # Work out which ones are missing. 

173 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

174 

175 # Get the known names for comparison. 

176 combined_by_name = {k.name: k for k in combined} 

177 

178 missing = set() 

179 incompatible = {} 

180 for datasetType in missing_datasetTypes: 

181 # The dataset type is not found. It may not be listed 

182 # or it may be that it is there with the same name 

183 # but different definition. 

184 if datasetType.name in combined_by_name: 

185 # This implies some inconsistency in definitions 

186 # for connections. If there is support for storage 

187 # class conversion we can let it slide. 

188 # At this point we do not know 

189 # where the inconsistency is but trust that down 

190 # stream code will be more explicit about input 

191 # vs output incompatibilities. 

192 existing = combined_by_name[datasetType.name] 

193 convertible_to_existing = existing.is_compatible_with(datasetType) 

194 convertible_from_existing = datasetType.is_compatible_with(existing) 

195 if convertible_to_existing and convertible_from_existing: 

196 _LOG.debug( 

197 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

198 datasetType.name, 

199 datasetType.storageClass_name, 

200 existing.storageClass_name, 

201 ) 

202 _dict[datasetType] = combined[existing] 

203 elif convertible_to_existing or convertible_from_existing: 

204 # We'd need to refactor a fair amount to recognize 

205 # whether this is an error or not, so I'm not going to 

206 # bother until we need to do that for other reasons 

207 # (it won't be too long). 

208 _LOG.info( 

209 "Dataset type %s is present with multiple only partially-compatible storage " 

210 "classes %s and %s.", 

211 datasetType.name, 

212 datasetType.storageClass_name, 

213 existing.storageClass_name, 

214 ) 

215 _dict[datasetType] = combined[existing] 

216 else: 

217 incompatible[datasetType] = existing 

218 else: 

219 missing.add(datasetType) 

220 

221 if missing or incompatible: 

222 reasons = [] 

223 if missing: 

224 reasons.append( 

225 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

226 f"types: [{', '.join(d.name for d in combined)}]." 

227 ) 

228 if incompatible: 

229 for x, y in incompatible.items(): 

230 reasons.append(f"{x} incompatible with {y}") 

231 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

232 

233 return cls(_dict, universe=first.universe) 

234 

235 @property 

236 def dimensions(self) -> DimensionGraph: 

237 """The union of all dimensions used by all dataset types in this 

238 dictionary, including implied dependencies (`DimensionGraph`). 

239 """ 

240 base = self.universe.empty 

241 if len(self) == 0: 

242 return base 

243 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

244 

245 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

246 """Unpack nested single-element `DatasetRef` dicts into a new 

247 mapping with `DatasetType` keys and `DatasetRef` values. 

248 

249 This method assumes that each nest contains exactly one item, as is the 

250 case for all "init" datasets. 

251 

252 Returns 

253 ------- 

254 dictionary : `NamedKeyDict` 

255 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

256 `DatasetType` instances and string names usable as keys. 

257 """ 

258 

259 def getOne(refs: dict[DataCoordinate, _RefHolder]) -> DatasetRef: 

260 (holder,) = refs.values() 

261 return holder.resolved_ref 

262 

263 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

264 

265 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

266 """Unpack nested multi-element `DatasetRef` dicts into a new 

267 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

268 

269 Returns 

270 ------- 

271 dictionary : `NamedKeyDict` 

272 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

273 both `DatasetType` instances and string names usable as keys. 

274 """ 

275 return NamedKeyDict( 

276 { 

277 datasetType: list(holder.resolved_ref for holder in refs.values()) 

278 for datasetType, refs in self.items() 

279 } 

280 ) 

281 

282 def extract( 

283 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

284 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

285 """Iterate over the contained `DatasetRef` instances that match the 

286 given `DatasetType` and data IDs. 

287 

288 Parameters 

289 ---------- 

290 datasetType : `DatasetType` 

291 Dataset type to match. 

292 dataIds : `Iterable` [ `DataCoordinate` ] 

293 Data IDs to match. 

294 

295 Returns 

296 ------- 

297 refs : `Iterator` [ `DatasetRef` ] 

298 DatasetRef instances for which ``ref.datasetType == datasetType`` 

299 and ``ref.dataId`` is in ``dataIds``. 

300 """ 

301 refs = self[datasetType] 

302 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

303 

304 def isdisjoint(self, other: _DatasetDict) -> bool: 

305 """Test whether ``self`` and ``other`` have any datasets in common. 

306 

307 Datasets are considered in common if they have the same *parent* 

308 dataset type name and data ID; storage classes and components are not 

309 considered. 

310 """ 

311 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

312 for k, v in other.items(): 

313 parent_name, _ = k.nameAndComponent() 

314 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

315 return False 

316 return True 

317 

318 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

319 """Iterate over all DatasetRef instances held by this data structure, 

320 assuming that each `_RefHolder` already carries are resolved ref. 

321 """ 

322 for holders_by_data_id in self.values(): 

323 for holder in holders_by_data_id.values(): 

324 yield holder.resolved_ref 

325 

326 

327class _QuantumScaffolding: 

328 """Helper class aggregating information about a `Quantum`, used when 

329 constructing a `QuantumGraph`. 

330 

331 See `_PipelineScaffolding` for a top-down description of the full 

332 scaffolding data structure. 

333 

334 Parameters 

335 ---------- 

336 task : _TaskScaffolding 

337 Back-reference to the helper object for the `PipelineTask` this quantum 

338 represents an execution of. 

339 dataId : `DataCoordinate` 

340 Data ID for this quantum. 

341 """ 

342 

343 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

344 self.task = task 

345 self.dataId = dataId 

346 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

347 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

348 self.prerequisites = _DatasetDict.fromDatasetTypes( 

349 task.prerequisites.keys(), universe=dataId.universe 

350 ) 

351 

352 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

353 

354 def __repr__(self) -> str: 

355 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

356 

357 task: _TaskScaffolding 

358 """Back-reference to the helper object for the `PipelineTask` this quantum 

359 represents an execution of. 

360 """ 

361 

362 dataId: DataCoordinate 

363 """Data ID for this quantum. 

364 """ 

365 

366 inputs: _DatasetDict 

367 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

368 

369 This is initialized to map each `DatasetType` to an empty dictionary at 

370 construction. Those nested dictionaries are populated (with data IDs as 

371 keys) with unresolved `DatasetRef` instances in 

372 `_PipelineScaffolding.connectDataIds`. 

373 """ 

374 

375 outputs: _DatasetDict 

376 """Nested dictionary containing `DatasetRef` outputs this quantum. 

377 """ 

378 

379 prerequisites: _DatasetDict 

380 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

381 quantum. 

382 """ 

383 

384 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

385 """Transform the scaffolding object into a true `Quantum` instance. 

386 

387 Parameters 

388 ---------- 

389 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

390 If not `None` then fill datastore records in each generated Quantum 

391 using the records from this structure. 

392 

393 Returns 

394 ------- 

395 quantum : `Quantum` 

396 An actual `Quantum` instance. 

397 """ 

398 allInputs = self.inputs.unpackMultiRefs() 

399 allInputs.update(self.prerequisites.unpackMultiRefs()) 

400 # Give the task's Connections class an opportunity to remove some 

401 # inputs, or complain if they are unacceptable. 

402 # This will raise if one of the check conditions is not met, which is 

403 # the intended behavior. 

404 # If it raises NotWorkFound, there is a bug in the QG algorithm 

405 # or the adjustQuantum is incorrectly trying to make a prerequisite 

406 # input behave like a regular input; adjustQuantum should only raise 

407 # NoWorkFound if a regular input is missing, and it shouldn't be 

408 # possible for us to have generated ``self`` if that's true. 

409 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

410 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

411 initInputs = self.task.initInputs.unpackSingleRefs() 

412 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

413 if datastore_records is not None: 

414 quantum_records = {} 

415 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

416 input_refs += list(initInputs.values()) 

417 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

418 for datastore_name, records in datastore_records.items(): 

419 matching_records = records.subset(input_ids) 

420 if matching_records is not None: 

421 quantum_records[datastore_name] = matching_records 

422 return Quantum( 

423 taskName=self.task.taskDef.taskName, 

424 taskClass=self.task.taskDef.taskClass, 

425 dataId=self.dataId, 

426 initInputs=initInputs, 

427 inputs=helper.inputs, 

428 outputs=helper.outputs, 

429 datastore_records=quantum_records, 

430 ) 

431 

432 

433@dataclass 

434class _TaskScaffolding: 

435 """Helper class aggregating information about a `PipelineTask`, used when 

436 constructing a `QuantumGraph`. 

437 

438 See `_PipelineScaffolding` for a top-down description of the full 

439 scaffolding data structure. 

440 

441 Parameters 

442 ---------- 

443 taskDef : `TaskDef` 

444 Data structure that identifies the task class and its config. 

445 parent : `_PipelineScaffolding` 

446 The parent data structure that will hold the instance being 

447 constructed. 

448 datasetTypes : `TaskDatasetTypes` 

449 Data structure that categorizes the dataset types used by this task. 

450 """ 

451 

452 def __init__( 

453 self, 

454 taskDef: TaskDef, 

455 parent: _PipelineScaffolding, 

456 datasetTypes: TaskDatasetTypes, 

457 ): 

458 universe = parent.dimensions.universe 

459 self.taskDef = taskDef 

460 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

461 assert self.dimensions.issubset(parent.dimensions) 

462 # Initialize _DatasetDicts as subsets of the one or two 

463 # corresponding dicts in the parent _PipelineScaffolding. 

464 self.initInputs = _DatasetDict.fromSubset( 

465 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

466 ) 

467 self.initOutputs = _DatasetDict.fromSubset( 

468 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

469 ) 

470 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

471 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

472 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

473 self.dataIds: set[DataCoordinate] = set() 

474 self.quanta = {} 

475 

476 def __repr__(self) -> str: 

477 # Default dataclass-injected __repr__ gets caught in an infinite loop 

478 # because of back-references. 

479 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

480 

481 taskDef: TaskDef 

482 """Data structure that identifies the task class and its config 

483 (`TaskDef`). 

484 """ 

485 

486 dimensions: DimensionGraph 

487 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

488 """ 

489 

490 initInputs: _DatasetDict 

491 """Dictionary containing information about datasets used to construct this 

492 task (`_DatasetDict`). 

493 """ 

494 

495 initOutputs: _DatasetDict 

496 """Dictionary containing information about datasets produced as a 

497 side-effect of constructing this task (`_DatasetDict`). 

498 """ 

499 

500 inputs: _DatasetDict 

501 """Dictionary containing information about datasets used as regular, 

502 graph-constraining inputs to this task (`_DatasetDict`). 

503 """ 

504 

505 outputs: _DatasetDict 

506 """Dictionary containing information about datasets produced by this task 

507 (`_DatasetDict`). 

508 """ 

509 

510 prerequisites: _DatasetDict 

511 """Dictionary containing information about input datasets that must be 

512 present in the repository before any Pipeline containing this task is run 

513 (`_DatasetDict`). 

514 """ 

515 

516 quanta: dict[DataCoordinate, _QuantumScaffolding] 

517 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

518 this task with that data ID. 

519 """ 

520 

521 def makeQuantumSet( 

522 self, 

523 missing: _DatasetDict, 

524 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

525 ) -> set[Quantum]: 

526 """Create a `set` of `Quantum` from the information in ``self``. 

527 

528 Parameters 

529 ---------- 

530 missing : `_DatasetDict` 

531 Input datasets that have not been found. 

532 datastore_records : `dict` 

533 Record from the datastore to export with quanta. 

534 

535 Returns 

536 ------- 

537 nodes : `set` of `Quantum` 

538 The `Quantum` elements corresponding to this task. 

539 """ 

540 outputs = set() 

541 for q in self.quanta.values(): 

542 try: 

543 tmpQuanta = q.makeQuantum(datastore_records) 

544 outputs.add(tmpQuanta) 

545 except (NoWorkFound, FileNotFoundError) as exc: 

546 if not missing.isdisjoint(q.inputs): 

547 # This is a node that is known to be pruned later and 

548 # should be left in even though some follow up queries 

549 # fail. This allows the pruning to start from this quantum 

550 # with known issues, and prune other nodes it touches. 

551 inputs = q.inputs.unpackMultiRefs() 

552 inputs.update(q.prerequisites.unpackMultiRefs()) 

553 tmpQuantum = Quantum( 

554 taskName=q.task.taskDef.taskName, 

555 taskClass=q.task.taskDef.taskClass, 

556 dataId=q.dataId, 

557 initInputs=q.task.initInputs.unpackSingleRefs(), 

558 inputs=inputs, 

559 outputs=q.outputs.unpackMultiRefs(), 

560 ) 

561 outputs.add(tmpQuantum) 

562 else: 

563 raise exc 

564 return outputs 

565 

566 

567class _DatasetIdMaker: 

568 """Helper class which generates random dataset UUIDs for unresolved 

569 datasets. 

570 """ 

571 

572 def __init__(self, run: str): 

573 self.run = run 

574 # Cache of dataset refs generated so far. 

575 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

576 

577 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

578 # For components we need their parent dataset ID. 

579 if dataset_type.isComponent(): 

580 parent_type = dataset_type.makeCompositeDatasetType() 

581 # Parent should be resolved if this is an existing input, or it 

582 # should be in the cache already if it is an intermediate. 

583 key = parent_type, data_id 

584 if key not in self.resolved: 

585 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

586 parent_ref = self.resolved[key] 

587 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved" 

588 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

589 

590 key = dataset_type, data_id 

591 if (resolved := self.resolved.get(key)) is None: 

592 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

593 self.resolved[key] = resolved 

594 return resolved 

595 

596 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None: 

597 """Resolve all unresolved references in the provided dictionary.""" 

598 for data_id, holder in refs.items(): 

599 if holder.ref is None: 

600 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

601 

602 

603@dataclass 

604class _PipelineScaffolding: 

605 """A helper data structure that organizes the information involved in 

606 constructing a `QuantumGraph` for a `Pipeline`. 

607 

608 Parameters 

609 ---------- 

610 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

611 Sequence of tasks from which a graph is to be constructed. Must 

612 have nested task classes already imported. 

613 universe : `DimensionUniverse` 

614 Universe of all possible dimensions. 

615 

616 Notes 

617 ----- 

618 The scaffolding data structure contains nested data structures for both 

619 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

620 data structures are shared between the pipeline-level structure (which 

621 aggregates all datasets and categorizes them from the perspective of the 

622 complete pipeline) and the individual tasks that use them as inputs and 

623 outputs. 

624 

625 `QuantumGraph` construction proceeds in four steps, with each corresponding 

626 to a different `_PipelineScaffolding` method: 

627 

628 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

629 the DatasetTypes used by the pipeline (delegating to 

630 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

631 nested `_TaskScaffolding` and `_DatasetDict` objects. 

632 

633 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

634 returns related tuples of all dimensions used to identify any regular 

635 input, output, and intermediate datasets (not prerequisites). We then 

636 iterate over these tuples of related dimensions, identifying the subsets 

637 that correspond to distinct data IDs for each task and dataset type, 

638 and then create `_QuantumScaffolding` objects. 

639 

640 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

641 dataset data IDs previously identified, transforming unresolved 

642 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

643 up prerequisite datasets for all quanta. 

644 

645 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

646 per-task `_QuantumScaffolding` objects. 

647 """ 

648 

649 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

650 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

651 self.tasks = [] 

652 # Aggregate and categorize the DatasetTypes in the Pipeline. 

653 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

654 # Construct dictionaries that map those DatasetTypes to structures 

655 # that will (later) hold additional information about them. 

656 for attr in ( 

657 "initInputs", 

658 "initIntermediates", 

659 "initOutputs", 

660 "inputs", 

661 "intermediates", 

662 "outputs", 

663 "prerequisites", 

664 ): 

665 setattr( 

666 self, 

667 attr, 

668 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

669 ) 

670 self.missing = _DatasetDict(universe=registry.dimensions) 

671 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

672 # Aggregate all dimensions for all non-init, non-prerequisite 

673 # DatasetTypes. These are the ones we'll include in the big join 

674 # query. 

675 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

676 # Construct scaffolding nodes for each Task, and add backreferences 

677 # to the Task from each DatasetScaffolding node. 

678 # Note that there's only one scaffolding node for each DatasetType, 

679 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

680 # reference it. 

681 if isinstance(pipeline, Pipeline): 

682 pipeline = pipeline.toExpandedPipeline() 

683 self.tasks = [ 

684 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

685 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

686 ] 

687 

688 def __repr__(self) -> str: 

689 # Default dataclass-injected __repr__ gets caught in an infinite loop 

690 # because of back-references. 

691 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

692 

693 tasks: list[_TaskScaffolding] 

694 """Scaffolding data structures for each task in the pipeline 

695 (`list` of `_TaskScaffolding`). 

696 """ 

697 

698 initInputs: _DatasetDict 

699 """Datasets consumed but not produced when constructing the tasks in this 

700 pipeline (`_DatasetDict`). 

701 """ 

702 

703 initIntermediates: _DatasetDict 

704 """Datasets that are both consumed and produced when constructing the tasks 

705 in this pipeline (`_DatasetDict`). 

706 """ 

707 

708 initOutputs: _DatasetDict 

709 """Datasets produced but not consumed when constructing the tasks in this 

710 pipeline (`_DatasetDict`). 

711 """ 

712 

713 inputs: _DatasetDict 

714 """Datasets that are consumed but not produced when running this pipeline 

715 (`_DatasetDict`). 

716 """ 

717 

718 intermediates: _DatasetDict 

719 """Datasets that are both produced and consumed when running this pipeline 

720 (`_DatasetDict`). 

721 """ 

722 

723 outputs: _DatasetDict 

724 """Datasets produced but not consumed when when running this pipeline 

725 (`_DatasetDict`). 

726 """ 

727 

728 prerequisites: _DatasetDict 

729 """Datasets that are consumed when running this pipeline and looked up 

730 per-Quantum when generating the graph (`_DatasetDict`). 

731 """ 

732 

733 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

734 """Datasets that should be used as constraints in the initial query, 

735 according to tasks (`NamedValueSet`). 

736 """ 

737 

738 dimensions: DimensionGraph 

739 """All dimensions used by any regular input, intermediate, or output 

740 (not prerequisite) dataset; the set of dimension used in the "Big Join 

741 Query" (`DimensionGraph`). 

742 

743 This is required to be a superset of all task quantum dimensions. 

744 """ 

745 

746 missing: _DatasetDict 

747 """Datasets whose existence was originally predicted but were not 

748 actually found. 

749 

750 Quanta that require these datasets as inputs will be pruned (recursively) 

751 when actually constructing a `QuantumGraph` object. 

752 

753 These are currently populated only when the "initial dataset query 

754 constraint" does not include all overall-input dataset types, and hence the 

755 initial data ID query can include data IDs that it should not. 

756 """ 

757 

758 globalInitOutputs: _DatasetDict | None = None 

759 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

760 """ 

761 

762 @contextmanager 

763 def connectDataIds( 

764 self, 

765 registry: Registry, 

766 collections: Any, 

767 userQuery: Optional[str], 

768 externalDataId: DataCoordinate, 

769 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

770 bind: Optional[Mapping[str, Any]] = None, 

771 ) -> Iterator[DataCoordinateQueryResults]: 

772 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

773 

774 This method populates `_TaskScaffolding.dataIds` and 

775 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

776 

777 Parameters 

778 ---------- 

779 registry : `lsst.daf.butler.Registry` 

780 Registry for the data repository; used for all data ID queries. 

781 collections 

782 Expressions representing the collections to search for input 

783 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

784 userQuery : `str` or `None` 

785 User-provided expression to limit the data IDs processed. 

786 externalDataId : `DataCoordinate` 

787 Externally-provided data ID that should be used to restrict the 

788 results, just as if these constraints had been included via ``AND`` 

789 in ``userQuery``. This includes (at least) any instrument named 

790 in the pipeline definition. 

791 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

792 The query constraint variant that should be used to constraint the 

793 query based on dataset existance, defaults to 

794 `DatasetQueryConstraintVariant.ALL`. 

795 bind : `Mapping`, optional 

796 Mapping containing literal values that should be injected into the 

797 ``userQuery`` expression, keyed by the identifiers they replace. 

798 

799 Returns 

800 ------- 

801 commonDataIds : \ 

802 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

803 An interface to a database temporary table containing all data IDs 

804 that will appear in this `QuantumGraph`. Returned inside a 

805 context manager, which will drop the temporary table at the end of 

806 the `with` block in which this method is called. 

807 """ 

808 _LOG.debug("Building query for data IDs.") 

809 # Initialization datasets always have empty data IDs. 

810 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

811 for datasetType, refs in itertools.chain( 

812 self.initInputs.items(), 

813 self.initIntermediates.items(), 

814 self.initOutputs.items(), 

815 ): 

816 refs[emptyDataId] = _RefHolder(datasetType) 

817 # Run one big query for the data IDs for task dimensions and regular 

818 # inputs and outputs. We limit the query to only dimensions that are 

819 # associated with the input dataset types, but don't (yet) try to 

820 # obtain the dataset_ids for those inputs. 

821 _LOG.debug( 

822 "Submitting data ID query over dimensions %s and materializing results.", 

823 list(self.dimensions.names), 

824 ) 

825 queryArgs: dict[str, Any] = { 

826 "dimensions": self.dimensions, 

827 "where": userQuery, 

828 "dataId": externalDataId, 

829 "bind": bind, 

830 } 

831 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

832 _LOG.debug( 

833 "Constraining graph query using default of %s.", 

834 list(self.defaultDatasetQueryConstraints.names), 

835 ) 

836 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

837 queryArgs["collections"] = collections 

838 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

839 _LOG.debug("Not using dataset existence to constrain query.") 

840 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

841 constraint = set(datasetQueryConstraint) 

842 inputs = {k.name: k for k in self.inputs.keys()} 

843 if remainder := constraint.difference(inputs.keys()): 

844 raise ValueError( 

845 f"{remainder} dataset type(s) specified as a graph constraint, but" 

846 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

847 ) 

848 _LOG.debug(f"Constraining graph query using {constraint}") 

849 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

850 queryArgs["collections"] = collections 

851 else: 

852 raise ValueError( 

853 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

854 ) 

855 

856 if "datasets" in queryArgs: 

857 for i, dataset_type in enumerate(queryArgs["datasets"]): 

858 if dataset_type.isComponent(): 

859 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

860 

861 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

862 _LOG.debug("Expanding data IDs.") 

863 commonDataIds = commonDataIds.expanded() 

864 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

865 # Iterate over query results, populating data IDs for datasets and 

866 # quanta and then connecting them to each other. 

867 n = -1 

868 for n, commonDataId in enumerate(commonDataIds): 

869 # Create DatasetRefs for all DatasetTypes from this result row, 

870 # noting that we might have created some already. 

871 # We remember both those that already existed and those that we 

872 # create now. 

873 refsForRow = {} 

874 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

875 for datasetType, refs in itertools.chain( 

876 self.inputs.items(), 

877 self.intermediates.items(), 

878 self.outputs.items(), 

879 ): 

880 datasetDataId: Optional[DataCoordinate] 

881 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

882 datasetDataId = commonDataId.subset(datasetType.dimensions) 

883 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

884 ref_holder = refs.get(datasetDataId) 

885 if ref_holder is None: 

886 ref_holder = _RefHolder(datasetType) 

887 refs[datasetDataId] = ref_holder 

888 refsForRow[datasetType.name] = ref_holder 

889 # Create _QuantumScaffolding objects for all tasks from this 

890 # result row, noting that we might have created some already. 

891 for task in self.tasks: 

892 quantumDataId = commonDataId.subset(task.dimensions) 

893 quantum = task.quanta.get(quantumDataId) 

894 if quantum is None: 

895 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

896 task.quanta[quantumDataId] = quantum 

897 # Whether this is a new quantum or an existing one, we can 

898 # now associate the DatasetRefs for this row with it. The 

899 # fact that a Quantum data ID and a dataset data ID both 

900 # came from the same result row is what tells us they 

901 # should be associated. 

902 # Many of these associates will be duplicates (because 

903 # another query row that differed from this one only in 

904 # irrelevant dimensions already added them), and we use 

905 # sets to skip. 

906 for datasetType in task.inputs: 

907 dataId = dataIdCacheForRow[datasetType.dimensions] 

908 ref_holder = refsForRow[datasetType.name] 

909 quantum.inputs[datasetType.name][dataId] = ref_holder 

910 for datasetType in task.outputs: 

911 dataId = dataIdCacheForRow[datasetType.dimensions] 

912 ref_holder = refsForRow[datasetType.name] 

913 quantum.outputs[datasetType.name][dataId] = ref_holder 

914 if n < 0: 

915 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

916 emptiness_explained = False 

917 for message in commonDataIds.explain_no_results(): 

918 _LOG.critical(message) 

919 emptiness_explained = True 

920 if not emptiness_explained: 

921 _LOG.critical( 

922 "To reproduce this query for debugging purposes, run " 

923 "Registry.queryDataIds with these arguments:" 

924 ) 

925 # We could just repr() the queryArgs dict to get something 

926 # the user could make sense of, but it's friendlier to 

927 # put these args in an easier-to-construct equivalent form 

928 # so they can read it more easily and copy and paste into 

929 # a Python terminal. 

930 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

931 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

932 if queryArgs["where"]: 

933 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

934 if "datasets" in queryArgs: 

935 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

936 if "collections" in queryArgs: 

937 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

938 _LOG.debug("Finished processing %d rows from data ID query.", n) 

939 yield commonDataIds 

940 

941 def resolveDatasetRefs( 

942 self, 

943 registry: Registry, 

944 collections: Any, 

945 run: str, 

946 commonDataIds: DataCoordinateQueryResults, 

947 *, 

948 skipExistingIn: Any = None, 

949 clobberOutputs: bool = True, 

950 constrainedByAllDatasets: bool = True, 

951 ) -> None: 

952 """Perform follow up queries for each dataset data ID produced in 

953 `fillDataIds`. 

954 

955 This method populates `_DatasetScaffolding.refs` (except for those in 

956 `prerequisites`). 

957 

958 Parameters 

959 ---------- 

960 registry : `lsst.daf.butler.Registry` 

961 Registry for the data repository; used for all data ID queries. 

962 collections 

963 Expressions representing the collections to search for input 

964 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

965 run : `str` 

966 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

967 output datasets, if it already exists. 

968 commonDataIds : \ 

969 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

970 Result of a previous call to `connectDataIds`. 

971 skipExistingIn 

972 Expressions representing the collections to search for existing 

973 output datasets that should be skipped. See 

974 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

975 `None` or empty string/sequence disables skipping. 

976 clobberOutputs : `bool`, optional 

977 If `True` (default), allow quanta to created even if outputs exist; 

978 this requires the same behavior behavior to be enabled when 

979 executing. If ``skipExistingIn`` is not `None`, completed quanta 

980 (those with metadata, or all outputs if there is no metadata 

981 dataset configured) will be skipped rather than clobbered. 

982 constrainedByAllDatasets : `bool`, optional 

983 Indicates if the commonDataIds were generated with a constraint on 

984 all dataset types. 

985 

986 Raises 

987 ------ 

988 OutputExistsError 

989 Raised if an output dataset already exists in the output run 

990 and ``skipExistingIn`` does not include output run, or if only 

991 some outputs are present and ``clobberOutputs`` is `False`. 

992 """ 

993 # Run may be provided but it does not have to exist, in that case we 

994 # use it for resolving references but don't check it for existing refs. 

995 run_exists = False 

996 if run: 

997 try: 

998 run_exists = bool(registry.queryCollections(run)) 

999 except MissingCollectionError: 

1000 # Undocumented exception is raise if it does not exist 

1001 pass 

1002 

1003 skip_collections_wildcard: CollectionWildcard | None = None 

1004 skipExistingInRun = False 

1005 if skipExistingIn: 

1006 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1007 if run_exists: 

1008 # as optimization check in the explicit list of names first 

1009 skipExistingInRun = run in skip_collections_wildcard.strings 

1010 if not skipExistingInRun: 

1011 # need to flatten it and check again 

1012 skipExistingInRun = run in registry.queryCollections( 

1013 skipExistingIn, 

1014 collectionTypes=CollectionType.RUN, 

1015 ) 

1016 

1017 idMaker = _DatasetIdMaker(run) 

1018 

1019 resolvedRefQueryResults: Iterable[DatasetRef] 

1020 

1021 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1022 # few different code paths that each transfer different pieces of 

1023 # information about what dataset query constraints were applied here, 

1024 # and none of them has the complete picture until we get here. We're 

1025 # long overdue for a QG generation rewrite that will make this go away 

1026 # entirely anyway. 

1027 constrainedByAllDatasets = ( 

1028 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1029 ) 

1030 

1031 # Look up [init] intermediate and output datasets in the output 

1032 # collection, if there is an output collection. 

1033 if run_exists or skip_collections_wildcard is not None: 

1034 for datasetType, refs in itertools.chain( 

1035 self.initIntermediates.items(), 

1036 self.initOutputs.items(), 

1037 self.intermediates.items(), 

1038 self.outputs.items(), 

1039 ): 

1040 _LOG.debug( 

1041 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1042 len(refs), 

1043 datasetType.name, 

1044 ) 

1045 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1046 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1047 # TODO: this assert incorrectly bans component inputs; 

1048 # investigate on DM-33027. 

1049 # assert not datasetType.isComponent(), \ 

1050 # "Output datasets cannot be components." 

1051 # 

1052 # Instead we have to handle them manually to avoid a 

1053 # deprecation warning, but it is at least confusing and 

1054 # possibly a bug for components to appear here at all. 

1055 if datasetType.isComponent(): 

1056 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1057 component = datasetType.component() 

1058 else: 

1059 parent_dataset_type = datasetType 

1060 component = None 

1061 

1062 # look at RUN collection first 

1063 if run_exists: 

1064 try: 

1065 resolvedRefQueryResults = subset.findDatasets( 

1066 parent_dataset_type, collections=run, findFirst=True 

1067 ) 

1068 except MissingDatasetTypeError: 

1069 resolvedRefQueryResults = [] 

1070 for resolvedRef in resolvedRefQueryResults: 

1071 # TODO: we could easily support per-DatasetType 

1072 # skipExisting and I could imagine that being useful - 

1073 # it's probably required in order to support writing 

1074 # initOutputs before QuantumGraph generation. 

1075 assert resolvedRef.dataId in refs 

1076 if not (skipExistingInRun or isInit or clobberOutputs): 

1077 raise OutputExistsError( 

1078 f"Output dataset {datasetType.name} already exists in " 

1079 f"output RUN collection '{run}' with data ID" 

1080 f" {resolvedRef.dataId}." 

1081 ) 

1082 # To resolve all outputs we have to remember existing 

1083 # ones to avoid generating new dataset IDs for them. 

1084 refs[resolvedRef.dataId].ref = ( 

1085 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1086 ) 

1087 

1088 # And check skipExistingIn too, if RUN collection is in 

1089 # it is handled above 

1090 if skip_collections_wildcard is not None: 

1091 try: 

1092 resolvedRefQueryResults = subset.findDatasets( 

1093 parent_dataset_type, 

1094 collections=skip_collections_wildcard, 

1095 findFirst=True, 

1096 ) 

1097 except MissingDatasetTypeError: 

1098 resolvedRefQueryResults = [] 

1099 for resolvedRef in resolvedRefQueryResults: 

1100 if resolvedRef.dataId not in refs: 

1101 continue 

1102 refs[resolvedRef.dataId].ref = ( 

1103 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1104 ) 

1105 

1106 # Look up input and initInput datasets in the input collection(s). We 

1107 # accumulate datasets in self.missing, if the common data IDs were not 

1108 # constrained on dataset type existence. 

1109 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1110 _LOG.debug( 

1111 "Resolving %d datasets for input dataset %s.", 

1112 len(refs), 

1113 datasetType.name, 

1114 ) 

1115 if datasetType.isComponent(): 

1116 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1117 component = datasetType.component() 

1118 else: 

1119 parent_dataset_type = datasetType 

1120 component = None 

1121 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1122 try: 

1123 resolvedRefQueryResults = commonDataIds.subset( 

1124 datasetType.dimensions, unique=True 

1125 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1126 except MissingDatasetTypeError: 

1127 resolvedRefQueryResults = [] 

1128 dataIdsNotFoundYet = set(refs.keys()) 

1129 for resolvedRef in resolvedRefQueryResults: 

1130 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1131 if resolvedRef.dataId not in refs: 

1132 continue 

1133 refs[resolvedRef.dataId].ref = ( 

1134 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1135 ) 

1136 if dataIdsNotFoundYet: 

1137 if constrainedByAllDatasets: 

1138 raise RuntimeError( 

1139 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1140 f"'{datasetType.name}' was/were present in a previous " 

1141 "query, but could not be found now. " 

1142 "This is either a logic bug in QuantumGraph generation " 

1143 "or the input collections have been modified since " 

1144 "QuantumGraph generation began." 

1145 ) 

1146 elif not datasetType.dimensions: 

1147 raise RuntimeError( 

1148 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1149 f"collections {collections}." 

1150 ) 

1151 else: 

1152 # If the common dataIds were not constrained using all the 

1153 # input dataset types, it is possible that some data ids 

1154 # found don't correspond to existing datasets. Mark these 

1155 # for later pruning from the quantum graph. 

1156 for k in dataIdsNotFoundYet: 

1157 missing_for_dataset_type[k] = refs[k] 

1158 if missing_for_dataset_type: 

1159 self.missing[datasetType] = missing_for_dataset_type 

1160 

1161 # Resolve the missing refs, just so they look like all of the others; 

1162 # in the end other code will make sure they never appear in the QG. 

1163 for dataset_type, refDict in self.missing.items(): 

1164 idMaker.resolveDict(dataset_type, refDict) 

1165 

1166 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1167 # replacing the unresolved refs there, and then look up prerequisites. 

1168 for task in self.tasks: 

1169 _LOG.debug( 

1170 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1171 len(task.quanta), 

1172 task.taskDef.label, 

1173 ) 

1174 # The way iterConnections is designed makes it impossible to 

1175 # annotate precisely enough to satisfy MyPy here. 

1176 lookupFunctions = { 

1177 c.name: c.lookupFunction # type: ignore 

1178 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1179 if c.lookupFunction is not None # type: ignore 

1180 } 

1181 dataIdsFailed = [] 

1182 dataIdsSucceeded = [] 

1183 for quantum in task.quanta.values(): 

1184 # Process outputs datasets only if skipExistingIn is not None 

1185 # or there is a run to look for outputs in and clobberOutputs 

1186 # is True. Note that if skipExistingIn is None, any output 

1187 # datasets that already exist would have already caused an 

1188 # exception to be raised. 

1189 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1190 resolvedRefs = [] 

1191 unresolvedDataIds = [] 

1192 haveMetadata = False 

1193 for datasetType, originalRefs in quantum.outputs.items(): 

1194 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1195 if ref is not None: 

1196 resolvedRefs.append(ref) 

1197 originalRefs[dataId].ref = ref 

1198 if datasetType.name == task.taskDef.metadataDatasetName: 

1199 haveMetadata = True 

1200 else: 

1201 unresolvedDataIds.append((datasetType, dataId)) 

1202 if resolvedRefs: 

1203 if haveMetadata or not unresolvedDataIds: 

1204 dataIdsSucceeded.append(quantum.dataId) 

1205 if skip_collections_wildcard is not None: 

1206 continue 

1207 else: 

1208 dataIdsFailed.append(quantum.dataId) 

1209 if not clobberOutputs: 

1210 raise OutputExistsError( 

1211 f"Quantum {quantum.dataId} of task with label " 

1212 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1213 f"({resolvedRefs}) " 

1214 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1215 "and clobbering outputs was not enabled." 

1216 ) 

1217 # Update the input DatasetRefs to the resolved ones we already 

1218 # searched for. 

1219 for datasetType, input_refs in quantum.inputs.items(): 

1220 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1221 input_refs[data_id].ref = ref 

1222 # Look up prerequisite datasets in the input collection(s). 

1223 # These may have dimensions that extend beyond those we queried 

1224 # for originally, because we want to permit those data ID 

1225 # values to differ across quanta and dataset types. 

1226 for datasetType in task.prerequisites: 

1227 if datasetType.isComponent(): 

1228 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1229 component = datasetType.component() 

1230 else: 

1231 parent_dataset_type = datasetType 

1232 component = None 

1233 lookupFunction = lookupFunctions.get(datasetType.name) 

1234 if lookupFunction is not None: 

1235 # PipelineTask has provided its own function to do the 

1236 # lookup. This always takes precedence. 

1237 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1238 elif ( 

1239 datasetType.isCalibration() 

1240 and datasetType.dimensions <= quantum.dataId.graph 

1241 and quantum.dataId.graph.temporal 

1242 ): 

1243 # This is a master calibration lookup, which we have to 

1244 # handle specially because the query system can't do a 

1245 # temporal join on a non-dimension-based timespan yet. 

1246 timespan = quantum.dataId.timespan 

1247 try: 

1248 prereq_ref = registry.findDataset( 

1249 parent_dataset_type, 

1250 quantum.dataId, 

1251 collections=collections, 

1252 timespan=timespan, 

1253 ) 

1254 if prereq_ref is not None: 

1255 if component is not None: 

1256 prereq_ref = prereq_ref.makeComponentRef(component) 

1257 prereq_refs = [prereq_ref] 

1258 else: 

1259 prereq_refs = [] 

1260 except (KeyError, MissingDatasetTypeError): 

1261 # This dataset type is not present in the registry, 

1262 # which just means there are no datasets here. 

1263 prereq_refs = [] 

1264 else: 

1265 # Most general case. 

1266 prereq_refs = [ 

1267 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1268 for prereq_ref in registry.queryDatasets( 

1269 parent_dataset_type, 

1270 collections=collections, 

1271 dataId=quantum.dataId, 

1272 findFirst=True, 

1273 ).expanded() 

1274 ] 

1275 

1276 for ref in prereq_refs: 

1277 if ref is not None: 

1278 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1279 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1280 

1281 # Resolve all quantum inputs and outputs. 

1282 for datasetDict in (quantum.inputs, quantum.outputs): 

1283 for dataset_type, refDict in datasetDict.items(): 

1284 idMaker.resolveDict(dataset_type, refDict) 

1285 

1286 # Resolve task initInputs and initOutputs. 

1287 for datasetDict in (task.initInputs, task.initOutputs): 

1288 for dataset_type, refDict in datasetDict.items(): 

1289 idMaker.resolveDict(dataset_type, refDict) 

1290 

1291 # Actually remove any quanta that we decided to skip above. 

1292 if dataIdsSucceeded: 

1293 if skip_collections_wildcard is not None: 

1294 _LOG.debug( 

1295 "Pruning successful %d quanta for task with label '%s' because all of their " 

1296 "outputs exist or metadata was written successfully.", 

1297 len(dataIdsSucceeded), 

1298 task.taskDef.label, 

1299 ) 

1300 for dataId in dataIdsSucceeded: 

1301 del task.quanta[dataId] 

1302 elif clobberOutputs: 

1303 _LOG.info( 

1304 "Found %d successful quanta for task with label '%s' " 

1305 "that will need to be clobbered during execution.", 

1306 len(dataIdsSucceeded), 

1307 task.taskDef.label, 

1308 ) 

1309 else: 

1310 raise AssertionError("OutputExistsError should have already been raised.") 

1311 if dataIdsFailed: 

1312 if clobberOutputs: 

1313 _LOG.info( 

1314 "Found %d failed/incomplete quanta for task with label '%s' " 

1315 "that will need to be clobbered during execution.", 

1316 len(dataIdsFailed), 

1317 task.taskDef.label, 

1318 ) 

1319 else: 

1320 raise AssertionError("OutputExistsError should have already been raised.") 

1321 

1322 # Collect initOutputs that do not belong to any task. 

1323 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1324 for task in self.tasks: 

1325 global_dataset_types -= set(task.initOutputs) 

1326 if global_dataset_types: 

1327 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1328 for dataset_type, refDict in self.globalInitOutputs.items(): 

1329 idMaker.resolveDict(dataset_type, refDict) 

1330 

1331 def makeQuantumGraph( 

1332 self, 

1333 registry: Registry, 

1334 metadata: Optional[Mapping[str, Any]] = None, 

1335 datastore: Optional[Datastore] = None, 

1336 ) -> QuantumGraph: 

1337 """Create a `QuantumGraph` from the quanta already present in 

1338 the scaffolding data structure. 

1339 

1340 Parameters 

1341 --------- 

1342 registry : `lsst.daf.butler.Registry` 

1343 Registry for the data repository; used for all data ID queries. 

1344 metadata : Optional Mapping of `str` to primitives 

1345 This is an optional parameter of extra data to carry with the 

1346 graph. Entries in this mapping should be able to be serialized in 

1347 JSON. 

1348 datastore : `Datastore`, optional 

1349 If not `None` then fill datastore records in each generated 

1350 Quantum. 

1351 

1352 Returns 

1353 ------- 

1354 graph : `QuantumGraph` 

1355 The full `QuantumGraph`. 

1356 """ 

1357 

1358 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1359 """Extract all DatasetRefs from the dictionaries""" 

1360 for ref_dict in dataset_dict.values(): 

1361 for holder in ref_dict.values(): 

1362 yield holder.resolved_ref 

1363 

1364 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1365 if datastore is not None: 

1366 datastore_records = datastore.export_records( 

1367 itertools.chain( 

1368 _make_refs(self.inputs), 

1369 _make_refs(self.initInputs), 

1370 _make_refs(self.prerequisites), 

1371 ) 

1372 ) 

1373 

1374 graphInput: dict[TaskDef, set[Quantum]] = {} 

1375 for task in self.tasks: 

1376 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1377 graphInput[task.taskDef] = qset 

1378 

1379 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1380 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1381 

1382 globalInitOutputs: list[DatasetRef] = [] 

1383 if self.globalInitOutputs is not None: 

1384 for refs_dict in self.globalInitOutputs.values(): 

1385 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1386 

1387 graph = QuantumGraph( 

1388 graphInput, 

1389 metadata=metadata, 

1390 pruneRefs=list(self.missing.iter_resolved_refs()), 

1391 universe=self.dimensions.universe, 

1392 initInputs=taskInitInputs, 

1393 initOutputs=taskInitOutputs, 

1394 globalInitOutputs=globalInitOutputs, 

1395 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1396 ) 

1397 return graph 

1398 

1399 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1400 """Make a list of all dataset types used by a graph as defined in 

1401 registry. 

1402 """ 

1403 chain = [ 

1404 self.initInputs, 

1405 self.initIntermediates, 

1406 self.initOutputs, 

1407 self.inputs, 

1408 self.intermediates, 

1409 self.outputs, 

1410 self.prerequisites, 

1411 ] 

1412 if self.globalInitOutputs is not None: 

1413 chain.append(self.globalInitOutputs) 

1414 

1415 # Collect names of all dataset types. 

1416 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1417 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1418 

1419 # Check for types that do not exist in registry yet: 

1420 # - inputs must exist 

1421 # - intermediates and outputs may not exist, but there must not be 

1422 # more than one definition (e.g. differing in storage class) 

1423 # - prerequisites may not exist, treat it the same as outputs here 

1424 for dstype in itertools.chain(self.initInputs, self.inputs): 

1425 if dstype.name not in dataset_types: 

1426 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1427 

1428 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1429 chain = [ 

1430 self.initIntermediates, 

1431 self.initOutputs, 

1432 self.intermediates, 

1433 self.outputs, 

1434 self.prerequisites, 

1435 ] 

1436 if self.globalInitOutputs is not None: 

1437 chain.append(self.globalInitOutputs) 

1438 for dstype in itertools.chain(*chain): 

1439 if dstype.name not in dataset_types: 

1440 new_outputs[dstype.name].add(dstype) 

1441 for name, dstypes in new_outputs.items(): 

1442 if len(dstypes) > 1: 

1443 raise ValueError( 

1444 "Pipeline contains multiple definitions for a dataset type " 

1445 f"which is not defined in registry yet: {dstypes}" 

1446 ) 

1447 elif len(dstypes) == 1: 

1448 dataset_types[name] = dstypes.pop() 

1449 

1450 return dataset_types.values() 

1451 

1452 

1453# ------------------------ 

1454# Exported definitions -- 

1455# ------------------------ 

1456 

1457 

1458class GraphBuilderError(Exception): 

1459 """Base class for exceptions generated by graph builder.""" 

1460 

1461 pass 

1462 

1463 

1464class OutputExistsError(GraphBuilderError): 

1465 """Exception generated when output datasets already exist.""" 

1466 

1467 pass 

1468 

1469 

1470class PrerequisiteMissingError(GraphBuilderError): 

1471 """Exception generated when a prerequisite dataset does not exist.""" 

1472 

1473 pass 

1474 

1475 

1476class GraphBuilder: 

1477 """GraphBuilder class is responsible for building task execution graph from 

1478 a Pipeline. 

1479 

1480 Parameters 

1481 ---------- 

1482 registry : `~lsst.daf.butler.Registry` 

1483 Data butler instance. 

1484 skipExistingIn 

1485 Expressions representing the collections to search for existing 

1486 output datasets that should be skipped. See 

1487 :ref:`daf_butler_ordered_collection_searches`. 

1488 clobberOutputs : `bool`, optional 

1489 If `True` (default), allow quanta to created even if partial outputs 

1490 exist; this requires the same behavior behavior to be enabled when 

1491 executing. 

1492 datastore : `Datastore`, optional 

1493 If not `None` then fill datastore records in each generated Quantum. 

1494 """ 

1495 

1496 def __init__( 

1497 self, 

1498 registry: Registry, 

1499 skipExistingIn: Any = None, 

1500 clobberOutputs: bool = True, 

1501 datastore: Optional[Datastore] = None, 

1502 ): 

1503 self.registry = registry 

1504 self.dimensions = registry.dimensions 

1505 self.skipExistingIn = skipExistingIn 

1506 self.clobberOutputs = clobberOutputs 

1507 self.datastore = datastore 

1508 

1509 def makeGraph( 

1510 self, 

1511 pipeline: Pipeline | Iterable[TaskDef], 

1512 collections: Any, 

1513 run: str, 

1514 userQuery: Optional[str], 

1515 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1516 metadata: Optional[Mapping[str, Any]] = None, 

1517 bind: Optional[Mapping[str, Any]] = None, 

1518 ) -> QuantumGraph: 

1519 """Create execution graph for a pipeline. 

1520 

1521 Parameters 

1522 ---------- 

1523 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1524 Pipeline definition, task names/classes and their configs. 

1525 collections 

1526 Expressions representing the collections to search for input 

1527 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1528 run : `str` 

1529 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1530 output datasets. Collection does not have to exist and it will be 

1531 created when graph is executed. 

1532 userQuery : `str` 

1533 String which defines user-defined selection for registry, should be 

1534 empty or `None` if there is no restrictions on data selection. 

1535 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1536 The query constraint variant that should be used to constraint the 

1537 query based on dataset existance, defaults to 

1538 `DatasetQueryConstraintVariant.ALL`. 

1539 metadata : Optional Mapping of `str` to primitives 

1540 This is an optional parameter of extra data to carry with the 

1541 graph. Entries in this mapping should be able to be serialized in 

1542 JSON. 

1543 bind : `Mapping`, optional 

1544 Mapping containing literal values that should be injected into the 

1545 ``userQuery`` expression, keyed by the identifiers they replace. 

1546 

1547 Returns 

1548 ------- 

1549 graph : `QuantumGraph` 

1550 

1551 Raises 

1552 ------ 

1553 UserExpressionError 

1554 Raised when user expression cannot be parsed. 

1555 OutputExistsError 

1556 Raised when output datasets already exist. 

1557 Exception 

1558 Other exceptions types may be raised by underlying registry 

1559 classes. 

1560 """ 

1561 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1562 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1563 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1564 instrument_class: Optional[Any] = None 

1565 if isinstance(pipeline, Pipeline): 

1566 instrument_class_name = pipeline.getInstrument() 

1567 if instrument_class_name is not None: 

1568 instrument_class = doImportType(instrument_class_name) 

1569 pipeline = list(pipeline.toExpandedPipeline()) 

1570 if instrument_class is not None: 

1571 dataId = DataCoordinate.standardize( 

1572 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1573 ) 

1574 else: 

1575 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1576 with scaffolding.connectDataIds( 

1577 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1578 ) as commonDataIds: 

1579 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1580 scaffolding.resolveDatasetRefs( 

1581 self.registry, 

1582 collections, 

1583 run, 

1584 commonDataIds, 

1585 skipExistingIn=self.skipExistingIn, 

1586 clobberOutputs=self.clobberOutputs, 

1587 constrainedByAllDatasets=condition, 

1588 ) 

1589 return scaffolding.makeQuantumGraph( 

1590 registry=self.registry, metadata=metadata, datastore=self.datastore 

1591 )