Coverage for python/lsst/pipe/base/graphBuilder.py: 14%

516 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-16 02:06 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionType, 

40 DataCoordinate, 

41 DatasetIdGenEnum, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56from lsst.utils import doImportType 

57 

58from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

59from ._status import NoWorkFound 

60 

61# ----------------------------- 

62# Imports for other modules -- 

63# ----------------------------- 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

76 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

77 the known `DatasetRef` instances of that type. 

78 

79 Parameters 

80 ---------- 

81 args 

82 Positional arguments are forwarded to the `dict` constructor. 

83 universe : `DimensionUniverse` 

84 Universe of all possible dimensions. 

85 """ 

86 

87 def __init__(self, *args: Any, universe: DimensionUniverse): 

88 super().__init__(*args) 

89 self.universe = universe 

90 

91 @classmethod 

92 def fromDatasetTypes( 

93 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

94 ) -> _DatasetDict: 

95 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

96 

97 Parameters 

98 ---------- 

99 datasetTypes : `iterable` of `DatasetType` 

100 DatasetTypes to use as keys for the dict. Values will be empty 

101 dictionaries. 

102 universe : `DimensionUniverse` 

103 Universe of all possible dimensions. 

104 

105 Returns 

106 ------- 

107 dictionary : `_DatasetDict` 

108 A new `_DatasetDict` instance. 

109 """ 

110 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

111 

112 @classmethod 

113 def fromSubset( 

114 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

115 ) -> _DatasetDict: 

116 """Return a new dictionary by extracting items corresponding to the 

117 given keys from one or more existing dictionaries. 

118 

119 Parameters 

120 ---------- 

121 datasetTypes : `iterable` of `DatasetType` 

122 DatasetTypes to use as keys for the dict. Values will be obtained 

123 by lookups against ``first`` and ``rest``. 

124 first : `_DatasetDict` 

125 Another dictionary from which to extract values. 

126 rest 

127 Additional dictionaries from which to extract values. 

128 

129 Returns 

130 ------- 

131 dictionary : `_DatasetDict` 

132 A new dictionary instance. 

133 """ 

134 combined = ChainMap(first, *rest) 

135 

136 # Dataset types known to match immediately can be processed 

137 # without checks. 

138 matches = combined.keys() & set(datasetTypes) 

139 _dict = {k: combined[k] for k in matches} 

140 

141 if len(_dict) < len(datasetTypes): 

142 # Work out which ones are missing. 

143 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

144 

145 # Get the known names for comparison. 

146 combined_by_name = {k.name: k for k in combined} 

147 

148 missing = set() 

149 incompatible = {} 

150 for datasetType in missing_datasetTypes: 

151 # The dataset type is not found. It may not be listed 

152 # or it may be that it is there with the same name 

153 # but different definition. 

154 if datasetType.name in combined_by_name: 

155 # This implies some inconsistency in definitions 

156 # for connections. If there is support for storage 

157 # class conversion we can let it slide. 

158 # At this point we do not know 

159 # where the inconsistency is but trust that down 

160 # stream code will be more explicit about input 

161 # vs output incompatibilities. 

162 existing = combined_by_name[datasetType.name] 

163 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

164 _LOG.warning( 

165 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

166 datasetType, 

167 existing, 

168 ) 

169 _dict[datasetType] = combined[existing] 

170 else: 

171 incompatible[datasetType] = existing 

172 else: 

173 missing.add(datasetType) 

174 

175 if missing or incompatible: 

176 reasons = [] 

177 if missing: 

178 reasons.append( 

179 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

180 + ", ".join(d.name for d in combined) 

181 ) 

182 if incompatible: 

183 for x, y in incompatible.items(): 

184 reasons.append(f"{x} incompatible with {y}") 

185 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

186 

187 return cls(_dict, universe=first.universe) 

188 

189 @property 

190 def dimensions(self) -> DimensionGraph: 

191 """The union of all dimensions used by all dataset types in this 

192 dictionary, including implied dependencies (`DimensionGraph`). 

193 """ 

194 base = self.universe.empty 

195 if len(self) == 0: 

196 return base 

197 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

198 

199 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

200 """Unpack nested single-element `DatasetRef` dicts into a new 

201 mapping with `DatasetType` keys and `DatasetRef` values. 

202 

203 This method assumes that each nest contains exactly one item, as is the 

204 case for all "init" datasets. 

205 

206 Returns 

207 ------- 

208 dictionary : `NamedKeyDict` 

209 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

210 `DatasetType` instances and string names usable as keys. 

211 """ 

212 

213 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

214 (ref,) = refs.values() 

215 return ref 

216 

217 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

218 

219 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

220 """Unpack nested multi-element `DatasetRef` dicts into a new 

221 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

222 

223 Returns 

224 ------- 

225 dictionary : `NamedKeyDict` 

226 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

227 both `DatasetType` instances and string names usable as keys. 

228 """ 

229 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

230 

231 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

232 """Iterate over the contained `DatasetRef` instances that match the 

233 given `DatasetType` and data IDs. 

234 

235 Parameters 

236 ---------- 

237 datasetType : `DatasetType` 

238 Dataset type to match. 

239 dataIds : `Iterable` [ `DataCoordinate` ] 

240 Data IDs to match. 

241 

242 Returns 

243 ------- 

244 refs : `Iterator` [ `DatasetRef` ] 

245 DatasetRef instances for which ``ref.datasetType == datasetType`` 

246 and ``ref.dataId`` is in ``dataIds``. 

247 """ 

248 refs = self[datasetType] 

249 return (refs[dataId] for dataId in dataIds) 

250 

251 

252class _QuantumScaffolding: 

253 """Helper class aggregating information about a `Quantum`, used when 

254 constructing a `QuantumGraph`. 

255 

256 See `_PipelineScaffolding` for a top-down description of the full 

257 scaffolding data structure. 

258 

259 Parameters 

260 ---------- 

261 task : _TaskScaffolding 

262 Back-reference to the helper object for the `PipelineTask` this quantum 

263 represents an execution of. 

264 dataId : `DataCoordinate` 

265 Data ID for this quantum. 

266 """ 

267 

268 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

269 self.task = task 

270 self.dataId = dataId 

271 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

272 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

273 self.prerequisites = _DatasetDict.fromDatasetTypes( 

274 task.prerequisites.keys(), universe=dataId.universe 

275 ) 

276 

277 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

278 

279 def __repr__(self) -> str: 

280 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

281 

282 task: _TaskScaffolding 

283 """Back-reference to the helper object for the `PipelineTask` this quantum 

284 represents an execution of. 

285 """ 

286 

287 dataId: DataCoordinate 

288 """Data ID for this quantum. 

289 """ 

290 

291 inputs: _DatasetDict 

292 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

293 

294 This is initialized to map each `DatasetType` to an empty dictionary at 

295 construction. Those nested dictionaries are populated (with data IDs as 

296 keys) with unresolved `DatasetRef` instances in 

297 `_PipelineScaffolding.connectDataIds`. 

298 """ 

299 

300 outputs: _DatasetDict 

301 """Nested dictionary containing `DatasetRef` outputs this quantum. 

302 """ 

303 

304 prerequisites: _DatasetDict 

305 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

306 quantum. 

307 """ 

308 

309 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

310 """Transform the scaffolding object into a true `Quantum` instance. 

311 

312 Parameters 

313 ---------- 

314 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

315 If not `None` then fill datastore records in each generated Quantum 

316 using the records from this structure. 

317 

318 Returns 

319 ------- 

320 quantum : `Quantum` 

321 An actual `Quantum` instance. 

322 """ 

323 allInputs = self.inputs.unpackMultiRefs() 

324 allInputs.update(self.prerequisites.unpackMultiRefs()) 

325 # Give the task's Connections class an opportunity to remove some 

326 # inputs, or complain if they are unacceptable. 

327 # This will raise if one of the check conditions is not met, which is 

328 # the intended behavior. 

329 # If it raises NotWorkFound, there is a bug in the QG algorithm 

330 # or the adjustQuantum is incorrectly trying to make a prerequisite 

331 # input behave like a regular input; adjustQuantum should only raise 

332 # NoWorkFound if a regular input is missing, and it shouldn't be 

333 # possible for us to have generated ``self`` if that's true. 

334 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

335 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

336 initInputs = self.task.initInputs.unpackSingleRefs() 

337 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

338 if datastore_records is not None: 

339 quantum_records = {} 

340 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

341 input_refs += list(initInputs.values()) 

342 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

343 for datastore_name, records in datastore_records.items(): 

344 matching_records = records.subset(input_ids) 

345 if matching_records is not None: 

346 quantum_records[datastore_name] = matching_records 

347 return Quantum( 

348 taskName=self.task.taskDef.taskName, 

349 taskClass=self.task.taskDef.taskClass, 

350 dataId=self.dataId, 

351 initInputs=initInputs, 

352 inputs=helper.inputs, 

353 outputs=helper.outputs, 

354 datastore_records=quantum_records, 

355 ) 

356 

357 

358@dataclass 

359class _TaskScaffolding: 

360 """Helper class aggregating information about a `PipelineTask`, used when 

361 constructing a `QuantumGraph`. 

362 

363 See `_PipelineScaffolding` for a top-down description of the full 

364 scaffolding data structure. 

365 

366 Parameters 

367 ---------- 

368 taskDef : `TaskDef` 

369 Data structure that identifies the task class and its config. 

370 parent : `_PipelineScaffolding` 

371 The parent data structure that will hold the instance being 

372 constructed. 

373 datasetTypes : `TaskDatasetTypes` 

374 Data structure that categorizes the dataset types used by this task. 

375 """ 

376 

377 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

378 universe = parent.dimensions.universe 

379 self.taskDef = taskDef 

380 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

381 assert self.dimensions.issubset(parent.dimensions) 

382 # Initialize _DatasetDicts as subsets of the one or two 

383 # corresponding dicts in the parent _PipelineScaffolding. 

384 self.initInputs = _DatasetDict.fromSubset( 

385 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

386 ) 

387 self.initOutputs = _DatasetDict.fromSubset( 

388 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

389 ) 

390 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

391 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

392 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

393 self.dataIds: Set[DataCoordinate] = set() 

394 self.quanta = {} 

395 

396 def __repr__(self) -> str: 

397 # Default dataclass-injected __repr__ gets caught in an infinite loop 

398 # because of back-references. 

399 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

400 

401 taskDef: TaskDef 

402 """Data structure that identifies the task class and its config 

403 (`TaskDef`). 

404 """ 

405 

406 dimensions: DimensionGraph 

407 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

408 """ 

409 

410 initInputs: _DatasetDict 

411 """Dictionary containing information about datasets used to construct this 

412 task (`_DatasetDict`). 

413 """ 

414 

415 initOutputs: _DatasetDict 

416 """Dictionary containing information about datasets produced as a 

417 side-effect of constructing this task (`_DatasetDict`). 

418 """ 

419 

420 inputs: _DatasetDict 

421 """Dictionary containing information about datasets used as regular, 

422 graph-constraining inputs to this task (`_DatasetDict`). 

423 """ 

424 

425 outputs: _DatasetDict 

426 """Dictionary containing information about datasets produced by this task 

427 (`_DatasetDict`). 

428 """ 

429 

430 prerequisites: _DatasetDict 

431 """Dictionary containing information about input datasets that must be 

432 present in the repository before any Pipeline containing this task is run 

433 (`_DatasetDict`). 

434 """ 

435 

436 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

437 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

438 this task with that data ID. 

439 """ 

440 

441 def makeQuantumSet( 

442 self, 

443 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

444 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

445 ) -> Set[Quantum]: 

446 """Create a `set` of `Quantum` from the information in ``self``. 

447 

448 Parameters 

449 ---------- 

450 unresolvedRefs : `set` [ `DatasetRef` ], optional 

451 Input dataset refs that have not been found. 

452 datastore_records : `dict` 

453 

454 

455 Returns 

456 ------- 

457 nodes : `set` of `Quantum` 

458 The `Quantum` elements corresponding to this task. 

459 """ 

460 if unresolvedRefs is None: 

461 unresolvedRefs = set() 

462 outputs = set() 

463 for q in self.quanta.values(): 

464 try: 

465 tmpQuanta = q.makeQuantum(datastore_records) 

466 outputs.add(tmpQuanta) 

467 except (NoWorkFound, FileNotFoundError) as exc: 

468 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

469 if unresolvedRefs.intersection(refs): 

470 # This means it is a node that is Known to be pruned 

471 # later and should be left in even though some follow up 

472 # queries fail. This allows the pruning to start from this 

473 # quantum with known issues, and prune other nodes it 

474 # touches 

475 inputs = q.inputs.unpackMultiRefs() 

476 inputs.update(q.prerequisites.unpackMultiRefs()) 

477 tmpQuantum = Quantum( 

478 taskName=q.task.taskDef.taskName, 

479 taskClass=q.task.taskDef.taskClass, 

480 dataId=q.dataId, 

481 initInputs=q.task.initInputs.unpackSingleRefs(), 

482 inputs=inputs, 

483 outputs=q.outputs.unpackMultiRefs(), 

484 ) 

485 outputs.add(tmpQuantum) 

486 else: 

487 raise exc 

488 return outputs 

489 

490 

491class _DatasetIdMaker: 

492 """Helper class which generates random dataset UUIDs for unresolved 

493 datasets. 

494 """ 

495 

496 def __init__(self, registry: Registry, run: str): 

497 self.datasetIdFactory = registry.datasetIdFactory 

498 self.run = run 

499 # Dataset IDs generated so far 

500 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

501 

502 def resolveRef(self, ref: DatasetRef) -> DatasetRef: 

503 if ref.id is not None: 

504 return ref 

505 

506 # For components we need their parent dataset ID. 

507 if ref.isComponent(): 

508 parent_ref = ref.makeCompositeRef() 

509 # Some basic check - parent should be resolved if this is an 

510 # existing input, or it should be in the cache already if it is 

511 # an intermediate. 

512 if parent_ref.id is None: 

513 key = parent_ref.datasetType, parent_ref.dataId 

514 if key not in self.resolved: 

515 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}") 

516 parent_ref = self.resolved[key] 

517 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved" 

518 return ref.resolved(parent_ref.id, parent_ref.run) 

519 

520 key = ref.datasetType, ref.dataId 

521 if (resolved := self.resolved.get(key)) is None: 

522 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE) 

523 self.resolved[key] = resolved 

524 return resolved 

525 

526 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]: 

527 """Resolve all unresolved references in the provided dictionary.""" 

528 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()} 

529 

530 

531@dataclass 

532class _PipelineScaffolding: 

533 """A helper data structure that organizes the information involved in 

534 constructing a `QuantumGraph` for a `Pipeline`. 

535 

536 Parameters 

537 ---------- 

538 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

539 Sequence of tasks from which a graph is to be constructed. Must 

540 have nested task classes already imported. 

541 universe : `DimensionUniverse` 

542 Universe of all possible dimensions. 

543 

544 Notes 

545 ----- 

546 The scaffolding data structure contains nested data structures for both 

547 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

548 data structures are shared between the pipeline-level structure (which 

549 aggregates all datasets and categorizes them from the perspective of the 

550 complete pipeline) and the individual tasks that use them as inputs and 

551 outputs. 

552 

553 `QuantumGraph` construction proceeds in four steps, with each corresponding 

554 to a different `_PipelineScaffolding` method: 

555 

556 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

557 the DatasetTypes used by the pipeline (delegating to 

558 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

559 nested `_TaskScaffolding` and `_DatasetDict` objects. 

560 

561 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

562 returns related tuples of all dimensions used to identify any regular 

563 input, output, and intermediate datasets (not prerequisites). We then 

564 iterate over these tuples of related dimensions, identifying the subsets 

565 that correspond to distinct data IDs for each task and dataset type, 

566 and then create `_QuantumScaffolding` objects. 

567 

568 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

569 dataset data IDs previously identified, transforming unresolved 

570 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

571 up prerequisite datasets for all quanta. 

572 

573 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

574 per-task `_QuantumScaffolding` objects. 

575 """ 

576 

577 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

578 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

579 self.tasks = [] 

580 # Aggregate and categorize the DatasetTypes in the Pipeline. 

581 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

582 # Construct dictionaries that map those DatasetTypes to structures 

583 # that will (later) hold additional information about them. 

584 for attr in ( 

585 "initInputs", 

586 "initIntermediates", 

587 "initOutputs", 

588 "inputs", 

589 "intermediates", 

590 "outputs", 

591 "prerequisites", 

592 ): 

593 setattr( 

594 self, 

595 attr, 

596 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

597 ) 

598 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

599 # Aggregate all dimensions for all non-init, non-prerequisite 

600 # DatasetTypes. These are the ones we'll include in the big join 

601 # query. 

602 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

603 # Construct scaffolding nodes for each Task, and add backreferences 

604 # to the Task from each DatasetScaffolding node. 

605 # Note that there's only one scaffolding node for each DatasetType, 

606 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

607 # reference it. 

608 if isinstance(pipeline, Pipeline): 

609 pipeline = pipeline.toExpandedPipeline() 

610 self.tasks = [ 

611 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

612 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

613 ] 

614 

615 def __repr__(self) -> str: 

616 # Default dataclass-injected __repr__ gets caught in an infinite loop 

617 # because of back-references. 

618 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

619 

620 tasks: List[_TaskScaffolding] 

621 """Scaffolding data structures for each task in the pipeline 

622 (`list` of `_TaskScaffolding`). 

623 """ 

624 

625 initInputs: _DatasetDict 

626 """Datasets consumed but not produced when constructing the tasks in this 

627 pipeline (`_DatasetDict`). 

628 """ 

629 

630 initIntermediates: _DatasetDict 

631 """Datasets that are both consumed and produced when constructing the tasks 

632 in this pipeline (`_DatasetDict`). 

633 """ 

634 

635 initOutputs: _DatasetDict 

636 """Datasets produced but not consumed when constructing the tasks in this 

637 pipeline (`_DatasetDict`). 

638 """ 

639 

640 inputs: _DatasetDict 

641 """Datasets that are consumed but not produced when running this pipeline 

642 (`_DatasetDict`). 

643 """ 

644 

645 intermediates: _DatasetDict 

646 """Datasets that are both produced and consumed when running this pipeline 

647 (`_DatasetDict`). 

648 """ 

649 

650 outputs: _DatasetDict 

651 """Datasets produced but not consumed when when running this pipeline 

652 (`_DatasetDict`). 

653 """ 

654 

655 prerequisites: _DatasetDict 

656 """Datasets that are consumed when running this pipeline and looked up 

657 per-Quantum when generating the graph (`_DatasetDict`). 

658 """ 

659 

660 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

661 """Datasets that should be used as constraints in the initial query, 

662 according to tasks (`NamedValueSet`). 

663 """ 

664 

665 dimensions: DimensionGraph 

666 """All dimensions used by any regular input, intermediate, or output 

667 (not prerequisite) dataset; the set of dimension used in the "Big Join 

668 Query" (`DimensionGraph`). 

669 

670 This is required to be a superset of all task quantum dimensions. 

671 """ 

672 

673 globalInitOutputs: _DatasetDict | None = None 

674 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

675 """ 

676 

677 @contextmanager 

678 def connectDataIds( 

679 self, 

680 registry: Registry, 

681 collections: Any, 

682 userQuery: Optional[str], 

683 externalDataId: DataCoordinate, 

684 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

685 bind: Optional[Mapping[str, Any]] = None, 

686 ) -> Iterator[DataCoordinateQueryResults]: 

687 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

688 

689 This method populates `_TaskScaffolding.dataIds` and 

690 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

691 

692 Parameters 

693 ---------- 

694 registry : `lsst.daf.butler.Registry` 

695 Registry for the data repository; used for all data ID queries. 

696 collections 

697 Expressions representing the collections to search for input 

698 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

699 userQuery : `str` or `None` 

700 User-provided expression to limit the data IDs processed. 

701 externalDataId : `DataCoordinate` 

702 Externally-provided data ID that should be used to restrict the 

703 results, just as if these constraints had been included via ``AND`` 

704 in ``userQuery``. This includes (at least) any instrument named 

705 in the pipeline definition. 

706 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

707 The query constraint variant that should be used to constraint the 

708 query based on dataset existance, defaults to 

709 `DatasetQueryConstraintVariant.ALL`. 

710 bind : `Mapping`, optional 

711 Mapping containing literal values that should be injected into the 

712 ``userQuery`` expression, keyed by the identifiers they replace. 

713 

714 Returns 

715 ------- 

716 commonDataIds : \ 

717 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

718 An interface to a database temporary table containing all data IDs 

719 that will appear in this `QuantumGraph`. Returned inside a 

720 context manager, which will drop the temporary table at the end of 

721 the `with` block in which this method is called. 

722 """ 

723 _LOG.debug("Building query for data IDs.") 

724 # Initialization datasets always have empty data IDs. 

725 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

726 for datasetType, refs in itertools.chain( 

727 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

728 ): 

729 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

730 # Run one big query for the data IDs for task dimensions and regular 

731 # inputs and outputs. We limit the query to only dimensions that are 

732 # associated with the input dataset types, but don't (yet) try to 

733 # obtain the dataset_ids for those inputs. 

734 _LOG.debug( 

735 "Submitting data ID query over dimensions %s and materializing results.", 

736 list(self.dimensions.names), 

737 ) 

738 queryArgs: Dict[str, Any] = { 

739 "dimensions": self.dimensions, 

740 "where": userQuery, 

741 "dataId": externalDataId, 

742 "bind": bind, 

743 } 

744 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

745 _LOG.debug( 

746 "Constraining graph query using default of %s.", 

747 list(self.defaultDatasetQueryConstraints.names), 

748 ) 

749 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

750 queryArgs["collections"] = collections 

751 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

752 _LOG.debug("Not using dataset existence to constrain query.") 

753 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

754 constraint = set(datasetQueryConstraint) 

755 inputs = {k.name: k for k in self.inputs.keys()} 

756 if remainder := constraint.difference(inputs.keys()): 

757 raise ValueError( 

758 f"{remainder} dataset type(s) specified as a graph constraint, but" 

759 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

760 ) 

761 _LOG.debug(f"Constraining graph query using {constraint}") 

762 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

763 queryArgs["collections"] = collections 

764 else: 

765 raise ValueError( 

766 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

767 ) 

768 

769 if "datasets" in queryArgs: 

770 for i, dataset_type in enumerate(queryArgs["datasets"]): 

771 if dataset_type.isComponent(): 

772 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

773 

774 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

775 _LOG.debug("Expanding data IDs.") 

776 commonDataIds = commonDataIds.expanded() 

777 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

778 # Iterate over query results, populating data IDs for datasets and 

779 # quanta and then connecting them to each other. 

780 n = -1 

781 for n, commonDataId in enumerate(commonDataIds): 

782 # Create DatasetRefs for all DatasetTypes from this result row, 

783 # noting that we might have created some already. 

784 # We remember both those that already existed and those that we 

785 # create now. 

786 refsForRow = {} 

787 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

788 for datasetType, refs in itertools.chain( 

789 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

790 ): 

791 datasetDataId: Optional[DataCoordinate] 

792 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

793 datasetDataId = commonDataId.subset(datasetType.dimensions) 

794 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

795 ref = refs.get(datasetDataId) 

796 if ref is None: 

797 ref = DatasetRef(datasetType, datasetDataId) 

798 refs[datasetDataId] = ref 

799 refsForRow[datasetType.name] = ref 

800 # Create _QuantumScaffolding objects for all tasks from this 

801 # result row, noting that we might have created some already. 

802 for task in self.tasks: 

803 quantumDataId = commonDataId.subset(task.dimensions) 

804 quantum = task.quanta.get(quantumDataId) 

805 if quantum is None: 

806 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

807 task.quanta[quantumDataId] = quantum 

808 # Whether this is a new quantum or an existing one, we can 

809 # now associate the DatasetRefs for this row with it. The 

810 # fact that a Quantum data ID and a dataset data ID both 

811 # came from the same result row is what tells us they 

812 # should be associated. 

813 # Many of these associates will be duplicates (because 

814 # another query row that differed from this one only in 

815 # irrelevant dimensions already added them), and we use 

816 # sets to skip. 

817 for datasetType in task.inputs: 

818 ref = refsForRow[datasetType.name] 

819 quantum.inputs[datasetType.name][ref.dataId] = ref 

820 for datasetType in task.outputs: 

821 ref = refsForRow[datasetType.name] 

822 quantum.outputs[datasetType.name][ref.dataId] = ref 

823 if n < 0: 

824 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

825 emptiness_explained = False 

826 for message in commonDataIds.explain_no_results(): 

827 _LOG.critical(message) 

828 emptiness_explained = True 

829 if not emptiness_explained: 

830 _LOG.critical( 

831 "To reproduce this query for debugging purposes, run " 

832 "Registry.queryDataIds with these arguments:" 

833 ) 

834 # We could just repr() the queryArgs dict to get something 

835 # the user could make sense of, but it's friendlier to 

836 # put these args in an easier-to-construct equivalent form 

837 # so they can read it more easily and copy and paste into 

838 # a Python terminal. 

839 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

840 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

841 if queryArgs["where"]: 

842 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

843 if "datasets" in queryArgs: 

844 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

845 if "collections" in queryArgs: 

846 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

847 _LOG.debug("Finished processing %d rows from data ID query.", n) 

848 yield commonDataIds 

849 

850 def resolveDatasetRefs( 

851 self, 

852 registry: Registry, 

853 collections: Any, 

854 run: Optional[str], 

855 commonDataIds: DataCoordinateQueryResults, 

856 *, 

857 skipExistingIn: Any = None, 

858 clobberOutputs: bool = True, 

859 constrainedByAllDatasets: bool = True, 

860 resolveRefs: bool = False, 

861 ) -> None: 

862 """Perform follow up queries for each dataset data ID produced in 

863 `fillDataIds`. 

864 

865 This method populates `_DatasetScaffolding.refs` (except for those in 

866 `prerequisites`). 

867 

868 Parameters 

869 ---------- 

870 registry : `lsst.daf.butler.Registry` 

871 Registry for the data repository; used for all data ID queries. 

872 collections 

873 Expressions representing the collections to search for input 

874 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

875 run : `str`, optional 

876 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

877 output datasets, if it already exists. 

878 commonDataIds : \ 

879 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

880 Result of a previous call to `connectDataIds`. 

881 skipExistingIn 

882 Expressions representing the collections to search for existing 

883 output datasets that should be skipped. See 

884 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

885 `None` or empty string/sequence disables skipping. 

886 clobberOutputs : `bool`, optional 

887 If `True` (default), allow quanta to created even if outputs exist; 

888 this requires the same behavior behavior to be enabled when 

889 executing. If ``skipExistingIn`` is not `None`, completed quanta 

890 (those with metadata, or all outputs if there is no metadata 

891 dataset configured) will be skipped rather than clobbered. 

892 constrainedByAllDatasets : `bool`, optional 

893 Indicates if the commonDataIds were generated with a constraint on 

894 all dataset types. 

895 resolveRefs : `bool`, optional 

896 If `True` then resolve all input references and generate random 

897 dataset IDs for all output and intermediate datasets. True value 

898 requires ``run`` collection to be specified. 

899 

900 Raises 

901 ------ 

902 OutputExistsError 

903 Raised if an output dataset already exists in the output run 

904 and ``skipExistingIn`` does not include output run, or if only 

905 some outputs are present and ``clobberOutputs`` is `False`. 

906 """ 

907 # Run may be provided but it does not have to exist, in that case we 

908 # use it for resolving references but don't check it for existing refs. 

909 run_exists = False 

910 if run: 

911 try: 

912 run_exists = bool(registry.queryCollections(run)) 

913 except MissingCollectionError: 

914 # Undocumented exception is raise if it does not exist 

915 pass 

916 

917 skip_collections_wildcard: CollectionWildcard | None = None 

918 skipExistingInRun = False 

919 if skipExistingIn: 

920 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

921 if run_exists: 

922 # as optimization check in the explicit list of names first 

923 skipExistingInRun = run in skip_collections_wildcard.strings 

924 if not skipExistingInRun: 

925 # need to flatten it and check again 

926 skipExistingInRun = run in registry.queryCollections( 

927 skipExistingIn, 

928 collectionTypes=CollectionType.RUN, 

929 ) 

930 

931 idMaker: Optional[_DatasetIdMaker] = None 

932 if resolveRefs: 

933 assert run is not None, "run cannot be None when resolveRefs is True" 

934 idMaker = _DatasetIdMaker(registry, run) 

935 

936 resolvedRefQueryResults: Iterable[DatasetRef] 

937 

938 # Updating constrainedByAllDatasets here is not ideal, but we have a 

939 # few different code paths that each transfer different pieces of 

940 # information about what dataset query constraints were applied here, 

941 # and none of them has the complete picture until we get here. We're 

942 # long overdue for a QG generation rewrite that will make this go away 

943 # entirely anyway. 

944 constrainedByAllDatasets = ( 

945 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

946 ) 

947 

948 # Look up [init] intermediate and output datasets in the output 

949 # collection, if there is an output collection. 

950 if run_exists or skip_collections_wildcard is not None: 

951 for datasetType, refs in itertools.chain( 

952 self.initIntermediates.items(), 

953 self.initOutputs.items(), 

954 self.intermediates.items(), 

955 self.outputs.items(), 

956 ): 

957 _LOG.debug( 

958 "Resolving %d datasets for intermediate and/or output dataset %s.", 

959 len(refs), 

960 datasetType.name, 

961 ) 

962 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

963 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

964 # TODO: this assert incorrectly bans component inputs; 

965 # investigate on DM-33027. 

966 # assert not datasetType.isComponent(), \ 

967 # "Output datasets cannot be components." 

968 # 

969 # Instead we have to handle them manually to avoid a 

970 # deprecation warning, but it is at least confusing and 

971 # possibly a bug for components to appear here at all. 

972 if datasetType.isComponent(): 

973 parent_dataset_type = datasetType.makeCompositeDatasetType() 

974 component = datasetType.component() 

975 else: 

976 parent_dataset_type = datasetType 

977 component = None 

978 

979 # look at RUN collection first 

980 if run_exists: 

981 try: 

982 resolvedRefQueryResults = subset.findDatasets( 

983 parent_dataset_type, collections=run, findFirst=True 

984 ) 

985 except MissingDatasetTypeError: 

986 resolvedRefQueryResults = [] 

987 for resolvedRef in resolvedRefQueryResults: 

988 # TODO: we could easily support per-DatasetType 

989 # skipExisting and I could imagine that being useful - 

990 # it's probably required in order to support writing 

991 # initOutputs before QuantumGraph generation. 

992 assert resolvedRef.dataId in refs 

993 if not (skipExistingInRun or isInit or clobberOutputs): 

994 raise OutputExistsError( 

995 f"Output dataset {datasetType.name} already exists in " 

996 f"output RUN collection '{run}' with data ID" 

997 f" {resolvedRef.dataId}." 

998 ) 

999 # If we are going to resolve all outputs then we have 

1000 # to remember existing ones to avoid generating new 

1001 # dataset IDs for them. 

1002 if resolveRefs: 

1003 refs[resolvedRef.dataId] = ( 

1004 resolvedRef.makeComponentRef(component) 

1005 if component is not None 

1006 else resolvedRef 

1007 ) 

1008 

1009 # And check skipExistingIn too, if RUN collection is in 

1010 # it is handled above 

1011 if skip_collections_wildcard is not None: 

1012 try: 

1013 resolvedRefQueryResults = subset.findDatasets( 

1014 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True 

1015 ) 

1016 except MissingDatasetTypeError: 

1017 resolvedRefQueryResults = [] 

1018 for resolvedRef in resolvedRefQueryResults: 

1019 assert resolvedRef.dataId in refs 

1020 refs[resolvedRef.dataId] = ( 

1021 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1022 ) 

1023 

1024 # Look up input and initInput datasets in the input collection(s). 

1025 # container to accumulate unfound refs, if the common dataIs were not 

1026 # constrained on dataset type existence. 

1027 self.unfoundRefs = set() 

1028 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1029 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

1030 if datasetType.isComponent(): 

1031 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1032 component = datasetType.component() 

1033 else: 

1034 parent_dataset_type = datasetType 

1035 component = None 

1036 try: 

1037 resolvedRefQueryResults = commonDataIds.subset( 

1038 datasetType.dimensions, unique=True 

1039 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1040 except MissingDatasetTypeError: 

1041 resolvedRefQueryResults = [] 

1042 dataIdsNotFoundYet = set(refs.keys()) 

1043 for resolvedRef in resolvedRefQueryResults: 

1044 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1045 refs[resolvedRef.dataId] = ( 

1046 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1047 ) 

1048 if dataIdsNotFoundYet: 

1049 if constrainedByAllDatasets: 

1050 raise RuntimeError( 

1051 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1052 f"'{datasetType.name}' was/were present in a previous " 

1053 "query, but could not be found now. " 

1054 "This is either a logic bug in QuantumGraph generation " 

1055 "or the input collections have been modified since " 

1056 "QuantumGraph generation began." 

1057 ) 

1058 elif not datasetType.dimensions: 

1059 raise RuntimeError( 

1060 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1061 f"collections {collections}." 

1062 ) 

1063 else: 

1064 # if the common dataIds were not constrained using all the 

1065 # input dataset types, it is possible that some data ids 

1066 # found dont correspond to existing dataset types and they 

1067 # will be un-resolved. Mark these for later pruning from 

1068 # the quantum graph. 

1069 for k in dataIdsNotFoundYet: 

1070 self.unfoundRefs.add(refs[k]) 

1071 

1072 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1073 # replacing the unresolved refs there, and then look up prerequisites. 

1074 for task in self.tasks: 

1075 _LOG.debug( 

1076 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1077 len(task.quanta), 

1078 task.taskDef.label, 

1079 ) 

1080 # The way iterConnections is designed makes it impossible to 

1081 # annotate precisely enough to satisfy MyPy here. 

1082 lookupFunctions = { 

1083 c.name: c.lookupFunction # type: ignore 

1084 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1085 if c.lookupFunction is not None # type: ignore 

1086 } 

1087 dataIdsFailed = [] 

1088 dataIdsSucceeded = [] 

1089 for quantum in task.quanta.values(): 

1090 # Process outputs datasets only if skipExistingIn is not None 

1091 # or there is a run to look for outputs in and clobberOutputs 

1092 # is True. Note that if skipExistingIn is None, any output 

1093 # datasets that already exist would have already caused an 

1094 # exception to be raised. We never update the DatasetRefs in 

1095 # the quantum because those should never be resolved. 

1096 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1097 resolvedRefs = [] 

1098 unresolvedRefs = [] 

1099 haveMetadata = False 

1100 for datasetType, originalRefs in quantum.outputs.items(): 

1101 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1102 if ref.id is not None: 

1103 resolvedRefs.append(ref) 

1104 if datasetType.name == task.taskDef.metadataDatasetName: 

1105 haveMetadata = True 

1106 else: 

1107 unresolvedRefs.append(ref) 

1108 if resolvedRefs: 

1109 if haveMetadata or not unresolvedRefs: 

1110 dataIdsSucceeded.append(quantum.dataId) 

1111 if skip_collections_wildcard is not None: 

1112 continue 

1113 else: 

1114 dataIdsFailed.append(quantum.dataId) 

1115 if not clobberOutputs: 

1116 raise OutputExistsError( 

1117 f"Quantum {quantum.dataId} of task with label " 

1118 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1119 f"({resolvedRefs}) " 

1120 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

1121 "and clobbering outputs was not enabled." 

1122 ) 

1123 # Update the input DatasetRefs to the resolved ones we already 

1124 # searched for. 

1125 for datasetType, input_refs in quantum.inputs.items(): 

1126 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

1127 input_refs[ref.dataId] = ref 

1128 # Look up prerequisite datasets in the input collection(s). 

1129 # These may have dimensions that extend beyond those we queried 

1130 # for originally, because we want to permit those data ID 

1131 # values to differ across quanta and dataset types. 

1132 for datasetType in task.prerequisites: 

1133 if datasetType.isComponent(): 

1134 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1135 component = datasetType.component() 

1136 else: 

1137 parent_dataset_type = datasetType 

1138 component = None 

1139 lookupFunction = lookupFunctions.get(datasetType.name) 

1140 if lookupFunction is not None: 

1141 # PipelineTask has provided its own function to do the 

1142 # lookup. This always takes precedence. 

1143 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1144 elif ( 

1145 datasetType.isCalibration() 

1146 and datasetType.dimensions <= quantum.dataId.graph 

1147 and quantum.dataId.graph.temporal 

1148 ): 

1149 # This is a master calibration lookup, which we have to 

1150 # handle specially because the query system can't do a 

1151 # temporal join on a non-dimension-based timespan yet. 

1152 timespan = quantum.dataId.timespan 

1153 try: 

1154 prereq_ref = registry.findDataset( 

1155 parent_dataset_type, 

1156 quantum.dataId, 

1157 collections=collections, 

1158 timespan=timespan, 

1159 ) 

1160 if prereq_ref is not None: 

1161 if component is not None: 

1162 prereq_ref = prereq_ref.makeComponentRef(component) 

1163 prereq_refs = [prereq_ref] 

1164 else: 

1165 prereq_refs = [] 

1166 except (KeyError, MissingDatasetTypeError): 

1167 # This dataset type is not present in the registry, 

1168 # which just means there are no datasets here. 

1169 prereq_refs = [] 

1170 else: 

1171 # Most general case. 

1172 prereq_refs = [ 

1173 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1174 for prereq_ref in registry.queryDatasets( 

1175 parent_dataset_type, 

1176 collections=collections, 

1177 dataId=quantum.dataId, 

1178 findFirst=True, 

1179 ).expanded() 

1180 ] 

1181 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1182 quantum.prerequisites[datasetType].update(prereq_refs_map) 

1183 task.prerequisites[datasetType].update(prereq_refs_map) 

1184 

1185 # Resolve all quantum inputs and outputs. 

1186 if idMaker: 

1187 for datasetDict in (quantum.inputs, quantum.outputs): 

1188 for refDict in datasetDict.values(): 

1189 refDict.update(idMaker.resolveDict(refDict)) 

1190 

1191 # Resolve task initInputs and initOutputs. 

1192 if idMaker: 

1193 for datasetDict in (task.initInputs, task.initOutputs): 

1194 for refDict in datasetDict.values(): 

1195 refDict.update(idMaker.resolveDict(refDict)) 

1196 

1197 # Actually remove any quanta that we decided to skip above. 

1198 if dataIdsSucceeded: 

1199 if skip_collections_wildcard is not None: 

1200 _LOG.debug( 

1201 "Pruning successful %d quanta for task with label '%s' because all of their " 

1202 "outputs exist or metadata was written successfully.", 

1203 len(dataIdsSucceeded), 

1204 task.taskDef.label, 

1205 ) 

1206 for dataId in dataIdsSucceeded: 

1207 del task.quanta[dataId] 

1208 elif clobberOutputs: 

1209 _LOG.info( 

1210 "Found %d successful quanta for task with label '%s' " 

1211 "that will need to be clobbered during execution.", 

1212 len(dataIdsSucceeded), 

1213 task.taskDef.label, 

1214 ) 

1215 else: 

1216 raise AssertionError("OutputExistsError should have already been raised.") 

1217 if dataIdsFailed: 

1218 if clobberOutputs: 

1219 _LOG.info( 

1220 "Found %d failed/incomplete quanta for task with label '%s' " 

1221 "that will need to be clobbered during execution.", 

1222 len(dataIdsFailed), 

1223 task.taskDef.label, 

1224 ) 

1225 else: 

1226 raise AssertionError("OutputExistsError should have already been raised.") 

1227 

1228 # Collect initOutputs that do not belong to any task. 

1229 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1230 for task in self.tasks: 

1231 global_dataset_types -= set(task.initOutputs) 

1232 if global_dataset_types: 

1233 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1234 if idMaker is not None: 

1235 for refDict in self.globalInitOutputs.values(): 

1236 refDict.update(idMaker.resolveDict(refDict)) 

1237 

1238 def makeQuantumGraph( 

1239 self, 

1240 registry: Registry, 

1241 metadata: Optional[Mapping[str, Any]] = None, 

1242 datastore: Optional[Datastore] = None, 

1243 ) -> QuantumGraph: 

1244 """Create a `QuantumGraph` from the quanta already present in 

1245 the scaffolding data structure. 

1246 

1247 Parameters 

1248 --------- 

1249 registry : `lsst.daf.butler.Registry` 

1250 Registry for the data repository; used for all data ID queries. 

1251 metadata : Optional Mapping of `str` to primitives 

1252 This is an optional parameter of extra data to carry with the 

1253 graph. Entries in this mapping should be able to be serialized in 

1254 JSON. 

1255 datastore : `Datastore`, optional 

1256 If not `None` then fill datastore records in each generated 

1257 Quantum. 

1258 

1259 Returns 

1260 ------- 

1261 graph : `QuantumGraph` 

1262 The full `QuantumGraph`. 

1263 """ 

1264 

1265 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1266 """Extract all DatasetRefs from the dictionaries""" 

1267 for ref_dict in dataset_dict.values(): 

1268 yield from ref_dict.values() 

1269 

1270 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1271 if datastore is not None: 

1272 datastore_records = datastore.export_records( 

1273 itertools.chain( 

1274 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1275 ) 

1276 ) 

1277 

1278 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1279 for task in self.tasks: 

1280 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1281 graphInput[task.taskDef] = qset 

1282 

1283 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1284 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1285 

1286 globalInitOutputs: list[DatasetRef] = [] 

1287 if self.globalInitOutputs is not None: 

1288 for refs_dict in self.globalInitOutputs.values(): 

1289 globalInitOutputs.extend(refs_dict.values()) 

1290 

1291 graph = QuantumGraph( 

1292 graphInput, 

1293 metadata=metadata, 

1294 pruneRefs=self.unfoundRefs, 

1295 universe=self.dimensions.universe, 

1296 initInputs=taskInitInputs, 

1297 initOutputs=taskInitOutputs, 

1298 globalInitOutputs=globalInitOutputs, 

1299 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1300 ) 

1301 return graph 

1302 

1303 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1304 """Make a list of all dataset types used by a graph as defined in 

1305 registry. 

1306 """ 

1307 chain = [ 

1308 self.initInputs, 

1309 self.initIntermediates, 

1310 self.initOutputs, 

1311 self.inputs, 

1312 self.intermediates, 

1313 self.outputs, 

1314 self.prerequisites, 

1315 ] 

1316 if self.globalInitOutputs is not None: 

1317 chain.append(self.globalInitOutputs) 

1318 

1319 # Collect names of all dataset types. 

1320 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1321 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1322 

1323 # Check for types that do not exist in registry yet: 

1324 # - inputs must exist 

1325 # - intermediates and outputs may not exist, but there must not be 

1326 # more than one definition (e.g. differing in storage class) 

1327 # - prerequisites may not exist, treat it the same as outputs here 

1328 for dstype in itertools.chain(self.initInputs, self.inputs): 

1329 if dstype.name not in dataset_types: 

1330 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1331 

1332 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1333 chain = [ 

1334 self.initIntermediates, 

1335 self.initOutputs, 

1336 self.intermediates, 

1337 self.outputs, 

1338 self.prerequisites, 

1339 ] 

1340 if self.globalInitOutputs is not None: 

1341 chain.append(self.globalInitOutputs) 

1342 for dstype in itertools.chain(*chain): 

1343 if dstype.name not in dataset_types: 

1344 new_outputs[dstype.name].add(dstype) 

1345 for name, dstypes in new_outputs.items(): 

1346 if len(dstypes) > 1: 

1347 raise ValueError( 

1348 "Pipeline contains multiple definitions for a dataset type " 

1349 f"which is not defined in registry yet: {dstypes}" 

1350 ) 

1351 elif len(dstypes) == 1: 

1352 dataset_types[name] = dstypes.pop() 

1353 

1354 return dataset_types.values() 

1355 

1356 

1357# ------------------------ 

1358# Exported definitions -- 

1359# ------------------------ 

1360 

1361 

1362class GraphBuilderError(Exception): 

1363 """Base class for exceptions generated by graph builder.""" 

1364 

1365 pass 

1366 

1367 

1368class OutputExistsError(GraphBuilderError): 

1369 """Exception generated when output datasets already exist.""" 

1370 

1371 pass 

1372 

1373 

1374class PrerequisiteMissingError(GraphBuilderError): 

1375 """Exception generated when a prerequisite dataset does not exist.""" 

1376 

1377 pass 

1378 

1379 

1380class GraphBuilder: 

1381 """GraphBuilder class is responsible for building task execution graph from 

1382 a Pipeline. 

1383 

1384 Parameters 

1385 ---------- 

1386 registry : `~lsst.daf.butler.Registry` 

1387 Data butler instance. 

1388 skipExistingIn 

1389 Expressions representing the collections to search for existing 

1390 output datasets that should be skipped. See 

1391 :ref:`daf_butler_ordered_collection_searches`. 

1392 clobberOutputs : `bool`, optional 

1393 If `True` (default), allow quanta to created even if partial outputs 

1394 exist; this requires the same behavior behavior to be enabled when 

1395 executing. 

1396 datastore : `Datastore`, optional 

1397 If not `None` then fill datastore records in each generated Quantum. 

1398 """ 

1399 

1400 def __init__( 

1401 self, 

1402 registry: Registry, 

1403 skipExistingIn: Any = None, 

1404 clobberOutputs: bool = True, 

1405 datastore: Optional[Datastore] = None, 

1406 ): 

1407 self.registry = registry 

1408 self.dimensions = registry.dimensions 

1409 self.skipExistingIn = skipExistingIn 

1410 self.clobberOutputs = clobberOutputs 

1411 self.datastore = datastore 

1412 

1413 def makeGraph( 

1414 self, 

1415 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1416 collections: Any, 

1417 run: Optional[str], 

1418 userQuery: Optional[str], 

1419 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1420 metadata: Optional[Mapping[str, Any]] = None, 

1421 resolveRefs: bool = False, 

1422 bind: Optional[Mapping[str, Any]] = None, 

1423 ) -> QuantumGraph: 

1424 """Create execution graph for a pipeline. 

1425 

1426 Parameters 

1427 ---------- 

1428 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1429 Pipeline definition, task names/classes and their configs. 

1430 collections 

1431 Expressions representing the collections to search for input 

1432 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1433 run : `str`, optional 

1434 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1435 output datasets. Collection does not have to exist and it will be 

1436 created when graph is executed. 

1437 userQuery : `str` 

1438 String which defines user-defined selection for registry, should be 

1439 empty or `None` if there is no restrictions on data selection. 

1440 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1441 The query constraint variant that should be used to constraint the 

1442 query based on dataset existance, defaults to 

1443 `DatasetQueryConstraintVariant.ALL`. 

1444 metadata : Optional Mapping of `str` to primitives 

1445 This is an optional parameter of extra data to carry with the 

1446 graph. Entries in this mapping should be able to be serialized in 

1447 JSON. 

1448 resolveRefs : `bool`, optional 

1449 If `True` then resolve all input references and generate random 

1450 dataset IDs for all output and intermediate datasets. True value 

1451 requires ``run`` collection to be specified. 

1452 bind : `Mapping`, optional 

1453 Mapping containing literal values that should be injected into the 

1454 ``userQuery`` expression, keyed by the identifiers they replace. 

1455 

1456 Returns 

1457 ------- 

1458 graph : `QuantumGraph` 

1459 

1460 Raises 

1461 ------ 

1462 UserExpressionError 

1463 Raised when user expression cannot be parsed. 

1464 OutputExistsError 

1465 Raised when output datasets already exist. 

1466 Exception 

1467 Other exceptions types may be raised by underlying registry 

1468 classes. 

1469 """ 

1470 if resolveRefs and run is None: 

1471 raise ValueError("`resolveRefs` requires `run` parameter.") 

1472 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1473 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1474 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1475 instrument_class: Optional[Any] = None 

1476 if isinstance(pipeline, Pipeline): 

1477 instrument_class_name = pipeline.getInstrument() 

1478 if instrument_class_name is not None: 

1479 instrument_class = doImportType(instrument_class_name) 

1480 pipeline = list(pipeline.toExpandedPipeline()) 

1481 if instrument_class is not None: 

1482 dataId = DataCoordinate.standardize( 

1483 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1484 ) 

1485 else: 

1486 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1487 with scaffolding.connectDataIds( 

1488 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1489 ) as commonDataIds: 

1490 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1491 scaffolding.resolveDatasetRefs( 

1492 self.registry, 

1493 collections, 

1494 run, 

1495 commonDataIds, 

1496 skipExistingIn=self.skipExistingIn, 

1497 clobberOutputs=self.clobberOutputs, 

1498 constrainedByAllDatasets=condition, 

1499 resolveRefs=resolveRefs, 

1500 ) 

1501 return scaffolding.makeQuantumGraph( 

1502 registry=self.registry, metadata=metadata, datastore=self.datastore 

1503 )