Coverage for python/lsst/pipe/base/graphBuilder.py: 16%

464 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2024-03-20 00:42 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionType, 

40 DataCoordinate, 

41 DatasetIdGenEnum, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56from lsst.utils import doImportType 

57 

58from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

59from ._status import NoWorkFound 

60 

61# ----------------------------- 

62# Imports for other modules -- 

63# ----------------------------- 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

76 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

77 the known `DatasetRef` instances of that type. 

78 

79 Parameters 

80 ---------- 

81 args 

82 Positional arguments are forwarded to the `dict` constructor. 

83 universe : `DimensionUniverse` 

84 Universe of all possible dimensions. 

85 """ 

86 

87 def __init__(self, *args: Any, universe: DimensionUniverse): 

88 super().__init__(*args) 

89 self.universe = universe 

90 

91 @classmethod 

92 def fromDatasetTypes( 

93 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

94 ) -> _DatasetDict: 

95 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

96 

97 Parameters 

98 ---------- 

99 datasetTypes : `iterable` of `DatasetType` 

100 DatasetTypes to use as keys for the dict. Values will be empty 

101 dictionaries. 

102 universe : `DimensionUniverse` 

103 Universe of all possible dimensions. 

104 

105 Returns 

106 ------- 

107 dictionary : `_DatasetDict` 

108 A new `_DatasetDict` instance. 

109 """ 

110 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

111 

112 @classmethod 

113 def fromSubset( 

114 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

115 ) -> _DatasetDict: 

116 """Return a new dictionary by extracting items corresponding to the 

117 given keys from one or more existing dictionaries. 

118 

119 Parameters 

120 ---------- 

121 datasetTypes : `iterable` of `DatasetType` 

122 DatasetTypes to use as keys for the dict. Values will be obtained 

123 by lookups against ``first`` and ``rest``. 

124 first : `_DatasetDict` 

125 Another dictionary from which to extract values. 

126 rest 

127 Additional dictionaries from which to extract values. 

128 

129 Returns 

130 ------- 

131 dictionary : `_DatasetDict` 

132 A new dictionary instance. 

133 """ 

134 combined = ChainMap(first, *rest) 

135 

136 # Dataset types known to match immediately can be processed 

137 # without checks. 

138 matches = combined.keys() & set(datasetTypes) 

139 _dict = {k: combined[k] for k in matches} 

140 

141 if len(_dict) < len(datasetTypes): 

142 # Work out which ones are missing. 

143 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

144 

145 # Get the known names for comparison. 

146 combined_by_name = {k.name: k for k in combined} 

147 

148 missing = set() 

149 incompatible = {} 

150 for datasetType in missing_datasetTypes: 

151 # The dataset type is not found. It may not be listed 

152 # or it may be that it is there with the same name 

153 # but different definition. 

154 if datasetType.name in combined_by_name: 

155 # This implies some inconsistency in definitions 

156 # for connections. If there is support for storage 

157 # class conversion we can let it slide. 

158 # At this point we do not know 

159 # where the inconsistency is but trust that down 

160 # stream code will be more explicit about input 

161 # vs output incompatibilities. 

162 existing = combined_by_name[datasetType.name] 

163 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

164 _LOG.warning( 

165 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

166 datasetType, 

167 existing, 

168 ) 

169 _dict[datasetType] = combined[existing] 

170 else: 

171 incompatible[datasetType] = existing 

172 else: 

173 missing.add(datasetType) 

174 

175 if missing or incompatible: 

176 reasons = [] 

177 if missing: 

178 reasons.append( 

179 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

180 + ", ".join(d.name for d in combined) 

181 ) 

182 if incompatible: 

183 for x, y in incompatible.items(): 

184 reasons.append(f"{x} incompatible with {y}") 

185 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

186 

187 return cls(_dict, universe=first.universe) 

188 

189 @property 

190 def dimensions(self) -> DimensionGraph: 

191 """The union of all dimensions used by all dataset types in this 

192 dictionary, including implied dependencies (`DimensionGraph`). 

193 """ 

194 base = self.universe.empty 

195 if len(self) == 0: 

196 return base 

197 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

198 

199 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

200 """Unpack nested single-element `DatasetRef` dicts into a new 

201 mapping with `DatasetType` keys and `DatasetRef` values. 

202 

203 This method assumes that each nest contains exactly one item, as is the 

204 case for all "init" datasets. 

205 

206 Returns 

207 ------- 

208 dictionary : `NamedKeyDict` 

209 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

210 `DatasetType` instances and string names usable as keys. 

211 """ 

212 

213 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

214 (ref,) = refs.values() 

215 return ref 

216 

217 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

218 

219 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

220 """Unpack nested multi-element `DatasetRef` dicts into a new 

221 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

222 

223 Returns 

224 ------- 

225 dictionary : `NamedKeyDict` 

226 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

227 both `DatasetType` instances and string names usable as keys. 

228 """ 

229 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

230 

231 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

232 """Iterate over the contained `DatasetRef` instances that match the 

233 given `DatasetType` and data IDs. 

234 

235 Parameters 

236 ---------- 

237 datasetType : `DatasetType` 

238 Dataset type to match. 

239 dataIds : `Iterable` [ `DataCoordinate` ] 

240 Data IDs to match. 

241 

242 Returns 

243 ------- 

244 refs : `Iterator` [ `DatasetRef` ] 

245 DatasetRef instances for which ``ref.datasetType == datasetType`` 

246 and ``ref.dataId`` is in ``dataIds``. 

247 """ 

248 refs = self[datasetType] 

249 return (refs[dataId] for dataId in dataIds) 

250 

251 

252class _QuantumScaffolding: 

253 """Helper class aggregating information about a `Quantum`, used when 

254 constructing a `QuantumGraph`. 

255 

256 See `_PipelineScaffolding` for a top-down description of the full 

257 scaffolding data structure. 

258 

259 Parameters 

260 ---------- 

261 task : _TaskScaffolding 

262 Back-reference to the helper object for the `PipelineTask` this quantum 

263 represents an execution of. 

264 dataId : `DataCoordinate` 

265 Data ID for this quantum. 

266 """ 

267 

268 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

269 self.task = task 

270 self.dataId = dataId 

271 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

272 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

273 self.prerequisites = _DatasetDict.fromDatasetTypes( 

274 task.prerequisites.keys(), universe=dataId.universe 

275 ) 

276 

277 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

278 

279 def __repr__(self) -> str: 

280 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

281 

282 task: _TaskScaffolding 

283 """Back-reference to the helper object for the `PipelineTask` this quantum 

284 represents an execution of. 

285 """ 

286 

287 dataId: DataCoordinate 

288 """Data ID for this quantum. 

289 """ 

290 

291 inputs: _DatasetDict 

292 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

293 

294 This is initialized to map each `DatasetType` to an empty dictionary at 

295 construction. Those nested dictionaries are populated (with data IDs as 

296 keys) with unresolved `DatasetRef` instances in 

297 `_PipelineScaffolding.connectDataIds`. 

298 """ 

299 

300 outputs: _DatasetDict 

301 """Nested dictionary containing `DatasetRef` outputs this quantum. 

302 """ 

303 

304 prerequisites: _DatasetDict 

305 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

306 quantum. 

307 """ 

308 

309 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

310 """Transform the scaffolding object into a true `Quantum` instance. 

311 

312 Parameters 

313 ---------- 

314 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

315 If not `None` then fill datastore records in each generated Quantum 

316 using the records from this structure. 

317 

318 Returns 

319 ------- 

320 quantum : `Quantum` 

321 An actual `Quantum` instance. 

322 """ 

323 allInputs = self.inputs.unpackMultiRefs() 

324 allInputs.update(self.prerequisites.unpackMultiRefs()) 

325 # Give the task's Connections class an opportunity to remove some 

326 # inputs, or complain if they are unacceptable. 

327 # This will raise if one of the check conditions is not met, which is 

328 # the intended behavior. 

329 # If it raises NotWorkFound, there is a bug in the QG algorithm 

330 # or the adjustQuantum is incorrectly trying to make a prerequisite 

331 # input behave like a regular input; adjustQuantum should only raise 

332 # NoWorkFound if a regular input is missing, and it shouldn't be 

333 # possible for us to have generated ``self`` if that's true. 

334 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

335 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

336 initInputs = self.task.initInputs.unpackSingleRefs() 

337 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

338 if datastore_records is not None: 

339 quantum_records = {} 

340 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

341 input_refs += list(initInputs.values()) 

342 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

343 for datastore_name, records in datastore_records.items(): 

344 matching_records = records.subset(input_ids) 

345 if matching_records is not None: 

346 quantum_records[datastore_name] = matching_records 

347 return Quantum( 

348 taskName=self.task.taskDef.taskName, 

349 taskClass=self.task.taskDef.taskClass, 

350 dataId=self.dataId, 

351 initInputs=initInputs, 

352 inputs=helper.inputs, 

353 outputs=helper.outputs, 

354 datastore_records=quantum_records, 

355 ) 

356 

357 

358@dataclass 

359class _TaskScaffolding: 

360 """Helper class aggregating information about a `PipelineTask`, used when 

361 constructing a `QuantumGraph`. 

362 

363 See `_PipelineScaffolding` for a top-down description of the full 

364 scaffolding data structure. 

365 

366 Parameters 

367 ---------- 

368 taskDef : `TaskDef` 

369 Data structure that identifies the task class and its config. 

370 parent : `_PipelineScaffolding` 

371 The parent data structure that will hold the instance being 

372 constructed. 

373 datasetTypes : `TaskDatasetTypes` 

374 Data structure that categorizes the dataset types used by this task. 

375 """ 

376 

377 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

378 universe = parent.dimensions.universe 

379 self.taskDef = taskDef 

380 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

381 assert self.dimensions.issubset(parent.dimensions) 

382 # Initialize _DatasetDicts as subsets of the one or two 

383 # corresponding dicts in the parent _PipelineScaffolding. 

384 self.initInputs = _DatasetDict.fromSubset( 

385 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

386 ) 

387 self.initOutputs = _DatasetDict.fromSubset( 

388 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

389 ) 

390 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

391 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

392 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

393 self.dataIds: Set[DataCoordinate] = set() 

394 self.quanta = {} 

395 

396 def __repr__(self) -> str: 

397 # Default dataclass-injected __repr__ gets caught in an infinite loop 

398 # because of back-references. 

399 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

400 

401 taskDef: TaskDef 

402 """Data structure that identifies the task class and its config 

403 (`TaskDef`). 

404 """ 

405 

406 dimensions: DimensionGraph 

407 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

408 """ 

409 

410 initInputs: _DatasetDict 

411 """Dictionary containing information about datasets used to construct this 

412 task (`_DatasetDict`). 

413 """ 

414 

415 initOutputs: _DatasetDict 

416 """Dictionary containing information about datasets produced as a 

417 side-effect of constructing this task (`_DatasetDict`). 

418 """ 

419 

420 inputs: _DatasetDict 

421 """Dictionary containing information about datasets used as regular, 

422 graph-constraining inputs to this task (`_DatasetDict`). 

423 """ 

424 

425 outputs: _DatasetDict 

426 """Dictionary containing information about datasets produced by this task 

427 (`_DatasetDict`). 

428 """ 

429 

430 prerequisites: _DatasetDict 

431 """Dictionary containing information about input datasets that must be 

432 present in the repository before any Pipeline containing this task is run 

433 (`_DatasetDict`). 

434 """ 

435 

436 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

437 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

438 this task with that data ID. 

439 """ 

440 

441 def makeQuantumSet( 

442 self, 

443 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

444 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

445 ) -> Set[Quantum]: 

446 """Create a `set` of `Quantum` from the information in ``self``. 

447 

448 Parameters 

449 ---------- 

450 unresolvedRefs : `set` [ `DatasetRef` ], optional 

451 Input dataset refs that have not been found. 

452 datastore_records : `dict` 

453 

454 

455 Returns 

456 ------- 

457 nodes : `set` of `Quantum` 

458 The `Quantum` elements corresponding to this task. 

459 """ 

460 if unresolvedRefs is None: 

461 unresolvedRefs = set() 

462 outputs = set() 

463 for q in self.quanta.values(): 

464 try: 

465 tmpQuanta = q.makeQuantum(datastore_records) 

466 outputs.add(tmpQuanta) 

467 except (NoWorkFound, FileNotFoundError) as exc: 

468 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

469 if unresolvedRefs.intersection(refs): 

470 # This means it is a node that is Known to be pruned 

471 # later and should be left in even though some follow up 

472 # queries fail. This allows the pruning to start from this 

473 # quantum with known issues, and prune other nodes it 

474 # touches 

475 inputs = q.inputs.unpackMultiRefs() 

476 inputs.update(q.prerequisites.unpackMultiRefs()) 

477 tmpQuantum = Quantum( 

478 taskName=q.task.taskDef.taskName, 

479 taskClass=q.task.taskDef.taskClass, 

480 dataId=q.dataId, 

481 initInputs=q.task.initInputs.unpackSingleRefs(), 

482 inputs=inputs, 

483 outputs=q.outputs.unpackMultiRefs(), 

484 ) 

485 outputs.add(tmpQuantum) 

486 else: 

487 raise exc 

488 return outputs 

489 

490 

491class _DatasetIdMaker: 

492 """Helper class which generates random dataset UUIDs for unresolved 

493 datasets. 

494 """ 

495 

496 def __init__(self, registry: Registry, run: str): 

497 self.datasetIdFactory = registry.datasetIdFactory 

498 self.run = run 

499 # Dataset IDs generated so far 

500 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

501 

502 def resolveRef(self, ref: DatasetRef) -> DatasetRef: 

503 if ref.id is not None: 

504 return ref 

505 key = ref.datasetType, ref.dataId 

506 if (resolved := self.resolved.get(key)) is None: 

507 datasetId = self.datasetIdFactory.makeDatasetId( 

508 self.run, ref.datasetType, ref.dataId, DatasetIdGenEnum.UNIQUE 

509 ) 

510 resolved = ref.resolved(datasetId, self.run) 

511 self.resolved[key] = resolved 

512 return resolved 

513 

514 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]: 

515 """Resolve all unresolved references in the provided dictionary.""" 

516 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()} 

517 

518 

519@dataclass 

520class _PipelineScaffolding: 

521 """A helper data structure that organizes the information involved in 

522 constructing a `QuantumGraph` for a `Pipeline`. 

523 

524 Parameters 

525 ---------- 

526 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

527 Sequence of tasks from which a graph is to be constructed. Must 

528 have nested task classes already imported. 

529 universe : `DimensionUniverse` 

530 Universe of all possible dimensions. 

531 

532 Notes 

533 ----- 

534 The scaffolding data structure contains nested data structures for both 

535 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

536 data structures are shared between the pipeline-level structure (which 

537 aggregates all datasets and categorizes them from the perspective of the 

538 complete pipeline) and the individual tasks that use them as inputs and 

539 outputs. 

540 

541 `QuantumGraph` construction proceeds in four steps, with each corresponding 

542 to a different `_PipelineScaffolding` method: 

543 

544 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

545 the DatasetTypes used by the pipeline (delegating to 

546 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

547 nested `_TaskScaffolding` and `_DatasetDict` objects. 

548 

549 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

550 returns related tuples of all dimensions used to identify any regular 

551 input, output, and intermediate datasets (not prerequisites). We then 

552 iterate over these tuples of related dimensions, identifying the subsets 

553 that correspond to distinct data IDs for each task and dataset type, 

554 and then create `_QuantumScaffolding` objects. 

555 

556 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

557 dataset data IDs previously identified, transforming unresolved 

558 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

559 up prerequisite datasets for all quanta. 

560 

561 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

562 per-task `_QuantumScaffolding` objects. 

563 """ 

564 

565 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

566 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

567 self.tasks = [] 

568 # Aggregate and categorize the DatasetTypes in the Pipeline. 

569 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

570 # Construct dictionaries that map those DatasetTypes to structures 

571 # that will (later) hold addiitonal information about them. 

572 for attr in ( 

573 "initInputs", 

574 "initIntermediates", 

575 "initOutputs", 

576 "inputs", 

577 "intermediates", 

578 "outputs", 

579 "prerequisites", 

580 ): 

581 setattr( 

582 self, 

583 attr, 

584 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

585 ) 

586 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

587 # Aggregate all dimensions for all non-init, non-prerequisite 

588 # DatasetTypes. These are the ones we'll include in the big join 

589 # query. 

590 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

591 # Construct scaffolding nodes for each Task, and add backreferences 

592 # to the Task from each DatasetScaffolding node. 

593 # Note that there's only one scaffolding node for each DatasetType, 

594 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

595 # reference it. 

596 if isinstance(pipeline, Pipeline): 

597 pipeline = pipeline.toExpandedPipeline() 

598 self.tasks = [ 

599 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

600 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

601 ] 

602 

603 def __repr__(self) -> str: 

604 # Default dataclass-injected __repr__ gets caught in an infinite loop 

605 # because of back-references. 

606 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

607 

608 tasks: List[_TaskScaffolding] 

609 """Scaffolding data structures for each task in the pipeline 

610 (`list` of `_TaskScaffolding`). 

611 """ 

612 

613 initInputs: _DatasetDict 

614 """Datasets consumed but not produced when constructing the tasks in this 

615 pipeline (`_DatasetDict`). 

616 """ 

617 

618 initIntermediates: _DatasetDict 

619 """Datasets that are both consumed and produced when constructing the tasks 

620 in this pipeline (`_DatasetDict`). 

621 """ 

622 

623 initOutputs: _DatasetDict 

624 """Datasets produced but not consumed when constructing the tasks in this 

625 pipeline (`_DatasetDict`). 

626 """ 

627 

628 inputs: _DatasetDict 

629 """Datasets that are consumed but not produced when running this pipeline 

630 (`_DatasetDict`). 

631 """ 

632 

633 intermediates: _DatasetDict 

634 """Datasets that are both produced and consumed when running this pipeline 

635 (`_DatasetDict`). 

636 """ 

637 

638 outputs: _DatasetDict 

639 """Datasets produced but not consumed when when running this pipeline 

640 (`_DatasetDict`). 

641 """ 

642 

643 prerequisites: _DatasetDict 

644 """Datasets that are consumed when running this pipeline and looked up 

645 per-Quantum when generating the graph (`_DatasetDict`). 

646 """ 

647 

648 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

649 """Datasets that should be used as constraints in the initial query, 

650 according to tasks (`NamedValueSet`). 

651 """ 

652 

653 dimensions: DimensionGraph 

654 """All dimensions used by any regular input, intermediate, or output 

655 (not prerequisite) dataset; the set of dimension used in the "Big Join 

656 Query" (`DimensionGraph`). 

657 

658 This is required to be a superset of all task quantum dimensions. 

659 """ 

660 

661 @contextmanager 

662 def connectDataIds( 

663 self, 

664 registry: Registry, 

665 collections: Any, 

666 userQuery: Optional[str], 

667 externalDataId: DataCoordinate, 

668 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

669 bind: Optional[Mapping[str, Any]] = None, 

670 ) -> Iterator[DataCoordinateQueryResults]: 

671 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

672 

673 This method populates `_TaskScaffolding.dataIds` and 

674 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

675 

676 Parameters 

677 ---------- 

678 registry : `lsst.daf.butler.Registry` 

679 Registry for the data repository; used for all data ID queries. 

680 collections 

681 Expressions representing the collections to search for input 

682 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

683 userQuery : `str` or `None` 

684 User-provided expression to limit the data IDs processed. 

685 externalDataId : `DataCoordinate` 

686 Externally-provided data ID that should be used to restrict the 

687 results, just as if these constraints had been included via ``AND`` 

688 in ``userQuery``. This includes (at least) any instrument named 

689 in the pipeline definition. 

690 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

691 The query constraint variant that should be used to constraint the 

692 query based on dataset existance, defaults to 

693 `DatasetQueryConstraintVariant.ALL`. 

694 bind : `Mapping`, optional 

695 Mapping containing literal values that should be injected into the 

696 ``userQuery`` expression, keyed by the identifiers they replace. 

697 

698 Returns 

699 ------- 

700 commonDataIds : \ 

701 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

702 An interface to a database temporary table containing all data IDs 

703 that will appear in this `QuantumGraph`. Returned inside a 

704 context manager, which will drop the temporary table at the end of 

705 the `with` block in which this method is called. 

706 """ 

707 _LOG.debug("Building query for data IDs.") 

708 # Initialization datasets always have empty data IDs. 

709 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

710 for datasetType, refs in itertools.chain( 

711 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

712 ): 

713 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

714 # Run one big query for the data IDs for task dimensions and regular 

715 # inputs and outputs. We limit the query to only dimensions that are 

716 # associated with the input dataset types, but don't (yet) try to 

717 # obtain the dataset_ids for those inputs. 

718 _LOG.debug( 

719 "Submitting data ID query over dimensions %s and materializing results.", 

720 list(self.dimensions.names), 

721 ) 

722 queryArgs: Dict[str, Any] = { 

723 "dimensions": self.dimensions, 

724 "where": userQuery, 

725 "dataId": externalDataId, 

726 "bind": bind, 

727 } 

728 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

729 _LOG.debug( 

730 "Constraining graph query using default of %s.", 

731 list(self.defaultDatasetQueryConstraints.names), 

732 ) 

733 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

734 queryArgs["collections"] = collections 

735 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

736 _LOG.debug("Not using dataset existence to constrain query.") 

737 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

738 constraint = set(datasetQueryConstraint) 

739 inputs = {k.name: k for k in self.inputs.keys()} 

740 if remainder := constraint.difference(inputs.keys()): 

741 raise ValueError( 

742 f"{remainder} dataset type(s) specified as a graph constraint, but" 

743 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

744 ) 

745 _LOG.debug(f"Constraining graph query using {constraint}") 

746 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

747 queryArgs["collections"] = collections 

748 else: 

749 raise ValueError( 

750 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

751 ) 

752 

753 if "datasets" in queryArgs: 

754 for i, dataset_type in enumerate(queryArgs["datasets"]): 

755 if dataset_type.isComponent(): 

756 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

757 

758 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

759 _LOG.debug("Expanding data IDs.") 

760 commonDataIds = commonDataIds.expanded() 

761 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

762 # Iterate over query results, populating data IDs for datasets and 

763 # quanta and then connecting them to each other. 

764 n = -1 

765 for n, commonDataId in enumerate(commonDataIds): 

766 # Create DatasetRefs for all DatasetTypes from this result row, 

767 # noting that we might have created some already. 

768 # We remember both those that already existed and those that we 

769 # create now. 

770 refsForRow = {} 

771 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

772 for datasetType, refs in itertools.chain( 

773 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

774 ): 

775 datasetDataId: Optional[DataCoordinate] 

776 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

777 datasetDataId = commonDataId.subset(datasetType.dimensions) 

778 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

779 ref = refs.get(datasetDataId) 

780 if ref is None: 

781 ref = DatasetRef(datasetType, datasetDataId) 

782 refs[datasetDataId] = ref 

783 refsForRow[datasetType.name] = ref 

784 # Create _QuantumScaffolding objects for all tasks from this 

785 # result row, noting that we might have created some already. 

786 for task in self.tasks: 

787 quantumDataId = commonDataId.subset(task.dimensions) 

788 quantum = task.quanta.get(quantumDataId) 

789 if quantum is None: 

790 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

791 task.quanta[quantumDataId] = quantum 

792 # Whether this is a new quantum or an existing one, we can 

793 # now associate the DatasetRefs for this row with it. The 

794 # fact that a Quantum data ID and a dataset data ID both 

795 # came from the same result row is what tells us they 

796 # should be associated. 

797 # Many of these associates will be duplicates (because 

798 # another query row that differed from this one only in 

799 # irrelevant dimensions already added them), and we use 

800 # sets to skip. 

801 for datasetType in task.inputs: 

802 ref = refsForRow[datasetType.name] 

803 quantum.inputs[datasetType.name][ref.dataId] = ref 

804 for datasetType in task.outputs: 

805 ref = refsForRow[datasetType.name] 

806 quantum.outputs[datasetType.name][ref.dataId] = ref 

807 if n < 0: 

808 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

809 emptiness_explained = False 

810 for message in commonDataIds.explain_no_results(): 

811 _LOG.critical(message) 

812 emptiness_explained = True 

813 if not emptiness_explained: 

814 _LOG.critical( 

815 "To reproduce this query for debugging purposes, run " 

816 "Registry.queryDataIds with these arguments:" 

817 ) 

818 # We could just repr() the queryArgs dict to get something 

819 # the user could make sense of, but it's friendlier to 

820 # put these args in an easier-to-construct equivalent form 

821 # so they can read it more easily and copy and paste into 

822 # a Python terminal. 

823 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

824 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

825 if queryArgs["where"]: 

826 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

827 if "datasets" in queryArgs: 

828 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

829 if "collections" in queryArgs: 

830 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

831 _LOG.debug("Finished processing %d rows from data ID query.", n) 

832 yield commonDataIds 

833 

834 def resolveDatasetRefs( 

835 self, 

836 registry: Registry, 

837 collections: Any, 

838 run: Optional[str], 

839 commonDataIds: DataCoordinateQueryResults, 

840 *, 

841 skipExistingIn: Any = None, 

842 clobberOutputs: bool = True, 

843 constrainedByAllDatasets: bool = True, 

844 resolveRefs: bool = False, 

845 ) -> None: 

846 """Perform follow up queries for each dataset data ID produced in 

847 `fillDataIds`. 

848 

849 This method populates `_DatasetScaffolding.refs` (except for those in 

850 `prerequisites`). 

851 

852 Parameters 

853 ---------- 

854 registry : `lsst.daf.butler.Registry` 

855 Registry for the data repository; used for all data ID queries. 

856 collections 

857 Expressions representing the collections to search for input 

858 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

859 run : `str`, optional 

860 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

861 output datasets, if it already exists. 

862 commonDataIds : \ 

863 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

864 Result of a previous call to `connectDataIds`. 

865 skipExistingIn 

866 Expressions representing the collections to search for existing 

867 output datasets that should be skipped. See 

868 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

869 `None` or empty string/sequence disables skipping. 

870 clobberOutputs : `bool`, optional 

871 If `True` (default), allow quanta to created even if outputs exist; 

872 this requires the same behavior behavior to be enabled when 

873 executing. If ``skipExistingIn`` is not `None`, completed quanta 

874 (those with metadata, or all outputs if there is no metadata 

875 dataset configured) will be skipped rather than clobbered. 

876 constrainedByAllDatasets : `bool`, optional 

877 Indicates if the commonDataIds were generated with a constraint on 

878 all dataset types. 

879 resolveRefs : `bool`, optional 

880 If `True` then resolve all input references and generate random 

881 dataset IDs for all output and intermediate datasets. True value 

882 requires ``run`` collection to be specified. 

883 

884 Raises 

885 ------ 

886 OutputExistsError 

887 Raised if an output dataset already exists in the output run 

888 and ``skipExistingIn`` does not include output run, or if only 

889 some outputs are present and ``clobberOutputs`` is `False`. 

890 """ 

891 skip_collections_wildcard: CollectionWildcard | None = None 

892 skipExistingInRun = False 

893 if skipExistingIn: 

894 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

895 if run: 

896 # as optimization check in the explicit list of names first 

897 skipExistingInRun = run in skip_collections_wildcard.strings 

898 if not skipExistingInRun: 

899 # need to flatten it and check again 

900 skipExistingInRun = run in registry.queryCollections( 

901 skipExistingIn, 

902 collectionTypes=CollectionType.RUN, 

903 ) 

904 

905 idMaker: Optional[_DatasetIdMaker] = None 

906 if resolveRefs: 

907 assert run is not None, "run cannot be None when resolveRefs is True" 

908 idMaker = _DatasetIdMaker(registry, run) 

909 

910 resolvedRefQueryResults: Iterable[DatasetRef] 

911 

912 # Updating constrainedByAllDatasets here is not ideal, but we have a 

913 # few different code paths that each transfer different pieces of 

914 # information about what dataset query constraints were applied here, 

915 # and none of them has the complete picture until we get here. We're 

916 # long overdue for a QG generation rewrite that will make this go away 

917 # entirely anyway. 

918 constrainedByAllDatasets = ( 

919 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

920 ) 

921 

922 # Look up [init] intermediate and output datasets in the output 

923 # collection, if there is an output collection. 

924 if run is not None or skip_collections_wildcard is not None: 

925 for datasetType, refs in itertools.chain( 

926 self.initIntermediates.items(), 

927 self.initOutputs.items(), 

928 self.intermediates.items(), 

929 self.outputs.items(), 

930 ): 

931 _LOG.debug( 

932 "Resolving %d datasets for intermediate and/or output dataset %s.", 

933 len(refs), 

934 datasetType.name, 

935 ) 

936 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

937 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

938 # TODO: this assert incorrectly bans component inputs; 

939 # investigate on DM-33027. 

940 # assert not datasetType.isComponent(), \ 

941 # "Output datasets cannot be components." 

942 # 

943 # Instead we have to handle them manually to avoid a 

944 # deprecation warning, but it is at least confusing and 

945 # possibly a bug for components to appear here at all. 

946 if datasetType.isComponent(): 

947 parent_dataset_type = datasetType.makeCompositeDatasetType() 

948 component = datasetType.component() 

949 else: 

950 parent_dataset_type = datasetType 

951 component = None 

952 

953 # look at RUN collection first 

954 if run is not None: 

955 try: 

956 resolvedRefQueryResults = subset.findDatasets( 

957 parent_dataset_type, collections=run, findFirst=True 

958 ) 

959 except MissingDatasetTypeError: 

960 resolvedRefQueryResults = [] 

961 for resolvedRef in resolvedRefQueryResults: 

962 # TODO: we could easily support per-DatasetType 

963 # skipExisting and I could imagine that being useful - 

964 # it's probably required in order to support writing 

965 # initOutputs before QuantumGraph generation. 

966 assert resolvedRef.dataId in refs 

967 if not (skipExistingInRun or isInit or clobberOutputs): 

968 raise OutputExistsError( 

969 f"Output dataset {datasetType.name} already exists in " 

970 f"output RUN collection '{run}' with data ID" 

971 f" {resolvedRef.dataId}." 

972 ) 

973 # If we are going to resolve all outputs then we have 

974 # to remember existing ones to avoid generating new 

975 # dataset IDs for them. 

976 if resolveRefs: 

977 refs[resolvedRef.dataId] = ( 

978 resolvedRef.makeComponentRef(component) 

979 if component is not None 

980 else resolvedRef 

981 ) 

982 

983 # And check skipExistingIn too, if RUN collection is in 

984 # it is handled above 

985 if skip_collections_wildcard is not None: 

986 try: 

987 resolvedRefQueryResults = subset.findDatasets( 

988 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True 

989 ) 

990 except MissingDatasetTypeError: 

991 resolvedRefQueryResults = [] 

992 for resolvedRef in resolvedRefQueryResults: 

993 assert resolvedRef.dataId in refs 

994 refs[resolvedRef.dataId] = ( 

995 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

996 ) 

997 

998 # Look up input and initInput datasets in the input collection(s). 

999 # container to accumulate unfound refs, if the common dataIs were not 

1000 # constrained on dataset type existence. 

1001 self.unfoundRefs = set() 

1002 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1003 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

1004 if datasetType.isComponent(): 

1005 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1006 component = datasetType.component() 

1007 else: 

1008 parent_dataset_type = datasetType 

1009 component = None 

1010 try: 

1011 resolvedRefQueryResults = commonDataIds.subset( 

1012 datasetType.dimensions, unique=True 

1013 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1014 except MissingDatasetTypeError: 

1015 resolvedRefQueryResults = [] 

1016 dataIdsNotFoundYet = set(refs.keys()) 

1017 for resolvedRef in resolvedRefQueryResults: 

1018 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1019 refs[resolvedRef.dataId] = ( 

1020 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1021 ) 

1022 if dataIdsNotFoundYet: 

1023 if constrainedByAllDatasets: 

1024 raise RuntimeError( 

1025 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1026 f"'{datasetType.name}' was/were present in a previous " 

1027 f"query, but could not be found now." 

1028 f"This is either a logic bug in QuantumGraph generation " 

1029 f"or the input collections have been modified since " 

1030 f"QuantumGraph generation began." 

1031 ) 

1032 elif not datasetType.dimensions: 

1033 raise RuntimeError( 

1034 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1035 f"collections {collections}." 

1036 ) 

1037 else: 

1038 # if the common dataIds were not constrained using all the 

1039 # input dataset types, it is possible that some data ids 

1040 # found dont correspond to existing dataset types and they 

1041 # will be un-resolved. Mark these for later pruning from 

1042 # the quantum graph. 

1043 for k in dataIdsNotFoundYet: 

1044 self.unfoundRefs.add(refs[k]) 

1045 

1046 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1047 # replacing the unresolved refs there, and then look up prerequisites. 

1048 for task in self.tasks: 

1049 _LOG.debug( 

1050 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1051 len(task.quanta), 

1052 task.taskDef.label, 

1053 ) 

1054 # The way iterConnections is designed makes it impossible to 

1055 # annotate precisely enough to satisfy MyPy here. 

1056 lookupFunctions = { 

1057 c.name: c.lookupFunction # type: ignore 

1058 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1059 if c.lookupFunction is not None # type: ignore 

1060 } 

1061 dataIdsFailed = [] 

1062 dataIdsSucceeded = [] 

1063 for quantum in task.quanta.values(): 

1064 # Process outputs datasets only if skipExistingIn is not None 

1065 # or there is a run to look for outputs in and clobberOutputs 

1066 # is True. Note that if skipExistingIn is None, any output 

1067 # datasets that already exist would have already caused an 

1068 # exception to be raised. We never update the DatasetRefs in 

1069 # the quantum because those should never be resolved. 

1070 if skip_collections_wildcard is not None or (run is not None and clobberOutputs): 

1071 resolvedRefs = [] 

1072 unresolvedRefs = [] 

1073 haveMetadata = False 

1074 for datasetType, originalRefs in quantum.outputs.items(): 

1075 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1076 if ref.id is not None: 

1077 resolvedRefs.append(ref) 

1078 if datasetType.name == task.taskDef.metadataDatasetName: 

1079 haveMetadata = True 

1080 else: 

1081 unresolvedRefs.append(ref) 

1082 if resolvedRefs: 

1083 if haveMetadata or not unresolvedRefs: 

1084 dataIdsSucceeded.append(quantum.dataId) 

1085 if skip_collections_wildcard is not None: 

1086 continue 

1087 else: 

1088 dataIdsFailed.append(quantum.dataId) 

1089 if not clobberOutputs: 

1090 raise OutputExistsError( 

1091 f"Quantum {quantum.dataId} of task with label " 

1092 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1093 f"({resolvedRefs}) " 

1094 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

1095 "and clobbering outputs was not enabled." 

1096 ) 

1097 # Update the input DatasetRefs to the resolved ones we already 

1098 # searched for. 

1099 for datasetType, input_refs in quantum.inputs.items(): 

1100 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

1101 input_refs[ref.dataId] = ref 

1102 # Look up prerequisite datasets in the input collection(s). 

1103 # These may have dimensions that extend beyond those we queried 

1104 # for originally, because we want to permit those data ID 

1105 # values to differ across quanta and dataset types. 

1106 for datasetType in task.prerequisites: 

1107 if datasetType.isComponent(): 

1108 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1109 component = datasetType.component() 

1110 else: 

1111 parent_dataset_type = datasetType 

1112 component = None 

1113 lookupFunction = lookupFunctions.get(datasetType.name) 

1114 if lookupFunction is not None: 

1115 # PipelineTask has provided its own function to do the 

1116 # lookup. This always takes precedence. 

1117 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1118 elif ( 

1119 datasetType.isCalibration() 

1120 and datasetType.dimensions <= quantum.dataId.graph 

1121 and quantum.dataId.graph.temporal 

1122 ): 

1123 # This is a master calibration lookup, which we have to 

1124 # handle specially because the query system can't do a 

1125 # temporal join on a non-dimension-based timespan yet. 

1126 timespan = quantum.dataId.timespan 

1127 try: 

1128 prereq_ref = registry.findDataset( 

1129 parent_dataset_type, 

1130 quantum.dataId, 

1131 collections=collections, 

1132 timespan=timespan, 

1133 ) 

1134 if prereq_ref is not None: 

1135 if component is not None: 

1136 prereq_ref = prereq_ref.makeComponentRef(component) 

1137 prereq_refs = [prereq_ref] 

1138 else: 

1139 prereq_refs = [] 

1140 except (KeyError, MissingDatasetTypeError): 

1141 # This dataset type is not present in the registry, 

1142 # which just means there are no datasets here. 

1143 prereq_refs = [] 

1144 else: 

1145 # Most general case. 

1146 prereq_refs = [ 

1147 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1148 for prereq_ref in registry.queryDatasets( 

1149 parent_dataset_type, 

1150 collections=collections, 

1151 dataId=quantum.dataId, 

1152 findFirst=True, 

1153 ).expanded() 

1154 ] 

1155 quantum.prerequisites[datasetType].update( 

1156 {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1157 ) 

1158 

1159 # Resolve all quantum inputs and outputs. 

1160 if idMaker: 

1161 for datasetDict in (quantum.inputs, quantum.outputs): 

1162 for refDict in datasetDict.values(): 

1163 refDict.update(idMaker.resolveDict(refDict)) 

1164 

1165 # Resolve task initInputs and initOutputs. 

1166 if idMaker: 

1167 for datasetDict in (task.initInputs, task.initOutputs): 

1168 for refDict in datasetDict.values(): 

1169 refDict.update(idMaker.resolveDict(refDict)) 

1170 

1171 # Actually remove any quanta that we decided to skip above. 

1172 if dataIdsSucceeded: 

1173 if skip_collections_wildcard is not None: 

1174 _LOG.debug( 

1175 "Pruning successful %d quanta for task with label '%s' because all of their " 

1176 "outputs exist or metadata was written successfully.", 

1177 len(dataIdsSucceeded), 

1178 task.taskDef.label, 

1179 ) 

1180 for dataId in dataIdsSucceeded: 

1181 del task.quanta[dataId] 

1182 elif clobberOutputs: 

1183 _LOG.info( 

1184 "Found %d successful quanta for task with label '%s' " 

1185 "that will need to be clobbered during execution.", 

1186 len(dataIdsSucceeded), 

1187 task.taskDef.label, 

1188 ) 

1189 else: 

1190 raise AssertionError("OutputExistsError should have already been raised.") 

1191 if dataIdsFailed: 

1192 if clobberOutputs: 

1193 _LOG.info( 

1194 "Found %d failed/incomplete quanta for task with label '%s' " 

1195 "that will need to be clobbered during execution.", 

1196 len(dataIdsFailed), 

1197 task.taskDef.label, 

1198 ) 

1199 else: 

1200 raise AssertionError("OutputExistsError should have already been raised.") 

1201 

1202 def makeQuantumGraph( 

1203 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None 

1204 ) -> QuantumGraph: 

1205 """Create a `QuantumGraph` from the quanta already present in 

1206 the scaffolding data structure. 

1207 

1208 Parameters 

1209 --------- 

1210 metadata : Optional Mapping of `str` to primitives 

1211 This is an optional parameter of extra data to carry with the 

1212 graph. Entries in this mapping should be able to be serialized in 

1213 JSON. 

1214 datastore : `Datastore`, optional 

1215 If not `None` then fill datastore records in each generated 

1216 Quantum. 

1217 

1218 Returns 

1219 ------- 

1220 graph : `QuantumGraph` 

1221 The full `QuantumGraph`. 

1222 """ 

1223 

1224 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1225 """Extract all DatasetRefs from the dictionaries""" 

1226 for ref_dict in dataset_dict.values(): 

1227 yield from ref_dict.values() 

1228 

1229 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1230 if datastore is not None: 

1231 datastore_records = datastore.export_records( 

1232 itertools.chain( 

1233 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1234 ) 

1235 ) 

1236 

1237 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1238 for task in self.tasks: 

1239 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1240 graphInput[task.taskDef] = qset 

1241 

1242 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1243 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1244 

1245 graph = QuantumGraph( 

1246 graphInput, 

1247 metadata=metadata, 

1248 pruneRefs=self.unfoundRefs, 

1249 universe=self.dimensions.universe, 

1250 initInputs=taskInitInputs, 

1251 initOutputs=taskInitOutputs, 

1252 ) 

1253 return graph 

1254 

1255 

1256# ------------------------ 

1257# Exported definitions -- 

1258# ------------------------ 

1259 

1260 

1261class GraphBuilderError(Exception): 

1262 """Base class for exceptions generated by graph builder.""" 

1263 

1264 pass 

1265 

1266 

1267class OutputExistsError(GraphBuilderError): 

1268 """Exception generated when output datasets already exist.""" 

1269 

1270 pass 

1271 

1272 

1273class PrerequisiteMissingError(GraphBuilderError): 

1274 """Exception generated when a prerequisite dataset does not exist.""" 

1275 

1276 pass 

1277 

1278 

1279class GraphBuilder: 

1280 """GraphBuilder class is responsible for building task execution graph from 

1281 a Pipeline. 

1282 

1283 Parameters 

1284 ---------- 

1285 registry : `~lsst.daf.butler.Registry` 

1286 Data butler instance. 

1287 skipExistingIn 

1288 Expressions representing the collections to search for existing 

1289 output datasets that should be skipped. See 

1290 :ref:`daf_butler_ordered_collection_searches`. 

1291 clobberOutputs : `bool`, optional 

1292 If `True` (default), allow quanta to created even if partial outputs 

1293 exist; this requires the same behavior behavior to be enabled when 

1294 executing. 

1295 datastore : `Datastore`, optional 

1296 If not `None` then fill datastore records in each generated Quantum. 

1297 """ 

1298 

1299 def __init__( 

1300 self, 

1301 registry: Registry, 

1302 skipExistingIn: Any = None, 

1303 clobberOutputs: bool = True, 

1304 datastore: Optional[Datastore] = None, 

1305 ): 

1306 self.registry = registry 

1307 self.dimensions = registry.dimensions 

1308 self.skipExistingIn = skipExistingIn 

1309 self.clobberOutputs = clobberOutputs 

1310 self.datastore = datastore 

1311 

1312 def makeGraph( 

1313 self, 

1314 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1315 collections: Any, 

1316 run: Optional[str], 

1317 userQuery: Optional[str], 

1318 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1319 metadata: Optional[Mapping[str, Any]] = None, 

1320 resolveRefs: bool = False, 

1321 bind: Optional[Mapping[str, Any]] = None, 

1322 ) -> QuantumGraph: 

1323 """Create execution graph for a pipeline. 

1324 

1325 Parameters 

1326 ---------- 

1327 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1328 Pipeline definition, task names/classes and their configs. 

1329 collections 

1330 Expressions representing the collections to search for input 

1331 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1332 run : `str`, optional 

1333 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1334 output datasets, if it already exists. 

1335 userQuery : `str` 

1336 String which defines user-defined selection for registry, should be 

1337 empty or `None` if there is no restrictions on data selection. 

1338 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1339 The query constraint variant that should be used to constraint the 

1340 query based on dataset existance, defaults to 

1341 `DatasetQueryConstraintVariant.ALL`. 

1342 metadata : Optional Mapping of `str` to primitives 

1343 This is an optional parameter of extra data to carry with the 

1344 graph. Entries in this mapping should be able to be serialized in 

1345 JSON. 

1346 resolveRefs : `bool`, optional 

1347 If `True` then resolve all input references and generate random 

1348 dataset IDs for all output and intermediate datasets. True value 

1349 requires ``run`` collection to be specified. 

1350 bind : `Mapping`, optional 

1351 Mapping containing literal values that should be injected into the 

1352 ``userQuery`` expression, keyed by the identifiers they replace. 

1353 

1354 Returns 

1355 ------- 

1356 graph : `QuantumGraph` 

1357 

1358 Raises 

1359 ------ 

1360 UserExpressionError 

1361 Raised when user expression cannot be parsed. 

1362 OutputExistsError 

1363 Raised when output datasets already exist. 

1364 Exception 

1365 Other exceptions types may be raised by underlying registry 

1366 classes. 

1367 """ 

1368 if resolveRefs and run is None: 

1369 raise ValueError("`resolveRefs` requires `run` parameter.") 

1370 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1371 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1372 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1373 instrument_class: Optional[Any] = None 

1374 if isinstance(pipeline, Pipeline): 

1375 instrument_class_name = pipeline.getInstrument() 

1376 if instrument_class_name is not None: 

1377 instrument_class = doImportType(instrument_class_name) 

1378 pipeline = list(pipeline.toExpandedPipeline()) 

1379 if instrument_class is not None: 

1380 dataId = DataCoordinate.standardize( 

1381 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1382 ) 

1383 else: 

1384 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1385 with scaffolding.connectDataIds( 

1386 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1387 ) as commonDataIds: 

1388 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1389 scaffolding.resolveDatasetRefs( 

1390 self.registry, 

1391 collections, 

1392 run, 

1393 commonDataIds, 

1394 skipExistingIn=self.skipExistingIn, 

1395 clobberOutputs=self.clobberOutputs, 

1396 constrainedByAllDatasets=condition, 

1397 resolveRefs=resolveRefs, 

1398 ) 

1399 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)