Coverage for python/lsst/pipe/base/graphBuilder.py: 14%

521 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-12 09:13 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionType, 

40 DataCoordinate, 

41 DatasetIdGenEnum, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56from lsst.utils import doImportType 

57 

58from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

59from ._status import NoWorkFound 

60 

61# ----------------------------- 

62# Imports for other modules -- 

63# ----------------------------- 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

76 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

77 the known `DatasetRef` instances of that type. 

78 

79 Parameters 

80 ---------- 

81 args 

82 Positional arguments are forwarded to the `dict` constructor. 

83 universe : `DimensionUniverse` 

84 Universe of all possible dimensions. 

85 """ 

86 

87 def __init__(self, *args: Any, universe: DimensionUniverse): 

88 super().__init__(*args) 

89 self.universe = universe 

90 

91 @classmethod 

92 def fromDatasetTypes( 

93 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

94 ) -> _DatasetDict: 

95 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

96 

97 Parameters 

98 ---------- 

99 datasetTypes : `iterable` of `DatasetType` 

100 DatasetTypes to use as keys for the dict. Values will be empty 

101 dictionaries. 

102 universe : `DimensionUniverse` 

103 Universe of all possible dimensions. 

104 

105 Returns 

106 ------- 

107 dictionary : `_DatasetDict` 

108 A new `_DatasetDict` instance. 

109 """ 

110 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

111 

112 @classmethod 

113 def fromSubset( 

114 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

115 ) -> _DatasetDict: 

116 """Return a new dictionary by extracting items corresponding to the 

117 given keys from one or more existing dictionaries. 

118 

119 Parameters 

120 ---------- 

121 datasetTypes : `iterable` of `DatasetType` 

122 DatasetTypes to use as keys for the dict. Values will be obtained 

123 by lookups against ``first`` and ``rest``. 

124 first : `_DatasetDict` 

125 Another dictionary from which to extract values. 

126 rest 

127 Additional dictionaries from which to extract values. 

128 

129 Returns 

130 ------- 

131 dictionary : `_DatasetDict` 

132 A new dictionary instance. 

133 """ 

134 combined = ChainMap(first, *rest) 

135 

136 # Dataset types known to match immediately can be processed 

137 # without checks. 

138 matches = combined.keys() & set(datasetTypes) 

139 _dict = {k: combined[k] for k in matches} 

140 

141 if len(_dict) < len(datasetTypes): 

142 # Work out which ones are missing. 

143 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

144 

145 # Get the known names for comparison. 

146 combined_by_name = {k.name: k for k in combined} 

147 

148 missing = set() 

149 incompatible = {} 

150 for datasetType in missing_datasetTypes: 

151 # The dataset type is not found. It may not be listed 

152 # or it may be that it is there with the same name 

153 # but different definition. 

154 if datasetType.name in combined_by_name: 

155 # This implies some inconsistency in definitions 

156 # for connections. If there is support for storage 

157 # class conversion we can let it slide. 

158 # At this point we do not know 

159 # where the inconsistency is but trust that down 

160 # stream code will be more explicit about input 

161 # vs output incompatibilities. 

162 existing = combined_by_name[datasetType.name] 

163 convertible_to_existing = existing.is_compatible_with(datasetType) 

164 convertible_from_existing = datasetType.is_compatible_with(existing) 

165 if convertible_to_existing and convertible_from_existing: 

166 _LOG.debug( 

167 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

168 datasetType.name, 

169 datasetType.storageClass_name, 

170 existing.storageClass_name, 

171 ) 

172 _dict[datasetType] = combined[existing] 

173 elif convertible_to_existing or convertible_from_existing: 

174 # We'd need to refactor a fair amount to recognize 

175 # whether this is an error or not, so I'm not going to 

176 # bother until we need to do that for other reasons 

177 # (it won't be too long). 

178 _LOG.info( 

179 "Dataset type %s is present with multiple only partially-compatible storage " 

180 "classes %s and %s.", 

181 datasetType.name, 

182 datasetType.storageClass_name, 

183 existing.storageClass_name, 

184 ) 

185 _dict[datasetType] = combined[existing] 

186 else: 

187 incompatible[datasetType] = existing 

188 else: 

189 missing.add(datasetType) 

190 

191 if missing or incompatible: 

192 reasons = [] 

193 if missing: 

194 reasons.append( 

195 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

196 + ", ".join(d.name for d in combined) 

197 ) 

198 if incompatible: 

199 for x, y in incompatible.items(): 

200 reasons.append(f"{x} incompatible with {y}") 

201 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

202 

203 return cls(_dict, universe=first.universe) 

204 

205 @property 

206 def dimensions(self) -> DimensionGraph: 

207 """The union of all dimensions used by all dataset types in this 

208 dictionary, including implied dependencies (`DimensionGraph`). 

209 """ 

210 base = self.universe.empty 

211 if len(self) == 0: 

212 return base 

213 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

214 

215 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

216 """Unpack nested single-element `DatasetRef` dicts into a new 

217 mapping with `DatasetType` keys and `DatasetRef` values. 

218 

219 This method assumes that each nest contains exactly one item, as is the 

220 case for all "init" datasets. 

221 

222 Returns 

223 ------- 

224 dictionary : `NamedKeyDict` 

225 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

226 `DatasetType` instances and string names usable as keys. 

227 """ 

228 

229 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

230 (ref,) = refs.values() 

231 return ref 

232 

233 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

234 

235 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

236 """Unpack nested multi-element `DatasetRef` dicts into a new 

237 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

238 

239 Returns 

240 ------- 

241 dictionary : `NamedKeyDict` 

242 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

243 both `DatasetType` instances and string names usable as keys. 

244 """ 

245 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

246 

247 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

248 """Iterate over the contained `DatasetRef` instances that match the 

249 given `DatasetType` and data IDs. 

250 

251 Parameters 

252 ---------- 

253 datasetType : `DatasetType` 

254 Dataset type to match. 

255 dataIds : `Iterable` [ `DataCoordinate` ] 

256 Data IDs to match. 

257 

258 Returns 

259 ------- 

260 refs : `Iterator` [ `DatasetRef` ] 

261 DatasetRef instances for which ``ref.datasetType == datasetType`` 

262 and ``ref.dataId`` is in ``dataIds``. 

263 """ 

264 refs = self[datasetType] 

265 return (refs[dataId] for dataId in dataIds) 

266 

267 

268class _QuantumScaffolding: 

269 """Helper class aggregating information about a `Quantum`, used when 

270 constructing a `QuantumGraph`. 

271 

272 See `_PipelineScaffolding` for a top-down description of the full 

273 scaffolding data structure. 

274 

275 Parameters 

276 ---------- 

277 task : _TaskScaffolding 

278 Back-reference to the helper object for the `PipelineTask` this quantum 

279 represents an execution of. 

280 dataId : `DataCoordinate` 

281 Data ID for this quantum. 

282 """ 

283 

284 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

285 self.task = task 

286 self.dataId = dataId 

287 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

288 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

289 self.prerequisites = _DatasetDict.fromDatasetTypes( 

290 task.prerequisites.keys(), universe=dataId.universe 

291 ) 

292 

293 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

294 

295 def __repr__(self) -> str: 

296 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

297 

298 task: _TaskScaffolding 

299 """Back-reference to the helper object for the `PipelineTask` this quantum 

300 represents an execution of. 

301 """ 

302 

303 dataId: DataCoordinate 

304 """Data ID for this quantum. 

305 """ 

306 

307 inputs: _DatasetDict 

308 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

309 

310 This is initialized to map each `DatasetType` to an empty dictionary at 

311 construction. Those nested dictionaries are populated (with data IDs as 

312 keys) with unresolved `DatasetRef` instances in 

313 `_PipelineScaffolding.connectDataIds`. 

314 """ 

315 

316 outputs: _DatasetDict 

317 """Nested dictionary containing `DatasetRef` outputs this quantum. 

318 """ 

319 

320 prerequisites: _DatasetDict 

321 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

322 quantum. 

323 """ 

324 

325 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

326 """Transform the scaffolding object into a true `Quantum` instance. 

327 

328 Parameters 

329 ---------- 

330 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

331 If not `None` then fill datastore records in each generated Quantum 

332 using the records from this structure. 

333 

334 Returns 

335 ------- 

336 quantum : `Quantum` 

337 An actual `Quantum` instance. 

338 """ 

339 allInputs = self.inputs.unpackMultiRefs() 

340 allInputs.update(self.prerequisites.unpackMultiRefs()) 

341 # Give the task's Connections class an opportunity to remove some 

342 # inputs, or complain if they are unacceptable. 

343 # This will raise if one of the check conditions is not met, which is 

344 # the intended behavior. 

345 # If it raises NotWorkFound, there is a bug in the QG algorithm 

346 # or the adjustQuantum is incorrectly trying to make a prerequisite 

347 # input behave like a regular input; adjustQuantum should only raise 

348 # NoWorkFound if a regular input is missing, and it shouldn't be 

349 # possible for us to have generated ``self`` if that's true. 

350 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

351 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

352 initInputs = self.task.initInputs.unpackSingleRefs() 

353 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

354 if datastore_records is not None: 

355 quantum_records = {} 

356 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

357 input_refs += list(initInputs.values()) 

358 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

359 for datastore_name, records in datastore_records.items(): 

360 matching_records = records.subset(input_ids) 

361 if matching_records is not None: 

362 quantum_records[datastore_name] = matching_records 

363 return Quantum( 

364 taskName=self.task.taskDef.taskName, 

365 taskClass=self.task.taskDef.taskClass, 

366 dataId=self.dataId, 

367 initInputs=initInputs, 

368 inputs=helper.inputs, 

369 outputs=helper.outputs, 

370 datastore_records=quantum_records, 

371 ) 

372 

373 

374@dataclass 

375class _TaskScaffolding: 

376 """Helper class aggregating information about a `PipelineTask`, used when 

377 constructing a `QuantumGraph`. 

378 

379 See `_PipelineScaffolding` for a top-down description of the full 

380 scaffolding data structure. 

381 

382 Parameters 

383 ---------- 

384 taskDef : `TaskDef` 

385 Data structure that identifies the task class and its config. 

386 parent : `_PipelineScaffolding` 

387 The parent data structure that will hold the instance being 

388 constructed. 

389 datasetTypes : `TaskDatasetTypes` 

390 Data structure that categorizes the dataset types used by this task. 

391 """ 

392 

393 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

394 universe = parent.dimensions.universe 

395 self.taskDef = taskDef 

396 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

397 assert self.dimensions.issubset(parent.dimensions) 

398 # Initialize _DatasetDicts as subsets of the one or two 

399 # corresponding dicts in the parent _PipelineScaffolding. 

400 self.initInputs = _DatasetDict.fromSubset( 

401 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

402 ) 

403 self.initOutputs = _DatasetDict.fromSubset( 

404 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

405 ) 

406 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

407 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

408 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

409 self.dataIds: Set[DataCoordinate] = set() 

410 self.quanta = {} 

411 

412 def __repr__(self) -> str: 

413 # Default dataclass-injected __repr__ gets caught in an infinite loop 

414 # because of back-references. 

415 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

416 

417 taskDef: TaskDef 

418 """Data structure that identifies the task class and its config 

419 (`TaskDef`). 

420 """ 

421 

422 dimensions: DimensionGraph 

423 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

424 """ 

425 

426 initInputs: _DatasetDict 

427 """Dictionary containing information about datasets used to construct this 

428 task (`_DatasetDict`). 

429 """ 

430 

431 initOutputs: _DatasetDict 

432 """Dictionary containing information about datasets produced as a 

433 side-effect of constructing this task (`_DatasetDict`). 

434 """ 

435 

436 inputs: _DatasetDict 

437 """Dictionary containing information about datasets used as regular, 

438 graph-constraining inputs to this task (`_DatasetDict`). 

439 """ 

440 

441 outputs: _DatasetDict 

442 """Dictionary containing information about datasets produced by this task 

443 (`_DatasetDict`). 

444 """ 

445 

446 prerequisites: _DatasetDict 

447 """Dictionary containing information about input datasets that must be 

448 present in the repository before any Pipeline containing this task is run 

449 (`_DatasetDict`). 

450 """ 

451 

452 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

453 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

454 this task with that data ID. 

455 """ 

456 

457 def makeQuantumSet( 

458 self, 

459 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

460 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

461 ) -> Set[Quantum]: 

462 """Create a `set` of `Quantum` from the information in ``self``. 

463 

464 Parameters 

465 ---------- 

466 unresolvedRefs : `set` [ `DatasetRef` ], optional 

467 Input dataset refs that have not been found. 

468 datastore_records : `dict` 

469 

470 

471 Returns 

472 ------- 

473 nodes : `set` of `Quantum` 

474 The `Quantum` elements corresponding to this task. 

475 """ 

476 if unresolvedRefs is None: 

477 unresolvedRefs = set() 

478 outputs = set() 

479 for q in self.quanta.values(): 

480 try: 

481 tmpQuanta = q.makeQuantum(datastore_records) 

482 outputs.add(tmpQuanta) 

483 except (NoWorkFound, FileNotFoundError) as exc: 

484 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

485 if unresolvedRefs.intersection(refs): 

486 # This means it is a node that is Known to be pruned 

487 # later and should be left in even though some follow up 

488 # queries fail. This allows the pruning to start from this 

489 # quantum with known issues, and prune other nodes it 

490 # touches 

491 inputs = q.inputs.unpackMultiRefs() 

492 inputs.update(q.prerequisites.unpackMultiRefs()) 

493 tmpQuantum = Quantum( 

494 taskName=q.task.taskDef.taskName, 

495 taskClass=q.task.taskDef.taskClass, 

496 dataId=q.dataId, 

497 initInputs=q.task.initInputs.unpackSingleRefs(), 

498 inputs=inputs, 

499 outputs=q.outputs.unpackMultiRefs(), 

500 ) 

501 outputs.add(tmpQuantum) 

502 else: 

503 raise exc 

504 return outputs 

505 

506 

507class _DatasetIdMaker: 

508 """Helper class which generates random dataset UUIDs for unresolved 

509 datasets. 

510 """ 

511 

512 def __init__(self, registry: Registry, run: str): 

513 self.datasetIdFactory = registry.datasetIdFactory 

514 self.run = run 

515 # Dataset IDs generated so far 

516 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

517 

518 def resolveRef(self, ref: DatasetRef) -> DatasetRef: 

519 if ref.id is not None: 

520 return ref 

521 

522 # For components we need their parent dataset ID. 

523 if ref.isComponent(): 

524 parent_ref = ref.makeCompositeRef() 

525 # Some basic check - parent should be resolved if this is an 

526 # existing input, or it should be in the cache already if it is 

527 # an intermediate. 

528 if parent_ref.id is None: 

529 key = parent_ref.datasetType, parent_ref.dataId 

530 if key not in self.resolved: 

531 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}") 

532 parent_ref = self.resolved[key] 

533 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved" 

534 return ref.resolved(parent_ref.id, parent_ref.run) 

535 

536 key = ref.datasetType, ref.dataId 

537 if (resolved := self.resolved.get(key)) is None: 

538 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE) 

539 self.resolved[key] = resolved 

540 return resolved 

541 

542 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]: 

543 """Resolve all unresolved references in the provided dictionary.""" 

544 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()} 

545 

546 

547@dataclass 

548class _PipelineScaffolding: 

549 """A helper data structure that organizes the information involved in 

550 constructing a `QuantumGraph` for a `Pipeline`. 

551 

552 Parameters 

553 ---------- 

554 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

555 Sequence of tasks from which a graph is to be constructed. Must 

556 have nested task classes already imported. 

557 universe : `DimensionUniverse` 

558 Universe of all possible dimensions. 

559 

560 Notes 

561 ----- 

562 The scaffolding data structure contains nested data structures for both 

563 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

564 data structures are shared between the pipeline-level structure (which 

565 aggregates all datasets and categorizes them from the perspective of the 

566 complete pipeline) and the individual tasks that use them as inputs and 

567 outputs. 

568 

569 `QuantumGraph` construction proceeds in four steps, with each corresponding 

570 to a different `_PipelineScaffolding` method: 

571 

572 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

573 the DatasetTypes used by the pipeline (delegating to 

574 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

575 nested `_TaskScaffolding` and `_DatasetDict` objects. 

576 

577 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

578 returns related tuples of all dimensions used to identify any regular 

579 input, output, and intermediate datasets (not prerequisites). We then 

580 iterate over these tuples of related dimensions, identifying the subsets 

581 that correspond to distinct data IDs for each task and dataset type, 

582 and then create `_QuantumScaffolding` objects. 

583 

584 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

585 dataset data IDs previously identified, transforming unresolved 

586 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

587 up prerequisite datasets for all quanta. 

588 

589 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

590 per-task `_QuantumScaffolding` objects. 

591 """ 

592 

593 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

594 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

595 self.tasks = [] 

596 # Aggregate and categorize the DatasetTypes in the Pipeline. 

597 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

598 # Construct dictionaries that map those DatasetTypes to structures 

599 # that will (later) hold additional information about them. 

600 for attr in ( 

601 "initInputs", 

602 "initIntermediates", 

603 "initOutputs", 

604 "inputs", 

605 "intermediates", 

606 "outputs", 

607 "prerequisites", 

608 ): 

609 setattr( 

610 self, 

611 attr, 

612 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

613 ) 

614 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

615 # Aggregate all dimensions for all non-init, non-prerequisite 

616 # DatasetTypes. These are the ones we'll include in the big join 

617 # query. 

618 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

619 # Construct scaffolding nodes for each Task, and add backreferences 

620 # to the Task from each DatasetScaffolding node. 

621 # Note that there's only one scaffolding node for each DatasetType, 

622 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

623 # reference it. 

624 if isinstance(pipeline, Pipeline): 

625 pipeline = pipeline.toExpandedPipeline() 

626 self.tasks = [ 

627 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

628 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

629 ] 

630 

631 def __repr__(self) -> str: 

632 # Default dataclass-injected __repr__ gets caught in an infinite loop 

633 # because of back-references. 

634 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

635 

636 tasks: List[_TaskScaffolding] 

637 """Scaffolding data structures for each task in the pipeline 

638 (`list` of `_TaskScaffolding`). 

639 """ 

640 

641 initInputs: _DatasetDict 

642 """Datasets consumed but not produced when constructing the tasks in this 

643 pipeline (`_DatasetDict`). 

644 """ 

645 

646 initIntermediates: _DatasetDict 

647 """Datasets that are both consumed and produced when constructing the tasks 

648 in this pipeline (`_DatasetDict`). 

649 """ 

650 

651 initOutputs: _DatasetDict 

652 """Datasets produced but not consumed when constructing the tasks in this 

653 pipeline (`_DatasetDict`). 

654 """ 

655 

656 inputs: _DatasetDict 

657 """Datasets that are consumed but not produced when running this pipeline 

658 (`_DatasetDict`). 

659 """ 

660 

661 intermediates: _DatasetDict 

662 """Datasets that are both produced and consumed when running this pipeline 

663 (`_DatasetDict`). 

664 """ 

665 

666 outputs: _DatasetDict 

667 """Datasets produced but not consumed when when running this pipeline 

668 (`_DatasetDict`). 

669 """ 

670 

671 prerequisites: _DatasetDict 

672 """Datasets that are consumed when running this pipeline and looked up 

673 per-Quantum when generating the graph (`_DatasetDict`). 

674 """ 

675 

676 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

677 """Datasets that should be used as constraints in the initial query, 

678 according to tasks (`NamedValueSet`). 

679 """ 

680 

681 dimensions: DimensionGraph 

682 """All dimensions used by any regular input, intermediate, or output 

683 (not prerequisite) dataset; the set of dimension used in the "Big Join 

684 Query" (`DimensionGraph`). 

685 

686 This is required to be a superset of all task quantum dimensions. 

687 """ 

688 

689 globalInitOutputs: _DatasetDict | None = None 

690 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

691 """ 

692 

693 @contextmanager 

694 def connectDataIds( 

695 self, 

696 registry: Registry, 

697 collections: Any, 

698 userQuery: Optional[str], 

699 externalDataId: DataCoordinate, 

700 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

701 bind: Optional[Mapping[str, Any]] = None, 

702 ) -> Iterator[DataCoordinateQueryResults]: 

703 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

704 

705 This method populates `_TaskScaffolding.dataIds` and 

706 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

707 

708 Parameters 

709 ---------- 

710 registry : `lsst.daf.butler.Registry` 

711 Registry for the data repository; used for all data ID queries. 

712 collections 

713 Expressions representing the collections to search for input 

714 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

715 userQuery : `str` or `None` 

716 User-provided expression to limit the data IDs processed. 

717 externalDataId : `DataCoordinate` 

718 Externally-provided data ID that should be used to restrict the 

719 results, just as if these constraints had been included via ``AND`` 

720 in ``userQuery``. This includes (at least) any instrument named 

721 in the pipeline definition. 

722 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

723 The query constraint variant that should be used to constraint the 

724 query based on dataset existance, defaults to 

725 `DatasetQueryConstraintVariant.ALL`. 

726 bind : `Mapping`, optional 

727 Mapping containing literal values that should be injected into the 

728 ``userQuery`` expression, keyed by the identifiers they replace. 

729 

730 Returns 

731 ------- 

732 commonDataIds : \ 

733 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

734 An interface to a database temporary table containing all data IDs 

735 that will appear in this `QuantumGraph`. Returned inside a 

736 context manager, which will drop the temporary table at the end of 

737 the `with` block in which this method is called. 

738 """ 

739 _LOG.debug("Building query for data IDs.") 

740 # Initialization datasets always have empty data IDs. 

741 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

742 for datasetType, refs in itertools.chain( 

743 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

744 ): 

745 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

746 # Run one big query for the data IDs for task dimensions and regular 

747 # inputs and outputs. We limit the query to only dimensions that are 

748 # associated with the input dataset types, but don't (yet) try to 

749 # obtain the dataset_ids for those inputs. 

750 _LOG.debug( 

751 "Submitting data ID query over dimensions %s and materializing results.", 

752 list(self.dimensions.names), 

753 ) 

754 queryArgs: Dict[str, Any] = { 

755 "dimensions": self.dimensions, 

756 "where": userQuery, 

757 "dataId": externalDataId, 

758 "bind": bind, 

759 } 

760 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

761 _LOG.debug( 

762 "Constraining graph query using default of %s.", 

763 list(self.defaultDatasetQueryConstraints.names), 

764 ) 

765 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

766 queryArgs["collections"] = collections 

767 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

768 _LOG.debug("Not using dataset existence to constrain query.") 

769 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

770 constraint = set(datasetQueryConstraint) 

771 inputs = {k.name: k for k in self.inputs.keys()} 

772 if remainder := constraint.difference(inputs.keys()): 

773 raise ValueError( 

774 f"{remainder} dataset type(s) specified as a graph constraint, but" 

775 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

776 ) 

777 _LOG.debug(f"Constraining graph query using {constraint}") 

778 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

779 queryArgs["collections"] = collections 

780 else: 

781 raise ValueError( 

782 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

783 ) 

784 

785 if "datasets" in queryArgs: 

786 for i, dataset_type in enumerate(queryArgs["datasets"]): 

787 if dataset_type.isComponent(): 

788 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

789 

790 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

791 _LOG.debug("Expanding data IDs.") 

792 commonDataIds = commonDataIds.expanded() 

793 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

794 # Iterate over query results, populating data IDs for datasets and 

795 # quanta and then connecting them to each other. 

796 n = -1 

797 for n, commonDataId in enumerate(commonDataIds): 

798 # Create DatasetRefs for all DatasetTypes from this result row, 

799 # noting that we might have created some already. 

800 # We remember both those that already existed and those that we 

801 # create now. 

802 refsForRow = {} 

803 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

804 for datasetType, refs in itertools.chain( 

805 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

806 ): 

807 datasetDataId: Optional[DataCoordinate] 

808 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

809 datasetDataId = commonDataId.subset(datasetType.dimensions) 

810 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

811 ref = refs.get(datasetDataId) 

812 if ref is None: 

813 ref = DatasetRef(datasetType, datasetDataId) 

814 refs[datasetDataId] = ref 

815 refsForRow[datasetType.name] = ref 

816 # Create _QuantumScaffolding objects for all tasks from this 

817 # result row, noting that we might have created some already. 

818 for task in self.tasks: 

819 quantumDataId = commonDataId.subset(task.dimensions) 

820 quantum = task.quanta.get(quantumDataId) 

821 if quantum is None: 

822 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

823 task.quanta[quantumDataId] = quantum 

824 # Whether this is a new quantum or an existing one, we can 

825 # now associate the DatasetRefs for this row with it. The 

826 # fact that a Quantum data ID and a dataset data ID both 

827 # came from the same result row is what tells us they 

828 # should be associated. 

829 # Many of these associates will be duplicates (because 

830 # another query row that differed from this one only in 

831 # irrelevant dimensions already added them), and we use 

832 # sets to skip. 

833 for datasetType in task.inputs: 

834 ref = refsForRow[datasetType.name] 

835 quantum.inputs[datasetType.name][ref.dataId] = ref 

836 for datasetType in task.outputs: 

837 ref = refsForRow[datasetType.name] 

838 quantum.outputs[datasetType.name][ref.dataId] = ref 

839 if n < 0: 

840 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

841 emptiness_explained = False 

842 for message in commonDataIds.explain_no_results(): 

843 _LOG.critical(message) 

844 emptiness_explained = True 

845 if not emptiness_explained: 

846 _LOG.critical( 

847 "To reproduce this query for debugging purposes, run " 

848 "Registry.queryDataIds with these arguments:" 

849 ) 

850 # We could just repr() the queryArgs dict to get something 

851 # the user could make sense of, but it's friendlier to 

852 # put these args in an easier-to-construct equivalent form 

853 # so they can read it more easily and copy and paste into 

854 # a Python terminal. 

855 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

856 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

857 if queryArgs["where"]: 

858 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

859 if "datasets" in queryArgs: 

860 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

861 if "collections" in queryArgs: 

862 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

863 _LOG.debug("Finished processing %d rows from data ID query.", n) 

864 yield commonDataIds 

865 

866 def resolveDatasetRefs( 

867 self, 

868 registry: Registry, 

869 collections: Any, 

870 run: Optional[str], 

871 commonDataIds: DataCoordinateQueryResults, 

872 *, 

873 skipExistingIn: Any = None, 

874 clobberOutputs: bool = True, 

875 constrainedByAllDatasets: bool = True, 

876 resolveRefs: bool = False, 

877 ) -> None: 

878 """Perform follow up queries for each dataset data ID produced in 

879 `fillDataIds`. 

880 

881 This method populates `_DatasetScaffolding.refs` (except for those in 

882 `prerequisites`). 

883 

884 Parameters 

885 ---------- 

886 registry : `lsst.daf.butler.Registry` 

887 Registry for the data repository; used for all data ID queries. 

888 collections 

889 Expressions representing the collections to search for input 

890 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

891 run : `str`, optional 

892 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

893 output datasets, if it already exists. 

894 commonDataIds : \ 

895 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

896 Result of a previous call to `connectDataIds`. 

897 skipExistingIn 

898 Expressions representing the collections to search for existing 

899 output datasets that should be skipped. See 

900 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

901 `None` or empty string/sequence disables skipping. 

902 clobberOutputs : `bool`, optional 

903 If `True` (default), allow quanta to created even if outputs exist; 

904 this requires the same behavior behavior to be enabled when 

905 executing. If ``skipExistingIn`` is not `None`, completed quanta 

906 (those with metadata, or all outputs if there is no metadata 

907 dataset configured) will be skipped rather than clobbered. 

908 constrainedByAllDatasets : `bool`, optional 

909 Indicates if the commonDataIds were generated with a constraint on 

910 all dataset types. 

911 resolveRefs : `bool`, optional 

912 If `True` then resolve all input references and generate random 

913 dataset IDs for all output and intermediate datasets. True value 

914 requires ``run`` collection to be specified. 

915 

916 Raises 

917 ------ 

918 OutputExistsError 

919 Raised if an output dataset already exists in the output run 

920 and ``skipExistingIn`` does not include output run, or if only 

921 some outputs are present and ``clobberOutputs`` is `False`. 

922 """ 

923 # Run may be provided but it does not have to exist, in that case we 

924 # use it for resolving references but don't check it for existing refs. 

925 run_exists = False 

926 if run: 

927 try: 

928 run_exists = bool(registry.queryCollections(run)) 

929 except MissingCollectionError: 

930 # Undocumented exception is raise if it does not exist 

931 pass 

932 

933 skip_collections_wildcard: CollectionWildcard | None = None 

934 skipExistingInRun = False 

935 if skipExistingIn: 

936 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

937 if run_exists: 

938 # as optimization check in the explicit list of names first 

939 skipExistingInRun = run in skip_collections_wildcard.strings 

940 if not skipExistingInRun: 

941 # need to flatten it and check again 

942 skipExistingInRun = run in registry.queryCollections( 

943 skipExistingIn, 

944 collectionTypes=CollectionType.RUN, 

945 ) 

946 

947 idMaker: Optional[_DatasetIdMaker] = None 

948 if resolveRefs: 

949 assert run is not None, "run cannot be None when resolveRefs is True" 

950 idMaker = _DatasetIdMaker(registry, run) 

951 

952 resolvedRefQueryResults: Iterable[DatasetRef] 

953 

954 # Updating constrainedByAllDatasets here is not ideal, but we have a 

955 # few different code paths that each transfer different pieces of 

956 # information about what dataset query constraints were applied here, 

957 # and none of them has the complete picture until we get here. We're 

958 # long overdue for a QG generation rewrite that will make this go away 

959 # entirely anyway. 

960 constrainedByAllDatasets = ( 

961 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

962 ) 

963 

964 # Look up [init] intermediate and output datasets in the output 

965 # collection, if there is an output collection. 

966 if run_exists or skip_collections_wildcard is not None: 

967 for datasetType, refs in itertools.chain( 

968 self.initIntermediates.items(), 

969 self.initOutputs.items(), 

970 self.intermediates.items(), 

971 self.outputs.items(), 

972 ): 

973 _LOG.debug( 

974 "Resolving %d datasets for intermediate and/or output dataset %s.", 

975 len(refs), 

976 datasetType.name, 

977 ) 

978 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

979 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

980 # TODO: this assert incorrectly bans component inputs; 

981 # investigate on DM-33027. 

982 # assert not datasetType.isComponent(), \ 

983 # "Output datasets cannot be components." 

984 # 

985 # Instead we have to handle them manually to avoid a 

986 # deprecation warning, but it is at least confusing and 

987 # possibly a bug for components to appear here at all. 

988 if datasetType.isComponent(): 

989 parent_dataset_type = datasetType.makeCompositeDatasetType() 

990 component = datasetType.component() 

991 else: 

992 parent_dataset_type = datasetType 

993 component = None 

994 

995 # look at RUN collection first 

996 if run_exists: 

997 try: 

998 resolvedRefQueryResults = subset.findDatasets( 

999 parent_dataset_type, collections=run, findFirst=True 

1000 ) 

1001 except MissingDatasetTypeError: 

1002 resolvedRefQueryResults = [] 

1003 for resolvedRef in resolvedRefQueryResults: 

1004 # TODO: we could easily support per-DatasetType 

1005 # skipExisting and I could imagine that being useful - 

1006 # it's probably required in order to support writing 

1007 # initOutputs before QuantumGraph generation. 

1008 assert resolvedRef.dataId in refs 

1009 if not (skipExistingInRun or isInit or clobberOutputs): 

1010 raise OutputExistsError( 

1011 f"Output dataset {datasetType.name} already exists in " 

1012 f"output RUN collection '{run}' with data ID" 

1013 f" {resolvedRef.dataId}." 

1014 ) 

1015 # If we are going to resolve all outputs then we have 

1016 # to remember existing ones to avoid generating new 

1017 # dataset IDs for them. 

1018 if resolveRefs: 

1019 refs[resolvedRef.dataId] = ( 

1020 resolvedRef.makeComponentRef(component) 

1021 if component is not None 

1022 else resolvedRef 

1023 ) 

1024 

1025 # And check skipExistingIn too, if RUN collection is in 

1026 # it is handled above 

1027 if skip_collections_wildcard is not None: 

1028 try: 

1029 resolvedRefQueryResults = subset.findDatasets( 

1030 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True 

1031 ) 

1032 except MissingDatasetTypeError: 

1033 resolvedRefQueryResults = [] 

1034 for resolvedRef in resolvedRefQueryResults: 

1035 assert resolvedRef.dataId in refs 

1036 refs[resolvedRef.dataId] = ( 

1037 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1038 ) 

1039 

1040 # Look up input and initInput datasets in the input collection(s). 

1041 # container to accumulate unfound refs, if the common dataIs were not 

1042 # constrained on dataset type existence. 

1043 self.unfoundRefs = set() 

1044 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1045 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

1046 if datasetType.isComponent(): 

1047 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1048 component = datasetType.component() 

1049 else: 

1050 parent_dataset_type = datasetType 

1051 component = None 

1052 try: 

1053 resolvedRefQueryResults = commonDataIds.subset( 

1054 datasetType.dimensions, unique=True 

1055 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1056 except MissingDatasetTypeError: 

1057 resolvedRefQueryResults = [] 

1058 dataIdsNotFoundYet = set(refs.keys()) 

1059 for resolvedRef in resolvedRefQueryResults: 

1060 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1061 refs[resolvedRef.dataId] = ( 

1062 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1063 ) 

1064 if dataIdsNotFoundYet: 

1065 if constrainedByAllDatasets: 

1066 raise RuntimeError( 

1067 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1068 f"'{datasetType.name}' was/were present in a previous " 

1069 "query, but could not be found now. " 

1070 "This is either a logic bug in QuantumGraph generation " 

1071 "or the input collections have been modified since " 

1072 "QuantumGraph generation began." 

1073 ) 

1074 elif not datasetType.dimensions: 

1075 raise RuntimeError( 

1076 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1077 f"collections {collections}." 

1078 ) 

1079 else: 

1080 # if the common dataIds were not constrained using all the 

1081 # input dataset types, it is possible that some data ids 

1082 # found dont correspond to existing dataset types and they 

1083 # will be un-resolved. Mark these for later pruning from 

1084 # the quantum graph. 

1085 for k in dataIdsNotFoundYet: 

1086 self.unfoundRefs.add(refs[k]) 

1087 

1088 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1089 # replacing the unresolved refs there, and then look up prerequisites. 

1090 for task in self.tasks: 

1091 _LOG.debug( 

1092 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1093 len(task.quanta), 

1094 task.taskDef.label, 

1095 ) 

1096 # The way iterConnections is designed makes it impossible to 

1097 # annotate precisely enough to satisfy MyPy here. 

1098 lookupFunctions = { 

1099 c.name: c.lookupFunction # type: ignore 

1100 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1101 if c.lookupFunction is not None # type: ignore 

1102 } 

1103 dataIdsFailed = [] 

1104 dataIdsSucceeded = [] 

1105 for quantum in task.quanta.values(): 

1106 # Process outputs datasets only if skipExistingIn is not None 

1107 # or there is a run to look for outputs in and clobberOutputs 

1108 # is True. Note that if skipExistingIn is None, any output 

1109 # datasets that already exist would have already caused an 

1110 # exception to be raised. We never update the DatasetRefs in 

1111 # the quantum because those should never be resolved. 

1112 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1113 resolvedRefs = [] 

1114 unresolvedRefs = [] 

1115 haveMetadata = False 

1116 for datasetType, originalRefs in quantum.outputs.items(): 

1117 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1118 if ref.id is not None: 

1119 resolvedRefs.append(ref) 

1120 if datasetType.name == task.taskDef.metadataDatasetName: 

1121 haveMetadata = True 

1122 else: 

1123 unresolvedRefs.append(ref) 

1124 if resolvedRefs: 

1125 if haveMetadata or not unresolvedRefs: 

1126 dataIdsSucceeded.append(quantum.dataId) 

1127 if skip_collections_wildcard is not None: 

1128 continue 

1129 else: 

1130 dataIdsFailed.append(quantum.dataId) 

1131 if not clobberOutputs: 

1132 raise OutputExistsError( 

1133 f"Quantum {quantum.dataId} of task with label " 

1134 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1135 f"({resolvedRefs}) " 

1136 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

1137 "and clobbering outputs was not enabled." 

1138 ) 

1139 # Update the input DatasetRefs to the resolved ones we already 

1140 # searched for. 

1141 for datasetType, input_refs in quantum.inputs.items(): 

1142 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

1143 input_refs[ref.dataId] = ref 

1144 # Look up prerequisite datasets in the input collection(s). 

1145 # These may have dimensions that extend beyond those we queried 

1146 # for originally, because we want to permit those data ID 

1147 # values to differ across quanta and dataset types. 

1148 for datasetType in task.prerequisites: 

1149 if datasetType.isComponent(): 

1150 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1151 component = datasetType.component() 

1152 else: 

1153 parent_dataset_type = datasetType 

1154 component = None 

1155 lookupFunction = lookupFunctions.get(datasetType.name) 

1156 if lookupFunction is not None: 

1157 # PipelineTask has provided its own function to do the 

1158 # lookup. This always takes precedence. 

1159 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1160 elif ( 

1161 datasetType.isCalibration() 

1162 and datasetType.dimensions <= quantum.dataId.graph 

1163 and quantum.dataId.graph.temporal 

1164 ): 

1165 # This is a master calibration lookup, which we have to 

1166 # handle specially because the query system can't do a 

1167 # temporal join on a non-dimension-based timespan yet. 

1168 timespan = quantum.dataId.timespan 

1169 try: 

1170 prereq_ref = registry.findDataset( 

1171 parent_dataset_type, 

1172 quantum.dataId, 

1173 collections=collections, 

1174 timespan=timespan, 

1175 ) 

1176 if prereq_ref is not None: 

1177 if component is not None: 

1178 prereq_ref = prereq_ref.makeComponentRef(component) 

1179 prereq_refs = [prereq_ref] 

1180 else: 

1181 prereq_refs = [] 

1182 except (KeyError, MissingDatasetTypeError): 

1183 # This dataset type is not present in the registry, 

1184 # which just means there are no datasets here. 

1185 prereq_refs = [] 

1186 else: 

1187 # Most general case. 

1188 prereq_refs = [ 

1189 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1190 for prereq_ref in registry.queryDatasets( 

1191 parent_dataset_type, 

1192 collections=collections, 

1193 dataId=quantum.dataId, 

1194 findFirst=True, 

1195 ).expanded() 

1196 ] 

1197 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1198 quantum.prerequisites[datasetType].update(prereq_refs_map) 

1199 task.prerequisites[datasetType].update(prereq_refs_map) 

1200 

1201 # Resolve all quantum inputs and outputs. 

1202 if idMaker: 

1203 for datasetDict in (quantum.inputs, quantum.outputs): 

1204 for refDict in datasetDict.values(): 

1205 refDict.update(idMaker.resolveDict(refDict)) 

1206 

1207 # Resolve task initInputs and initOutputs. 

1208 if idMaker: 

1209 for datasetDict in (task.initInputs, task.initOutputs): 

1210 for refDict in datasetDict.values(): 

1211 refDict.update(idMaker.resolveDict(refDict)) 

1212 

1213 # Actually remove any quanta that we decided to skip above. 

1214 if dataIdsSucceeded: 

1215 if skip_collections_wildcard is not None: 

1216 _LOG.debug( 

1217 "Pruning successful %d quanta for task with label '%s' because all of their " 

1218 "outputs exist or metadata was written successfully.", 

1219 len(dataIdsSucceeded), 

1220 task.taskDef.label, 

1221 ) 

1222 for dataId in dataIdsSucceeded: 

1223 del task.quanta[dataId] 

1224 elif clobberOutputs: 

1225 _LOG.info( 

1226 "Found %d successful quanta for task with label '%s' " 

1227 "that will need to be clobbered during execution.", 

1228 len(dataIdsSucceeded), 

1229 task.taskDef.label, 

1230 ) 

1231 else: 

1232 raise AssertionError("OutputExistsError should have already been raised.") 

1233 if dataIdsFailed: 

1234 if clobberOutputs: 

1235 _LOG.info( 

1236 "Found %d failed/incomplete quanta for task with label '%s' " 

1237 "that will need to be clobbered during execution.", 

1238 len(dataIdsFailed), 

1239 task.taskDef.label, 

1240 ) 

1241 else: 

1242 raise AssertionError("OutputExistsError should have already been raised.") 

1243 

1244 # Collect initOutputs that do not belong to any task. 

1245 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1246 for task in self.tasks: 

1247 global_dataset_types -= set(task.initOutputs) 

1248 if global_dataset_types: 

1249 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1250 if idMaker is not None: 

1251 for refDict in self.globalInitOutputs.values(): 

1252 refDict.update(idMaker.resolveDict(refDict)) 

1253 

1254 def makeQuantumGraph( 

1255 self, 

1256 registry: Registry, 

1257 metadata: Optional[Mapping[str, Any]] = None, 

1258 datastore: Optional[Datastore] = None, 

1259 ) -> QuantumGraph: 

1260 """Create a `QuantumGraph` from the quanta already present in 

1261 the scaffolding data structure. 

1262 

1263 Parameters 

1264 --------- 

1265 registry : `lsst.daf.butler.Registry` 

1266 Registry for the data repository; used for all data ID queries. 

1267 metadata : Optional Mapping of `str` to primitives 

1268 This is an optional parameter of extra data to carry with the 

1269 graph. Entries in this mapping should be able to be serialized in 

1270 JSON. 

1271 datastore : `Datastore`, optional 

1272 If not `None` then fill datastore records in each generated 

1273 Quantum. 

1274 

1275 Returns 

1276 ------- 

1277 graph : `QuantumGraph` 

1278 The full `QuantumGraph`. 

1279 """ 

1280 

1281 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1282 """Extract all DatasetRefs from the dictionaries""" 

1283 for ref_dict in dataset_dict.values(): 

1284 yield from ref_dict.values() 

1285 

1286 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1287 if datastore is not None: 

1288 datastore_records = datastore.export_records( 

1289 itertools.chain( 

1290 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1291 ) 

1292 ) 

1293 

1294 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1295 for task in self.tasks: 

1296 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1297 graphInput[task.taskDef] = qset 

1298 

1299 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1300 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1301 

1302 globalInitOutputs: list[DatasetRef] = [] 

1303 if self.globalInitOutputs is not None: 

1304 for refs_dict in self.globalInitOutputs.values(): 

1305 globalInitOutputs.extend(refs_dict.values()) 

1306 

1307 graph = QuantumGraph( 

1308 graphInput, 

1309 metadata=metadata, 

1310 pruneRefs=self.unfoundRefs, 

1311 universe=self.dimensions.universe, 

1312 initInputs=taskInitInputs, 

1313 initOutputs=taskInitOutputs, 

1314 globalInitOutputs=globalInitOutputs, 

1315 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1316 ) 

1317 return graph 

1318 

1319 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1320 """Make a list of all dataset types used by a graph as defined in 

1321 registry. 

1322 """ 

1323 chain = [ 

1324 self.initInputs, 

1325 self.initIntermediates, 

1326 self.initOutputs, 

1327 self.inputs, 

1328 self.intermediates, 

1329 self.outputs, 

1330 self.prerequisites, 

1331 ] 

1332 if self.globalInitOutputs is not None: 

1333 chain.append(self.globalInitOutputs) 

1334 

1335 # Collect names of all dataset types. 

1336 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1337 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1338 

1339 # Check for types that do not exist in registry yet: 

1340 # - inputs must exist 

1341 # - intermediates and outputs may not exist, but there must not be 

1342 # more than one definition (e.g. differing in storage class) 

1343 # - prerequisites may not exist, treat it the same as outputs here 

1344 for dstype in itertools.chain(self.initInputs, self.inputs): 

1345 if dstype.name not in dataset_types: 

1346 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1347 

1348 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1349 chain = [ 

1350 self.initIntermediates, 

1351 self.initOutputs, 

1352 self.intermediates, 

1353 self.outputs, 

1354 self.prerequisites, 

1355 ] 

1356 if self.globalInitOutputs is not None: 

1357 chain.append(self.globalInitOutputs) 

1358 for dstype in itertools.chain(*chain): 

1359 if dstype.name not in dataset_types: 

1360 new_outputs[dstype.name].add(dstype) 

1361 for name, dstypes in new_outputs.items(): 

1362 if len(dstypes) > 1: 

1363 raise ValueError( 

1364 "Pipeline contains multiple definitions for a dataset type " 

1365 f"which is not defined in registry yet: {dstypes}" 

1366 ) 

1367 elif len(dstypes) == 1: 

1368 dataset_types[name] = dstypes.pop() 

1369 

1370 return dataset_types.values() 

1371 

1372 

1373# ------------------------ 

1374# Exported definitions -- 

1375# ------------------------ 

1376 

1377 

1378class GraphBuilderError(Exception): 

1379 """Base class for exceptions generated by graph builder.""" 

1380 

1381 pass 

1382 

1383 

1384class OutputExistsError(GraphBuilderError): 

1385 """Exception generated when output datasets already exist.""" 

1386 

1387 pass 

1388 

1389 

1390class PrerequisiteMissingError(GraphBuilderError): 

1391 """Exception generated when a prerequisite dataset does not exist.""" 

1392 

1393 pass 

1394 

1395 

1396class GraphBuilder: 

1397 """GraphBuilder class is responsible for building task execution graph from 

1398 a Pipeline. 

1399 

1400 Parameters 

1401 ---------- 

1402 registry : `~lsst.daf.butler.Registry` 

1403 Data butler instance. 

1404 skipExistingIn 

1405 Expressions representing the collections to search for existing 

1406 output datasets that should be skipped. See 

1407 :ref:`daf_butler_ordered_collection_searches`. 

1408 clobberOutputs : `bool`, optional 

1409 If `True` (default), allow quanta to created even if partial outputs 

1410 exist; this requires the same behavior behavior to be enabled when 

1411 executing. 

1412 datastore : `Datastore`, optional 

1413 If not `None` then fill datastore records in each generated Quantum. 

1414 """ 

1415 

1416 def __init__( 

1417 self, 

1418 registry: Registry, 

1419 skipExistingIn: Any = None, 

1420 clobberOutputs: bool = True, 

1421 datastore: Optional[Datastore] = None, 

1422 ): 

1423 self.registry = registry 

1424 self.dimensions = registry.dimensions 

1425 self.skipExistingIn = skipExistingIn 

1426 self.clobberOutputs = clobberOutputs 

1427 self.datastore = datastore 

1428 

1429 def makeGraph( 

1430 self, 

1431 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1432 collections: Any, 

1433 run: Optional[str], 

1434 userQuery: Optional[str], 

1435 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1436 metadata: Optional[Mapping[str, Any]] = None, 

1437 resolveRefs: bool = False, 

1438 bind: Optional[Mapping[str, Any]] = None, 

1439 ) -> QuantumGraph: 

1440 """Create execution graph for a pipeline. 

1441 

1442 Parameters 

1443 ---------- 

1444 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1445 Pipeline definition, task names/classes and their configs. 

1446 collections 

1447 Expressions representing the collections to search for input 

1448 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1449 run : `str`, optional 

1450 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1451 output datasets. Collection does not have to exist and it will be 

1452 created when graph is executed. 

1453 userQuery : `str` 

1454 String which defines user-defined selection for registry, should be 

1455 empty or `None` if there is no restrictions on data selection. 

1456 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1457 The query constraint variant that should be used to constraint the 

1458 query based on dataset existance, defaults to 

1459 `DatasetQueryConstraintVariant.ALL`. 

1460 metadata : Optional Mapping of `str` to primitives 

1461 This is an optional parameter of extra data to carry with the 

1462 graph. Entries in this mapping should be able to be serialized in 

1463 JSON. 

1464 resolveRefs : `bool`, optional 

1465 If `True` then resolve all input references and generate random 

1466 dataset IDs for all output and intermediate datasets. True value 

1467 requires ``run`` collection to be specified. 

1468 bind : `Mapping`, optional 

1469 Mapping containing literal values that should be injected into the 

1470 ``userQuery`` expression, keyed by the identifiers they replace. 

1471 

1472 Returns 

1473 ------- 

1474 graph : `QuantumGraph` 

1475 

1476 Raises 

1477 ------ 

1478 UserExpressionError 

1479 Raised when user expression cannot be parsed. 

1480 OutputExistsError 

1481 Raised when output datasets already exist. 

1482 Exception 

1483 Other exceptions types may be raised by underlying registry 

1484 classes. 

1485 """ 

1486 if resolveRefs and run is None: 

1487 raise ValueError("`resolveRefs` requires `run` parameter.") 

1488 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1489 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1490 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1491 instrument_class: Optional[Any] = None 

1492 if isinstance(pipeline, Pipeline): 

1493 instrument_class_name = pipeline.getInstrument() 

1494 if instrument_class_name is not None: 

1495 instrument_class = doImportType(instrument_class_name) 

1496 pipeline = list(pipeline.toExpandedPipeline()) 

1497 if instrument_class is not None: 

1498 dataId = DataCoordinate.standardize( 

1499 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1500 ) 

1501 else: 

1502 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1503 with scaffolding.connectDataIds( 

1504 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1505 ) as commonDataIds: 

1506 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1507 scaffolding.resolveDatasetRefs( 

1508 self.registry, 

1509 collections, 

1510 run, 

1511 commonDataIds, 

1512 skipExistingIn=self.skipExistingIn, 

1513 clobberOutputs=self.clobberOutputs, 

1514 constrainedByAllDatasets=condition, 

1515 resolveRefs=resolveRefs, 

1516 ) 

1517 return scaffolding.makeQuantumGraph( 

1518 registry=self.registry, metadata=metadata, datastore=self.datastore 

1519 )