Coverage for python/lsst/pipe/base/graphBuilder.py: 14%

532 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-22 02:19 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33import warnings 

34from collections import ChainMap, defaultdict 

35from contextlib import contextmanager 

36from dataclasses import dataclass 

37from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

38 

39from lsst.daf.butler import ( 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetIdGenEnum, 

43 DatasetRef, 

44 DatasetType, 

45 Datastore, 

46 DatastoreRecordData, 

47 DimensionGraph, 

48 DimensionUniverse, 

49 NamedKeyDict, 

50 NamedValueSet, 

51 Quantum, 

52 Registry, 

53 UnresolvedRefWarning, 

54) 

55from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

56from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

57from lsst.daf.butler.registry.wildcards import CollectionWildcard 

58from lsst.utils import doImportType 

59 

60from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

61from ._status import NoWorkFound 

62 

63# ----------------------------- 

64# Imports for other modules -- 

65# ----------------------------- 

66from .connections import AdjustQuantumHelper, iterConnections 

67from .graph import QuantumGraph 

68from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

69 

70# ---------------------------------- 

71# Local non-exported definitions -- 

72# ---------------------------------- 

73 

74_LOG = logging.getLogger(__name__) 

75 

76 

77class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

78 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

79 the known `DatasetRef` instances of that type. 

80 

81 Parameters 

82 ---------- 

83 args 

84 Positional arguments are forwarded to the `dict` constructor. 

85 universe : `DimensionUniverse` 

86 Universe of all possible dimensions. 

87 """ 

88 

89 def __init__(self, *args: Any, universe: DimensionUniverse): 

90 super().__init__(*args) 

91 self.universe = universe 

92 

93 @classmethod 

94 def fromDatasetTypes( 

95 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

96 ) -> _DatasetDict: 

97 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

98 

99 Parameters 

100 ---------- 

101 datasetTypes : `iterable` of `DatasetType` 

102 DatasetTypes to use as keys for the dict. Values will be empty 

103 dictionaries. 

104 universe : `DimensionUniverse` 

105 Universe of all possible dimensions. 

106 

107 Returns 

108 ------- 

109 dictionary : `_DatasetDict` 

110 A new `_DatasetDict` instance. 

111 """ 

112 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

113 

114 @classmethod 

115 def fromSubset( 

116 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

117 ) -> _DatasetDict: 

118 """Return a new dictionary by extracting items corresponding to the 

119 given keys from one or more existing dictionaries. 

120 

121 Parameters 

122 ---------- 

123 datasetTypes : `iterable` of `DatasetType` 

124 DatasetTypes to use as keys for the dict. Values will be obtained 

125 by lookups against ``first`` and ``rest``. 

126 first : `_DatasetDict` 

127 Another dictionary from which to extract values. 

128 rest 

129 Additional dictionaries from which to extract values. 

130 

131 Returns 

132 ------- 

133 dictionary : `_DatasetDict` 

134 A new dictionary instance. 

135 """ 

136 combined = ChainMap(first, *rest) 

137 

138 # Dataset types known to match immediately can be processed 

139 # without checks. 

140 matches = combined.keys() & set(datasetTypes) 

141 _dict = {k: combined[k] for k in matches} 

142 

143 if len(_dict) < len(datasetTypes): 

144 # Work out which ones are missing. 

145 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

146 

147 # Get the known names for comparison. 

148 combined_by_name = {k.name: k for k in combined} 

149 

150 missing = set() 

151 incompatible = {} 

152 for datasetType in missing_datasetTypes: 

153 # The dataset type is not found. It may not be listed 

154 # or it may be that it is there with the same name 

155 # but different definition. 

156 if datasetType.name in combined_by_name: 

157 # This implies some inconsistency in definitions 

158 # for connections. If there is support for storage 

159 # class conversion we can let it slide. 

160 # At this point we do not know 

161 # where the inconsistency is but trust that down 

162 # stream code will be more explicit about input 

163 # vs output incompatibilities. 

164 existing = combined_by_name[datasetType.name] 

165 convertible_to_existing = existing.is_compatible_with(datasetType) 

166 convertible_from_existing = datasetType.is_compatible_with(existing) 

167 if convertible_to_existing and convertible_from_existing: 

168 _LOG.debug( 

169 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

170 datasetType.name, 

171 datasetType.storageClass_name, 

172 existing.storageClass_name, 

173 ) 

174 _dict[datasetType] = combined[existing] 

175 elif convertible_to_existing or convertible_from_existing: 

176 # We'd need to refactor a fair amount to recognize 

177 # whether this is an error or not, so I'm not going to 

178 # bother until we need to do that for other reasons 

179 # (it won't be too long). 

180 _LOG.info( 

181 "Dataset type %s is present with multiple only partially-compatible storage " 

182 "classes %s and %s.", 

183 datasetType.name, 

184 datasetType.storageClass_name, 

185 existing.storageClass_name, 

186 ) 

187 _dict[datasetType] = combined[existing] 

188 else: 

189 incompatible[datasetType] = existing 

190 else: 

191 missing.add(datasetType) 

192 

193 if missing or incompatible: 

194 reasons = [] 

195 if missing: 

196 reasons.append( 

197 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

198 + ", ".join(d.name for d in combined) 

199 ) 

200 if incompatible: 

201 for x, y in incompatible.items(): 

202 reasons.append(f"{x} incompatible with {y}") 

203 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

204 

205 return cls(_dict, universe=first.universe) 

206 

207 @property 

208 def dimensions(self) -> DimensionGraph: 

209 """The union of all dimensions used by all dataset types in this 

210 dictionary, including implied dependencies (`DimensionGraph`). 

211 """ 

212 base = self.universe.empty 

213 if len(self) == 0: 

214 return base 

215 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

216 

217 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

218 """Unpack nested single-element `DatasetRef` dicts into a new 

219 mapping with `DatasetType` keys and `DatasetRef` values. 

220 

221 This method assumes that each nest contains exactly one item, as is the 

222 case for all "init" datasets. 

223 

224 Returns 

225 ------- 

226 dictionary : `NamedKeyDict` 

227 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

228 `DatasetType` instances and string names usable as keys. 

229 """ 

230 

231 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

232 (ref,) = refs.values() 

233 return ref 

234 

235 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

236 

237 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

238 """Unpack nested multi-element `DatasetRef` dicts into a new 

239 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

240 

241 Returns 

242 ------- 

243 dictionary : `NamedKeyDict` 

244 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

245 both `DatasetType` instances and string names usable as keys. 

246 """ 

247 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

248 

249 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

250 """Iterate over the contained `DatasetRef` instances that match the 

251 given `DatasetType` and data IDs. 

252 

253 Parameters 

254 ---------- 

255 datasetType : `DatasetType` 

256 Dataset type to match. 

257 dataIds : `Iterable` [ `DataCoordinate` ] 

258 Data IDs to match. 

259 

260 Returns 

261 ------- 

262 refs : `Iterator` [ `DatasetRef` ] 

263 DatasetRef instances for which ``ref.datasetType == datasetType`` 

264 and ``ref.dataId`` is in ``dataIds``. 

265 """ 

266 refs = self[datasetType] 

267 return (refs[dataId] for dataId in dataIds) 

268 

269 

270class _QuantumScaffolding: 

271 """Helper class aggregating information about a `Quantum`, used when 

272 constructing a `QuantumGraph`. 

273 

274 See `_PipelineScaffolding` for a top-down description of the full 

275 scaffolding data structure. 

276 

277 Parameters 

278 ---------- 

279 task : _TaskScaffolding 

280 Back-reference to the helper object for the `PipelineTask` this quantum 

281 represents an execution of. 

282 dataId : `DataCoordinate` 

283 Data ID for this quantum. 

284 """ 

285 

286 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

287 self.task = task 

288 self.dataId = dataId 

289 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

290 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

291 self.prerequisites = _DatasetDict.fromDatasetTypes( 

292 task.prerequisites.keys(), universe=dataId.universe 

293 ) 

294 

295 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

296 

297 def __repr__(self) -> str: 

298 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

299 

300 task: _TaskScaffolding 

301 """Back-reference to the helper object for the `PipelineTask` this quantum 

302 represents an execution of. 

303 """ 

304 

305 dataId: DataCoordinate 

306 """Data ID for this quantum. 

307 """ 

308 

309 inputs: _DatasetDict 

310 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

311 

312 This is initialized to map each `DatasetType` to an empty dictionary at 

313 construction. Those nested dictionaries are populated (with data IDs as 

314 keys) with unresolved `DatasetRef` instances in 

315 `_PipelineScaffolding.connectDataIds`. 

316 """ 

317 

318 outputs: _DatasetDict 

319 """Nested dictionary containing `DatasetRef` outputs this quantum. 

320 """ 

321 

322 prerequisites: _DatasetDict 

323 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

324 quantum. 

325 """ 

326 

327 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

328 """Transform the scaffolding object into a true `Quantum` instance. 

329 

330 Parameters 

331 ---------- 

332 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

333 If not `None` then fill datastore records in each generated Quantum 

334 using the records from this structure. 

335 

336 Returns 

337 ------- 

338 quantum : `Quantum` 

339 An actual `Quantum` instance. 

340 """ 

341 allInputs = self.inputs.unpackMultiRefs() 

342 allInputs.update(self.prerequisites.unpackMultiRefs()) 

343 # Give the task's Connections class an opportunity to remove some 

344 # inputs, or complain if they are unacceptable. 

345 # This will raise if one of the check conditions is not met, which is 

346 # the intended behavior. 

347 # If it raises NotWorkFound, there is a bug in the QG algorithm 

348 # or the adjustQuantum is incorrectly trying to make a prerequisite 

349 # input behave like a regular input; adjustQuantum should only raise 

350 # NoWorkFound if a regular input is missing, and it shouldn't be 

351 # possible for us to have generated ``self`` if that's true. 

352 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

353 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

354 initInputs = self.task.initInputs.unpackSingleRefs() 

355 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

356 if datastore_records is not None: 

357 quantum_records = {} 

358 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

359 input_refs += list(initInputs.values()) 

360 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

361 for datastore_name, records in datastore_records.items(): 

362 matching_records = records.subset(input_ids) 

363 if matching_records is not None: 

364 quantum_records[datastore_name] = matching_records 

365 return Quantum( 

366 taskName=self.task.taskDef.taskName, 

367 taskClass=self.task.taskDef.taskClass, 

368 dataId=self.dataId, 

369 initInputs=initInputs, 

370 inputs=helper.inputs, 

371 outputs=helper.outputs, 

372 datastore_records=quantum_records, 

373 ) 

374 

375 

376@dataclass 

377class _TaskScaffolding: 

378 """Helper class aggregating information about a `PipelineTask`, used when 

379 constructing a `QuantumGraph`. 

380 

381 See `_PipelineScaffolding` for a top-down description of the full 

382 scaffolding data structure. 

383 

384 Parameters 

385 ---------- 

386 taskDef : `TaskDef` 

387 Data structure that identifies the task class and its config. 

388 parent : `_PipelineScaffolding` 

389 The parent data structure that will hold the instance being 

390 constructed. 

391 datasetTypes : `TaskDatasetTypes` 

392 Data structure that categorizes the dataset types used by this task. 

393 """ 

394 

395 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

396 universe = parent.dimensions.universe 

397 self.taskDef = taskDef 

398 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

399 assert self.dimensions.issubset(parent.dimensions) 

400 # Initialize _DatasetDicts as subsets of the one or two 

401 # corresponding dicts in the parent _PipelineScaffolding. 

402 self.initInputs = _DatasetDict.fromSubset( 

403 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

404 ) 

405 self.initOutputs = _DatasetDict.fromSubset( 

406 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

407 ) 

408 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

409 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

410 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

411 self.dataIds: Set[DataCoordinate] = set() 

412 self.quanta = {} 

413 

414 def __repr__(self) -> str: 

415 # Default dataclass-injected __repr__ gets caught in an infinite loop 

416 # because of back-references. 

417 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

418 

419 taskDef: TaskDef 

420 """Data structure that identifies the task class and its config 

421 (`TaskDef`). 

422 """ 

423 

424 dimensions: DimensionGraph 

425 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

426 """ 

427 

428 initInputs: _DatasetDict 

429 """Dictionary containing information about datasets used to construct this 

430 task (`_DatasetDict`). 

431 """ 

432 

433 initOutputs: _DatasetDict 

434 """Dictionary containing information about datasets produced as a 

435 side-effect of constructing this task (`_DatasetDict`). 

436 """ 

437 

438 inputs: _DatasetDict 

439 """Dictionary containing information about datasets used as regular, 

440 graph-constraining inputs to this task (`_DatasetDict`). 

441 """ 

442 

443 outputs: _DatasetDict 

444 """Dictionary containing information about datasets produced by this task 

445 (`_DatasetDict`). 

446 """ 

447 

448 prerequisites: _DatasetDict 

449 """Dictionary containing information about input datasets that must be 

450 present in the repository before any Pipeline containing this task is run 

451 (`_DatasetDict`). 

452 """ 

453 

454 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

455 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

456 this task with that data ID. 

457 """ 

458 

459 def makeQuantumSet( 

460 self, 

461 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

462 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

463 ) -> Set[Quantum]: 

464 """Create a `set` of `Quantum` from the information in ``self``. 

465 

466 Parameters 

467 ---------- 

468 unresolvedRefs : `set` [ `DatasetRef` ], optional 

469 Input dataset refs that have not been found. 

470 datastore_records : `dict` 

471 

472 

473 Returns 

474 ------- 

475 nodes : `set` of `Quantum` 

476 The `Quantum` elements corresponding to this task. 

477 """ 

478 if unresolvedRefs is None: 

479 unresolvedRefs = set() 

480 outputs = set() 

481 for q in self.quanta.values(): 

482 try: 

483 tmpQuanta = q.makeQuantum(datastore_records) 

484 outputs.add(tmpQuanta) 

485 except (NoWorkFound, FileNotFoundError) as exc: 

486 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

487 if unresolvedRefs.intersection(refs): 

488 # This means it is a node that is Known to be pruned 

489 # later and should be left in even though some follow up 

490 # queries fail. This allows the pruning to start from this 

491 # quantum with known issues, and prune other nodes it 

492 # touches 

493 inputs = q.inputs.unpackMultiRefs() 

494 inputs.update(q.prerequisites.unpackMultiRefs()) 

495 tmpQuantum = Quantum( 

496 taskName=q.task.taskDef.taskName, 

497 taskClass=q.task.taskDef.taskClass, 

498 dataId=q.dataId, 

499 initInputs=q.task.initInputs.unpackSingleRefs(), 

500 inputs=inputs, 

501 outputs=q.outputs.unpackMultiRefs(), 

502 ) 

503 outputs.add(tmpQuantum) 

504 else: 

505 raise exc 

506 return outputs 

507 

508 

509class _DatasetIdMaker: 

510 """Helper class which generates random dataset UUIDs for unresolved 

511 datasets. 

512 """ 

513 

514 def __init__(self, registry: Registry, run: str): 

515 self.datasetIdFactory = registry.datasetIdFactory 

516 self.run = run 

517 # Dataset IDs generated so far 

518 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

519 

520 def resolveRef(self, ref: DatasetRef) -> DatasetRef: 

521 if ref.id is not None: 

522 return ref 

523 

524 # For components we need their parent dataset ID. 

525 if ref.isComponent(): 

526 with warnings.catch_warnings(): 

527 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

528 parent_ref = ref.makeCompositeRef() 

529 # Some basic check - parent should be resolved if this is an 

530 # existing input, or it should be in the cache already if it is 

531 # an intermediate. 

532 if parent_ref.id is None: 

533 key = parent_ref.datasetType, parent_ref.dataId 

534 if key not in self.resolved: 

535 raise ValueError(f"Composite dataset is missing from cache: {parent_ref}") 

536 parent_ref = self.resolved[key] 

537 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved" 

538 with warnings.catch_warnings(): 

539 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

540 return ref.resolved(parent_ref.id, parent_ref.run) 

541 

542 key = ref.datasetType, ref.dataId 

543 if (resolved := self.resolved.get(key)) is None: 

544 with warnings.catch_warnings(): 

545 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

546 resolved = self.datasetIdFactory.resolveRef(ref, self.run, DatasetIdGenEnum.UNIQUE) 

547 self.resolved[key] = resolved 

548 return resolved 

549 

550 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]: 

551 """Resolve all unresolved references in the provided dictionary.""" 

552 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()} 

553 

554 

555@dataclass 

556class _PipelineScaffolding: 

557 """A helper data structure that organizes the information involved in 

558 constructing a `QuantumGraph` for a `Pipeline`. 

559 

560 Parameters 

561 ---------- 

562 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

563 Sequence of tasks from which a graph is to be constructed. Must 

564 have nested task classes already imported. 

565 universe : `DimensionUniverse` 

566 Universe of all possible dimensions. 

567 

568 Notes 

569 ----- 

570 The scaffolding data structure contains nested data structures for both 

571 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

572 data structures are shared between the pipeline-level structure (which 

573 aggregates all datasets and categorizes them from the perspective of the 

574 complete pipeline) and the individual tasks that use them as inputs and 

575 outputs. 

576 

577 `QuantumGraph` construction proceeds in four steps, with each corresponding 

578 to a different `_PipelineScaffolding` method: 

579 

580 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

581 the DatasetTypes used by the pipeline (delegating to 

582 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

583 nested `_TaskScaffolding` and `_DatasetDict` objects. 

584 

585 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

586 returns related tuples of all dimensions used to identify any regular 

587 input, output, and intermediate datasets (not prerequisites). We then 

588 iterate over these tuples of related dimensions, identifying the subsets 

589 that correspond to distinct data IDs for each task and dataset type, 

590 and then create `_QuantumScaffolding` objects. 

591 

592 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

593 dataset data IDs previously identified, transforming unresolved 

594 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

595 up prerequisite datasets for all quanta. 

596 

597 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

598 per-task `_QuantumScaffolding` objects. 

599 """ 

600 

601 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

602 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

603 self.tasks = [] 

604 # Aggregate and categorize the DatasetTypes in the Pipeline. 

605 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

606 # Construct dictionaries that map those DatasetTypes to structures 

607 # that will (later) hold additional information about them. 

608 for attr in ( 

609 "initInputs", 

610 "initIntermediates", 

611 "initOutputs", 

612 "inputs", 

613 "intermediates", 

614 "outputs", 

615 "prerequisites", 

616 ): 

617 setattr( 

618 self, 

619 attr, 

620 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

621 ) 

622 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

623 # Aggregate all dimensions for all non-init, non-prerequisite 

624 # DatasetTypes. These are the ones we'll include in the big join 

625 # query. 

626 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

627 # Construct scaffolding nodes for each Task, and add backreferences 

628 # to the Task from each DatasetScaffolding node. 

629 # Note that there's only one scaffolding node for each DatasetType, 

630 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

631 # reference it. 

632 if isinstance(pipeline, Pipeline): 

633 pipeline = pipeline.toExpandedPipeline() 

634 self.tasks = [ 

635 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

636 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

637 ] 

638 

639 def __repr__(self) -> str: 

640 # Default dataclass-injected __repr__ gets caught in an infinite loop 

641 # because of back-references. 

642 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

643 

644 tasks: List[_TaskScaffolding] 

645 """Scaffolding data structures for each task in the pipeline 

646 (`list` of `_TaskScaffolding`). 

647 """ 

648 

649 initInputs: _DatasetDict 

650 """Datasets consumed but not produced when constructing the tasks in this 

651 pipeline (`_DatasetDict`). 

652 """ 

653 

654 initIntermediates: _DatasetDict 

655 """Datasets that are both consumed and produced when constructing the tasks 

656 in this pipeline (`_DatasetDict`). 

657 """ 

658 

659 initOutputs: _DatasetDict 

660 """Datasets produced but not consumed when constructing the tasks in this 

661 pipeline (`_DatasetDict`). 

662 """ 

663 

664 inputs: _DatasetDict 

665 """Datasets that are consumed but not produced when running this pipeline 

666 (`_DatasetDict`). 

667 """ 

668 

669 intermediates: _DatasetDict 

670 """Datasets that are both produced and consumed when running this pipeline 

671 (`_DatasetDict`). 

672 """ 

673 

674 outputs: _DatasetDict 

675 """Datasets produced but not consumed when when running this pipeline 

676 (`_DatasetDict`). 

677 """ 

678 

679 prerequisites: _DatasetDict 

680 """Datasets that are consumed when running this pipeline and looked up 

681 per-Quantum when generating the graph (`_DatasetDict`). 

682 """ 

683 

684 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

685 """Datasets that should be used as constraints in the initial query, 

686 according to tasks (`NamedValueSet`). 

687 """ 

688 

689 dimensions: DimensionGraph 

690 """All dimensions used by any regular input, intermediate, or output 

691 (not prerequisite) dataset; the set of dimension used in the "Big Join 

692 Query" (`DimensionGraph`). 

693 

694 This is required to be a superset of all task quantum dimensions. 

695 """ 

696 

697 globalInitOutputs: _DatasetDict | None = None 

698 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

699 """ 

700 

701 @contextmanager 

702 def connectDataIds( 

703 self, 

704 registry: Registry, 

705 collections: Any, 

706 userQuery: Optional[str], 

707 externalDataId: DataCoordinate, 

708 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

709 bind: Optional[Mapping[str, Any]] = None, 

710 ) -> Iterator[DataCoordinateQueryResults]: 

711 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

712 

713 This method populates `_TaskScaffolding.dataIds` and 

714 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

715 

716 Parameters 

717 ---------- 

718 registry : `lsst.daf.butler.Registry` 

719 Registry for the data repository; used for all data ID queries. 

720 collections 

721 Expressions representing the collections to search for input 

722 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

723 userQuery : `str` or `None` 

724 User-provided expression to limit the data IDs processed. 

725 externalDataId : `DataCoordinate` 

726 Externally-provided data ID that should be used to restrict the 

727 results, just as if these constraints had been included via ``AND`` 

728 in ``userQuery``. This includes (at least) any instrument named 

729 in the pipeline definition. 

730 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

731 The query constraint variant that should be used to constraint the 

732 query based on dataset existance, defaults to 

733 `DatasetQueryConstraintVariant.ALL`. 

734 bind : `Mapping`, optional 

735 Mapping containing literal values that should be injected into the 

736 ``userQuery`` expression, keyed by the identifiers they replace. 

737 

738 Returns 

739 ------- 

740 commonDataIds : \ 

741 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

742 An interface to a database temporary table containing all data IDs 

743 that will appear in this `QuantumGraph`. Returned inside a 

744 context manager, which will drop the temporary table at the end of 

745 the `with` block in which this method is called. 

746 """ 

747 _LOG.debug("Building query for data IDs.") 

748 # Initialization datasets always have empty data IDs. 

749 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

750 for datasetType, refs in itertools.chain( 

751 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

752 ): 

753 with warnings.catch_warnings(): 

754 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

755 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

756 # Run one big query for the data IDs for task dimensions and regular 

757 # inputs and outputs. We limit the query to only dimensions that are 

758 # associated with the input dataset types, but don't (yet) try to 

759 # obtain the dataset_ids for those inputs. 

760 _LOG.debug( 

761 "Submitting data ID query over dimensions %s and materializing results.", 

762 list(self.dimensions.names), 

763 ) 

764 queryArgs: Dict[str, Any] = { 

765 "dimensions": self.dimensions, 

766 "where": userQuery, 

767 "dataId": externalDataId, 

768 "bind": bind, 

769 } 

770 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

771 _LOG.debug( 

772 "Constraining graph query using default of %s.", 

773 list(self.defaultDatasetQueryConstraints.names), 

774 ) 

775 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

776 queryArgs["collections"] = collections 

777 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

778 _LOG.debug("Not using dataset existence to constrain query.") 

779 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

780 constraint = set(datasetQueryConstraint) 

781 inputs = {k.name: k for k in self.inputs.keys()} 

782 if remainder := constraint.difference(inputs.keys()): 

783 raise ValueError( 

784 f"{remainder} dataset type(s) specified as a graph constraint, but" 

785 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

786 ) 

787 _LOG.debug(f"Constraining graph query using {constraint}") 

788 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

789 queryArgs["collections"] = collections 

790 else: 

791 raise ValueError( 

792 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

793 ) 

794 

795 if "datasets" in queryArgs: 

796 for i, dataset_type in enumerate(queryArgs["datasets"]): 

797 if dataset_type.isComponent(): 

798 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

799 

800 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

801 _LOG.debug("Expanding data IDs.") 

802 commonDataIds = commonDataIds.expanded() 

803 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

804 # Iterate over query results, populating data IDs for datasets and 

805 # quanta and then connecting them to each other. 

806 n = -1 

807 for n, commonDataId in enumerate(commonDataIds): 

808 # Create DatasetRefs for all DatasetTypes from this result row, 

809 # noting that we might have created some already. 

810 # We remember both those that already existed and those that we 

811 # create now. 

812 refsForRow = {} 

813 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

814 for datasetType, refs in itertools.chain( 

815 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

816 ): 

817 datasetDataId: Optional[DataCoordinate] 

818 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

819 datasetDataId = commonDataId.subset(datasetType.dimensions) 

820 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

821 ref = refs.get(datasetDataId) 

822 if ref is None: 

823 with warnings.catch_warnings(): 

824 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

825 ref = DatasetRef(datasetType, datasetDataId) 

826 refs[datasetDataId] = ref 

827 refsForRow[datasetType.name] = ref 

828 # Create _QuantumScaffolding objects for all tasks from this 

829 # result row, noting that we might have created some already. 

830 for task in self.tasks: 

831 quantumDataId = commonDataId.subset(task.dimensions) 

832 quantum = task.quanta.get(quantumDataId) 

833 if quantum is None: 

834 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

835 task.quanta[quantumDataId] = quantum 

836 # Whether this is a new quantum or an existing one, we can 

837 # now associate the DatasetRefs for this row with it. The 

838 # fact that a Quantum data ID and a dataset data ID both 

839 # came from the same result row is what tells us they 

840 # should be associated. 

841 # Many of these associates will be duplicates (because 

842 # another query row that differed from this one only in 

843 # irrelevant dimensions already added them), and we use 

844 # sets to skip. 

845 for datasetType in task.inputs: 

846 ref = refsForRow[datasetType.name] 

847 quantum.inputs[datasetType.name][ref.dataId] = ref 

848 for datasetType in task.outputs: 

849 ref = refsForRow[datasetType.name] 

850 quantum.outputs[datasetType.name][ref.dataId] = ref 

851 if n < 0: 

852 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

853 emptiness_explained = False 

854 for message in commonDataIds.explain_no_results(): 

855 _LOG.critical(message) 

856 emptiness_explained = True 

857 if not emptiness_explained: 

858 _LOG.critical( 

859 "To reproduce this query for debugging purposes, run " 

860 "Registry.queryDataIds with these arguments:" 

861 ) 

862 # We could just repr() the queryArgs dict to get something 

863 # the user could make sense of, but it's friendlier to 

864 # put these args in an easier-to-construct equivalent form 

865 # so they can read it more easily and copy and paste into 

866 # a Python terminal. 

867 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

868 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

869 if queryArgs["where"]: 

870 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

871 if "datasets" in queryArgs: 

872 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

873 if "collections" in queryArgs: 

874 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

875 _LOG.debug("Finished processing %d rows from data ID query.", n) 

876 yield commonDataIds 

877 

878 def resolveDatasetRefs( 

879 self, 

880 registry: Registry, 

881 collections: Any, 

882 run: Optional[str], 

883 commonDataIds: DataCoordinateQueryResults, 

884 *, 

885 skipExistingIn: Any = None, 

886 clobberOutputs: bool = True, 

887 constrainedByAllDatasets: bool = True, 

888 resolveRefs: bool = False, 

889 ) -> None: 

890 """Perform follow up queries for each dataset data ID produced in 

891 `fillDataIds`. 

892 

893 This method populates `_DatasetScaffolding.refs` (except for those in 

894 `prerequisites`). 

895 

896 Parameters 

897 ---------- 

898 registry : `lsst.daf.butler.Registry` 

899 Registry for the data repository; used for all data ID queries. 

900 collections 

901 Expressions representing the collections to search for input 

902 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

903 run : `str`, optional 

904 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

905 output datasets, if it already exists. 

906 commonDataIds : \ 

907 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

908 Result of a previous call to `connectDataIds`. 

909 skipExistingIn 

910 Expressions representing the collections to search for existing 

911 output datasets that should be skipped. See 

912 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

913 `None` or empty string/sequence disables skipping. 

914 clobberOutputs : `bool`, optional 

915 If `True` (default), allow quanta to created even if outputs exist; 

916 this requires the same behavior behavior to be enabled when 

917 executing. If ``skipExistingIn`` is not `None`, completed quanta 

918 (those with metadata, or all outputs if there is no metadata 

919 dataset configured) will be skipped rather than clobbered. 

920 constrainedByAllDatasets : `bool`, optional 

921 Indicates if the commonDataIds were generated with a constraint on 

922 all dataset types. 

923 resolveRefs : `bool`, optional 

924 If `True` then resolve all input references and generate random 

925 dataset IDs for all output and intermediate datasets. True value 

926 requires ``run`` collection to be specified. 

927 

928 Raises 

929 ------ 

930 OutputExistsError 

931 Raised if an output dataset already exists in the output run 

932 and ``skipExistingIn`` does not include output run, or if only 

933 some outputs are present and ``clobberOutputs`` is `False`. 

934 """ 

935 # Run may be provided but it does not have to exist, in that case we 

936 # use it for resolving references but don't check it for existing refs. 

937 run_exists = False 

938 if run: 

939 try: 

940 run_exists = bool(registry.queryCollections(run)) 

941 except MissingCollectionError: 

942 # Undocumented exception is raise if it does not exist 

943 pass 

944 

945 skip_collections_wildcard: CollectionWildcard | None = None 

946 skipExistingInRun = False 

947 if skipExistingIn: 

948 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

949 if run_exists: 

950 # as optimization check in the explicit list of names first 

951 skipExistingInRun = run in skip_collections_wildcard.strings 

952 if not skipExistingInRun: 

953 # need to flatten it and check again 

954 skipExistingInRun = run in registry.queryCollections( 

955 skipExistingIn, 

956 collectionTypes=CollectionType.RUN, 

957 ) 

958 

959 idMaker: Optional[_DatasetIdMaker] = None 

960 if resolveRefs: 

961 assert run is not None, "run cannot be None when resolveRefs is True" 

962 idMaker = _DatasetIdMaker(registry, run) 

963 

964 resolvedRefQueryResults: Iterable[DatasetRef] 

965 

966 # Updating constrainedByAllDatasets here is not ideal, but we have a 

967 # few different code paths that each transfer different pieces of 

968 # information about what dataset query constraints were applied here, 

969 # and none of them has the complete picture until we get here. We're 

970 # long overdue for a QG generation rewrite that will make this go away 

971 # entirely anyway. 

972 constrainedByAllDatasets = ( 

973 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

974 ) 

975 

976 # Look up [init] intermediate and output datasets in the output 

977 # collection, if there is an output collection. 

978 if run_exists or skip_collections_wildcard is not None: 

979 for datasetType, refs in itertools.chain( 

980 self.initIntermediates.items(), 

981 self.initOutputs.items(), 

982 self.intermediates.items(), 

983 self.outputs.items(), 

984 ): 

985 _LOG.debug( 

986 "Resolving %d datasets for intermediate and/or output dataset %s.", 

987 len(refs), 

988 datasetType.name, 

989 ) 

990 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

991 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

992 # TODO: this assert incorrectly bans component inputs; 

993 # investigate on DM-33027. 

994 # assert not datasetType.isComponent(), \ 

995 # "Output datasets cannot be components." 

996 # 

997 # Instead we have to handle them manually to avoid a 

998 # deprecation warning, but it is at least confusing and 

999 # possibly a bug for components to appear here at all. 

1000 if datasetType.isComponent(): 

1001 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1002 component = datasetType.component() 

1003 else: 

1004 parent_dataset_type = datasetType 

1005 component = None 

1006 

1007 # look at RUN collection first 

1008 if run_exists: 

1009 try: 

1010 resolvedRefQueryResults = subset.findDatasets( 

1011 parent_dataset_type, collections=run, findFirst=True 

1012 ) 

1013 except MissingDatasetTypeError: 

1014 resolvedRefQueryResults = [] 

1015 for resolvedRef in resolvedRefQueryResults: 

1016 # TODO: we could easily support per-DatasetType 

1017 # skipExisting and I could imagine that being useful - 

1018 # it's probably required in order to support writing 

1019 # initOutputs before QuantumGraph generation. 

1020 assert resolvedRef.dataId in refs 

1021 if not (skipExistingInRun or isInit or clobberOutputs): 

1022 raise OutputExistsError( 

1023 f"Output dataset {datasetType.name} already exists in " 

1024 f"output RUN collection '{run}' with data ID" 

1025 f" {resolvedRef.dataId}." 

1026 ) 

1027 # If we are going to resolve all outputs then we have 

1028 # to remember existing ones to avoid generating new 

1029 # dataset IDs for them. 

1030 if resolveRefs: 

1031 refs[resolvedRef.dataId] = ( 

1032 resolvedRef.makeComponentRef(component) 

1033 if component is not None 

1034 else resolvedRef 

1035 ) 

1036 

1037 # And check skipExistingIn too, if RUN collection is in 

1038 # it is handled above 

1039 if skip_collections_wildcard is not None: 

1040 try: 

1041 resolvedRefQueryResults = subset.findDatasets( 

1042 parent_dataset_type, collections=skip_collections_wildcard, findFirst=True 

1043 ) 

1044 except MissingDatasetTypeError: 

1045 resolvedRefQueryResults = [] 

1046 for resolvedRef in resolvedRefQueryResults: 

1047 assert resolvedRef.dataId in refs 

1048 refs[resolvedRef.dataId] = ( 

1049 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1050 ) 

1051 

1052 # Look up input and initInput datasets in the input collection(s). 

1053 # container to accumulate unfound refs, if the common dataIs were not 

1054 # constrained on dataset type existence. 

1055 self.unfoundRefs = set() 

1056 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1057 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

1058 if datasetType.isComponent(): 

1059 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1060 component = datasetType.component() 

1061 else: 

1062 parent_dataset_type = datasetType 

1063 component = None 

1064 try: 

1065 resolvedRefQueryResults = commonDataIds.subset( 

1066 datasetType.dimensions, unique=True 

1067 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1068 except MissingDatasetTypeError: 

1069 resolvedRefQueryResults = [] 

1070 dataIdsNotFoundYet = set(refs.keys()) 

1071 for resolvedRef in resolvedRefQueryResults: 

1072 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1073 refs[resolvedRef.dataId] = ( 

1074 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1075 ) 

1076 if dataIdsNotFoundYet: 

1077 if constrainedByAllDatasets: 

1078 raise RuntimeError( 

1079 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1080 f"'{datasetType.name}' was/were present in a previous " 

1081 "query, but could not be found now. " 

1082 "This is either a logic bug in QuantumGraph generation " 

1083 "or the input collections have been modified since " 

1084 "QuantumGraph generation began." 

1085 ) 

1086 elif not datasetType.dimensions: 

1087 raise RuntimeError( 

1088 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1089 f"collections {collections}." 

1090 ) 

1091 else: 

1092 # if the common dataIds were not constrained using all the 

1093 # input dataset types, it is possible that some data ids 

1094 # found dont correspond to existing dataset types and they 

1095 # will be un-resolved. Mark these for later pruning from 

1096 # the quantum graph. 

1097 for k in dataIdsNotFoundYet: 

1098 self.unfoundRefs.add(refs[k]) 

1099 

1100 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1101 # replacing the unresolved refs there, and then look up prerequisites. 

1102 for task in self.tasks: 

1103 _LOG.debug( 

1104 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1105 len(task.quanta), 

1106 task.taskDef.label, 

1107 ) 

1108 # The way iterConnections is designed makes it impossible to 

1109 # annotate precisely enough to satisfy MyPy here. 

1110 lookupFunctions = { 

1111 c.name: c.lookupFunction # type: ignore 

1112 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1113 if c.lookupFunction is not None # type: ignore 

1114 } 

1115 dataIdsFailed = [] 

1116 dataIdsSucceeded = [] 

1117 for quantum in task.quanta.values(): 

1118 # Process outputs datasets only if skipExistingIn is not None 

1119 # or there is a run to look for outputs in and clobberOutputs 

1120 # is True. Note that if skipExistingIn is None, any output 

1121 # datasets that already exist would have already caused an 

1122 # exception to be raised. We never update the DatasetRefs in 

1123 # the quantum because those should never be resolved. 

1124 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1125 resolvedRefs = [] 

1126 unresolvedRefs = [] 

1127 haveMetadata = False 

1128 for datasetType, originalRefs in quantum.outputs.items(): 

1129 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1130 if ref.id is not None: 

1131 resolvedRefs.append(ref) 

1132 if datasetType.name == task.taskDef.metadataDatasetName: 

1133 haveMetadata = True 

1134 else: 

1135 unresolvedRefs.append(ref) 

1136 if resolvedRefs: 

1137 if haveMetadata or not unresolvedRefs: 

1138 dataIdsSucceeded.append(quantum.dataId) 

1139 if skip_collections_wildcard is not None: 

1140 continue 

1141 else: 

1142 dataIdsFailed.append(quantum.dataId) 

1143 if not clobberOutputs: 

1144 raise OutputExistsError( 

1145 f"Quantum {quantum.dataId} of task with label " 

1146 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1147 f"({resolvedRefs}) " 

1148 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

1149 "and clobbering outputs was not enabled." 

1150 ) 

1151 # Update the input DatasetRefs to the resolved ones we already 

1152 # searched for. 

1153 for datasetType, input_refs in quantum.inputs.items(): 

1154 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

1155 input_refs[ref.dataId] = ref 

1156 # Look up prerequisite datasets in the input collection(s). 

1157 # These may have dimensions that extend beyond those we queried 

1158 # for originally, because we want to permit those data ID 

1159 # values to differ across quanta and dataset types. 

1160 for datasetType in task.prerequisites: 

1161 if datasetType.isComponent(): 

1162 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1163 component = datasetType.component() 

1164 else: 

1165 parent_dataset_type = datasetType 

1166 component = None 

1167 lookupFunction = lookupFunctions.get(datasetType.name) 

1168 if lookupFunction is not None: 

1169 # PipelineTask has provided its own function to do the 

1170 # lookup. This always takes precedence. 

1171 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1172 elif ( 

1173 datasetType.isCalibration() 

1174 and datasetType.dimensions <= quantum.dataId.graph 

1175 and quantum.dataId.graph.temporal 

1176 ): 

1177 # This is a master calibration lookup, which we have to 

1178 # handle specially because the query system can't do a 

1179 # temporal join on a non-dimension-based timespan yet. 

1180 timespan = quantum.dataId.timespan 

1181 try: 

1182 prereq_ref = registry.findDataset( 

1183 parent_dataset_type, 

1184 quantum.dataId, 

1185 collections=collections, 

1186 timespan=timespan, 

1187 ) 

1188 if prereq_ref is not None: 

1189 if component is not None: 

1190 prereq_ref = prereq_ref.makeComponentRef(component) 

1191 prereq_refs = [prereq_ref] 

1192 else: 

1193 prereq_refs = [] 

1194 except (KeyError, MissingDatasetTypeError): 

1195 # This dataset type is not present in the registry, 

1196 # which just means there are no datasets here. 

1197 prereq_refs = [] 

1198 else: 

1199 # Most general case. 

1200 prereq_refs = [ 

1201 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1202 for prereq_ref in registry.queryDatasets( 

1203 parent_dataset_type, 

1204 collections=collections, 

1205 dataId=quantum.dataId, 

1206 findFirst=True, 

1207 ).expanded() 

1208 ] 

1209 prereq_refs_map = {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1210 quantum.prerequisites[datasetType].update(prereq_refs_map) 

1211 task.prerequisites[datasetType].update(prereq_refs_map) 

1212 

1213 # Resolve all quantum inputs and outputs. 

1214 if idMaker: 

1215 for datasetDict in (quantum.inputs, quantum.outputs): 

1216 for refDict in datasetDict.values(): 

1217 refDict.update(idMaker.resolveDict(refDict)) 

1218 

1219 # Resolve task initInputs and initOutputs. 

1220 if idMaker: 

1221 for datasetDict in (task.initInputs, task.initOutputs): 

1222 for refDict in datasetDict.values(): 

1223 refDict.update(idMaker.resolveDict(refDict)) 

1224 

1225 # Actually remove any quanta that we decided to skip above. 

1226 if dataIdsSucceeded: 

1227 if skip_collections_wildcard is not None: 

1228 _LOG.debug( 

1229 "Pruning successful %d quanta for task with label '%s' because all of their " 

1230 "outputs exist or metadata was written successfully.", 

1231 len(dataIdsSucceeded), 

1232 task.taskDef.label, 

1233 ) 

1234 for dataId in dataIdsSucceeded: 

1235 del task.quanta[dataId] 

1236 elif clobberOutputs: 

1237 _LOG.info( 

1238 "Found %d successful quanta for task with label '%s' " 

1239 "that will need to be clobbered during execution.", 

1240 len(dataIdsSucceeded), 

1241 task.taskDef.label, 

1242 ) 

1243 else: 

1244 raise AssertionError("OutputExistsError should have already been raised.") 

1245 if dataIdsFailed: 

1246 if clobberOutputs: 

1247 _LOG.info( 

1248 "Found %d failed/incomplete quanta for task with label '%s' " 

1249 "that will need to be clobbered during execution.", 

1250 len(dataIdsFailed), 

1251 task.taskDef.label, 

1252 ) 

1253 else: 

1254 raise AssertionError("OutputExistsError should have already been raised.") 

1255 

1256 # Collect initOutputs that do not belong to any task. 

1257 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1258 for task in self.tasks: 

1259 global_dataset_types -= set(task.initOutputs) 

1260 if global_dataset_types: 

1261 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1262 if idMaker is not None: 

1263 for refDict in self.globalInitOutputs.values(): 

1264 refDict.update(idMaker.resolveDict(refDict)) 

1265 

1266 def makeQuantumGraph( 

1267 self, 

1268 registry: Registry, 

1269 metadata: Optional[Mapping[str, Any]] = None, 

1270 datastore: Optional[Datastore] = None, 

1271 ) -> QuantumGraph: 

1272 """Create a `QuantumGraph` from the quanta already present in 

1273 the scaffolding data structure. 

1274 

1275 Parameters 

1276 --------- 

1277 registry : `lsst.daf.butler.Registry` 

1278 Registry for the data repository; used for all data ID queries. 

1279 metadata : Optional Mapping of `str` to primitives 

1280 This is an optional parameter of extra data to carry with the 

1281 graph. Entries in this mapping should be able to be serialized in 

1282 JSON. 

1283 datastore : `Datastore`, optional 

1284 If not `None` then fill datastore records in each generated 

1285 Quantum. 

1286 

1287 Returns 

1288 ------- 

1289 graph : `QuantumGraph` 

1290 The full `QuantumGraph`. 

1291 """ 

1292 

1293 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1294 """Extract all DatasetRefs from the dictionaries""" 

1295 for ref_dict in dataset_dict.values(): 

1296 yield from ref_dict.values() 

1297 

1298 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1299 if datastore is not None: 

1300 datastore_records = datastore.export_records( 

1301 itertools.chain( 

1302 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1303 ) 

1304 ) 

1305 

1306 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1307 for task in self.tasks: 

1308 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1309 graphInput[task.taskDef] = qset 

1310 

1311 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1312 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1313 

1314 globalInitOutputs: list[DatasetRef] = [] 

1315 if self.globalInitOutputs is not None: 

1316 for refs_dict in self.globalInitOutputs.values(): 

1317 globalInitOutputs.extend(refs_dict.values()) 

1318 

1319 graph = QuantumGraph( 

1320 graphInput, 

1321 metadata=metadata, 

1322 pruneRefs=self.unfoundRefs, 

1323 universe=self.dimensions.universe, 

1324 initInputs=taskInitInputs, 

1325 initOutputs=taskInitOutputs, 

1326 globalInitOutputs=globalInitOutputs, 

1327 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1328 ) 

1329 return graph 

1330 

1331 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1332 """Make a list of all dataset types used by a graph as defined in 

1333 registry. 

1334 """ 

1335 chain = [ 

1336 self.initInputs, 

1337 self.initIntermediates, 

1338 self.initOutputs, 

1339 self.inputs, 

1340 self.intermediates, 

1341 self.outputs, 

1342 self.prerequisites, 

1343 ] 

1344 if self.globalInitOutputs is not None: 

1345 chain.append(self.globalInitOutputs) 

1346 

1347 # Collect names of all dataset types. 

1348 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1349 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1350 

1351 # Check for types that do not exist in registry yet: 

1352 # - inputs must exist 

1353 # - intermediates and outputs may not exist, but there must not be 

1354 # more than one definition (e.g. differing in storage class) 

1355 # - prerequisites may not exist, treat it the same as outputs here 

1356 for dstype in itertools.chain(self.initInputs, self.inputs): 

1357 if dstype.name not in dataset_types: 

1358 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1359 

1360 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1361 chain = [ 

1362 self.initIntermediates, 

1363 self.initOutputs, 

1364 self.intermediates, 

1365 self.outputs, 

1366 self.prerequisites, 

1367 ] 

1368 if self.globalInitOutputs is not None: 

1369 chain.append(self.globalInitOutputs) 

1370 for dstype in itertools.chain(*chain): 

1371 if dstype.name not in dataset_types: 

1372 new_outputs[dstype.name].add(dstype) 

1373 for name, dstypes in new_outputs.items(): 

1374 if len(dstypes) > 1: 

1375 raise ValueError( 

1376 "Pipeline contains multiple definitions for a dataset type " 

1377 f"which is not defined in registry yet: {dstypes}" 

1378 ) 

1379 elif len(dstypes) == 1: 

1380 dataset_types[name] = dstypes.pop() 

1381 

1382 return dataset_types.values() 

1383 

1384 

1385# ------------------------ 

1386# Exported definitions -- 

1387# ------------------------ 

1388 

1389 

1390class GraphBuilderError(Exception): 

1391 """Base class for exceptions generated by graph builder.""" 

1392 

1393 pass 

1394 

1395 

1396class OutputExistsError(GraphBuilderError): 

1397 """Exception generated when output datasets already exist.""" 

1398 

1399 pass 

1400 

1401 

1402class PrerequisiteMissingError(GraphBuilderError): 

1403 """Exception generated when a prerequisite dataset does not exist.""" 

1404 

1405 pass 

1406 

1407 

1408class GraphBuilder: 

1409 """GraphBuilder class is responsible for building task execution graph from 

1410 a Pipeline. 

1411 

1412 Parameters 

1413 ---------- 

1414 registry : `~lsst.daf.butler.Registry` 

1415 Data butler instance. 

1416 skipExistingIn 

1417 Expressions representing the collections to search for existing 

1418 output datasets that should be skipped. See 

1419 :ref:`daf_butler_ordered_collection_searches`. 

1420 clobberOutputs : `bool`, optional 

1421 If `True` (default), allow quanta to created even if partial outputs 

1422 exist; this requires the same behavior behavior to be enabled when 

1423 executing. 

1424 datastore : `Datastore`, optional 

1425 If not `None` then fill datastore records in each generated Quantum. 

1426 """ 

1427 

1428 def __init__( 

1429 self, 

1430 registry: Registry, 

1431 skipExistingIn: Any = None, 

1432 clobberOutputs: bool = True, 

1433 datastore: Optional[Datastore] = None, 

1434 ): 

1435 self.registry = registry 

1436 self.dimensions = registry.dimensions 

1437 self.skipExistingIn = skipExistingIn 

1438 self.clobberOutputs = clobberOutputs 

1439 self.datastore = datastore 

1440 

1441 def makeGraph( 

1442 self, 

1443 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1444 collections: Any, 

1445 run: Optional[str], 

1446 userQuery: Optional[str], 

1447 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1448 metadata: Optional[Mapping[str, Any]] = None, 

1449 resolveRefs: bool = False, 

1450 bind: Optional[Mapping[str, Any]] = None, 

1451 ) -> QuantumGraph: 

1452 """Create execution graph for a pipeline. 

1453 

1454 Parameters 

1455 ---------- 

1456 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1457 Pipeline definition, task names/classes and their configs. 

1458 collections 

1459 Expressions representing the collections to search for input 

1460 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1461 run : `str`, optional 

1462 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1463 output datasets. Collection does not have to exist and it will be 

1464 created when graph is executed. 

1465 userQuery : `str` 

1466 String which defines user-defined selection for registry, should be 

1467 empty or `None` if there is no restrictions on data selection. 

1468 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1469 The query constraint variant that should be used to constraint the 

1470 query based on dataset existance, defaults to 

1471 `DatasetQueryConstraintVariant.ALL`. 

1472 metadata : Optional Mapping of `str` to primitives 

1473 This is an optional parameter of extra data to carry with the 

1474 graph. Entries in this mapping should be able to be serialized in 

1475 JSON. 

1476 resolveRefs : `bool`, optional 

1477 If `True` then resolve all input references and generate random 

1478 dataset IDs for all output and intermediate datasets. True value 

1479 requires ``run`` collection to be specified. 

1480 bind : `Mapping`, optional 

1481 Mapping containing literal values that should be injected into the 

1482 ``userQuery`` expression, keyed by the identifiers they replace. 

1483 

1484 Returns 

1485 ------- 

1486 graph : `QuantumGraph` 

1487 

1488 Raises 

1489 ------ 

1490 UserExpressionError 

1491 Raised when user expression cannot be parsed. 

1492 OutputExistsError 

1493 Raised when output datasets already exist. 

1494 Exception 

1495 Other exceptions types may be raised by underlying registry 

1496 classes. 

1497 """ 

1498 if resolveRefs and run is None: 

1499 raise ValueError("`resolveRefs` requires `run` parameter.") 

1500 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1501 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1502 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1503 instrument_class: Optional[Any] = None 

1504 if isinstance(pipeline, Pipeline): 

1505 instrument_class_name = pipeline.getInstrument() 

1506 if instrument_class_name is not None: 

1507 instrument_class = doImportType(instrument_class_name) 

1508 pipeline = list(pipeline.toExpandedPipeline()) 

1509 if instrument_class is not None: 

1510 dataId = DataCoordinate.standardize( 

1511 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1512 ) 

1513 else: 

1514 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1515 with scaffolding.connectDataIds( 

1516 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1517 ) as commonDataIds: 

1518 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1519 scaffolding.resolveDatasetRefs( 

1520 self.registry, 

1521 collections, 

1522 run, 

1523 commonDataIds, 

1524 skipExistingIn=self.skipExistingIn, 

1525 clobberOutputs=self.clobberOutputs, 

1526 constrainedByAllDatasets=condition, 

1527 resolveRefs=resolveRefs, 

1528 ) 

1529 return scaffolding.makeQuantumGraph( 

1530 registry=self.registry, metadata=metadata, datastore=self.datastore 

1531 )