Coverage for python/lsst/pipe/base/graphBuilder.py: 16%

456 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-05 18:44 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionType, 

40 DataCoordinate, 

41 DatasetIdGenEnum, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 Quantum, 

50 Registry, 

51) 

52from lsst.daf.butler.registry import MissingDatasetTypeError 

53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

54from lsst.daf.butler.registry.wildcards import CollectionWildcard 

55from lsst.utils import doImportType 

56 

57from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

58from ._status import NoWorkFound 

59 

60# ----------------------------- 

61# Imports for other modules -- 

62# ----------------------------- 

63from .connections import AdjustQuantumHelper, iterConnections 

64from .graph import QuantumGraph 

65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

66 

67# ---------------------------------- 

68# Local non-exported definitions -- 

69# ---------------------------------- 

70 

71_LOG = logging.getLogger(__name__) 

72 

73 

74class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

75 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

76 the known `DatasetRef` instances of that type. 

77 

78 Parameters 

79 ---------- 

80 args 

81 Positional arguments are forwarded to the `dict` constructor. 

82 universe : `DimensionUniverse` 

83 Universe of all possible dimensions. 

84 """ 

85 

86 def __init__(self, *args: Any, universe: DimensionUniverse): 

87 super().__init__(*args) 

88 self.universe = universe 

89 

90 @classmethod 

91 def fromDatasetTypes( 

92 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

93 ) -> _DatasetDict: 

94 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

95 

96 Parameters 

97 ---------- 

98 datasetTypes : `iterable` of `DatasetType` 

99 DatasetTypes to use as keys for the dict. Values will be empty 

100 dictionaries. 

101 universe : `DimensionUniverse` 

102 Universe of all possible dimensions. 

103 

104 Returns 

105 ------- 

106 dictionary : `_DatasetDict` 

107 A new `_DatasetDict` instance. 

108 """ 

109 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

110 

111 @classmethod 

112 def fromSubset( 

113 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

114 ) -> _DatasetDict: 

115 """Return a new dictionary by extracting items corresponding to the 

116 given keys from one or more existing dictionaries. 

117 

118 Parameters 

119 ---------- 

120 datasetTypes : `iterable` of `DatasetType` 

121 DatasetTypes to use as keys for the dict. Values will be obtained 

122 by lookups against ``first`` and ``rest``. 

123 first : `_DatasetDict` 

124 Another dictionary from which to extract values. 

125 rest 

126 Additional dictionaries from which to extract values. 

127 

128 Returns 

129 ------- 

130 dictionary : `_DatasetDict` 

131 A new dictionary instance. 

132 """ 

133 combined = ChainMap(first, *rest) 

134 

135 # Dataset types known to match immediately can be processed 

136 # without checks. 

137 matches = combined.keys() & set(datasetTypes) 

138 _dict = {k: combined[k] for k in matches} 

139 

140 if len(_dict) < len(datasetTypes): 

141 # Work out which ones are missing. 

142 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

143 

144 # Get the known names for comparison. 

145 combined_by_name = {k.name: k for k in combined} 

146 

147 missing = set() 

148 incompatible = {} 

149 for datasetType in missing_datasetTypes: 

150 # The dataset type is not found. It may not be listed 

151 # or it may be that it is there with the same name 

152 # but different definition. 

153 if datasetType.name in combined_by_name: 

154 # This implies some inconsistency in definitions 

155 # for connections. If there is support for storage 

156 # class conversion we can let it slide. 

157 # At this point we do not know 

158 # where the inconsistency is but trust that down 

159 # stream code will be more explicit about input 

160 # vs output incompatibilities. 

161 existing = combined_by_name[datasetType.name] 

162 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

163 _LOG.warning( 

164 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

165 datasetType, 

166 existing, 

167 ) 

168 _dict[datasetType] = combined[existing] 

169 else: 

170 incompatible[datasetType] = existing 

171 else: 

172 missing.add(datasetType) 

173 

174 if missing or incompatible: 

175 reasons = [] 

176 if missing: 

177 reasons.append( 

178 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

179 + ", ".join(d.name for d in combined) 

180 ) 

181 if incompatible: 

182 for x, y in incompatible.items(): 

183 reasons.append(f"{x} incompatible with {y}") 

184 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

185 

186 return cls(_dict, universe=first.universe) 

187 

188 @property 

189 def dimensions(self) -> DimensionGraph: 

190 """The union of all dimensions used by all dataset types in this 

191 dictionary, including implied dependencies (`DimensionGraph`). 

192 """ 

193 base = self.universe.empty 

194 if len(self) == 0: 

195 return base 

196 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

197 

198 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

199 """Unpack nested single-element `DatasetRef` dicts into a new 

200 mapping with `DatasetType` keys and `DatasetRef` values. 

201 

202 This method assumes that each nest contains exactly one item, as is the 

203 case for all "init" datasets. 

204 

205 Returns 

206 ------- 

207 dictionary : `NamedKeyDict` 

208 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

209 `DatasetType` instances and string names usable as keys. 

210 """ 

211 

212 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

213 (ref,) = refs.values() 

214 return ref 

215 

216 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

217 

218 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

219 """Unpack nested multi-element `DatasetRef` dicts into a new 

220 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

221 

222 Returns 

223 ------- 

224 dictionary : `NamedKeyDict` 

225 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

226 both `DatasetType` instances and string names usable as keys. 

227 """ 

228 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

229 

230 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

231 """Iterate over the contained `DatasetRef` instances that match the 

232 given `DatasetType` and data IDs. 

233 

234 Parameters 

235 ---------- 

236 datasetType : `DatasetType` 

237 Dataset type to match. 

238 dataIds : `Iterable` [ `DataCoordinate` ] 

239 Data IDs to match. 

240 

241 Returns 

242 ------- 

243 refs : `Iterator` [ `DatasetRef` ] 

244 DatasetRef instances for which ``ref.datasetType == datasetType`` 

245 and ``ref.dataId`` is in ``dataIds``. 

246 """ 

247 refs = self[datasetType] 

248 return (refs[dataId] for dataId in dataIds) 

249 

250 

251class _QuantumScaffolding: 

252 """Helper class aggregating information about a `Quantum`, used when 

253 constructing a `QuantumGraph`. 

254 

255 See `_PipelineScaffolding` for a top-down description of the full 

256 scaffolding data structure. 

257 

258 Parameters 

259 ---------- 

260 task : _TaskScaffolding 

261 Back-reference to the helper object for the `PipelineTask` this quantum 

262 represents an execution of. 

263 dataId : `DataCoordinate` 

264 Data ID for this quantum. 

265 """ 

266 

267 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

268 self.task = task 

269 self.dataId = dataId 

270 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

271 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

272 self.prerequisites = _DatasetDict.fromDatasetTypes( 

273 task.prerequisites.keys(), universe=dataId.universe 

274 ) 

275 

276 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

277 

278 def __repr__(self) -> str: 

279 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

280 

281 task: _TaskScaffolding 

282 """Back-reference to the helper object for the `PipelineTask` this quantum 

283 represents an execution of. 

284 """ 

285 

286 dataId: DataCoordinate 

287 """Data ID for this quantum. 

288 """ 

289 

290 inputs: _DatasetDict 

291 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

292 

293 This is initialized to map each `DatasetType` to an empty dictionary at 

294 construction. Those nested dictionaries are populated (with data IDs as 

295 keys) with unresolved `DatasetRef` instances in 

296 `_PipelineScaffolding.connectDataIds`. 

297 """ 

298 

299 outputs: _DatasetDict 

300 """Nested dictionary containing `DatasetRef` outputs this quantum. 

301 """ 

302 

303 prerequisites: _DatasetDict 

304 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

305 quantum. 

306 """ 

307 

308 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

309 """Transform the scaffolding object into a true `Quantum` instance. 

310 

311 Parameters 

312 ---------- 

313 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

314 If not `None` then fill datastore records in each generated Quantum 

315 using the records from this structure. 

316 

317 Returns 

318 ------- 

319 quantum : `Quantum` 

320 An actual `Quantum` instance. 

321 """ 

322 allInputs = self.inputs.unpackMultiRefs() 

323 allInputs.update(self.prerequisites.unpackMultiRefs()) 

324 # Give the task's Connections class an opportunity to remove some 

325 # inputs, or complain if they are unacceptable. 

326 # This will raise if one of the check conditions is not met, which is 

327 # the intended behavior. 

328 # If it raises NotWorkFound, there is a bug in the QG algorithm 

329 # or the adjustQuantum is incorrectly trying to make a prerequisite 

330 # input behave like a regular input; adjustQuantum should only raise 

331 # NoWorkFound if a regular input is missing, and it shouldn't be 

332 # possible for us to have generated ``self`` if that's true. 

333 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

334 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

335 initInputs = self.task.initInputs.unpackSingleRefs() 

336 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

337 if datastore_records is not None: 

338 quantum_records = {} 

339 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

340 input_refs += list(initInputs.values()) 

341 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

342 for datastore_name, records in datastore_records.items(): 

343 matching_records = records.subset(input_ids) 

344 if matching_records is not None: 

345 quantum_records[datastore_name] = matching_records 

346 return Quantum( 

347 taskName=self.task.taskDef.taskName, 

348 taskClass=self.task.taskDef.taskClass, 

349 dataId=self.dataId, 

350 initInputs=initInputs, 

351 inputs=helper.inputs, 

352 outputs=helper.outputs, 

353 datastore_records=quantum_records, 

354 ) 

355 

356 

357@dataclass 

358class _TaskScaffolding: 

359 """Helper class aggregating information about a `PipelineTask`, used when 

360 constructing a `QuantumGraph`. 

361 

362 See `_PipelineScaffolding` for a top-down description of the full 

363 scaffolding data structure. 

364 

365 Parameters 

366 ---------- 

367 taskDef : `TaskDef` 

368 Data structure that identifies the task class and its config. 

369 parent : `_PipelineScaffolding` 

370 The parent data structure that will hold the instance being 

371 constructed. 

372 datasetTypes : `TaskDatasetTypes` 

373 Data structure that categorizes the dataset types used by this task. 

374 """ 

375 

376 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

377 universe = parent.dimensions.universe 

378 self.taskDef = taskDef 

379 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

380 assert self.dimensions.issubset(parent.dimensions) 

381 # Initialize _DatasetDicts as subsets of the one or two 

382 # corresponding dicts in the parent _PipelineScaffolding. 

383 self.initInputs = _DatasetDict.fromSubset( 

384 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

385 ) 

386 self.initOutputs = _DatasetDict.fromSubset( 

387 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

388 ) 

389 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

390 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

391 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

392 self.dataIds: Set[DataCoordinate] = set() 

393 self.quanta = {} 

394 

395 def __repr__(self) -> str: 

396 # Default dataclass-injected __repr__ gets caught in an infinite loop 

397 # because of back-references. 

398 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

399 

400 taskDef: TaskDef 

401 """Data structure that identifies the task class and its config 

402 (`TaskDef`). 

403 """ 

404 

405 dimensions: DimensionGraph 

406 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

407 """ 

408 

409 initInputs: _DatasetDict 

410 """Dictionary containing information about datasets used to construct this 

411 task (`_DatasetDict`). 

412 """ 

413 

414 initOutputs: _DatasetDict 

415 """Dictionary containing information about datasets produced as a 

416 side-effect of constructing this task (`_DatasetDict`). 

417 """ 

418 

419 inputs: _DatasetDict 

420 """Dictionary containing information about datasets used as regular, 

421 graph-constraining inputs to this task (`_DatasetDict`). 

422 """ 

423 

424 outputs: _DatasetDict 

425 """Dictionary containing information about datasets produced by this task 

426 (`_DatasetDict`). 

427 """ 

428 

429 prerequisites: _DatasetDict 

430 """Dictionary containing information about input datasets that must be 

431 present in the repository before any Pipeline containing this task is run 

432 (`_DatasetDict`). 

433 """ 

434 

435 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

436 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

437 this task with that data ID. 

438 """ 

439 

440 def makeQuantumSet( 

441 self, 

442 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

443 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

444 ) -> Set[Quantum]: 

445 """Create a `set` of `Quantum` from the information in ``self``. 

446 

447 Parameters 

448 ---------- 

449 unresolvedRefs : `set` [ `DatasetRef` ], optional 

450 Input dataset refs that have not been found. 

451 datastore_records : `dict` 

452 

453 

454 Returns 

455 ------- 

456 nodes : `set` of `Quantum` 

457 The `Quantum` elements corresponding to this task. 

458 """ 

459 if unresolvedRefs is None: 

460 unresolvedRefs = set() 

461 outputs = set() 

462 for q in self.quanta.values(): 

463 try: 

464 tmpQuanta = q.makeQuantum(datastore_records) 

465 outputs.add(tmpQuanta) 

466 except (NoWorkFound, FileNotFoundError) as exc: 

467 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

468 if unresolvedRefs.intersection(refs): 

469 # This means it is a node that is Known to be pruned 

470 # later and should be left in even though some follow up 

471 # queries fail. This allows the pruning to start from this 

472 # quantum with known issues, and prune other nodes it 

473 # touches 

474 inputs = q.inputs.unpackMultiRefs() 

475 inputs.update(q.prerequisites.unpackMultiRefs()) 

476 tmpQuantum = Quantum( 

477 taskName=q.task.taskDef.taskName, 

478 taskClass=q.task.taskDef.taskClass, 

479 dataId=q.dataId, 

480 initInputs=q.task.initInputs.unpackSingleRefs(), 

481 inputs=inputs, 

482 outputs=q.outputs.unpackMultiRefs(), 

483 ) 

484 outputs.add(tmpQuantum) 

485 else: 

486 raise exc 

487 return outputs 

488 

489 

490class _DatasetIdMaker: 

491 """Helper class which generates random dataset UUIDs for unresolved 

492 datasets. 

493 """ 

494 

495 def __init__(self, registry: Registry, run: str): 

496 self.datasetIdFactory = registry.datasetIdFactory 

497 self.run = run 

498 # Dataset IDs generated so far 

499 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

500 

501 def resolveRef(self, ref: DatasetRef) -> DatasetRef: 

502 if ref.id is not None: 

503 return ref 

504 key = ref.datasetType, ref.dataId 

505 if (resolved := self.resolved.get(key)) is None: 

506 datasetId = self.datasetIdFactory.makeDatasetId( 

507 self.run, ref.datasetType, ref.dataId, DatasetIdGenEnum.UNIQUE 

508 ) 

509 resolved = ref.resolved(datasetId, self.run) 

510 self.resolved[key] = resolved 

511 return resolved 

512 

513 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]: 

514 """Resolve all unresolved references in the provided dictionary.""" 

515 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()} 

516 

517 

518@dataclass 

519class _PipelineScaffolding: 

520 """A helper data structure that organizes the information involved in 

521 constructing a `QuantumGraph` for a `Pipeline`. 

522 

523 Parameters 

524 ---------- 

525 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

526 Sequence of tasks from which a graph is to be constructed. Must 

527 have nested task classes already imported. 

528 universe : `DimensionUniverse` 

529 Universe of all possible dimensions. 

530 

531 Notes 

532 ----- 

533 The scaffolding data structure contains nested data structures for both 

534 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

535 data structures are shared between the pipeline-level structure (which 

536 aggregates all datasets and categorizes them from the perspective of the 

537 complete pipeline) and the individual tasks that use them as inputs and 

538 outputs. 

539 

540 `QuantumGraph` construction proceeds in four steps, with each corresponding 

541 to a different `_PipelineScaffolding` method: 

542 

543 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

544 the DatasetTypes used by the pipeline (delegating to 

545 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

546 nested `_TaskScaffolding` and `_DatasetDict` objects. 

547 

548 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

549 returns related tuples of all dimensions used to identify any regular 

550 input, output, and intermediate datasets (not prerequisites). We then 

551 iterate over these tuples of related dimensions, identifying the subsets 

552 that correspond to distinct data IDs for each task and dataset type, 

553 and then create `_QuantumScaffolding` objects. 

554 

555 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

556 dataset data IDs previously identified, transforming unresolved 

557 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

558 up prerequisite datasets for all quanta. 

559 

560 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

561 per-task `_QuantumScaffolding` objects. 

562 """ 

563 

564 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

565 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

566 self.tasks = [] 

567 # Aggregate and categorize the DatasetTypes in the Pipeline. 

568 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

569 # Construct dictionaries that map those DatasetTypes to structures 

570 # that will (later) hold addiitonal information about them. 

571 for attr in ( 

572 "initInputs", 

573 "initIntermediates", 

574 "initOutputs", 

575 "inputs", 

576 "intermediates", 

577 "outputs", 

578 "prerequisites", 

579 ): 

580 setattr( 

581 self, 

582 attr, 

583 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

584 ) 

585 # Aggregate all dimensions for all non-init, non-prerequisite 

586 # DatasetTypes. These are the ones we'll include in the big join 

587 # query. 

588 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

589 # Construct scaffolding nodes for each Task, and add backreferences 

590 # to the Task from each DatasetScaffolding node. 

591 # Note that there's only one scaffolding node for each DatasetType, 

592 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

593 # reference it. 

594 if isinstance(pipeline, Pipeline): 

595 pipeline = pipeline.toExpandedPipeline() 

596 self.tasks = [ 

597 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

598 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

599 ] 

600 

601 def __repr__(self) -> str: 

602 # Default dataclass-injected __repr__ gets caught in an infinite loop 

603 # because of back-references. 

604 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

605 

606 tasks: List[_TaskScaffolding] 

607 """Scaffolding data structures for each task in the pipeline 

608 (`list` of `_TaskScaffolding`). 

609 """ 

610 

611 initInputs: _DatasetDict 

612 """Datasets consumed but not produced when constructing the tasks in this 

613 pipeline (`_DatasetDict`). 

614 """ 

615 

616 initIntermediates: _DatasetDict 

617 """Datasets that are both consumed and produced when constructing the tasks 

618 in this pipeline (`_DatasetDict`). 

619 """ 

620 

621 initOutputs: _DatasetDict 

622 """Datasets produced but not consumed when constructing the tasks in this 

623 pipeline (`_DatasetDict`). 

624 """ 

625 

626 inputs: _DatasetDict 

627 """Datasets that are consumed but not produced when running this pipeline 

628 (`_DatasetDict`). 

629 """ 

630 

631 intermediates: _DatasetDict 

632 """Datasets that are both produced and consumed when running this pipeline 

633 (`_DatasetDict`). 

634 """ 

635 

636 outputs: _DatasetDict 

637 """Datasets produced but not consumed when when running this pipeline 

638 (`_DatasetDict`). 

639 """ 

640 

641 prerequisites: _DatasetDict 

642 """Datasets that are consumed when running this pipeline and looked up 

643 per-Quantum when generating the graph (`_DatasetDict`). 

644 """ 

645 

646 dimensions: DimensionGraph 

647 """All dimensions used by any regular input, intermediate, or output 

648 (not prerequisite) dataset; the set of dimension used in the "Big Join 

649 Query" (`DimensionGraph`). 

650 

651 This is required to be a superset of all task quantum dimensions. 

652 """ 

653 

654 @contextmanager 

655 def connectDataIds( 

656 self, 

657 registry: Registry, 

658 collections: Any, 

659 userQuery: Optional[str], 

660 externalDataId: DataCoordinate, 

661 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

662 ) -> Iterator[DataCoordinateQueryResults]: 

663 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

664 

665 This method populates `_TaskScaffolding.dataIds` and 

666 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

667 

668 Parameters 

669 ---------- 

670 registry : `lsst.daf.butler.Registry` 

671 Registry for the data repository; used for all data ID queries. 

672 collections 

673 Expressions representing the collections to search for input 

674 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

675 userQuery : `str` or `None` 

676 User-provided expression to limit the data IDs processed. 

677 externalDataId : `DataCoordinate` 

678 Externally-provided data ID that should be used to restrict the 

679 results, just as if these constraints had been included via ``AND`` 

680 in ``userQuery``. This includes (at least) any instrument named 

681 in the pipeline definition. 

682 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

683 The query constraint variant that should be used to constraint the 

684 query based on dataset existance, defaults to 

685 `DatasetQueryConstraintVariant.ALL`. 

686 

687 Returns 

688 ------- 

689 commonDataIds : \ 

690 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

691 An interface to a database temporary table containing all data IDs 

692 that will appear in this `QuantumGraph`. Returned inside a 

693 context manager, which will drop the temporary table at the end of 

694 the `with` block in which this method is called. 

695 """ 

696 _LOG.debug("Building query for data IDs.") 

697 # Initialization datasets always have empty data IDs. 

698 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

699 for datasetType, refs in itertools.chain( 

700 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

701 ): 

702 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

703 # Run one big query for the data IDs for task dimensions and regular 

704 # inputs and outputs. We limit the query to only dimensions that are 

705 # associated with the input dataset types, but don't (yet) try to 

706 # obtain the dataset_ids for those inputs. 

707 _LOG.debug("Submitting data ID query and materializing results.") 

708 queryArgs: Dict[str, Any] = { 

709 "dimensions": self.dimensions, 

710 "where": userQuery, 

711 "dataId": externalDataId, 

712 } 

713 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

714 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

715 queryArgs["datasets"] = list(self.inputs) 

716 queryArgs["collections"] = collections 

717 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

718 _LOG.debug("Not using dataset existence to constrain query.") 

719 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

720 constraint = set(datasetQueryConstraint) 

721 inputs = {k.name: k for k in self.inputs.keys()} 

722 if remainder := constraint.difference(inputs.keys()): 

723 raise ValueError( 

724 f"{remainder} dataset type(s) specified as a graph constraint, but" 

725 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

726 ) 

727 _LOG.debug(f"Constraining graph query using {constraint}") 

728 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

729 queryArgs["collections"] = collections 

730 else: 

731 raise ValueError( 

732 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

733 ) 

734 

735 if "datasets" in queryArgs: 

736 for i, dataset_type in enumerate(queryArgs["datasets"]): 

737 if dataset_type.isComponent(): 

738 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

739 

740 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

741 _LOG.debug("Expanding data IDs.") 

742 commonDataIds = commonDataIds.expanded() 

743 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

744 # Iterate over query results, populating data IDs for datasets and 

745 # quanta and then connecting them to each other. 

746 n = -1 

747 for n, commonDataId in enumerate(commonDataIds): 

748 _LOG.debug("Next DataID = %s", commonDataId) 

749 # Create DatasetRefs for all DatasetTypes from this result row, 

750 # noting that we might have created some already. 

751 # We remember both those that already existed and those that we 

752 # create now. 

753 refsForRow = {} 

754 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

755 for datasetType, refs in itertools.chain( 

756 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

757 ): 

758 datasetDataId: Optional[DataCoordinate] 

759 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

760 datasetDataId = commonDataId.subset(datasetType.dimensions) 

761 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

762 ref = refs.get(datasetDataId) 

763 if ref is None: 

764 ref = DatasetRef(datasetType, datasetDataId) 

765 _LOG.debug("Made new ref = %s", ref) 

766 refs[datasetDataId] = ref 

767 refsForRow[datasetType.name] = ref 

768 # Create _QuantumScaffolding objects for all tasks from this 

769 # result row, noting that we might have created some already. 

770 for task in self.tasks: 

771 quantumDataId = commonDataId.subset(task.dimensions) 

772 quantum = task.quanta.get(quantumDataId) 

773 if quantum is None: 

774 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

775 task.quanta[quantumDataId] = quantum 

776 # Whether this is a new quantum or an existing one, we can 

777 # now associate the DatasetRefs for this row with it. The 

778 # fact that a Quantum data ID and a dataset data ID both 

779 # came from the same result row is what tells us they 

780 # should be associated. 

781 # Many of these associates will be duplicates (because 

782 # another query row that differed from this one only in 

783 # irrelevant dimensions already added them), and we use 

784 # sets to skip. 

785 for datasetType in task.inputs: 

786 ref = refsForRow[datasetType.name] 

787 quantum.inputs[datasetType.name][ref.dataId] = ref 

788 for datasetType in task.outputs: 

789 ref = refsForRow[datasetType.name] 

790 quantum.outputs[datasetType.name][ref.dataId] = ref 

791 if n < 0: 

792 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

793 emptiness_explained = False 

794 for message in commonDataIds.explain_no_results(): 

795 _LOG.critical(message) 

796 emptiness_explained = True 

797 if not emptiness_explained: 

798 _LOG.critical( 

799 "To reproduce this query for debugging purposes, run " 

800 "Registry.queryDataIds with these arguments:" 

801 ) 

802 # We could just repr() the queryArgs dict to get something 

803 # the user could make sense of, but it's friendlier to 

804 # put these args in an easier-to-construct equivalent form 

805 # so they can read it more easily and copy and paste into 

806 # a Python terminal. 

807 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

808 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

809 if queryArgs["where"]: 

810 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

811 if "datasets" in queryArgs: 

812 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

813 if "collections" in queryArgs: 

814 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

815 _LOG.debug("Finished processing %d rows from data ID query.", n) 

816 yield commonDataIds 

817 

818 def resolveDatasetRefs( 

819 self, 

820 registry: Registry, 

821 collections: Any, 

822 run: Optional[str], 

823 commonDataIds: DataCoordinateQueryResults, 

824 *, 

825 skipExistingIn: Any = None, 

826 clobberOutputs: bool = True, 

827 constrainedByAllDatasets: bool = True, 

828 resolveRefs: bool = False, 

829 ) -> None: 

830 """Perform follow up queries for each dataset data ID produced in 

831 `fillDataIds`. 

832 

833 This method populates `_DatasetScaffolding.refs` (except for those in 

834 `prerequisites`). 

835 

836 Parameters 

837 ---------- 

838 registry : `lsst.daf.butler.Registry` 

839 Registry for the data repository; used for all data ID queries. 

840 collections 

841 Expressions representing the collections to search for input 

842 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

843 run : `str`, optional 

844 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

845 output datasets, if it already exists. 

846 commonDataIds : \ 

847 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

848 Result of a previous call to `connectDataIds`. 

849 skipExistingIn 

850 Expressions representing the collections to search for existing 

851 output datasets that should be skipped. See 

852 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

853 `None` or empty string/sequence disables skipping. 

854 clobberOutputs : `bool`, optional 

855 If `True` (default), allow quanta to created even if outputs exist; 

856 this requires the same behavior behavior to be enabled when 

857 executing. If ``skipExistingIn`` is not `None`, completed quanta 

858 (those with metadata, or all outputs if there is no metadata 

859 dataset configured) will be skipped rather than clobbered. 

860 constrainedByAllDatasets : `bool`, optional 

861 Indicates if the commonDataIds were generated with a constraint on 

862 all dataset types. 

863 resolveRefs : `bool`, optional 

864 If `True` then resolve all input references and generate random 

865 dataset IDs for all output and intermediate datasets. True value 

866 requires ``run`` collection to be specified. 

867 

868 Raises 

869 ------ 

870 OutputExistsError 

871 Raised if an output dataset already exists in the output run 

872 and ``skipExistingIn`` does not include output run, or if only 

873 some outputs are present and ``clobberOutputs`` is `False`. 

874 """ 

875 skip_collections_wildcard: CollectionWildcard | None = None 

876 skipExistingInRun = False 

877 if skipExistingIn: 

878 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

879 if run: 

880 # as optimization check in the explicit list of names first 

881 skipExistingInRun = run in skip_collections_wildcard.strings 

882 if not skipExistingInRun: 

883 # need to flatten it and check again 

884 skipExistingInRun = run in registry.queryCollections( 

885 skipExistingIn, 

886 collectionTypes=CollectionType.RUN, 

887 ) 

888 

889 idMaker: Optional[_DatasetIdMaker] = None 

890 if resolveRefs: 

891 assert run is not None, "run cannot be None when resolveRefs is True" 

892 idMaker = _DatasetIdMaker(registry, run) 

893 

894 resolvedRefQueryResults: Iterable[DatasetRef] 

895 

896 # Look up [init] intermediate and output datasets in the output 

897 # collection, if there is an output collection. 

898 if run is not None or skip_collections_wildcard is not None: 

899 for datasetType, refs in itertools.chain( 

900 self.initIntermediates.items(), 

901 self.initOutputs.items(), 

902 self.intermediates.items(), 

903 self.outputs.items(), 

904 ): 

905 _LOG.debug( 

906 "Resolving %d datasets for intermediate and/or output dataset %s.", 

907 len(refs), 

908 datasetType.name, 

909 ) 

910 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

911 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

912 assert not datasetType.isComponent(), "Output datasets cannot be components." 

913 

914 # look at RUN collection first 

915 if run is not None: 

916 try: 

917 resolvedRefQueryResults = subset.findDatasets( 

918 datasetType, collections=run, findFirst=True 

919 ) 

920 except MissingDatasetTypeError: 

921 resolvedRefQueryResults = [] 

922 for resolvedRef in resolvedRefQueryResults: 

923 # TODO: we could easily support per-DatasetType 

924 # skipExisting and I could imagine that being useful - 

925 # it's probably required in order to support writing 

926 # initOutputs before QuantumGraph generation. 

927 assert resolvedRef.dataId in refs 

928 if not (skipExistingInRun or isInit or clobberOutputs): 

929 raise OutputExistsError( 

930 f"Output dataset {datasetType.name} already exists in " 

931 f"output RUN collection '{run}' with data ID" 

932 f" {resolvedRef.dataId}." 

933 ) 

934 # If we are going to resolve all outputs then we have 

935 # to remember existing ones to avoid generating new 

936 # dataset IDs for them. 

937 if resolveRefs: 

938 refs[resolvedRef.dataId] = resolvedRef 

939 

940 # And check skipExistingIn too, if RUN collection is in 

941 # it is handled above 

942 if skip_collections_wildcard is not None: 

943 try: 

944 resolvedRefQueryResults = subset.findDatasets( 

945 datasetType, collections=skip_collections_wildcard, findFirst=True 

946 ) 

947 except MissingDatasetTypeError: 

948 resolvedRefQueryResults = [] 

949 for resolvedRef in resolvedRefQueryResults: 

950 assert resolvedRef.dataId in refs 

951 refs[resolvedRef.dataId] = resolvedRef 

952 

953 # Look up input and initInput datasets in the input collection(s). 

954 # container to accumulate unfound refs, if the common dataIs were not 

955 # constrained on dataset type existence. 

956 self.unfoundRefs = set() 

957 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

958 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

959 if datasetType.isComponent(): 

960 parent_dataset_type = datasetType.makeCompositeDatasetType() 

961 component = datasetType.component() 

962 else: 

963 parent_dataset_type = datasetType 

964 component = None 

965 try: 

966 resolvedRefQueryResults = commonDataIds.subset( 

967 datasetType.dimensions, unique=True 

968 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

969 except MissingDatasetTypeError: 

970 resolvedRefQueryResults = [] 

971 dataIdsNotFoundYet = set(refs.keys()) 

972 for resolvedRef in resolvedRefQueryResults: 

973 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

974 refs[resolvedRef.dataId] = ( 

975 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

976 ) 

977 if dataIdsNotFoundYet: 

978 if constrainedByAllDatasets: 

979 raise RuntimeError( 

980 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

981 f"'{datasetType.name}' was/were present in a previous " 

982 f"query, but could not be found now." 

983 f"This is either a logic bug in QuantumGraph generation " 

984 f"or the input collections have been modified since " 

985 f"QuantumGraph generation began." 

986 ) 

987 else: 

988 # if the common dataIds were not constrained using all the 

989 # input dataset types, it is possible that some data ids 

990 # found dont correspond to existing dataset types and they 

991 # will be un-resolved. Mark these for later pruning from 

992 # the quantum graph. 

993 for k in dataIdsNotFoundYet: 

994 self.unfoundRefs.add(refs[k]) 

995 

996 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

997 # replacing the unresolved refs there, and then look up prerequisites. 

998 for task in self.tasks: 

999 _LOG.debug( 

1000 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1001 len(task.quanta), 

1002 task.taskDef.label, 

1003 ) 

1004 # The way iterConnections is designed makes it impossible to 

1005 # annotate precisely enough to satisfy MyPy here. 

1006 lookupFunctions = { 

1007 c.name: c.lookupFunction # type: ignore 

1008 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1009 if c.lookupFunction is not None # type: ignore 

1010 } 

1011 dataIdsFailed = [] 

1012 dataIdsSucceeded = [] 

1013 for quantum in task.quanta.values(): 

1014 # Process outputs datasets only if skipExistingIn is not None 

1015 # or there is a run to look for outputs in and clobberOutputs 

1016 # is True. Note that if skipExistingIn is None, any output 

1017 # datasets that already exist would have already caused an 

1018 # exception to be raised. We never update the DatasetRefs in 

1019 # the quantum because those should never be resolved. 

1020 if skip_collections_wildcard is not None or (run is not None and clobberOutputs): 

1021 resolvedRefs = [] 

1022 unresolvedRefs = [] 

1023 haveMetadata = False 

1024 for datasetType, originalRefs in quantum.outputs.items(): 

1025 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1026 if ref.id is not None: 

1027 resolvedRefs.append(ref) 

1028 if datasetType.name == task.taskDef.metadataDatasetName: 

1029 haveMetadata = True 

1030 else: 

1031 unresolvedRefs.append(ref) 

1032 if resolvedRefs: 

1033 if haveMetadata or not unresolvedRefs: 

1034 dataIdsSucceeded.append(quantum.dataId) 

1035 if skip_collections_wildcard is not None: 

1036 continue 

1037 else: 

1038 dataIdsFailed.append(quantum.dataId) 

1039 if not clobberOutputs: 

1040 raise OutputExistsError( 

1041 f"Quantum {quantum.dataId} of task with label " 

1042 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1043 f"({resolvedRefs}) " 

1044 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

1045 "and clobbering outputs was not enabled." 

1046 ) 

1047 # Update the input DatasetRefs to the resolved ones we already 

1048 # searched for. 

1049 for datasetType, input_refs in quantum.inputs.items(): 

1050 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

1051 input_refs[ref.dataId] = ref 

1052 # Look up prerequisite datasets in the input collection(s). 

1053 # These may have dimensions that extend beyond those we queried 

1054 # for originally, because we want to permit those data ID 

1055 # values to differ across quanta and dataset types. 

1056 for datasetType in task.prerequisites: 

1057 if datasetType.isComponent(): 

1058 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1059 component = datasetType.component() 

1060 else: 

1061 parent_dataset_type = datasetType 

1062 component = None 

1063 lookupFunction = lookupFunctions.get(datasetType.name) 

1064 if lookupFunction is not None: 

1065 # PipelineTask has provided its own function to do the 

1066 # lookup. This always takes precedence. 

1067 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1068 elif ( 

1069 datasetType.isCalibration() 

1070 and datasetType.dimensions <= quantum.dataId.graph 

1071 and quantum.dataId.graph.temporal 

1072 ): 

1073 # This is a master calibration lookup, which we have to 

1074 # handle specially because the query system can't do a 

1075 # temporal join on a non-dimension-based timespan yet. 

1076 timespan = quantum.dataId.timespan 

1077 try: 

1078 prereq_ref = registry.findDataset( 

1079 parent_dataset_type, 

1080 quantum.dataId, 

1081 collections=collections, 

1082 timespan=timespan, 

1083 ) 

1084 if prereq_ref is not None: 

1085 if component is not None: 

1086 prereq_ref = prereq_ref.makeComponentRef(component) 

1087 prereq_refs = [prereq_ref] 

1088 else: 

1089 prereq_refs = [] 

1090 except (KeyError, MissingDatasetTypeError): 

1091 # This dataset type is not present in the registry, 

1092 # which just means there are no datasets here. 

1093 prereq_refs = [] 

1094 else: 

1095 # Most general case. 

1096 prereq_refs = [ 

1097 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1098 for prereq_ref in registry.queryDatasets( 

1099 parent_dataset_type, 

1100 collections=collections, 

1101 dataId=quantum.dataId, 

1102 findFirst=True, 

1103 ).expanded() 

1104 ] 

1105 quantum.prerequisites[datasetType].update( 

1106 {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1107 ) 

1108 

1109 # Resolve all quantum inputs and outputs. 

1110 if idMaker: 

1111 for datasetDict in (quantum.inputs, quantum.outputs): 

1112 for refDict in datasetDict.values(): 

1113 refDict.update(idMaker.resolveDict(refDict)) 

1114 

1115 # Resolve task initInputs and initOutputs. 

1116 if idMaker: 

1117 for datasetDict in (task.initInputs, task.initOutputs): 

1118 for refDict in datasetDict.values(): 

1119 refDict.update(idMaker.resolveDict(refDict)) 

1120 

1121 # Actually remove any quanta that we decided to skip above. 

1122 if dataIdsSucceeded: 

1123 if skip_collections_wildcard is not None: 

1124 _LOG.debug( 

1125 "Pruning successful %d quanta for task with label '%s' because all of their " 

1126 "outputs exist or metadata was written successfully.", 

1127 len(dataIdsSucceeded), 

1128 task.taskDef.label, 

1129 ) 

1130 for dataId in dataIdsSucceeded: 

1131 del task.quanta[dataId] 

1132 elif clobberOutputs: 

1133 _LOG.info( 

1134 "Found %d successful quanta for task with label '%s' " 

1135 "that will need to be clobbered during execution.", 

1136 len(dataIdsSucceeded), 

1137 task.taskDef.label, 

1138 ) 

1139 else: 

1140 raise AssertionError("OutputExistsError should have already been raised.") 

1141 if dataIdsFailed: 

1142 if clobberOutputs: 

1143 _LOG.info( 

1144 "Found %d failed/incomplete quanta for task with label '%s' " 

1145 "that will need to be clobbered during execution.", 

1146 len(dataIdsFailed), 

1147 task.taskDef.label, 

1148 ) 

1149 else: 

1150 raise AssertionError("OutputExistsError should have already been raised.") 

1151 

1152 def makeQuantumGraph( 

1153 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None 

1154 ) -> QuantumGraph: 

1155 """Create a `QuantumGraph` from the quanta already present in 

1156 the scaffolding data structure. 

1157 

1158 Parameters 

1159 --------- 

1160 metadata : Optional Mapping of `str` to primitives 

1161 This is an optional parameter of extra data to carry with the 

1162 graph. Entries in this mapping should be able to be serialized in 

1163 JSON. 

1164 datastore : `Datastore`, optional 

1165 If not `None` then fill datastore records in each generated 

1166 Quantum. 

1167 

1168 Returns 

1169 ------- 

1170 graph : `QuantumGraph` 

1171 The full `QuantumGraph`. 

1172 """ 

1173 

1174 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1175 """Extract all DatasetRefs from the dictionaries""" 

1176 for ref_dict in dataset_dict.values(): 

1177 yield from ref_dict.values() 

1178 

1179 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1180 if datastore is not None: 

1181 datastore_records = datastore.export_records( 

1182 itertools.chain( 

1183 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1184 ) 

1185 ) 

1186 

1187 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1188 for task in self.tasks: 

1189 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1190 graphInput[task.taskDef] = qset 

1191 

1192 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1193 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1194 

1195 graph = QuantumGraph( 

1196 graphInput, 

1197 metadata=metadata, 

1198 pruneRefs=self.unfoundRefs, 

1199 universe=self.dimensions.universe, 

1200 initInputs=taskInitInputs, 

1201 initOutputs=taskInitOutputs, 

1202 ) 

1203 return graph 

1204 

1205 

1206# ------------------------ 

1207# Exported definitions -- 

1208# ------------------------ 

1209 

1210 

1211class GraphBuilderError(Exception): 

1212 """Base class for exceptions generated by graph builder.""" 

1213 

1214 pass 

1215 

1216 

1217class OutputExistsError(GraphBuilderError): 

1218 """Exception generated when output datasets already exist.""" 

1219 

1220 pass 

1221 

1222 

1223class PrerequisiteMissingError(GraphBuilderError): 

1224 """Exception generated when a prerequisite dataset does not exist.""" 

1225 

1226 pass 

1227 

1228 

1229class GraphBuilder: 

1230 """GraphBuilder class is responsible for building task execution graph from 

1231 a Pipeline. 

1232 

1233 Parameters 

1234 ---------- 

1235 registry : `~lsst.daf.butler.Registry` 

1236 Data butler instance. 

1237 skipExistingIn 

1238 Expressions representing the collections to search for existing 

1239 output datasets that should be skipped. See 

1240 :ref:`daf_butler_ordered_collection_searches`. 

1241 clobberOutputs : `bool`, optional 

1242 If `True` (default), allow quanta to created even if partial outputs 

1243 exist; this requires the same behavior behavior to be enabled when 

1244 executing. 

1245 datastore : `Datastore`, optional 

1246 If not `None` then fill datastore records in each generated Quantum. 

1247 """ 

1248 

1249 def __init__( 

1250 self, 

1251 registry: Registry, 

1252 skipExistingIn: Any = None, 

1253 clobberOutputs: bool = True, 

1254 datastore: Optional[Datastore] = None, 

1255 ): 

1256 self.registry = registry 

1257 self.dimensions = registry.dimensions 

1258 self.skipExistingIn = skipExistingIn 

1259 self.clobberOutputs = clobberOutputs 

1260 self.datastore = datastore 

1261 

1262 def makeGraph( 

1263 self, 

1264 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1265 collections: Any, 

1266 run: Optional[str], 

1267 userQuery: Optional[str], 

1268 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1269 metadata: Optional[Mapping[str, Any]] = None, 

1270 resolveRefs: bool = False, 

1271 ) -> QuantumGraph: 

1272 """Create execution graph for a pipeline. 

1273 

1274 Parameters 

1275 ---------- 

1276 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1277 Pipeline definition, task names/classes and their configs. 

1278 collections 

1279 Expressions representing the collections to search for input 

1280 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1281 run : `str`, optional 

1282 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1283 output datasets, if it already exists. 

1284 userQuery : `str` 

1285 String which defines user-defined selection for registry, should be 

1286 empty or `None` if there is no restrictions on data selection. 

1287 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1288 The query constraint variant that should be used to constraint the 

1289 query based on dataset existance, defaults to 

1290 `DatasetQueryConstraintVariant.ALL`. 

1291 metadata : Optional Mapping of `str` to primitives 

1292 This is an optional parameter of extra data to carry with the 

1293 graph. Entries in this mapping should be able to be serialized in 

1294 JSON. 

1295 resolveRefs : `bool`, optional 

1296 If `True` then resolve all input references and generate random 

1297 dataset IDs for all output and intermediate datasets. True value 

1298 requires ``run`` collection to be specified. 

1299 

1300 Returns 

1301 ------- 

1302 graph : `QuantumGraph` 

1303 

1304 Raises 

1305 ------ 

1306 UserExpressionError 

1307 Raised when user expression cannot be parsed. 

1308 OutputExistsError 

1309 Raised when output datasets already exist. 

1310 Exception 

1311 Other exceptions types may be raised by underlying registry 

1312 classes. 

1313 """ 

1314 if resolveRefs and run is None: 

1315 raise ValueError("`resolveRefs` requires `run` parameter.") 

1316 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1317 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1318 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1319 instrument_class: Optional[Any] = None 

1320 if isinstance(pipeline, Pipeline): 

1321 instrument_class_name = pipeline.getInstrument() 

1322 if instrument_class_name is not None: 

1323 instrument_class = doImportType(instrument_class_name) 

1324 pipeline = list(pipeline.toExpandedPipeline()) 

1325 if instrument_class is not None: 

1326 dataId = DataCoordinate.standardize( 

1327 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1328 ) 

1329 else: 

1330 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1331 with scaffolding.connectDataIds( 

1332 self.registry, collections, userQuery, dataId, datasetQueryConstraint 

1333 ) as commonDataIds: 

1334 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1335 scaffolding.resolveDatasetRefs( 

1336 self.registry, 

1337 collections, 

1338 run, 

1339 commonDataIds, 

1340 skipExistingIn=self.skipExistingIn, 

1341 clobberOutputs=self.clobberOutputs, 

1342 constrainedByAllDatasets=condition, 

1343 resolveRefs=resolveRefs, 

1344 ) 

1345 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)