Coverage for python/lsst/pipe/base/graphBuilder.py: 18%

424 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-11 01:21 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionSearch, 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetIdGenEnum, 

43 DatasetRef, 

44 DatasetType, 

45 Datastore, 

46 DatastoreRecordData, 

47 DimensionGraph, 

48 DimensionUniverse, 

49 NamedKeyDict, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

54from lsst.utils import doImportType 

55 

56from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

57from ._status import NoWorkFound 

58 

59# ----------------------------- 

60# Imports for other modules -- 

61# ----------------------------- 

62from .connections import AdjustQuantumHelper, iterConnections 

63from .graph import QuantumGraph 

64from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

65 

66# ---------------------------------- 

67# Local non-exported definitions -- 

68# ---------------------------------- 

69 

70_LOG = logging.getLogger(__name__) 

71 

72 

73class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

74 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

75 the known `DatasetRef` instances of that type. 

76 

77 Parameters 

78 ---------- 

79 args 

80 Positional arguments are forwarded to the `dict` constructor. 

81 universe : `DimensionUniverse` 

82 Universe of all possible dimensions. 

83 """ 

84 

85 def __init__(self, *args: Any, universe: DimensionUniverse): 

86 super().__init__(*args) 

87 self.universe = universe 

88 

89 @classmethod 

90 def fromDatasetTypes( 

91 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

92 ) -> _DatasetDict: 

93 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

94 

95 Parameters 

96 ---------- 

97 datasetTypes : `iterable` of `DatasetType` 

98 DatasetTypes to use as keys for the dict. Values will be empty 

99 dictionaries. 

100 universe : `DimensionUniverse` 

101 Universe of all possible dimensions. 

102 

103 Returns 

104 ------- 

105 dictionary : `_DatasetDict` 

106 A new `_DatasetDict` instance. 

107 """ 

108 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

109 

110 @classmethod 

111 def fromSubset( 

112 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

113 ) -> _DatasetDict: 

114 """Return a new dictionary by extracting items corresponding to the 

115 given keys from one or more existing dictionaries. 

116 

117 Parameters 

118 ---------- 

119 datasetTypes : `iterable` of `DatasetType` 

120 DatasetTypes to use as keys for the dict. Values will be obtained 

121 by lookups against ``first`` and ``rest``. 

122 first : `_DatasetDict` 

123 Another dictionary from which to extract values. 

124 rest 

125 Additional dictionaries from which to extract values. 

126 

127 Returns 

128 ------- 

129 dictionary : `_DatasetDict` 

130 A new dictionary instance. 

131 """ 

132 combined = ChainMap(first, *rest) 

133 

134 # Dataset types known to match immediately can be processed 

135 # without checks. 

136 matches = combined.keys() & set(datasetTypes) 

137 _dict = {k: combined[k] for k in matches} 

138 

139 if len(_dict) < len(datasetTypes): 

140 # Work out which ones are missing. 

141 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

142 

143 # Get the known names for comparison. 

144 combined_by_name = {k.name: k for k in combined} 

145 

146 missing = set() 

147 incompatible = {} 

148 for datasetType in missing_datasetTypes: 

149 # The dataset type is not found. It may not be listed 

150 # or it may be that it is there with the same name 

151 # but different definition. 

152 if datasetType.name in combined_by_name: 

153 # This implies some inconsistency in definitions 

154 # for connections. If there is support for storage 

155 # class conversion we can let it slide. 

156 # At this point we do not know 

157 # where the inconsistency is but trust that down 

158 # stream code will be more explicit about input 

159 # vs output incompatibilities. 

160 existing = combined_by_name[datasetType.name] 

161 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

162 _LOG.warning( 

163 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

164 datasetType, 

165 existing, 

166 ) 

167 _dict[datasetType] = combined[existing] 

168 else: 

169 incompatible[datasetType] = existing 

170 else: 

171 missing.add(datasetType) 

172 

173 if missing or incompatible: 

174 reasons = [] 

175 if missing: 

176 reasons.append( 

177 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

178 + ", ".join(d.name for d in combined) 

179 ) 

180 if incompatible: 

181 for x, y in incompatible.items(): 

182 reasons.append(f"{x} incompatible with {y}") 

183 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

184 

185 return cls(_dict, universe=first.universe) 

186 

187 @property 

188 def dimensions(self) -> DimensionGraph: 

189 """The union of all dimensions used by all dataset types in this 

190 dictionary, including implied dependencies (`DimensionGraph`). 

191 """ 

192 base = self.universe.empty 

193 if len(self) == 0: 

194 return base 

195 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

196 

197 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

198 """Unpack nested single-element `DatasetRef` dicts into a new 

199 mapping with `DatasetType` keys and `DatasetRef` values. 

200 

201 This method assumes that each nest contains exactly one item, as is the 

202 case for all "init" datasets. 

203 

204 Returns 

205 ------- 

206 dictionary : `NamedKeyDict` 

207 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

208 `DatasetType` instances and string names usable as keys. 

209 """ 

210 

211 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

212 (ref,) = refs.values() 

213 return ref 

214 

215 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

216 

217 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

218 """Unpack nested multi-element `DatasetRef` dicts into a new 

219 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

220 

221 Returns 

222 ------- 

223 dictionary : `NamedKeyDict` 

224 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

225 both `DatasetType` instances and string names usable as keys. 

226 """ 

227 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

228 

229 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

230 """Iterate over the contained `DatasetRef` instances that match the 

231 given `DatasetType` and data IDs. 

232 

233 Parameters 

234 ---------- 

235 datasetType : `DatasetType` 

236 Dataset type to match. 

237 dataIds : `Iterable` [ `DataCoordinate` ] 

238 Data IDs to match. 

239 

240 Returns 

241 ------- 

242 refs : `Iterator` [ `DatasetRef` ] 

243 DatasetRef instances for which ``ref.datasetType == datasetType`` 

244 and ``ref.dataId`` is in ``dataIds``. 

245 """ 

246 refs = self[datasetType] 

247 return (refs[dataId] for dataId in dataIds) 

248 

249 

250class _QuantumScaffolding: 

251 """Helper class aggregating information about a `Quantum`, used when 

252 constructing a `QuantumGraph`. 

253 

254 See `_PipelineScaffolding` for a top-down description of the full 

255 scaffolding data structure. 

256 

257 Parameters 

258 ---------- 

259 task : _TaskScaffolding 

260 Back-reference to the helper object for the `PipelineTask` this quantum 

261 represents an execution of. 

262 dataId : `DataCoordinate` 

263 Data ID for this quantum. 

264 """ 

265 

266 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

267 self.task = task 

268 self.dataId = dataId 

269 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

270 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

271 self.prerequisites = _DatasetDict.fromDatasetTypes( 

272 task.prerequisites.keys(), universe=dataId.universe 

273 ) 

274 

275 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

276 

277 def __repr__(self) -> str: 

278 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

279 

280 task: _TaskScaffolding 

281 """Back-reference to the helper object for the `PipelineTask` this quantum 

282 represents an execution of. 

283 """ 

284 

285 dataId: DataCoordinate 

286 """Data ID for this quantum. 

287 """ 

288 

289 inputs: _DatasetDict 

290 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

291 

292 This is initialized to map each `DatasetType` to an empty dictionary at 

293 construction. Those nested dictionaries are populated (with data IDs as 

294 keys) with unresolved `DatasetRef` instances in 

295 `_PipelineScaffolding.connectDataIds`. 

296 """ 

297 

298 outputs: _DatasetDict 

299 """Nested dictionary containing `DatasetRef` outputs this quantum. 

300 """ 

301 

302 prerequisites: _DatasetDict 

303 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

304 quantum. 

305 """ 

306 

307 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

308 """Transform the scaffolding object into a true `Quantum` instance. 

309 

310 Parameters 

311 ---------- 

312 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

313 If not `None` then fill datastore records in each generated Quantum 

314 using the records from this structure. 

315 

316 Returns 

317 ------- 

318 quantum : `Quantum` 

319 An actual `Quantum` instance. 

320 """ 

321 allInputs = self.inputs.unpackMultiRefs() 

322 allInputs.update(self.prerequisites.unpackMultiRefs()) 

323 # Give the task's Connections class an opportunity to remove some 

324 # inputs, or complain if they are unacceptable. 

325 # This will raise if one of the check conditions is not met, which is 

326 # the intended behavior. 

327 # If it raises NotWorkFound, there is a bug in the QG algorithm 

328 # or the adjustQuantum is incorrectly trying to make a prerequisite 

329 # input behave like a regular input; adjustQuantum should only raise 

330 # NoWorkFound if a regular input is missing, and it shouldn't be 

331 # possible for us to have generated ``self`` if that's true. 

332 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

333 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

334 initInputs = self.task.initInputs.unpackSingleRefs() 

335 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

336 if datastore_records is not None: 

337 quantum_records = {} 

338 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

339 input_refs += list(initInputs.values()) 

340 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

341 for datastore_name, records in datastore_records.items(): 

342 matching_records = records.subset(input_ids) 

343 if matching_records is not None: 

344 quantum_records[datastore_name] = matching_records 

345 return Quantum( 

346 taskName=self.task.taskDef.taskName, 

347 taskClass=self.task.taskDef.taskClass, 

348 dataId=self.dataId, 

349 initInputs=initInputs, 

350 inputs=helper.inputs, 

351 outputs=helper.outputs, 

352 datastore_records=quantum_records, 

353 ) 

354 

355 

356@dataclass 

357class _TaskScaffolding: 

358 """Helper class aggregating information about a `PipelineTask`, used when 

359 constructing a `QuantumGraph`. 

360 

361 See `_PipelineScaffolding` for a top-down description of the full 

362 scaffolding data structure. 

363 

364 Parameters 

365 ---------- 

366 taskDef : `TaskDef` 

367 Data structure that identifies the task class and its config. 

368 parent : `_PipelineScaffolding` 

369 The parent data structure that will hold the instance being 

370 constructed. 

371 datasetTypes : `TaskDatasetTypes` 

372 Data structure that categorizes the dataset types used by this task. 

373 """ 

374 

375 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

376 universe = parent.dimensions.universe 

377 self.taskDef = taskDef 

378 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

379 assert self.dimensions.issubset(parent.dimensions) 

380 # Initialize _DatasetDicts as subsets of the one or two 

381 # corresponding dicts in the parent _PipelineScaffolding. 

382 self.initInputs = _DatasetDict.fromSubset( 

383 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

384 ) 

385 self.initOutputs = _DatasetDict.fromSubset( 

386 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

387 ) 

388 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

389 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

390 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

391 self.dataIds: Set[DataCoordinate] = set() 

392 self.quanta = {} 

393 

394 def __repr__(self) -> str: 

395 # Default dataclass-injected __repr__ gets caught in an infinite loop 

396 # because of back-references. 

397 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

398 

399 taskDef: TaskDef 

400 """Data structure that identifies the task class and its config 

401 (`TaskDef`). 

402 """ 

403 

404 dimensions: DimensionGraph 

405 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

406 """ 

407 

408 initInputs: _DatasetDict 

409 """Dictionary containing information about datasets used to construct this 

410 task (`_DatasetDict`). 

411 """ 

412 

413 initOutputs: _DatasetDict 

414 """Dictionary containing information about datasets produced as a 

415 side-effect of constructing this task (`_DatasetDict`). 

416 """ 

417 

418 inputs: _DatasetDict 

419 """Dictionary containing information about datasets used as regular, 

420 graph-constraining inputs to this task (`_DatasetDict`). 

421 """ 

422 

423 outputs: _DatasetDict 

424 """Dictionary containing information about datasets produced by this task 

425 (`_DatasetDict`). 

426 """ 

427 

428 prerequisites: _DatasetDict 

429 """Dictionary containing information about input datasets that must be 

430 present in the repository before any Pipeline containing this task is run 

431 (`_DatasetDict`). 

432 """ 

433 

434 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

435 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

436 this task with that data ID. 

437 """ 

438 

439 def makeQuantumSet( 

440 self, 

441 unresolvedRefs: Optional[Set[DatasetRef]] = None, 

442 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

443 ) -> Set[Quantum]: 

444 """Create a `set` of `Quantum` from the information in ``self``. 

445 

446 Parameters 

447 ---------- 

448 unresolvedRefs : `set` [ `DatasetRef` ], optional 

449 Input dataset refs that have not been found. 

450 datastore_records : `dict` 

451 

452 

453 Returns 

454 ------- 

455 nodes : `set` of `Quantum` 

456 The `Quantum` elements corresponding to this task. 

457 """ 

458 if unresolvedRefs is None: 

459 unresolvedRefs = set() 

460 outputs = set() 

461 for q in self.quanta.values(): 

462 try: 

463 tmpQuanta = q.makeQuantum(datastore_records) 

464 outputs.add(tmpQuanta) 

465 except (NoWorkFound, FileNotFoundError) as exc: 

466 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

467 if unresolvedRefs.intersection(refs): 

468 # This means it is a node that is Known to be pruned 

469 # later and should be left in even though some follow up 

470 # queries fail. This allows the pruning to start from this 

471 # quantum with known issues, and prune other nodes it 

472 # touches 

473 inputs = q.inputs.unpackMultiRefs() 

474 inputs.update(q.prerequisites.unpackMultiRefs()) 

475 tmpQuantum = Quantum( 

476 taskName=q.task.taskDef.taskName, 

477 taskClass=q.task.taskDef.taskClass, 

478 dataId=q.dataId, 

479 initInputs=q.task.initInputs.unpackSingleRefs(), 

480 inputs=inputs, 

481 outputs=q.outputs.unpackMultiRefs(), 

482 ) 

483 outputs.add(tmpQuantum) 

484 else: 

485 raise exc 

486 return outputs 

487 

488 

489class _DatasetIdMaker: 

490 """Helper class which generates random dataset UUIDs for unresolved 

491 datasets. 

492 """ 

493 

494 def __init__(self, registry: Registry, run: str): 

495 self.datasetIdFactory = registry.datasetIdFactory 

496 self.run = run 

497 # Dataset IDs generated so far 

498 self.resolved: Dict[Tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

499 

500 def resolveRef(self, ref: DatasetRef) -> DatasetRef: 

501 if ref.id is not None: 

502 return ref 

503 key = ref.datasetType, ref.dataId 

504 if (resolved := self.resolved.get(key)) is None: 

505 datasetId = self.datasetIdFactory.makeDatasetId( 

506 self.run, ref.datasetType, ref.dataId, DatasetIdGenEnum.UNIQUE 

507 ) 

508 resolved = ref.resolved(datasetId, self.run) 

509 self.resolved[key] = resolved 

510 return resolved 

511 

512 def resolveDict(self, refs: Dict[DataCoordinate, DatasetRef]) -> Dict[DataCoordinate, DatasetRef]: 

513 """Resolve all unresolved references in the provided dictionary.""" 

514 return {dataId: self.resolveRef(ref) for dataId, ref in refs.items()} 

515 

516 

517@dataclass 

518class _PipelineScaffolding: 

519 """A helper data structure that organizes the information involved in 

520 constructing a `QuantumGraph` for a `Pipeline`. 

521 

522 Parameters 

523 ---------- 

524 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

525 Sequence of tasks from which a graph is to be constructed. Must 

526 have nested task classes already imported. 

527 universe : `DimensionUniverse` 

528 Universe of all possible dimensions. 

529 

530 Notes 

531 ----- 

532 The scaffolding data structure contains nested data structures for both 

533 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

534 data structures are shared between the pipeline-level structure (which 

535 aggregates all datasets and categorizes them from the perspective of the 

536 complete pipeline) and the individual tasks that use them as inputs and 

537 outputs. 

538 

539 `QuantumGraph` construction proceeds in four steps, with each corresponding 

540 to a different `_PipelineScaffolding` method: 

541 

542 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

543 the DatasetTypes used by the pipeline (delegating to 

544 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

545 nested `_TaskScaffolding` and `_DatasetDict` objects. 

546 

547 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

548 returns related tuples of all dimensions used to identify any regular 

549 input, output, and intermediate datasets (not prerequisites). We then 

550 iterate over these tuples of related dimensions, identifying the subsets 

551 that correspond to distinct data IDs for each task and dataset type, 

552 and then create `_QuantumScaffolding` objects. 

553 

554 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

555 dataset data IDs previously identified, transforming unresolved 

556 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

557 up prerequisite datasets for all quanta. 

558 

559 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

560 per-task `_QuantumScaffolding` objects. 

561 """ 

562 

563 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

564 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

565 self.tasks = [] 

566 # Aggregate and categorize the DatasetTypes in the Pipeline. 

567 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

568 # Construct dictionaries that map those DatasetTypes to structures 

569 # that will (later) hold addiitonal information about them. 

570 for attr in ( 

571 "initInputs", 

572 "initIntermediates", 

573 "initOutputs", 

574 "inputs", 

575 "intermediates", 

576 "outputs", 

577 "prerequisites", 

578 ): 

579 setattr( 

580 self, 

581 attr, 

582 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

583 ) 

584 # Aggregate all dimensions for all non-init, non-prerequisite 

585 # DatasetTypes. These are the ones we'll include in the big join 

586 # query. 

587 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

588 # Construct scaffolding nodes for each Task, and add backreferences 

589 # to the Task from each DatasetScaffolding node. 

590 # Note that there's only one scaffolding node for each DatasetType, 

591 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

592 # reference it. 

593 if isinstance(pipeline, Pipeline): 

594 pipeline = pipeline.toExpandedPipeline() 

595 self.tasks = [ 

596 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

597 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

598 ] 

599 

600 def __repr__(self) -> str: 

601 # Default dataclass-injected __repr__ gets caught in an infinite loop 

602 # because of back-references. 

603 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

604 

605 tasks: List[_TaskScaffolding] 

606 """Scaffolding data structures for each task in the pipeline 

607 (`list` of `_TaskScaffolding`). 

608 """ 

609 

610 initInputs: _DatasetDict 

611 """Datasets consumed but not produced when constructing the tasks in this 

612 pipeline (`_DatasetDict`). 

613 """ 

614 

615 initIntermediates: _DatasetDict 

616 """Datasets that are both consumed and produced when constructing the tasks 

617 in this pipeline (`_DatasetDict`). 

618 """ 

619 

620 initOutputs: _DatasetDict 

621 """Datasets produced but not consumed when constructing the tasks in this 

622 pipeline (`_DatasetDict`). 

623 """ 

624 

625 inputs: _DatasetDict 

626 """Datasets that are consumed but not produced when running this pipeline 

627 (`_DatasetDict`). 

628 """ 

629 

630 intermediates: _DatasetDict 

631 """Datasets that are both produced and consumed when running this pipeline 

632 (`_DatasetDict`). 

633 """ 

634 

635 outputs: _DatasetDict 

636 """Datasets produced but not consumed when when running this pipeline 

637 (`_DatasetDict`). 

638 """ 

639 

640 prerequisites: _DatasetDict 

641 """Datasets that are consumed when running this pipeline and looked up 

642 per-Quantum when generating the graph (`_DatasetDict`). 

643 """ 

644 

645 dimensions: DimensionGraph 

646 """All dimensions used by any regular input, intermediate, or output 

647 (not prerequisite) dataset; the set of dimension used in the "Big Join 

648 Query" (`DimensionGraph`). 

649 

650 This is required to be a superset of all task quantum dimensions. 

651 """ 

652 

653 @contextmanager 

654 def connectDataIds( 

655 self, 

656 registry: Registry, 

657 collections: Any, 

658 userQuery: Optional[str], 

659 externalDataId: DataCoordinate, 

660 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

661 ) -> Iterator[DataCoordinateQueryResults]: 

662 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

663 

664 This method populates `_TaskScaffolding.dataIds` and 

665 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

666 

667 Parameters 

668 ---------- 

669 registry : `lsst.daf.butler.Registry` 

670 Registry for the data repository; used for all data ID queries. 

671 collections 

672 Expressions representing the collections to search for input 

673 datasets. May be any of the types accepted by 

674 `lsst.daf.butler.CollectionSearch.fromExpression`. 

675 userQuery : `str` or `None` 

676 User-provided expression to limit the data IDs processed. 

677 externalDataId : `DataCoordinate` 

678 Externally-provided data ID that should be used to restrict the 

679 results, just as if these constraints had been included via ``AND`` 

680 in ``userQuery``. This includes (at least) any instrument named 

681 in the pipeline definition. 

682 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

683 The query constraint variant that should be used to constraint the 

684 query based on dataset existance, defaults to 

685 `DatasetQueryConstraintVariant.ALL`. 

686 

687 Returns 

688 ------- 

689 commonDataIds : \ 

690 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

691 An interface to a database temporary table containing all data IDs 

692 that will appear in this `QuantumGraph`. Returned inside a 

693 context manager, which will drop the temporary table at the end of 

694 the `with` block in which this method is called. 

695 """ 

696 _LOG.debug("Building query for data IDs.") 

697 # Initialization datasets always have empty data IDs. 

698 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

699 for datasetType, refs in itertools.chain( 

700 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

701 ): 

702 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

703 # Run one big query for the data IDs for task dimensions and regular 

704 # inputs and outputs. We limit the query to only dimensions that are 

705 # associated with the input dataset types, but don't (yet) try to 

706 # obtain the dataset_ids for those inputs. 

707 _LOG.debug("Submitting data ID query and materializing results.") 

708 queryArgs: Dict[str, Any] = { 

709 "dimensions": self.dimensions, 

710 "where": userQuery, 

711 "dataId": externalDataId, 

712 } 

713 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

714 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

715 queryArgs["datasets"] = list(self.inputs) 

716 queryArgs["collections"] = collections 

717 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

718 _LOG.debug("Not using dataset existence to constrain query.") 

719 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

720 constraint = set(datasetQueryConstraint) 

721 inputs = {k.name: k for k in self.inputs.keys()} 

722 if remainder := constraint.difference(inputs.keys()): 

723 raise ValueError( 

724 f"{remainder} dataset type(s) specified as a graph constraint, but" 

725 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

726 ) 

727 _LOG.debug(f"Constraining graph query using {constraint}") 

728 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

729 queryArgs["collections"] = collections 

730 else: 

731 raise ValueError( 

732 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

733 ) 

734 

735 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

736 _LOG.debug("Expanding data IDs.") 

737 commonDataIds = commonDataIds.expanded() 

738 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

739 # Iterate over query results, populating data IDs for datasets and 

740 # quanta and then connecting them to each other. 

741 n = -1 

742 for n, commonDataId in enumerate(commonDataIds): 

743 _LOG.debug("Next DataID = %s", commonDataId) 

744 # Create DatasetRefs for all DatasetTypes from this result row, 

745 # noting that we might have created some already. 

746 # We remember both those that already existed and those that we 

747 # create now. 

748 refsForRow = {} 

749 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

750 for datasetType, refs in itertools.chain( 

751 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

752 ): 

753 datasetDataId: Optional[DataCoordinate] 

754 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

755 datasetDataId = commonDataId.subset(datasetType.dimensions) 

756 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

757 ref = refs.get(datasetDataId) 

758 if ref is None: 

759 ref = DatasetRef(datasetType, datasetDataId) 

760 _LOG.debug("Made new ref = %s", ref) 

761 refs[datasetDataId] = ref 

762 refsForRow[datasetType.name] = ref 

763 # Create _QuantumScaffolding objects for all tasks from this 

764 # result row, noting that we might have created some already. 

765 for task in self.tasks: 

766 quantumDataId = commonDataId.subset(task.dimensions) 

767 quantum = task.quanta.get(quantumDataId) 

768 if quantum is None: 

769 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

770 task.quanta[quantumDataId] = quantum 

771 # Whether this is a new quantum or an existing one, we can 

772 # now associate the DatasetRefs for this row with it. The 

773 # fact that a Quantum data ID and a dataset data ID both 

774 # came from the same result row is what tells us they 

775 # should be associated. 

776 # Many of these associates will be duplicates (because 

777 # another query row that differed from this one only in 

778 # irrelevant dimensions already added them), and we use 

779 # sets to skip. 

780 for datasetType in task.inputs: 

781 ref = refsForRow[datasetType.name] 

782 quantum.inputs[datasetType.name][ref.dataId] = ref 

783 for datasetType in task.outputs: 

784 ref = refsForRow[datasetType.name] 

785 quantum.outputs[datasetType.name][ref.dataId] = ref 

786 if n < 0: 

787 emptiness_explained = False 

788 for message in commonDataIds.explain_no_results(): 

789 _LOG.warning(message) 

790 emptiness_explained = True 

791 if not emptiness_explained: 

792 _LOG.warning( 

793 "To reproduce this query for debugging purposes, run " 

794 "Registry.queryDataIds with these arguments:" 

795 ) 

796 # We could just repr() the queryArgs dict to get something 

797 # the user could make sense of, but it's friendlier to 

798 # put these args in an easier-to-construct equivalent form 

799 # so they can read it more easily and copy and paste into 

800 # a Python terminal. 

801 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

802 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName()) 

803 if queryArgs["where"]: 

804 _LOG.warning(" where=%s,", repr(queryArgs["where"])) 

805 if "datasets" in queryArgs: 

806 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

807 if "collections" in queryArgs: 

808 _LOG.warning(" collections=%s,", list(queryArgs["collections"])) 

809 _LOG.debug("Finished processing %d rows from data ID query.", n) 

810 yield commonDataIds 

811 

812 def resolveDatasetRefs( 

813 self, 

814 registry: Registry, 

815 collections: Any, 

816 run: Optional[str], 

817 commonDataIds: DataCoordinateQueryResults, 

818 *, 

819 skipExistingIn: Any = None, 

820 clobberOutputs: bool = True, 

821 constrainedByAllDatasets: bool = True, 

822 resolveRefs: bool = False, 

823 ) -> None: 

824 """Perform follow up queries for each dataset data ID produced in 

825 `fillDataIds`. 

826 

827 This method populates `_DatasetScaffolding.refs` (except for those in 

828 `prerequisites`). 

829 

830 Parameters 

831 ---------- 

832 registry : `lsst.daf.butler.Registry` 

833 Registry for the data repository; used for all data ID queries. 

834 collections 

835 Expressions representing the collections to search for input 

836 datasets. May be any of the types accepted by 

837 `lsst.daf.butler.CollectionSearch.fromExpression`. 

838 run : `str`, optional 

839 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

840 output datasets, if it already exists. 

841 commonDataIds : \ 

842 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

843 Result of a previous call to `connectDataIds`. 

844 skipExistingIn 

845 Expressions representing the collections to search for existing 

846 output datasets that should be skipped. May be any of the types 

847 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

848 `None` or empty string/sequence disables skipping. 

849 clobberOutputs : `bool`, optional 

850 If `True` (default), allow quanta to created even if outputs exist; 

851 this requires the same behavior behavior to be enabled when 

852 executing. If ``skipExistingIn`` is not `None`, completed quanta 

853 (those with metadata, or all outputs if there is no metadata 

854 dataset configured) will be skipped rather than clobbered. 

855 constrainedByAllDatasets : `bool`, optional 

856 Indicates if the commonDataIds were generated with a constraint on 

857 all dataset types. 

858 resolveRefs : `bool`, optional 

859 If `True` then resolve all input references and generate random 

860 dataset IDs for all output and intermediate datasets. True value 

861 requires ``run`` collection to be specified. 

862 

863 Raises 

864 ------ 

865 OutputExistsError 

866 Raised if an output dataset already exists in the output run 

867 and ``skipExistingIn`` does not include output run, or if only 

868 some outputs are present and ``clobberOutputs`` is `False`. 

869 """ 

870 skipCollections: Optional[CollectionSearch] = None 

871 skipExistingInRun = False 

872 if skipExistingIn: 

873 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

874 if run: 

875 # as optimization check in the explicit list of names first 

876 skipExistingInRun = run in skipCollections.explicitNames() 

877 if not skipExistingInRun: 

878 # need to flatten it and check again 

879 skipExistingInRun = run in registry.queryCollections( 

880 skipExistingIn, 

881 collectionTypes=CollectionType.RUN, 

882 ) 

883 

884 idMaker: Optional[_DatasetIdMaker] = None 

885 if resolveRefs: 

886 assert run is not None, "run cannot be None when resolveRefs is True" 

887 idMaker = _DatasetIdMaker(registry, run) 

888 

889 # Look up [init] intermediate and output datasets in the output 

890 # collection, if there is an output collection. 

891 if run is not None or skipCollections is not None: 

892 for datasetType, refs in itertools.chain( 

893 self.initIntermediates.items(), 

894 self.initOutputs.items(), 

895 self.intermediates.items(), 

896 self.outputs.items(), 

897 ): 

898 _LOG.debug( 

899 "Resolving %d datasets for intermediate and/or output dataset %s.", 

900 len(refs), 

901 datasetType.name, 

902 ) 

903 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

904 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

905 

906 # look at RUN collection first 

907 if run is not None: 

908 resolvedRefQueryResults = subset.findDatasets( 

909 datasetType, collections=run, findFirst=True 

910 ) 

911 for resolvedRef in resolvedRefQueryResults: 

912 # TODO: we could easily support per-DatasetType 

913 # skipExisting and I could imagine that being useful - 

914 # it's probably required in order to support writing 

915 # initOutputs before QuantumGraph generation. 

916 assert resolvedRef.dataId in refs 

917 if not (skipExistingInRun or isInit or clobberOutputs): 

918 raise OutputExistsError( 

919 f"Output dataset {datasetType.name} already exists in " 

920 f"output RUN collection '{run}' with data ID" 

921 f" {resolvedRef.dataId}." 

922 ) 

923 # If we are going to resolve all outputs then we have 

924 # to remember existing ones to avoid generating new 

925 # dataset IDs for them. 

926 if resolveRefs: 

927 refs[resolvedRef.dataId] = resolvedRef 

928 

929 # And check skipExistingIn too, if RUN collection is in 

930 # it is handled above 

931 if skipCollections is not None: 

932 resolvedRefQueryResults = subset.findDatasets( 

933 datasetType, collections=skipCollections, findFirst=True 

934 ) 

935 for resolvedRef in resolvedRefQueryResults: 

936 assert resolvedRef.dataId in refs 

937 refs[resolvedRef.dataId] = resolvedRef 

938 

939 # Look up input and initInput datasets in the input collection(s). 

940 # container to accumulate unfound refs, if the common dataIs were not 

941 # constrained on dataset type existence. 

942 self.unfoundRefs = set() 

943 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

944 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

945 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets( 

946 datasetType, collections=collections, findFirst=True 

947 ) 

948 dataIdsNotFoundYet = set(refs.keys()) 

949 for resolvedRef in resolvedRefQueryResults: 

950 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

951 refs[resolvedRef.dataId] = resolvedRef 

952 if dataIdsNotFoundYet: 

953 if constrainedByAllDatasets: 

954 raise RuntimeError( 

955 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

956 f"'{datasetType.name}' was/were present in a previous " 

957 f"query, but could not be found now." 

958 f"This is either a logic bug in QuantumGraph generation " 

959 f"or the input collections have been modified since " 

960 f"QuantumGraph generation began." 

961 ) 

962 else: 

963 # if the common dataIds were not constrained using all the 

964 # input dataset types, it is possible that some data ids 

965 # found dont correspond to existing dataset types and they 

966 # will be un-resolved. Mark these for later pruning from 

967 # the quantum graph. 

968 for k in dataIdsNotFoundYet: 

969 self.unfoundRefs.add(refs[k]) 

970 

971 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

972 # replacing the unresolved refs there, and then look up prerequisites. 

973 for task in self.tasks: 

974 _LOG.debug( 

975 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

976 len(task.quanta), 

977 task.taskDef.label, 

978 ) 

979 # The way iterConnections is designed makes it impossible to 

980 # annotate precisely enough to satisfy MyPy here. 

981 lookupFunctions = { 

982 c.name: c.lookupFunction # type: ignore 

983 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

984 if c.lookupFunction is not None # type: ignore 

985 } 

986 dataIdsFailed = [] 

987 dataIdsSucceeded = [] 

988 for quantum in task.quanta.values(): 

989 # Process outputs datasets only if skipExistingIn is not None 

990 # or there is a run to look for outputs in and clobberOutputs 

991 # is True. Note that if skipExistingIn is None, any output 

992 # datasets that already exist would have already caused an 

993 # exception to be raised. We never update the DatasetRefs in 

994 # the quantum because those should never be resolved. 

995 if skipCollections is not None or (run is not None and clobberOutputs): 

996 resolvedRefs = [] 

997 unresolvedRefs = [] 

998 haveMetadata = False 

999 for datasetType, originalRefs in quantum.outputs.items(): 

1000 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1001 if ref.id is not None: 

1002 resolvedRefs.append(ref) 

1003 if datasetType.name == task.taskDef.metadataDatasetName: 

1004 haveMetadata = True 

1005 else: 

1006 unresolvedRefs.append(ref) 

1007 if resolvedRefs: 

1008 if haveMetadata or not unresolvedRefs: 

1009 dataIdsSucceeded.append(quantum.dataId) 

1010 if skipCollections is not None: 

1011 continue 

1012 else: 

1013 dataIdsFailed.append(quantum.dataId) 

1014 if not clobberOutputs: 

1015 raise OutputExistsError( 

1016 f"Quantum {quantum.dataId} of task with label " 

1017 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1018 f"({resolvedRefs}) " 

1019 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

1020 "and clobbering outputs was not enabled." 

1021 ) 

1022 # Update the input DatasetRefs to the resolved ones we already 

1023 # searched for. 

1024 for datasetType, input_refs in quantum.inputs.items(): 

1025 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

1026 input_refs[ref.dataId] = ref 

1027 # Look up prerequisite datasets in the input collection(s). 

1028 # These may have dimensions that extend beyond those we queried 

1029 # for originally, because we want to permit those data ID 

1030 # values to differ across quanta and dataset types. 

1031 for datasetType in task.prerequisites: 

1032 lookupFunction = lookupFunctions.get(datasetType.name) 

1033 if lookupFunction is not None: 

1034 # PipelineTask has provided its own function to do the 

1035 # lookup. This always takes precedence. 

1036 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1037 elif ( 

1038 datasetType.isCalibration() 

1039 and datasetType.dimensions <= quantum.dataId.graph 

1040 and quantum.dataId.graph.temporal 

1041 ): 

1042 # This is a master calibration lookup, which we have to 

1043 # handle specially because the query system can't do a 

1044 # temporal join on a non-dimension-based timespan yet. 

1045 timespan = quantum.dataId.timespan 

1046 try: 

1047 prereq_refs = [ 

1048 registry.findDataset( 

1049 datasetType, quantum.dataId, collections=collections, timespan=timespan 

1050 ) 

1051 ] 

1052 except KeyError: 

1053 # This dataset type is not present in the registry, 

1054 # which just means there are no datasets here. 

1055 prereq_refs = [] 

1056 else: 

1057 # Most general case. 

1058 prereq_refs = list( 

1059 registry.queryDatasets( 

1060 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True 

1061 ).expanded() 

1062 ) 

1063 quantum.prerequisites[datasetType].update( 

1064 {ref.dataId: ref for ref in prereq_refs if ref is not None} 

1065 ) 

1066 

1067 # Resolve all quantum inputs and outputs. 

1068 if idMaker: 

1069 for datasetDict in (quantum.inputs, quantum.outputs): 

1070 for refDict in datasetDict.values(): 

1071 refDict.update(idMaker.resolveDict(refDict)) 

1072 

1073 # Resolve task initInputs and initOutputs. 

1074 if idMaker: 

1075 for datasetDict in (task.initInputs, task.initOutputs): 

1076 for refDict in datasetDict.values(): 

1077 refDict.update(idMaker.resolveDict(refDict)) 

1078 

1079 # Actually remove any quanta that we decided to skip above. 

1080 if dataIdsSucceeded: 

1081 if skipCollections is not None: 

1082 _LOG.debug( 

1083 "Pruning successful %d quanta for task with label '%s' because all of their " 

1084 "outputs exist or metadata was written successfully.", 

1085 len(dataIdsSucceeded), 

1086 task.taskDef.label, 

1087 ) 

1088 for dataId in dataIdsSucceeded: 

1089 del task.quanta[dataId] 

1090 elif clobberOutputs: 

1091 _LOG.info( 

1092 "Found %d successful quanta for task with label '%s' " 

1093 "that will need to be clobbered during execution.", 

1094 len(dataIdsSucceeded), 

1095 task.taskDef.label, 

1096 ) 

1097 else: 

1098 raise AssertionError("OutputExistsError should have already been raised.") 

1099 if dataIdsFailed: 

1100 if clobberOutputs: 

1101 _LOG.info( 

1102 "Found %d failed/incomplete quanta for task with label '%s' " 

1103 "that will need to be clobbered during execution.", 

1104 len(dataIdsFailed), 

1105 task.taskDef.label, 

1106 ) 

1107 else: 

1108 raise AssertionError("OutputExistsError should have already been raised.") 

1109 

1110 def makeQuantumGraph( 

1111 self, metadata: Optional[Mapping[str, Any]] = None, datastore: Optional[Datastore] = None 

1112 ) -> QuantumGraph: 

1113 """Create a `QuantumGraph` from the quanta already present in 

1114 the scaffolding data structure. 

1115 

1116 Parameters 

1117 --------- 

1118 metadata : Optional Mapping of `str` to primitives 

1119 This is an optional parameter of extra data to carry with the 

1120 graph. Entries in this mapping should be able to be serialized in 

1121 JSON. 

1122 datastore : `Datastore`, optional 

1123 If not `None` then fill datastore records in each generated 

1124 Quantum. 

1125 

1126 Returns 

1127 ------- 

1128 graph : `QuantumGraph` 

1129 The full `QuantumGraph`. 

1130 """ 

1131 

1132 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1133 """Extract all DatasetRefs from the dictionaries""" 

1134 for ref_dict in dataset_dict.values(): 

1135 yield from ref_dict.values() 

1136 

1137 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1138 if datastore is not None: 

1139 datastore_records = datastore.export_records( 

1140 itertools.chain( 

1141 _make_refs(self.inputs), _make_refs(self.initInputs), _make_refs(self.prerequisites) 

1142 ) 

1143 ) 

1144 

1145 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1146 for task in self.tasks: 

1147 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs, datastore_records=datastore_records) 

1148 graphInput[task.taskDef] = qset 

1149 

1150 taskInitInputs = {task.taskDef: task.initInputs.unpackSingleRefs().values() for task in self.tasks} 

1151 taskInitOutputs = {task.taskDef: task.initOutputs.unpackSingleRefs().values() for task in self.tasks} 

1152 

1153 graph = QuantumGraph( 

1154 graphInput, 

1155 metadata=metadata, 

1156 pruneRefs=self.unfoundRefs, 

1157 universe=self.dimensions.universe, 

1158 initInputs=taskInitInputs, 

1159 initOutputs=taskInitOutputs, 

1160 ) 

1161 return graph 

1162 

1163 

1164# ------------------------ 

1165# Exported definitions -- 

1166# ------------------------ 

1167 

1168 

1169class GraphBuilderError(Exception): 

1170 """Base class for exceptions generated by graph builder.""" 

1171 

1172 pass 

1173 

1174 

1175class OutputExistsError(GraphBuilderError): 

1176 """Exception generated when output datasets already exist.""" 

1177 

1178 pass 

1179 

1180 

1181class PrerequisiteMissingError(GraphBuilderError): 

1182 """Exception generated when a prerequisite dataset does not exist.""" 

1183 

1184 pass 

1185 

1186 

1187class GraphBuilder: 

1188 """GraphBuilder class is responsible for building task execution graph from 

1189 a Pipeline. 

1190 

1191 Parameters 

1192 ---------- 

1193 registry : `~lsst.daf.butler.Registry` 

1194 Data butler instance. 

1195 skipExistingIn 

1196 Expressions representing the collections to search for existing 

1197 output datasets that should be skipped. May be any of the types 

1198 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

1199 clobberOutputs : `bool`, optional 

1200 If `True` (default), allow quanta to created even if partial outputs 

1201 exist; this requires the same behavior behavior to be enabled when 

1202 executing. 

1203 datastore : `Datastore`, optional 

1204 If not `None` then fill datastore records in each generated Quantum. 

1205 """ 

1206 

1207 def __init__( 

1208 self, 

1209 registry: Registry, 

1210 skipExistingIn: Any = None, 

1211 clobberOutputs: bool = True, 

1212 datastore: Optional[Datastore] = None, 

1213 ): 

1214 self.registry = registry 

1215 self.dimensions = registry.dimensions 

1216 self.skipExistingIn = skipExistingIn 

1217 self.clobberOutputs = clobberOutputs 

1218 self.datastore = datastore 

1219 

1220 def makeGraph( 

1221 self, 

1222 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1223 collections: Any, 

1224 run: Optional[str], 

1225 userQuery: Optional[str], 

1226 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1227 metadata: Optional[Mapping[str, Any]] = None, 

1228 resolveRefs: bool = False, 

1229 ) -> QuantumGraph: 

1230 """Create execution graph for a pipeline. 

1231 

1232 Parameters 

1233 ---------- 

1234 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1235 Pipeline definition, task names/classes and their configs. 

1236 collections 

1237 Expressions representing the collections to search for input 

1238 datasets. May be any of the types accepted by 

1239 `lsst.daf.butler.CollectionSearch.fromExpression`. 

1240 run : `str`, optional 

1241 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1242 output datasets, if it already exists. 

1243 userQuery : `str` 

1244 String which defines user-defined selection for registry, should be 

1245 empty or `None` if there is no restrictions on data selection. 

1246 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1247 The query constraint variant that should be used to constraint the 

1248 query based on dataset existance, defaults to 

1249 `DatasetQueryConstraintVariant.ALL`. 

1250 metadata : Optional Mapping of `str` to primitives 

1251 This is an optional parameter of extra data to carry with the 

1252 graph. Entries in this mapping should be able to be serialized in 

1253 JSON. 

1254 resolveRefs : `bool`, optional 

1255 If `True` then resolve all input references and generate random 

1256 dataset IDs for all output and intermediate datasets. True value 

1257 requires ``run`` collection to be specified. 

1258 

1259 Returns 

1260 ------- 

1261 graph : `QuantumGraph` 

1262 

1263 Raises 

1264 ------ 

1265 UserExpressionError 

1266 Raised when user expression cannot be parsed. 

1267 OutputExistsError 

1268 Raised when output datasets already exist. 

1269 Exception 

1270 Other exceptions types may be raised by underlying registry 

1271 classes. 

1272 """ 

1273 if resolveRefs and run is None: 

1274 raise ValueError("`resolveRefs` requires `run` parameter.") 

1275 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1276 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1277 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1278 instrument_class: Optional[Any] = None 

1279 if isinstance(pipeline, Pipeline): 

1280 instrument_class_name = pipeline.getInstrument() 

1281 if instrument_class_name is not None: 

1282 instrument_class = doImportType(instrument_class_name) 

1283 pipeline = list(pipeline.toExpandedPipeline()) 

1284 if instrument_class is not None: 

1285 dataId = DataCoordinate.standardize( 

1286 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1287 ) 

1288 else: 

1289 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1290 with scaffolding.connectDataIds( 

1291 self.registry, collections, userQuery, dataId, datasetQueryConstraint 

1292 ) as commonDataIds: 

1293 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1294 scaffolding.resolveDatasetRefs( 

1295 self.registry, 

1296 collections, 

1297 run, 

1298 commonDataIds, 

1299 skipExistingIn=self.skipExistingIn, 

1300 clobberOutputs=self.clobberOutputs, 

1301 constrainedByAllDatasets=condition, 

1302 resolveRefs=resolveRefs, 

1303 ) 

1304 return scaffolding.makeQuantumGraph(metadata=metadata, datastore=self.datastore)