Coverage for python/lsst/pipe/base/graphBuilder.py: 17%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

346 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set 

37 

38from lsst.daf.butler import ( 

39 CollectionSearch, 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 DimensionGraph, 

45 DimensionUniverse, 

46 NamedKeyDict, 

47 Quantum, 

48) 

49from lsst.utils import doImport 

50 

51from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

52from ._status import NoWorkFound 

53 

54# ----------------------------- 

55# Imports for other modules -- 

56# ----------------------------- 

57from .connections import AdjustQuantumHelper, iterConnections 

58from .graph import QuantumGraph 

59from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

60 

61# ---------------------------------- 

62# Local non-exported definitions -- 

63# ---------------------------------- 

64 

65_LOG = logging.getLogger(__name__) 

66 

67 

68class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

69 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

70 the known `DatasetRef` instances of that type. 

71 

72 Parameters 

73 ---------- 

74 args 

75 Positional arguments are forwarded to the `dict` constructor. 

76 universe : `DimensionUniverse` 

77 Universe of all possible dimensions. 

78 """ 

79 

80 def __init__(self, *args, universe: DimensionGraph): 

81 super().__init__(*args) 

82 self.universe = universe 

83 

84 @classmethod 

85 def fromDatasetTypes( 

86 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

87 ) -> _DatasetDict: 

88 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

89 

90 Parameters 

91 ---------- 

92 datasetTypes : `iterable` of `DatasetType` 

93 DatasetTypes to use as keys for the dict. Values will be empty 

94 dictionaries. 

95 universe : `DimensionUniverse` 

96 Universe of all possible dimensions. 

97 

98 Returns 

99 ------- 

100 dictionary : `_DatasetDict` 

101 A new `_DatasetDict` instance. 

102 """ 

103 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

104 

105 @classmethod 

106 def fromSubset( 

107 cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

108 ) -> _DatasetDict: 

109 """Return a new dictionary by extracting items corresponding to the 

110 given keys from one or more existing dictionaries. 

111 

112 Parameters 

113 ---------- 

114 datasetTypes : `iterable` of `DatasetType` 

115 DatasetTypes to use as keys for the dict. Values will be obtained 

116 by lookups against ``first`` and ``rest``. 

117 first : `_DatasetDict` 

118 Another dictionary from which to extract values. 

119 rest 

120 Additional dictionaries from which to extract values. 

121 

122 Returns 

123 ------- 

124 dictionary : `_DatasetDict` 

125 A new dictionary instance. 

126 """ 

127 combined = ChainMap(first, *rest) 

128 

129 # Dataset types known to match immediately can be processed 

130 # without checks. 

131 matches = combined.keys() & set(datasetTypes) 

132 _dict = {k: combined[k] for k in matches} 

133 

134 if len(_dict) < len(datasetTypes): 

135 # Work out which ones are missing. 

136 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

137 

138 # Get the known names for comparison. 

139 combined_by_name = {k.name: k for k in combined} 

140 

141 missing = set() 

142 incompatible = {} 

143 for datasetType in missing_datasetTypes: 

144 # The dataset type is not found. It may not be listed 

145 # or it may be that it is there with the same name 

146 # but different definition. 

147 if datasetType.name in combined_by_name: 

148 # This implies some inconsistency in definitions 

149 # for connections. If there is support for storage 

150 # class conversion we can let it slide. 

151 # At this point we do not know 

152 # where the inconsistency is but trust that down 

153 # stream code will be more explicit about input 

154 # vs output incompatibilities. 

155 existing = combined_by_name[datasetType.name] 

156 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

157 _LOG.warning( 

158 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

159 datasetType, 

160 existing, 

161 ) 

162 _dict[datasetType] = combined[existing] 

163 else: 

164 incompatible[datasetType] = existing 

165 else: 

166 missing.add(datasetType) 

167 

168 if missing or incompatible: 

169 reasons = [] 

170 if missing: 

171 reasons.append( 

172 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

173 + ", ".join(d.name for d in combined) 

174 ) 

175 if incompatible: 

176 for x, y in incompatible.items(): 

177 reasons.append(f"{x} incompatible with {y}") 

178 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

179 

180 return cls(_dict, universe=first.universe) 

181 

182 @property 

183 def dimensions(self) -> DimensionGraph: 

184 """The union of all dimensions used by all dataset types in this 

185 dictionary, including implied dependencies (`DimensionGraph`). 

186 """ 

187 base = self.universe.empty 

188 if len(self) == 0: 

189 return base 

190 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

191 

192 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

193 """Unpack nested single-element `DatasetRef` dicts into a new 

194 mapping with `DatasetType` keys and `DatasetRef` values. 

195 

196 This method assumes that each nest contains exactly one item, as is the 

197 case for all "init" datasets. 

198 

199 Returns 

200 ------- 

201 dictionary : `NamedKeyDict` 

202 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

203 `DatasetType` instances and string names usable as keys. 

204 """ 

205 

206 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

207 (ref,) = refs.values() 

208 return ref 

209 

210 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

211 

212 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

213 """Unpack nested multi-element `DatasetRef` dicts into a new 

214 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

215 

216 Returns 

217 ------- 

218 dictionary : `NamedKeyDict` 

219 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

220 both `DatasetType` instances and string names usable as keys. 

221 """ 

222 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

223 

224 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

225 """Iterate over the contained `DatasetRef` instances that match the 

226 given `DatasetType` and data IDs. 

227 

228 Parameters 

229 ---------- 

230 datasetType : `DatasetType` 

231 Dataset type to match. 

232 dataIds : `Iterable` [ `DataCoordinate` ] 

233 Data IDs to match. 

234 

235 Returns 

236 ------- 

237 refs : `Iterator` [ `DatasetRef` ] 

238 DatasetRef instances for which ``ref.datasetType == datasetType`` 

239 and ``ref.dataId`` is in ``dataIds``. 

240 """ 

241 refs = self[datasetType] 

242 return (refs[dataId] for dataId in dataIds) 

243 

244 

245class _QuantumScaffolding: 

246 """Helper class aggregating information about a `Quantum`, used when 

247 constructing a `QuantumGraph`. 

248 

249 See `_PipelineScaffolding` for a top-down description of the full 

250 scaffolding data structure. 

251 

252 Parameters 

253 ---------- 

254 task : _TaskScaffolding 

255 Back-reference to the helper object for the `PipelineTask` this quantum 

256 represents an execution of. 

257 dataId : `DataCoordinate` 

258 Data ID for this quantum. 

259 """ 

260 

261 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

262 self.task = task 

263 self.dataId = dataId 

264 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

265 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

266 self.prerequisites = _DatasetDict.fromDatasetTypes( 

267 task.prerequisites.keys(), universe=dataId.universe 

268 ) 

269 

270 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

271 

272 def __repr__(self): 

273 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

274 

275 task: _TaskScaffolding 

276 """Back-reference to the helper object for the `PipelineTask` this quantum 

277 represents an execution of. 

278 """ 

279 

280 dataId: DataCoordinate 

281 """Data ID for this quantum. 

282 """ 

283 

284 inputs: _DatasetDict 

285 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

286 

287 This is initialized to map each `DatasetType` to an empty dictionary at 

288 construction. Those nested dictionaries are populated (with data IDs as 

289 keys) with unresolved `DatasetRef` instances in 

290 `_PipelineScaffolding.connectDataIds`. 

291 """ 

292 

293 outputs: _DatasetDict 

294 """Nested dictionary containing `DatasetRef` outputs this quantum. 

295 """ 

296 

297 prerequisites: _DatasetDict 

298 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

299 quantum. 

300 """ 

301 

302 def makeQuantum(self) -> Quantum: 

303 """Transform the scaffolding object into a true `Quantum` instance. 

304 

305 Returns 

306 ------- 

307 quantum : `Quantum` 

308 An actual `Quantum` instance. 

309 """ 

310 allInputs = self.inputs.unpackMultiRefs() 

311 allInputs.update(self.prerequisites.unpackMultiRefs()) 

312 # Give the task's Connections class an opportunity to remove some 

313 # inputs, or complain if they are unacceptable. 

314 # This will raise if one of the check conditions is not met, which is 

315 # the intended behavior. 

316 # If it raises NotWorkFound, there is a bug in the QG algorithm 

317 # or the adjustQuantum is incorrectly trying to make a prerequisite 

318 # input behave like a regular input; adjustQuantum should only raise 

319 # NoWorkFound if a regular input is missing, and it shouldn't be 

320 # possible for us to have generated ``self`` if that's true. 

321 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

322 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

323 return Quantum( 

324 taskName=self.task.taskDef.taskName, 

325 taskClass=self.task.taskDef.taskClass, 

326 dataId=self.dataId, 

327 initInputs=self.task.initInputs.unpackSingleRefs(), 

328 inputs=helper.inputs, 

329 outputs=helper.outputs, 

330 ) 

331 

332 

333@dataclass 

334class _TaskScaffolding: 

335 """Helper class aggregating information about a `PipelineTask`, used when 

336 constructing a `QuantumGraph`. 

337 

338 See `_PipelineScaffolding` for a top-down description of the full 

339 scaffolding data structure. 

340 

341 Parameters 

342 ---------- 

343 taskDef : `TaskDef` 

344 Data structure that identifies the task class and its config. 

345 parent : `_PipelineScaffolding` 

346 The parent data structure that will hold the instance being 

347 constructed. 

348 datasetTypes : `TaskDatasetTypes` 

349 Data structure that categorizes the dataset types used by this task. 

350 """ 

351 

352 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

353 universe = parent.dimensions.universe 

354 self.taskDef = taskDef 

355 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

356 assert self.dimensions.issubset(parent.dimensions) 

357 # Initialize _DatasetDicts as subsets of the one or two 

358 # corresponding dicts in the parent _PipelineScaffolding. 

359 self.initInputs = _DatasetDict.fromSubset( 

360 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

361 ) 

362 self.initOutputs = _DatasetDict.fromSubset( 

363 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

364 ) 

365 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

366 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

367 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

368 self.dataIds = set() 

369 self.quanta = {} 

370 

371 def __repr__(self): 

372 # Default dataclass-injected __repr__ gets caught in an infinite loop 

373 # because of back-references. 

374 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

375 

376 taskDef: TaskDef 

377 """Data structure that identifies the task class and its config 

378 (`TaskDef`). 

379 """ 

380 

381 dimensions: DimensionGraph 

382 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

383 """ 

384 

385 initInputs: _DatasetDict 

386 """Dictionary containing information about datasets used to construct this 

387 task (`_DatasetDict`). 

388 """ 

389 

390 initOutputs: _DatasetDict 

391 """Dictionary containing information about datasets produced as a 

392 side-effect of constructing this task (`_DatasetDict`). 

393 """ 

394 

395 inputs: _DatasetDict 

396 """Dictionary containing information about datasets used as regular, 

397 graph-constraining inputs to this task (`_DatasetDict`). 

398 """ 

399 

400 outputs: _DatasetDict 

401 """Dictionary containing information about datasets produced by this task 

402 (`_DatasetDict`). 

403 """ 

404 

405 prerequisites: _DatasetDict 

406 """Dictionary containing information about input datasets that must be 

407 present in the repository before any Pipeline containing this task is run 

408 (`_DatasetDict`). 

409 """ 

410 

411 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

412 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

413 this task with that data ID. 

414 """ 

415 

416 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]: 

417 """Create a `set` of `Quantum` from the information in ``self``. 

418 

419 Returns 

420 ------- 

421 nodes : `set` of `Quantum 

422 The `Quantum` elements corresponding to this task. 

423 """ 

424 if unresolvedRefs is None: 

425 unresolvedRefs = set() 

426 outputs = set() 

427 for q in self.quanta.values(): 

428 try: 

429 tmpQuanta = q.makeQuantum() 

430 outputs.add(tmpQuanta) 

431 except (NoWorkFound, FileNotFoundError) as exc: 

432 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

433 if unresolvedRefs.intersection(refs): 

434 # This means it is a node that is Known to be pruned 

435 # later and should be left in even though some follow up 

436 # queries fail. This allows the pruning to start from this 

437 # quantum with known issues, and prune other nodes it 

438 # touches 

439 inputs = q.inputs.unpackMultiRefs() 

440 inputs.update(q.prerequisites.unpackMultiRefs()) 

441 tmpQuantum = Quantum( 

442 taskName=q.task.taskDef.taskName, 

443 taskClass=q.task.taskDef.taskClass, 

444 dataId=q.dataId, 

445 initInputs=q.task.initInputs.unpackSingleRefs(), 

446 inputs=inputs, 

447 outputs=q.outputs.unpackMultiRefs(), 

448 ) 

449 outputs.add(tmpQuantum) 

450 else: 

451 raise exc 

452 return outputs 

453 

454 

455@dataclass 

456class _PipelineScaffolding: 

457 """A helper data structure that organizes the information involved in 

458 constructing a `QuantumGraph` for a `Pipeline`. 

459 

460 Parameters 

461 ---------- 

462 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

463 Sequence of tasks from which a graph is to be constructed. Must 

464 have nested task classes already imported. 

465 universe : `DimensionUniverse` 

466 Universe of all possible dimensions. 

467 

468 Notes 

469 ----- 

470 The scaffolding data structure contains nested data structures for both 

471 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

472 data structures are shared between the pipeline-level structure (which 

473 aggregates all datasets and categorizes them from the perspective of the 

474 complete pipeline) and the individual tasks that use them as inputs and 

475 outputs. 

476 

477 `QuantumGraph` construction proceeds in four steps, with each corresponding 

478 to a different `_PipelineScaffolding` method: 

479 

480 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

481 the DatasetTypes used by the pipeline (delegating to 

482 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

483 nested `_TaskScaffolding` and `_DatasetDict` objects. 

484 

485 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

486 returns related tuples of all dimensions used to identify any regular 

487 input, output, and intermediate datasets (not prerequisites). We then 

488 iterate over these tuples of related dimensions, identifying the subsets 

489 that correspond to distinct data IDs for each task and dataset type, 

490 and then create `_QuantumScaffolding` objects. 

491 

492 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

493 dataset data IDs previously identified, transforming unresolved 

494 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

495 up prerequisite datasets for all quanta. 

496 

497 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

498 per-task `_QuantumScaffolding` objects. 

499 """ 

500 

501 def __init__(self, pipeline, *, registry): 

502 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

503 self.tasks = [] 

504 # Aggregate and categorize the DatasetTypes in the Pipeline. 

505 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

506 # Construct dictionaries that map those DatasetTypes to structures 

507 # that will (later) hold addiitonal information about them. 

508 for attr in ( 

509 "initInputs", 

510 "initIntermediates", 

511 "initOutputs", 

512 "inputs", 

513 "intermediates", 

514 "outputs", 

515 "prerequisites", 

516 ): 

517 setattr( 

518 self, 

519 attr, 

520 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

521 ) 

522 # Aggregate all dimensions for all non-init, non-prerequisite 

523 # DatasetTypes. These are the ones we'll include in the big join 

524 # query. 

525 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

526 # Construct scaffolding nodes for each Task, and add backreferences 

527 # to the Task from each DatasetScaffolding node. 

528 # Note that there's only one scaffolding node for each DatasetType, 

529 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

530 # reference it. 

531 if isinstance(pipeline, Pipeline): 

532 pipeline = pipeline.toExpandedPipeline() 

533 self.tasks = [ 

534 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

535 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

536 ] 

537 

538 def __repr__(self): 

539 # Default dataclass-injected __repr__ gets caught in an infinite loop 

540 # because of back-references. 

541 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

542 

543 tasks: List[_TaskScaffolding] 

544 """Scaffolding data structures for each task in the pipeline 

545 (`list` of `_TaskScaffolding`). 

546 """ 

547 

548 initInputs: _DatasetDict 

549 """Datasets consumed but not produced when constructing the tasks in this 

550 pipeline (`_DatasetDict`). 

551 """ 

552 

553 initIntermediates: _DatasetDict 

554 """Datasets that are both consumed and produced when constructing the tasks 

555 in this pipeline (`_DatasetDict`). 

556 """ 

557 

558 initOutputs: _DatasetDict 

559 """Datasets produced but not consumed when constructing the tasks in this 

560 pipeline (`_DatasetDict`). 

561 """ 

562 

563 inputs: _DatasetDict 

564 """Datasets that are consumed but not produced when running this pipeline 

565 (`_DatasetDict`). 

566 """ 

567 

568 intermediates: _DatasetDict 

569 """Datasets that are both produced and consumed when running this pipeline 

570 (`_DatasetDict`). 

571 """ 

572 

573 outputs: _DatasetDict 

574 """Datasets produced but not consumed when when running this pipeline 

575 (`_DatasetDict`). 

576 """ 

577 

578 prerequisites: _DatasetDict 

579 """Datasets that are consumed when running this pipeline and looked up 

580 per-Quantum when generating the graph (`_DatasetDict`). 

581 """ 

582 

583 dimensions: DimensionGraph 

584 """All dimensions used by any regular input, intermediate, or output 

585 (not prerequisite) dataset; the set of dimension used in the "Big Join 

586 Query" (`DimensionGraph`). 

587 

588 This is required to be a superset of all task quantum dimensions. 

589 """ 

590 

591 @contextmanager 

592 def connectDataIds( 

593 self, 

594 registry, 

595 collections, 

596 userQuery, 

597 externalDataId, 

598 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

599 ): 

600 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

601 

602 This method populates `_TaskScaffolding.dataIds` and 

603 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

604 

605 Parameters 

606 ---------- 

607 registry : `lsst.daf.butler.Registry` 

608 Registry for the data repository; used for all data ID queries. 

609 collections 

610 Expressions representing the collections to search for input 

611 datasets. May be any of the types accepted by 

612 `lsst.daf.butler.CollectionSearch.fromExpression`. 

613 userQuery : `str` or `None` 

614 User-provided expression to limit the data IDs processed. 

615 externalDataId : `DataCoordinate` 

616 Externally-provided data ID that should be used to restrict the 

617 results, just as if these constraints had been included via ``AND`` 

618 in ``userQuery``. This includes (at least) any instrument named 

619 in the pipeline definition. 

620 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

621 The query constraint variant that should be used to constraint the 

622 query based on dataset existance, defaults to 

623 `DatasetQueryConstraintVariant.ALL`. 

624 

625 Returns 

626 ------- 

627 commonDataIds : \ 

628 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

629 An interface to a database temporary table containing all data IDs 

630 that will appear in this `QuantumGraph`. Returned inside a 

631 context manager, which will drop the temporary table at the end of 

632 the `with` block in which this method is called. 

633 """ 

634 _LOG.debug("Building query for data IDs.") 

635 # Initialization datasets always have empty data IDs. 

636 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

637 for datasetType, refs in itertools.chain( 

638 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

639 ): 

640 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

641 # Run one big query for the data IDs for task dimensions and regular 

642 # inputs and outputs. We limit the query to only dimensions that are 

643 # associated with the input dataset types, but don't (yet) try to 

644 # obtain the dataset_ids for those inputs. 

645 _LOG.debug("Submitting data ID query and materializing results.") 

646 queryArgs = {"dimensions": self.dimensions, "where": userQuery, "dataId": externalDataId} 

647 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

648 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

649 queryArgs["datasets"] = list(self.inputs) 

650 queryArgs["collections"] = collections 

651 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

652 _LOG.debug("Not using dataset existence to constrain query.") 

653 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

654 constraint = set(datasetQueryConstraint) 

655 inputs = {k.name: k for k in self.inputs.keys()} 

656 if remainder := constraint.difference(inputs.keys()): 

657 raise ValueError( 

658 f"{remainder} dataset type(s) specified as a graph constraint, but" 

659 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

660 ) 

661 _LOG.debug(f"Constraining graph query using {constraint}") 

662 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

663 queryArgs["collections"] = collections 

664 else: 

665 raise ValueError( 

666 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

667 ) 

668 

669 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

670 _LOG.debug("Expanding data IDs.") 

671 commonDataIds = commonDataIds.expanded() 

672 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

673 # Iterate over query results, populating data IDs for datasets and 

674 # quanta and then connecting them to each other. 

675 n = -1 

676 for n, commonDataId in enumerate(commonDataIds): 

677 # Create DatasetRefs for all DatasetTypes from this result row, 

678 # noting that we might have created some already. 

679 # We remember both those that already existed and those that we 

680 # create now. 

681 refsForRow = {} 

682 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {} 

683 for datasetType, refs in itertools.chain( 

684 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

685 ): 

686 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)): 

687 datasetDataId = commonDataId.subset(datasetType.dimensions) 

688 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

689 ref = refs.get(datasetDataId) 

690 if ref is None: 

691 ref = DatasetRef(datasetType, datasetDataId) 

692 refs[datasetDataId] = ref 

693 refsForRow[datasetType.name] = ref 

694 # Create _QuantumScaffolding objects for all tasks from this 

695 # result row, noting that we might have created some already. 

696 for task in self.tasks: 

697 quantumDataId = commonDataId.subset(task.dimensions) 

698 quantum = task.quanta.get(quantumDataId) 

699 if quantum is None: 

700 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

701 task.quanta[quantumDataId] = quantum 

702 # Whether this is a new quantum or an existing one, we can 

703 # now associate the DatasetRefs for this row with it. The 

704 # fact that a Quantum data ID and a dataset data ID both 

705 # came from the same result row is what tells us they 

706 # should be associated. 

707 # Many of these associates will be duplicates (because 

708 # another query row that differed from this one only in 

709 # irrelevant dimensions already added them), and we use 

710 # sets to skip. 

711 for datasetType in task.inputs: 

712 ref = refsForRow[datasetType.name] 

713 quantum.inputs[datasetType.name][ref.dataId] = ref 

714 for datasetType in task.outputs: 

715 ref = refsForRow[datasetType.name] 

716 quantum.outputs[datasetType.name][ref.dataId] = ref 

717 if n < 0: 

718 emptiness_explained = False 

719 for message in commonDataIds.explain_no_results(): 

720 _LOG.warning(message) 

721 emptiness_explained = True 

722 if not emptiness_explained: 

723 _LOG.warning( 

724 "To reproduce this query for debugging purposes, run " 

725 "Registry.queryDataIds with these arguments:" 

726 ) 

727 # We could just repr() the queryArgs dict to get something 

728 # the user could make sense of, but it's friendlier to 

729 # put these args in an easier-to-construct equivalent form 

730 # so they can read it more easily and copy and paste into 

731 # a Python terminal. 

732 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

733 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName()) 

734 if queryArgs["where"]: 

735 _LOG.warning(" where=%s,", repr(queryArgs["where"])) 

736 if "datasets" in queryArgs: 

737 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

738 if "collections" in queryArgs: 

739 _LOG.warning(" collections=%s,", list(queryArgs["collections"])) 

740 _LOG.debug("Finished processing %d rows from data ID query.", n) 

741 yield commonDataIds 

742 

743 def resolveDatasetRefs( 

744 self, 

745 registry, 

746 collections, 

747 run, 

748 commonDataIds, 

749 *, 

750 skipExistingIn=None, 

751 clobberOutputs=True, 

752 constrainedByAllDatasets: bool = True, 

753 ): 

754 """Perform follow up queries for each dataset data ID produced in 

755 `fillDataIds`. 

756 

757 This method populates `_DatasetScaffolding.refs` (except for those in 

758 `prerequisites`). 

759 

760 Parameters 

761 ---------- 

762 registry : `lsst.daf.butler.Registry` 

763 Registry for the data repository; used for all data ID queries. 

764 collections 

765 Expressions representing the collections to search for input 

766 datasets. May be any of the types accepted by 

767 `lsst.daf.butler.CollectionSearch.fromExpression`. 

768 run : `str`, optional 

769 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

770 output datasets, if it already exists. 

771 commonDataIds : \ 

772 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

773 Result of a previous call to `connectDataIds`. 

774 skipExistingIn 

775 Expressions representing the collections to search for existing 

776 output datasets that should be skipped. May be any of the types 

777 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

778 `None` or empty string/sequence disables skipping. 

779 clobberOutputs : `bool`, optional 

780 If `True` (default), allow quanta to created even if outputs exist; 

781 this requires the same behavior behavior to be enabled when 

782 executing. If ``skipExistingIn`` is not `None`, completed quanta 

783 (those with metadata, or all outputs if there is no metadata 

784 dataset configured) will be skipped rather than clobbered. 

785 constrainedByAllDatasets : `bool`, optional 

786 Indicates if the commonDataIds were generated with a constraint on 

787 all dataset types. 

788 

789 Raises 

790 ------ 

791 OutputExistsError 

792 Raised if an output dataset already exists in the output run 

793 and ``skipExistingIn`` does not include output run, or if only 

794 some outputs are present and ``clobberOutputs`` is `False`. 

795 """ 

796 skipCollections: Optional[CollectionSearch] = None 

797 skipExistingInRun = False 

798 if skipExistingIn: 

799 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

800 if run: 

801 # as optimization check in the explicit list of names first 

802 skipExistingInRun = run in skipCollections.explicitNames() 

803 if not skipExistingInRun: 

804 # need to flatten it and check again 

805 skipExistingInRun = run in registry.queryCollections( 

806 skipExistingIn, 

807 collectionTypes=CollectionType.RUN, 

808 ) 

809 

810 # Look up [init] intermediate and output datasets in the output 

811 # collection, if there is an output collection. 

812 if run is not None or skipCollections is not None: 

813 for datasetType, refs in itertools.chain( 

814 self.initIntermediates.items(), 

815 self.initOutputs.items(), 

816 self.intermediates.items(), 

817 self.outputs.items(), 

818 ): 

819 _LOG.debug( 

820 "Resolving %d datasets for intermediate and/or output dataset %s.", 

821 len(refs), 

822 datasetType.name, 

823 ) 

824 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

825 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

826 

827 # look at RUN collection first 

828 if run is not None: 

829 resolvedRefQueryResults = subset.findDatasets( 

830 datasetType, collections=run, findFirst=True 

831 ) 

832 for resolvedRef in resolvedRefQueryResults: 

833 # TODO: we could easily support per-DatasetType 

834 # skipExisting and I could imagine that being useful - 

835 # it's probably required in order to support writing 

836 # initOutputs before QuantumGraph generation. 

837 assert resolvedRef.dataId in refs 

838 if not (skipExistingInRun or isInit or clobberOutputs): 

839 raise OutputExistsError( 

840 f"Output dataset {datasetType.name} already exists in " 

841 f"output RUN collection '{run}' with data ID" 

842 f" {resolvedRef.dataId}." 

843 ) 

844 

845 # And check skipExistingIn too, if RUN collection is in 

846 # it is handled above 

847 if skipCollections is not None: 

848 resolvedRefQueryResults = subset.findDatasets( 

849 datasetType, collections=skipCollections, findFirst=True 

850 ) 

851 for resolvedRef in resolvedRefQueryResults: 

852 assert resolvedRef.dataId in refs 

853 refs[resolvedRef.dataId] = resolvedRef 

854 

855 # Look up input and initInput datasets in the input collection(s). 

856 # container to accumulate unfound refs, if the common dataIs were not 

857 # constrained on dataset type existence. 

858 self.unfoundRefs = set() 

859 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

860 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

861 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets( 

862 datasetType, collections=collections, findFirst=True 

863 ) 

864 dataIdsNotFoundYet = set(refs.keys()) 

865 for resolvedRef in resolvedRefQueryResults: 

866 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

867 refs[resolvedRef.dataId] = resolvedRef 

868 if dataIdsNotFoundYet: 

869 if constrainedByAllDatasets: 

870 raise RuntimeError( 

871 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

872 f"'{datasetType.name}' was/were present in a previous " 

873 f"query, but could not be found now." 

874 f"This is either a logic bug in QuantumGraph generation " 

875 f"or the input collections have been modified since " 

876 f"QuantumGraph generation began." 

877 ) 

878 else: 

879 # if the common dataIds were not constrained using all the 

880 # input dataset types, it is possible that some data ids 

881 # found dont correspond to existing dataset types and they 

882 # will be un-resolved. Mark these for later pruning from 

883 # the quantum graph. 

884 for k in dataIdsNotFoundYet: 

885 self.unfoundRefs.add(refs[k]) 

886 

887 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

888 # replacing the unresolved refs there, and then look up prerequisites. 

889 for task in self.tasks: 

890 _LOG.debug( 

891 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

892 len(task.quanta), 

893 task.taskDef.label, 

894 ) 

895 lookupFunctions = { 

896 c.name: c.lookupFunction 

897 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

898 if c.lookupFunction is not None 

899 } 

900 dataIdsFailed = [] 

901 dataIdsSucceeded = [] 

902 for quantum in task.quanta.values(): 

903 # Process outputs datasets only if skipExistingIn is not None 

904 # or there is a run to look for outputs in and clobberOutputs 

905 # is True. Note that if skipExistingIn is None, any output 

906 # datasets that already exist would have already caused an 

907 # exception to be raised. We never update the DatasetRefs in 

908 # the quantum because those should never be resolved. 

909 if skipCollections is not None or (run is not None and clobberOutputs): 

910 resolvedRefs = [] 

911 unresolvedRefs = [] 

912 haveMetadata = False 

913 for datasetType, originalRefs in quantum.outputs.items(): 

914 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

915 if ref.id is not None: 

916 resolvedRefs.append(ref) 

917 if datasetType.name == task.taskDef.metadataDatasetName: 

918 haveMetadata = True 

919 else: 

920 unresolvedRefs.append(ref) 

921 if resolvedRefs: 

922 if haveMetadata or not unresolvedRefs: 

923 dataIdsSucceeded.append(quantum.dataId) 

924 if skipCollections is not None: 

925 continue 

926 else: 

927 dataIdsFailed.append(quantum.dataId) 

928 if not clobberOutputs: 

929 raise OutputExistsError( 

930 f"Quantum {quantum.dataId} of task with label " 

931 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

932 f"({resolvedRefs}) " 

933 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

934 "and clobbering outputs was not enabled." 

935 ) 

936 # Update the input DatasetRefs to the resolved ones we already 

937 # searched for. 

938 for datasetType, refs in quantum.inputs.items(): 

939 for ref in task.inputs.extract(datasetType, refs.keys()): 

940 refs[ref.dataId] = ref 

941 # Look up prerequisite datasets in the input collection(s). 

942 # These may have dimensions that extend beyond those we queried 

943 # for originally, because we want to permit those data ID 

944 # values to differ across quanta and dataset types. 

945 for datasetType in task.prerequisites: 

946 lookupFunction = lookupFunctions.get(datasetType.name) 

947 if lookupFunction is not None: 

948 # PipelineTask has provided its own function to do the 

949 # lookup. This always takes precedence. 

950 refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

951 elif ( 

952 datasetType.isCalibration() 

953 and datasetType.dimensions <= quantum.dataId.graph 

954 and quantum.dataId.graph.temporal 

955 ): 

956 # This is a master calibration lookup, which we have to 

957 # handle specially because the query system can't do a 

958 # temporal join on a non-dimension-based timespan yet. 

959 timespan = quantum.dataId.timespan 

960 try: 

961 refs = [ 

962 registry.findDataset( 

963 datasetType, quantum.dataId, collections=collections, timespan=timespan 

964 ) 

965 ] 

966 except KeyError: 

967 # This dataset type is not present in the registry, 

968 # which just means there are no datasets here. 

969 refs = [] 

970 else: 

971 # Most general case. 

972 refs = list( 

973 registry.queryDatasets( 

974 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True 

975 ).expanded() 

976 ) 

977 quantum.prerequisites[datasetType].update( 

978 {ref.dataId: ref for ref in refs if ref is not None} 

979 ) 

980 # Actually remove any quanta that we decided to skip above. 

981 if dataIdsSucceeded: 

982 if skipCollections is not None: 

983 _LOG.debug( 

984 "Pruning successful %d quanta for task with label '%s' because all of their " 

985 "outputs exist or metadata was written successfully.", 

986 len(dataIdsSucceeded), 

987 task.taskDef.label, 

988 ) 

989 for dataId in dataIdsSucceeded: 

990 del task.quanta[dataId] 

991 elif clobberOutputs: 

992 _LOG.info( 

993 "Found %d successful quanta for task with label '%s' " 

994 "that will need to be clobbered during execution.", 

995 len(dataIdsSucceeded), 

996 task.taskDef.label, 

997 ) 

998 else: 

999 raise AssertionError("OutputExistsError should have already been raised.") 

1000 if dataIdsFailed: 

1001 if clobberOutputs: 

1002 _LOG.info( 

1003 "Found %d failed/incomplete quanta for task with label '%s' " 

1004 "that will need to be clobbered during execution.", 

1005 len(dataIdsFailed), 

1006 task.taskDef.label, 

1007 ) 

1008 else: 

1009 raise AssertionError("OutputExistsError should have already been raised.") 

1010 

1011 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

1012 """Create a `QuantumGraph` from the quanta already present in 

1013 the scaffolding data structure. 

1014 

1015 Parameters 

1016 --------- 

1017 metadata : Optional Mapping of `str` to primitives 

1018 This is an optional parameter of extra data to carry with the 

1019 graph. Entries in this mapping should be able to be serialized in 

1020 JSON. 

1021 

1022 Returns 

1023 ------- 

1024 graph : `QuantumGraph` 

1025 The full `QuantumGraph`. 

1026 """ 

1027 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1028 for task in self.tasks: 

1029 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs) 

1030 graphInput[task.taskDef] = qset 

1031 

1032 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs) 

1033 return graph 

1034 

1035 

1036# ------------------------ 

1037# Exported definitions -- 

1038# ------------------------ 

1039 

1040 

1041class GraphBuilderError(Exception): 

1042 """Base class for exceptions generated by graph builder.""" 

1043 

1044 pass 

1045 

1046 

1047class OutputExistsError(GraphBuilderError): 

1048 """Exception generated when output datasets already exist.""" 

1049 

1050 pass 

1051 

1052 

1053class PrerequisiteMissingError(GraphBuilderError): 

1054 """Exception generated when a prerequisite dataset does not exist.""" 

1055 

1056 pass 

1057 

1058 

1059class GraphBuilder(object): 

1060 """GraphBuilder class is responsible for building task execution graph from 

1061 a Pipeline. 

1062 

1063 Parameters 

1064 ---------- 

1065 registry : `~lsst.daf.butler.Registry` 

1066 Data butler instance. 

1067 skipExistingIn 

1068 Expressions representing the collections to search for existing 

1069 output datasets that should be skipped. May be any of the types 

1070 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

1071 clobberOutputs : `bool`, optional 

1072 If `True` (default), allow quanta to created even if partial outputs 

1073 exist; this requires the same behavior behavior to be enabled when 

1074 executing. 

1075 """ 

1076 

1077 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True): 

1078 self.registry = registry 

1079 self.dimensions = registry.dimensions 

1080 self.skipExistingIn = skipExistingIn 

1081 self.clobberOutputs = clobberOutputs 

1082 

1083 def makeGraph( 

1084 self, 

1085 pipeline, 

1086 collections, 

1087 run, 

1088 userQuery, 

1089 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1090 metadata: Optional[Mapping[str, Any]] = None, 

1091 ): 

1092 """Create execution graph for a pipeline. 

1093 

1094 Parameters 

1095 ---------- 

1096 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1097 Pipeline definition, task names/classes and their configs. 

1098 collections 

1099 Expressions representing the collections to search for input 

1100 datasets. May be any of the types accepted by 

1101 `lsst.daf.butler.CollectionSearch.fromExpression`. 

1102 run : `str`, optional 

1103 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1104 output datasets, if it already exists. 

1105 userQuery : `str` 

1106 String which defines user-defined selection for registry, should be 

1107 empty or `None` if there is no restrictions on data selection. 

1108 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1109 The query constraint variant that should be used to constraint the 

1110 query based on dataset existance, defaults to 

1111 `DatasetQueryConstraintVariant.ALL`. 

1112 metadata : Optional Mapping of `str` to primitives 

1113 This is an optional parameter of extra data to carry with the 

1114 graph. Entries in this mapping should be able to be serialized in 

1115 JSON. 

1116 

1117 Returns 

1118 ------- 

1119 graph : `QuantumGraph` 

1120 

1121 Raises 

1122 ------ 

1123 UserExpressionError 

1124 Raised when user expression cannot be parsed. 

1125 OutputExistsError 

1126 Raised when output datasets already exist. 

1127 Exception 

1128 Other exceptions types may be raised by underlying registry 

1129 classes. 

1130 """ 

1131 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1132 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1133 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1134 instrument = None 

1135 if isinstance(pipeline, Pipeline): 

1136 instrument = pipeline.getInstrument() 

1137 if isinstance(instrument, str): 

1138 instrument = doImport(instrument) 

1139 pipeline = list(pipeline.toExpandedPipeline()) 

1140 if instrument is not None: 

1141 dataId = DataCoordinate.standardize( 

1142 instrument=instrument.getName(), universe=self.registry.dimensions 

1143 ) 

1144 else: 

1145 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1146 with scaffolding.connectDataIds( 

1147 self.registry, collections, userQuery, dataId, datasetQueryConstraint 

1148 ) as commonDataIds: 

1149 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1150 scaffolding.resolveDatasetRefs( 

1151 self.registry, 

1152 collections, 

1153 run, 

1154 commonDataIds, 

1155 skipExistingIn=self.skipExistingIn, 

1156 clobberOutputs=self.clobberOutputs, 

1157 constrainedByAllDatasets=condition, 

1158 ) 

1159 return scaffolding.makeQuantumGraph(metadata=metadata)