Coverage for python/lsst/pipe/base/graphBuilder.py: 17%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

347 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Collection, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Union 

37 

38from lsst.daf.butler import ( 

39 CollectionSearch, 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 DimensionGraph, 

45 DimensionUniverse, 

46 NamedKeyDict, 

47 Quantum, 

48 Registry, 

49) 

50from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

51from lsst.utils import doImportType 

52 

53from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

54from ._status import NoWorkFound 

55 

56# ----------------------------- 

57# Imports for other modules -- 

58# ----------------------------- 

59from .connections import AdjustQuantumHelper, iterConnections 

60from .graph import QuantumGraph 

61from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

62 

63# ---------------------------------- 

64# Local non-exported definitions -- 

65# ---------------------------------- 

66 

67_LOG = logging.getLogger(__name__) 

68 

69 

70class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

71 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

72 the known `DatasetRef` instances of that type. 

73 

74 Parameters 

75 ---------- 

76 args 

77 Positional arguments are forwarded to the `dict` constructor. 

78 universe : `DimensionUniverse` 

79 Universe of all possible dimensions. 

80 """ 

81 

82 def __init__(self, *args: Any, universe: DimensionUniverse): 

83 super().__init__(*args) 

84 self.universe = universe 

85 

86 @classmethod 

87 def fromDatasetTypes( 

88 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

89 ) -> _DatasetDict: 

90 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

91 

92 Parameters 

93 ---------- 

94 datasetTypes : `iterable` of `DatasetType` 

95 DatasetTypes to use as keys for the dict. Values will be empty 

96 dictionaries. 

97 universe : `DimensionUniverse` 

98 Universe of all possible dimensions. 

99 

100 Returns 

101 ------- 

102 dictionary : `_DatasetDict` 

103 A new `_DatasetDict` instance. 

104 """ 

105 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

106 

107 @classmethod 

108 def fromSubset( 

109 cls, datasetTypes: Collection[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

110 ) -> _DatasetDict: 

111 """Return a new dictionary by extracting items corresponding to the 

112 given keys from one or more existing dictionaries. 

113 

114 Parameters 

115 ---------- 

116 datasetTypes : `iterable` of `DatasetType` 

117 DatasetTypes to use as keys for the dict. Values will be obtained 

118 by lookups against ``first`` and ``rest``. 

119 first : `_DatasetDict` 

120 Another dictionary from which to extract values. 

121 rest 

122 Additional dictionaries from which to extract values. 

123 

124 Returns 

125 ------- 

126 dictionary : `_DatasetDict` 

127 A new dictionary instance. 

128 """ 

129 combined = ChainMap(first, *rest) 

130 

131 # Dataset types known to match immediately can be processed 

132 # without checks. 

133 matches = combined.keys() & set(datasetTypes) 

134 _dict = {k: combined[k] for k in matches} 

135 

136 if len(_dict) < len(datasetTypes): 

137 # Work out which ones are missing. 

138 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

139 

140 # Get the known names for comparison. 

141 combined_by_name = {k.name: k for k in combined} 

142 

143 missing = set() 

144 incompatible = {} 

145 for datasetType in missing_datasetTypes: 

146 # The dataset type is not found. It may not be listed 

147 # or it may be that it is there with the same name 

148 # but different definition. 

149 if datasetType.name in combined_by_name: 

150 # This implies some inconsistency in definitions 

151 # for connections. If there is support for storage 

152 # class conversion we can let it slide. 

153 # At this point we do not know 

154 # where the inconsistency is but trust that down 

155 # stream code will be more explicit about input 

156 # vs output incompatibilities. 

157 existing = combined_by_name[datasetType.name] 

158 if existing.is_compatible_with(datasetType) or datasetType.is_compatible_with(existing): 

159 _LOG.warning( 

160 "Dataset type mismatch (%s != %s) but continuing since they are compatible", 

161 datasetType, 

162 existing, 

163 ) 

164 _dict[datasetType] = combined[existing] 

165 else: 

166 incompatible[datasetType] = existing 

167 else: 

168 missing.add(datasetType) 

169 

170 if missing or incompatible: 

171 reasons = [] 

172 if missing: 

173 reasons.append( 

174 "DatasetTypes {'.'.join(missing)} not present in list of known types: " 

175 + ", ".join(d.name for d in combined) 

176 ) 

177 if incompatible: 

178 for x, y in incompatible.items(): 

179 reasons.append(f"{x} incompatible with {y}") 

180 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

181 

182 return cls(_dict, universe=first.universe) 

183 

184 @property 

185 def dimensions(self) -> DimensionGraph: 

186 """The union of all dimensions used by all dataset types in this 

187 dictionary, including implied dependencies (`DimensionGraph`). 

188 """ 

189 base = self.universe.empty 

190 if len(self) == 0: 

191 return base 

192 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

193 

194 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

195 """Unpack nested single-element `DatasetRef` dicts into a new 

196 mapping with `DatasetType` keys and `DatasetRef` values. 

197 

198 This method assumes that each nest contains exactly one item, as is the 

199 case for all "init" datasets. 

200 

201 Returns 

202 ------- 

203 dictionary : `NamedKeyDict` 

204 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

205 `DatasetType` instances and string names usable as keys. 

206 """ 

207 

208 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

209 (ref,) = refs.values() 

210 return ref 

211 

212 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

213 

214 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

215 """Unpack nested multi-element `DatasetRef` dicts into a new 

216 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

217 

218 Returns 

219 ------- 

220 dictionary : `NamedKeyDict` 

221 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

222 both `DatasetType` instances and string names usable as keys. 

223 """ 

224 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

225 

226 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

227 """Iterate over the contained `DatasetRef` instances that match the 

228 given `DatasetType` and data IDs. 

229 

230 Parameters 

231 ---------- 

232 datasetType : `DatasetType` 

233 Dataset type to match. 

234 dataIds : `Iterable` [ `DataCoordinate` ] 

235 Data IDs to match. 

236 

237 Returns 

238 ------- 

239 refs : `Iterator` [ `DatasetRef` ] 

240 DatasetRef instances for which ``ref.datasetType == datasetType`` 

241 and ``ref.dataId`` is in ``dataIds``. 

242 """ 

243 refs = self[datasetType] 

244 return (refs[dataId] for dataId in dataIds) 

245 

246 

247class _QuantumScaffolding: 

248 """Helper class aggregating information about a `Quantum`, used when 

249 constructing a `QuantumGraph`. 

250 

251 See `_PipelineScaffolding` for a top-down description of the full 

252 scaffolding data structure. 

253 

254 Parameters 

255 ---------- 

256 task : _TaskScaffolding 

257 Back-reference to the helper object for the `PipelineTask` this quantum 

258 represents an execution of. 

259 dataId : `DataCoordinate` 

260 Data ID for this quantum. 

261 """ 

262 

263 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

264 self.task = task 

265 self.dataId = dataId 

266 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

267 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

268 self.prerequisites = _DatasetDict.fromDatasetTypes( 

269 task.prerequisites.keys(), universe=dataId.universe 

270 ) 

271 

272 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

273 

274 def __repr__(self) -> str: 

275 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

276 

277 task: _TaskScaffolding 

278 """Back-reference to the helper object for the `PipelineTask` this quantum 

279 represents an execution of. 

280 """ 

281 

282 dataId: DataCoordinate 

283 """Data ID for this quantum. 

284 """ 

285 

286 inputs: _DatasetDict 

287 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

288 

289 This is initialized to map each `DatasetType` to an empty dictionary at 

290 construction. Those nested dictionaries are populated (with data IDs as 

291 keys) with unresolved `DatasetRef` instances in 

292 `_PipelineScaffolding.connectDataIds`. 

293 """ 

294 

295 outputs: _DatasetDict 

296 """Nested dictionary containing `DatasetRef` outputs this quantum. 

297 """ 

298 

299 prerequisites: _DatasetDict 

300 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

301 quantum. 

302 """ 

303 

304 def makeQuantum(self) -> Quantum: 

305 """Transform the scaffolding object into a true `Quantum` instance. 

306 

307 Returns 

308 ------- 

309 quantum : `Quantum` 

310 An actual `Quantum` instance. 

311 """ 

312 allInputs = self.inputs.unpackMultiRefs() 

313 allInputs.update(self.prerequisites.unpackMultiRefs()) 

314 # Give the task's Connections class an opportunity to remove some 

315 # inputs, or complain if they are unacceptable. 

316 # This will raise if one of the check conditions is not met, which is 

317 # the intended behavior. 

318 # If it raises NotWorkFound, there is a bug in the QG algorithm 

319 # or the adjustQuantum is incorrectly trying to make a prerequisite 

320 # input behave like a regular input; adjustQuantum should only raise 

321 # NoWorkFound if a regular input is missing, and it shouldn't be 

322 # possible for us to have generated ``self`` if that's true. 

323 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

324 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

325 return Quantum( 

326 taskName=self.task.taskDef.taskName, 

327 taskClass=self.task.taskDef.taskClass, 

328 dataId=self.dataId, 

329 initInputs=self.task.initInputs.unpackSingleRefs(), 

330 inputs=helper.inputs, 

331 outputs=helper.outputs, 

332 ) 

333 

334 

335@dataclass 

336class _TaskScaffolding: 

337 """Helper class aggregating information about a `PipelineTask`, used when 

338 constructing a `QuantumGraph`. 

339 

340 See `_PipelineScaffolding` for a top-down description of the full 

341 scaffolding data structure. 

342 

343 Parameters 

344 ---------- 

345 taskDef : `TaskDef` 

346 Data structure that identifies the task class and its config. 

347 parent : `_PipelineScaffolding` 

348 The parent data structure that will hold the instance being 

349 constructed. 

350 datasetTypes : `TaskDatasetTypes` 

351 Data structure that categorizes the dataset types used by this task. 

352 """ 

353 

354 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

355 universe = parent.dimensions.universe 

356 self.taskDef = taskDef 

357 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

358 assert self.dimensions.issubset(parent.dimensions) 

359 # Initialize _DatasetDicts as subsets of the one or two 

360 # corresponding dicts in the parent _PipelineScaffolding. 

361 self.initInputs = _DatasetDict.fromSubset( 

362 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

363 ) 

364 self.initOutputs = _DatasetDict.fromSubset( 

365 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

366 ) 

367 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

368 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

369 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

370 self.dataIds: Set[DataCoordinate] = set() 

371 self.quanta = {} 

372 

373 def __repr__(self) -> str: 

374 # Default dataclass-injected __repr__ gets caught in an infinite loop 

375 # because of back-references. 

376 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

377 

378 taskDef: TaskDef 

379 """Data structure that identifies the task class and its config 

380 (`TaskDef`). 

381 """ 

382 

383 dimensions: DimensionGraph 

384 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

385 """ 

386 

387 initInputs: _DatasetDict 

388 """Dictionary containing information about datasets used to construct this 

389 task (`_DatasetDict`). 

390 """ 

391 

392 initOutputs: _DatasetDict 

393 """Dictionary containing information about datasets produced as a 

394 side-effect of constructing this task (`_DatasetDict`). 

395 """ 

396 

397 inputs: _DatasetDict 

398 """Dictionary containing information about datasets used as regular, 

399 graph-constraining inputs to this task (`_DatasetDict`). 

400 """ 

401 

402 outputs: _DatasetDict 

403 """Dictionary containing information about datasets produced by this task 

404 (`_DatasetDict`). 

405 """ 

406 

407 prerequisites: _DatasetDict 

408 """Dictionary containing information about input datasets that must be 

409 present in the repository before any Pipeline containing this task is run 

410 (`_DatasetDict`). 

411 """ 

412 

413 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

414 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

415 this task with that data ID. 

416 """ 

417 

418 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]: 

419 """Create a `set` of `Quantum` from the information in ``self``. 

420 

421 Returns 

422 ------- 

423 nodes : `set` of `Quantum 

424 The `Quantum` elements corresponding to this task. 

425 """ 

426 if unresolvedRefs is None: 

427 unresolvedRefs = set() 

428 outputs = set() 

429 for q in self.quanta.values(): 

430 try: 

431 tmpQuanta = q.makeQuantum() 

432 outputs.add(tmpQuanta) 

433 except (NoWorkFound, FileNotFoundError) as exc: 

434 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

435 if unresolvedRefs.intersection(refs): 

436 # This means it is a node that is Known to be pruned 

437 # later and should be left in even though some follow up 

438 # queries fail. This allows the pruning to start from this 

439 # quantum with known issues, and prune other nodes it 

440 # touches 

441 inputs = q.inputs.unpackMultiRefs() 

442 inputs.update(q.prerequisites.unpackMultiRefs()) 

443 tmpQuantum = Quantum( 

444 taskName=q.task.taskDef.taskName, 

445 taskClass=q.task.taskDef.taskClass, 

446 dataId=q.dataId, 

447 initInputs=q.task.initInputs.unpackSingleRefs(), 

448 inputs=inputs, 

449 outputs=q.outputs.unpackMultiRefs(), 

450 ) 

451 outputs.add(tmpQuantum) 

452 else: 

453 raise exc 

454 return outputs 

455 

456 

457@dataclass 

458class _PipelineScaffolding: 

459 """A helper data structure that organizes the information involved in 

460 constructing a `QuantumGraph` for a `Pipeline`. 

461 

462 Parameters 

463 ---------- 

464 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

465 Sequence of tasks from which a graph is to be constructed. Must 

466 have nested task classes already imported. 

467 universe : `DimensionUniverse` 

468 Universe of all possible dimensions. 

469 

470 Notes 

471 ----- 

472 The scaffolding data structure contains nested data structures for both 

473 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

474 data structures are shared between the pipeline-level structure (which 

475 aggregates all datasets and categorizes them from the perspective of the 

476 complete pipeline) and the individual tasks that use them as inputs and 

477 outputs. 

478 

479 `QuantumGraph` construction proceeds in four steps, with each corresponding 

480 to a different `_PipelineScaffolding` method: 

481 

482 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

483 the DatasetTypes used by the pipeline (delegating to 

484 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

485 nested `_TaskScaffolding` and `_DatasetDict` objects. 

486 

487 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

488 returns related tuples of all dimensions used to identify any regular 

489 input, output, and intermediate datasets (not prerequisites). We then 

490 iterate over these tuples of related dimensions, identifying the subsets 

491 that correspond to distinct data IDs for each task and dataset type, 

492 and then create `_QuantumScaffolding` objects. 

493 

494 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

495 dataset data IDs previously identified, transforming unresolved 

496 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

497 up prerequisite datasets for all quanta. 

498 

499 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

500 per-task `_QuantumScaffolding` objects. 

501 """ 

502 

503 def __init__(self, pipeline: Union[Pipeline, Iterable[TaskDef]], *, registry: Registry): 

504 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

505 self.tasks = [] 

506 # Aggregate and categorize the DatasetTypes in the Pipeline. 

507 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

508 # Construct dictionaries that map those DatasetTypes to structures 

509 # that will (later) hold addiitonal information about them. 

510 for attr in ( 

511 "initInputs", 

512 "initIntermediates", 

513 "initOutputs", 

514 "inputs", 

515 "intermediates", 

516 "outputs", 

517 "prerequisites", 

518 ): 

519 setattr( 

520 self, 

521 attr, 

522 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

523 ) 

524 # Aggregate all dimensions for all non-init, non-prerequisite 

525 # DatasetTypes. These are the ones we'll include in the big join 

526 # query. 

527 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

528 # Construct scaffolding nodes for each Task, and add backreferences 

529 # to the Task from each DatasetScaffolding node. 

530 # Note that there's only one scaffolding node for each DatasetType, 

531 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

532 # reference it. 

533 if isinstance(pipeline, Pipeline): 

534 pipeline = pipeline.toExpandedPipeline() 

535 self.tasks = [ 

536 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

537 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

538 ] 

539 

540 def __repr__(self) -> str: 

541 # Default dataclass-injected __repr__ gets caught in an infinite loop 

542 # because of back-references. 

543 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

544 

545 tasks: List[_TaskScaffolding] 

546 """Scaffolding data structures for each task in the pipeline 

547 (`list` of `_TaskScaffolding`). 

548 """ 

549 

550 initInputs: _DatasetDict 

551 """Datasets consumed but not produced when constructing the tasks in this 

552 pipeline (`_DatasetDict`). 

553 """ 

554 

555 initIntermediates: _DatasetDict 

556 """Datasets that are both consumed and produced when constructing the tasks 

557 in this pipeline (`_DatasetDict`). 

558 """ 

559 

560 initOutputs: _DatasetDict 

561 """Datasets produced but not consumed when constructing the tasks in this 

562 pipeline (`_DatasetDict`). 

563 """ 

564 

565 inputs: _DatasetDict 

566 """Datasets that are consumed but not produced when running this pipeline 

567 (`_DatasetDict`). 

568 """ 

569 

570 intermediates: _DatasetDict 

571 """Datasets that are both produced and consumed when running this pipeline 

572 (`_DatasetDict`). 

573 """ 

574 

575 outputs: _DatasetDict 

576 """Datasets produced but not consumed when when running this pipeline 

577 (`_DatasetDict`). 

578 """ 

579 

580 prerequisites: _DatasetDict 

581 """Datasets that are consumed when running this pipeline and looked up 

582 per-Quantum when generating the graph (`_DatasetDict`). 

583 """ 

584 

585 dimensions: DimensionGraph 

586 """All dimensions used by any regular input, intermediate, or output 

587 (not prerequisite) dataset; the set of dimension used in the "Big Join 

588 Query" (`DimensionGraph`). 

589 

590 This is required to be a superset of all task quantum dimensions. 

591 """ 

592 

593 @contextmanager 

594 def connectDataIds( 

595 self, 

596 registry: Registry, 

597 collections: Any, 

598 userQuery: Optional[str], 

599 externalDataId: DataCoordinate, 

600 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

601 ) -> Iterator[DataCoordinateQueryResults]: 

602 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

603 

604 This method populates `_TaskScaffolding.dataIds` and 

605 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

606 

607 Parameters 

608 ---------- 

609 registry : `lsst.daf.butler.Registry` 

610 Registry for the data repository; used for all data ID queries. 

611 collections 

612 Expressions representing the collections to search for input 

613 datasets. May be any of the types accepted by 

614 `lsst.daf.butler.CollectionSearch.fromExpression`. 

615 userQuery : `str` or `None` 

616 User-provided expression to limit the data IDs processed. 

617 externalDataId : `DataCoordinate` 

618 Externally-provided data ID that should be used to restrict the 

619 results, just as if these constraints had been included via ``AND`` 

620 in ``userQuery``. This includes (at least) any instrument named 

621 in the pipeline definition. 

622 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

623 The query constraint variant that should be used to constraint the 

624 query based on dataset existance, defaults to 

625 `DatasetQueryConstraintVariant.ALL`. 

626 

627 Returns 

628 ------- 

629 commonDataIds : \ 

630 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

631 An interface to a database temporary table containing all data IDs 

632 that will appear in this `QuantumGraph`. Returned inside a 

633 context manager, which will drop the temporary table at the end of 

634 the `with` block in which this method is called. 

635 """ 

636 _LOG.debug("Building query for data IDs.") 

637 # Initialization datasets always have empty data IDs. 

638 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

639 for datasetType, refs in itertools.chain( 

640 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

641 ): 

642 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

643 # Run one big query for the data IDs for task dimensions and regular 

644 # inputs and outputs. We limit the query to only dimensions that are 

645 # associated with the input dataset types, but don't (yet) try to 

646 # obtain the dataset_ids for those inputs. 

647 _LOG.debug("Submitting data ID query and materializing results.") 

648 queryArgs: Dict[str, Any] = { 

649 "dimensions": self.dimensions, 

650 "where": userQuery, 

651 "dataId": externalDataId, 

652 } 

653 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

654 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

655 queryArgs["datasets"] = list(self.inputs) 

656 queryArgs["collections"] = collections 

657 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

658 _LOG.debug("Not using dataset existence to constrain query.") 

659 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

660 constraint = set(datasetQueryConstraint) 

661 inputs = {k.name: k for k in self.inputs.keys()} 

662 if remainder := constraint.difference(inputs.keys()): 

663 raise ValueError( 

664 f"{remainder} dataset type(s) specified as a graph constraint, but" 

665 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

666 ) 

667 _LOG.debug(f"Constraining graph query using {constraint}") 

668 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

669 queryArgs["collections"] = collections 

670 else: 

671 raise ValueError( 

672 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

673 ) 

674 

675 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

676 _LOG.debug("Expanding data IDs.") 

677 commonDataIds = commonDataIds.expanded() 

678 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

679 # Iterate over query results, populating data IDs for datasets and 

680 # quanta and then connecting them to each other. 

681 n = -1 

682 for n, commonDataId in enumerate(commonDataIds): 

683 # Create DatasetRefs for all DatasetTypes from this result row, 

684 # noting that we might have created some already. 

685 # We remember both those that already existed and those that we 

686 # create now. 

687 refsForRow = {} 

688 dataIdCacheForRow: Dict[DimensionGraph, DataCoordinate] = {} 

689 for datasetType, refs in itertools.chain( 

690 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

691 ): 

692 datasetDataId: Optional[DataCoordinate] 

693 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

694 datasetDataId = commonDataId.subset(datasetType.dimensions) 

695 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

696 ref = refs.get(datasetDataId) 

697 if ref is None: 

698 ref = DatasetRef(datasetType, datasetDataId) 

699 refs[datasetDataId] = ref 

700 refsForRow[datasetType.name] = ref 

701 # Create _QuantumScaffolding objects for all tasks from this 

702 # result row, noting that we might have created some already. 

703 for task in self.tasks: 

704 quantumDataId = commonDataId.subset(task.dimensions) 

705 quantum = task.quanta.get(quantumDataId) 

706 if quantum is None: 

707 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

708 task.quanta[quantumDataId] = quantum 

709 # Whether this is a new quantum or an existing one, we can 

710 # now associate the DatasetRefs for this row with it. The 

711 # fact that a Quantum data ID and a dataset data ID both 

712 # came from the same result row is what tells us they 

713 # should be associated. 

714 # Many of these associates will be duplicates (because 

715 # another query row that differed from this one only in 

716 # irrelevant dimensions already added them), and we use 

717 # sets to skip. 

718 for datasetType in task.inputs: 

719 ref = refsForRow[datasetType.name] 

720 quantum.inputs[datasetType.name][ref.dataId] = ref 

721 for datasetType in task.outputs: 

722 ref = refsForRow[datasetType.name] 

723 quantum.outputs[datasetType.name][ref.dataId] = ref 

724 if n < 0: 

725 emptiness_explained = False 

726 for message in commonDataIds.explain_no_results(): 

727 _LOG.warning(message) 

728 emptiness_explained = True 

729 if not emptiness_explained: 

730 _LOG.warning( 

731 "To reproduce this query for debugging purposes, run " 

732 "Registry.queryDataIds with these arguments:" 

733 ) 

734 # We could just repr() the queryArgs dict to get something 

735 # the user could make sense of, but it's friendlier to 

736 # put these args in an easier-to-construct equivalent form 

737 # so they can read it more easily and copy and paste into 

738 # a Python terminal. 

739 _LOG.warning(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

740 _LOG.warning(" dataId=%s,", queryArgs["dataId"].byName()) 

741 if queryArgs["where"]: 

742 _LOG.warning(" where=%s,", repr(queryArgs["where"])) 

743 if "datasets" in queryArgs: 

744 _LOG.warning(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

745 if "collections" in queryArgs: 

746 _LOG.warning(" collections=%s,", list(queryArgs["collections"])) 

747 _LOG.debug("Finished processing %d rows from data ID query.", n) 

748 yield commonDataIds 

749 

750 def resolveDatasetRefs( 

751 self, 

752 registry: Registry, 

753 collections: Any, 

754 run: Optional[str], 

755 commonDataIds: DataCoordinateQueryResults, 

756 *, 

757 skipExistingIn: Any = None, 

758 clobberOutputs: bool = True, 

759 constrainedByAllDatasets: bool = True, 

760 ) -> None: 

761 """Perform follow up queries for each dataset data ID produced in 

762 `fillDataIds`. 

763 

764 This method populates `_DatasetScaffolding.refs` (except for those in 

765 `prerequisites`). 

766 

767 Parameters 

768 ---------- 

769 registry : `lsst.daf.butler.Registry` 

770 Registry for the data repository; used for all data ID queries. 

771 collections 

772 Expressions representing the collections to search for input 

773 datasets. May be any of the types accepted by 

774 `lsst.daf.butler.CollectionSearch.fromExpression`. 

775 run : `str`, optional 

776 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

777 output datasets, if it already exists. 

778 commonDataIds : \ 

779 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

780 Result of a previous call to `connectDataIds`. 

781 skipExistingIn 

782 Expressions representing the collections to search for existing 

783 output datasets that should be skipped. May be any of the types 

784 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

785 `None` or empty string/sequence disables skipping. 

786 clobberOutputs : `bool`, optional 

787 If `True` (default), allow quanta to created even if outputs exist; 

788 this requires the same behavior behavior to be enabled when 

789 executing. If ``skipExistingIn`` is not `None`, completed quanta 

790 (those with metadata, or all outputs if there is no metadata 

791 dataset configured) will be skipped rather than clobbered. 

792 constrainedByAllDatasets : `bool`, optional 

793 Indicates if the commonDataIds were generated with a constraint on 

794 all dataset types. 

795 

796 Raises 

797 ------ 

798 OutputExistsError 

799 Raised if an output dataset already exists in the output run 

800 and ``skipExistingIn`` does not include output run, or if only 

801 some outputs are present and ``clobberOutputs`` is `False`. 

802 """ 

803 skipCollections: Optional[CollectionSearch] = None 

804 skipExistingInRun = False 

805 if skipExistingIn: 

806 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

807 if run: 

808 # as optimization check in the explicit list of names first 

809 skipExistingInRun = run in skipCollections.explicitNames() 

810 if not skipExistingInRun: 

811 # need to flatten it and check again 

812 skipExistingInRun = run in registry.queryCollections( 

813 skipExistingIn, 

814 collectionTypes=CollectionType.RUN, 

815 ) 

816 

817 # Look up [init] intermediate and output datasets in the output 

818 # collection, if there is an output collection. 

819 if run is not None or skipCollections is not None: 

820 for datasetType, refs in itertools.chain( 

821 self.initIntermediates.items(), 

822 self.initOutputs.items(), 

823 self.intermediates.items(), 

824 self.outputs.items(), 

825 ): 

826 _LOG.debug( 

827 "Resolving %d datasets for intermediate and/or output dataset %s.", 

828 len(refs), 

829 datasetType.name, 

830 ) 

831 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

832 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

833 

834 # look at RUN collection first 

835 if run is not None: 

836 resolvedRefQueryResults = subset.findDatasets( 

837 datasetType, collections=run, findFirst=True 

838 ) 

839 for resolvedRef in resolvedRefQueryResults: 

840 # TODO: we could easily support per-DatasetType 

841 # skipExisting and I could imagine that being useful - 

842 # it's probably required in order to support writing 

843 # initOutputs before QuantumGraph generation. 

844 assert resolvedRef.dataId in refs 

845 if not (skipExistingInRun or isInit or clobberOutputs): 

846 raise OutputExistsError( 

847 f"Output dataset {datasetType.name} already exists in " 

848 f"output RUN collection '{run}' with data ID" 

849 f" {resolvedRef.dataId}." 

850 ) 

851 

852 # And check skipExistingIn too, if RUN collection is in 

853 # it is handled above 

854 if skipCollections is not None: 

855 resolvedRefQueryResults = subset.findDatasets( 

856 datasetType, collections=skipCollections, findFirst=True 

857 ) 

858 for resolvedRef in resolvedRefQueryResults: 

859 assert resolvedRef.dataId in refs 

860 refs[resolvedRef.dataId] = resolvedRef 

861 

862 # Look up input and initInput datasets in the input collection(s). 

863 # container to accumulate unfound refs, if the common dataIs were not 

864 # constrained on dataset type existence. 

865 self.unfoundRefs = set() 

866 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

867 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

868 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets( 

869 datasetType, collections=collections, findFirst=True 

870 ) 

871 dataIdsNotFoundYet = set(refs.keys()) 

872 for resolvedRef in resolvedRefQueryResults: 

873 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

874 refs[resolvedRef.dataId] = resolvedRef 

875 if dataIdsNotFoundYet: 

876 if constrainedByAllDatasets: 

877 raise RuntimeError( 

878 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

879 f"'{datasetType.name}' was/were present in a previous " 

880 f"query, but could not be found now." 

881 f"This is either a logic bug in QuantumGraph generation " 

882 f"or the input collections have been modified since " 

883 f"QuantumGraph generation began." 

884 ) 

885 else: 

886 # if the common dataIds were not constrained using all the 

887 # input dataset types, it is possible that some data ids 

888 # found dont correspond to existing dataset types and they 

889 # will be un-resolved. Mark these for later pruning from 

890 # the quantum graph. 

891 for k in dataIdsNotFoundYet: 

892 self.unfoundRefs.add(refs[k]) 

893 

894 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

895 # replacing the unresolved refs there, and then look up prerequisites. 

896 for task in self.tasks: 

897 _LOG.debug( 

898 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

899 len(task.quanta), 

900 task.taskDef.label, 

901 ) 

902 # The way iterConnections is designed makes it impossible to 

903 # annotate precisely enough to satisfy MyPy here. 

904 lookupFunctions = { 

905 c.name: c.lookupFunction # type: ignore 

906 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

907 if c.lookupFunction is not None # type: ignore 

908 } 

909 dataIdsFailed = [] 

910 dataIdsSucceeded = [] 

911 for quantum in task.quanta.values(): 

912 # Process outputs datasets only if skipExistingIn is not None 

913 # or there is a run to look for outputs in and clobberOutputs 

914 # is True. Note that if skipExistingIn is None, any output 

915 # datasets that already exist would have already caused an 

916 # exception to be raised. We never update the DatasetRefs in 

917 # the quantum because those should never be resolved. 

918 if skipCollections is not None or (run is not None and clobberOutputs): 

919 resolvedRefs = [] 

920 unresolvedRefs = [] 

921 haveMetadata = False 

922 for datasetType, originalRefs in quantum.outputs.items(): 

923 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

924 if ref.id is not None: 

925 resolvedRefs.append(ref) 

926 if datasetType.name == task.taskDef.metadataDatasetName: 

927 haveMetadata = True 

928 else: 

929 unresolvedRefs.append(ref) 

930 if resolvedRefs: 

931 if haveMetadata or not unresolvedRefs: 

932 dataIdsSucceeded.append(quantum.dataId) 

933 if skipCollections is not None: 

934 continue 

935 else: 

936 dataIdsFailed.append(quantum.dataId) 

937 if not clobberOutputs: 

938 raise OutputExistsError( 

939 f"Quantum {quantum.dataId} of task with label " 

940 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

941 f"({resolvedRefs}) " 

942 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

943 "and clobbering outputs was not enabled." 

944 ) 

945 # Update the input DatasetRefs to the resolved ones we already 

946 # searched for. 

947 for datasetType, input_refs in quantum.inputs.items(): 

948 for ref in task.inputs.extract(datasetType, input_refs.keys()): 

949 input_refs[ref.dataId] = ref 

950 # Look up prerequisite datasets in the input collection(s). 

951 # These may have dimensions that extend beyond those we queried 

952 # for originally, because we want to permit those data ID 

953 # values to differ across quanta and dataset types. 

954 for datasetType in task.prerequisites: 

955 lookupFunction = lookupFunctions.get(datasetType.name) 

956 if lookupFunction is not None: 

957 # PipelineTask has provided its own function to do the 

958 # lookup. This always takes precedence. 

959 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

960 elif ( 

961 datasetType.isCalibration() 

962 and datasetType.dimensions <= quantum.dataId.graph 

963 and quantum.dataId.graph.temporal 

964 ): 

965 # This is a master calibration lookup, which we have to 

966 # handle specially because the query system can't do a 

967 # temporal join on a non-dimension-based timespan yet. 

968 timespan = quantum.dataId.timespan 

969 try: 

970 prereq_refs = [ 

971 registry.findDataset( 

972 datasetType, quantum.dataId, collections=collections, timespan=timespan 

973 ) 

974 ] 

975 except KeyError: 

976 # This dataset type is not present in the registry, 

977 # which just means there are no datasets here. 

978 prereq_refs = [] 

979 else: 

980 # Most general case. 

981 prereq_refs = list( 

982 registry.queryDatasets( 

983 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True 

984 ).expanded() 

985 ) 

986 quantum.prerequisites[datasetType].update( 

987 {ref.dataId: ref for ref in prereq_refs if ref is not None} 

988 ) 

989 # Actually remove any quanta that we decided to skip above. 

990 if dataIdsSucceeded: 

991 if skipCollections is not None: 

992 _LOG.debug( 

993 "Pruning successful %d quanta for task with label '%s' because all of their " 

994 "outputs exist or metadata was written successfully.", 

995 len(dataIdsSucceeded), 

996 task.taskDef.label, 

997 ) 

998 for dataId in dataIdsSucceeded: 

999 del task.quanta[dataId] 

1000 elif clobberOutputs: 

1001 _LOG.info( 

1002 "Found %d successful quanta for task with label '%s' " 

1003 "that will need to be clobbered during execution.", 

1004 len(dataIdsSucceeded), 

1005 task.taskDef.label, 

1006 ) 

1007 else: 

1008 raise AssertionError("OutputExistsError should have already been raised.") 

1009 if dataIdsFailed: 

1010 if clobberOutputs: 

1011 _LOG.info( 

1012 "Found %d failed/incomplete quanta for task with label '%s' " 

1013 "that will need to be clobbered during execution.", 

1014 len(dataIdsFailed), 

1015 task.taskDef.label, 

1016 ) 

1017 else: 

1018 raise AssertionError("OutputExistsError should have already been raised.") 

1019 

1020 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None) -> QuantumGraph: 

1021 """Create a `QuantumGraph` from the quanta already present in 

1022 the scaffolding data structure. 

1023 

1024 Parameters 

1025 --------- 

1026 metadata : Optional Mapping of `str` to primitives 

1027 This is an optional parameter of extra data to carry with the 

1028 graph. Entries in this mapping should be able to be serialized in 

1029 JSON. 

1030 

1031 Returns 

1032 ------- 

1033 graph : `QuantumGraph` 

1034 The full `QuantumGraph`. 

1035 """ 

1036 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

1037 for task in self.tasks: 

1038 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs) 

1039 graphInput[task.taskDef] = qset 

1040 

1041 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs) 

1042 return graph 

1043 

1044 

1045# ------------------------ 

1046# Exported definitions -- 

1047# ------------------------ 

1048 

1049 

1050class GraphBuilderError(Exception): 

1051 """Base class for exceptions generated by graph builder.""" 

1052 

1053 pass 

1054 

1055 

1056class OutputExistsError(GraphBuilderError): 

1057 """Exception generated when output datasets already exist.""" 

1058 

1059 pass 

1060 

1061 

1062class PrerequisiteMissingError(GraphBuilderError): 

1063 """Exception generated when a prerequisite dataset does not exist.""" 

1064 

1065 pass 

1066 

1067 

1068class GraphBuilder(object): 

1069 """GraphBuilder class is responsible for building task execution graph from 

1070 a Pipeline. 

1071 

1072 Parameters 

1073 ---------- 

1074 registry : `~lsst.daf.butler.Registry` 

1075 Data butler instance. 

1076 skipExistingIn 

1077 Expressions representing the collections to search for existing 

1078 output datasets that should be skipped. May be any of the types 

1079 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

1080 clobberOutputs : `bool`, optional 

1081 If `True` (default), allow quanta to created even if partial outputs 

1082 exist; this requires the same behavior behavior to be enabled when 

1083 executing. 

1084 """ 

1085 

1086 def __init__(self, registry: Registry, skipExistingIn: Any = None, clobberOutputs: bool = True): 

1087 self.registry = registry 

1088 self.dimensions = registry.dimensions 

1089 self.skipExistingIn = skipExistingIn 

1090 self.clobberOutputs = clobberOutputs 

1091 

1092 def makeGraph( 

1093 self, 

1094 pipeline: Union[Pipeline, Iterable[TaskDef]], 

1095 collections: Any, 

1096 run: Optional[str], 

1097 userQuery: Optional[str], 

1098 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1099 metadata: Optional[Mapping[str, Any]] = None, 

1100 ) -> QuantumGraph: 

1101 """Create execution graph for a pipeline. 

1102 

1103 Parameters 

1104 ---------- 

1105 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1106 Pipeline definition, task names/classes and their configs. 

1107 collections 

1108 Expressions representing the collections to search for input 

1109 datasets. May be any of the types accepted by 

1110 `lsst.daf.butler.CollectionSearch.fromExpression`. 

1111 run : `str`, optional 

1112 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1113 output datasets, if it already exists. 

1114 userQuery : `str` 

1115 String which defines user-defined selection for registry, should be 

1116 empty or `None` if there is no restrictions on data selection. 

1117 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1118 The query constraint variant that should be used to constraint the 

1119 query based on dataset existance, defaults to 

1120 `DatasetQueryConstraintVariant.ALL`. 

1121 metadata : Optional Mapping of `str` to primitives 

1122 This is an optional parameter of extra data to carry with the 

1123 graph. Entries in this mapping should be able to be serialized in 

1124 JSON. 

1125 

1126 Returns 

1127 ------- 

1128 graph : `QuantumGraph` 

1129 

1130 Raises 

1131 ------ 

1132 UserExpressionError 

1133 Raised when user expression cannot be parsed. 

1134 OutputExistsError 

1135 Raised when output datasets already exist. 

1136 Exception 

1137 Other exceptions types may be raised by underlying registry 

1138 classes. 

1139 """ 

1140 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1141 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1142 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1143 instrument_class: Optional[Any] = None 

1144 if isinstance(pipeline, Pipeline): 

1145 instrument_class_name = pipeline.getInstrument() 

1146 if instrument_class_name is not None: 

1147 instrument_class = doImportType(instrument_class_name) 

1148 pipeline = list(pipeline.toExpandedPipeline()) 

1149 if instrument_class is not None: 

1150 dataId = DataCoordinate.standardize( 

1151 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1152 ) 

1153 else: 

1154 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1155 with scaffolding.connectDataIds( 

1156 self.registry, collections, userQuery, dataId, datasetQueryConstraint 

1157 ) as commonDataIds: 

1158 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1159 scaffolding.resolveDatasetRefs( 

1160 self.registry, 

1161 collections, 

1162 run, 

1163 commonDataIds, 

1164 skipExistingIn=self.skipExistingIn, 

1165 clobberOutputs=self.clobberOutputs, 

1166 constrainedByAllDatasets=condition, 

1167 ) 

1168 return scaffolding.makeQuantumGraph(metadata=metadata)