Coverage for python/lsst/pipe/base/graphBuilder.py: 17%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

311 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections, AdjustQuantumHelper 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph 

45from lsst.daf.butler import ( 

46 CollectionSearch, 

47 CollectionType, 

48 DataCoordinate, 

49 DatasetRef, 

50 DatasetType, 

51 DimensionGraph, 

52 DimensionUniverse, 

53 NamedKeyDict, 

54 Quantum, 

55) 

56from lsst.utils import doImport 

57from ._status import NoWorkFound 

58from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

59 

60# ---------------------------------- 

61# Local non-exported definitions -- 

62# ---------------------------------- 

63 

64_LOG = logging.getLogger(__name__.partition(".")[2]) 

65 

66 

67class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

68 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

69 the known `DatasetRef` instances of that type. 

70 

71 Parameters 

72 ---------- 

73 args 

74 Positional arguments are forwarded to the `dict` constructor. 

75 universe : `DimensionUniverse` 

76 Universe of all possible dimensions. 

77 """ 

78 def __init__(self, *args, universe: DimensionGraph): 

79 super().__init__(*args) 

80 self.universe = universe 

81 

82 @classmethod 

83 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

84 universe: DimensionUniverse) -> _DatasetDict: 

85 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

86 

87 Parameters 

88 ---------- 

89 datasetTypes : `iterable` of `DatasetType` 

90 DatasetTypes to use as keys for the dict. Values will be empty 

91 dictionaries. 

92 universe : `DimensionUniverse` 

93 Universe of all possible dimensions. 

94 

95 Returns 

96 ------- 

97 dictionary : `_DatasetDict` 

98 A new `_DatasetDict` instance. 

99 """ 

100 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

101 

102 @classmethod 

103 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

104 ) -> _DatasetDict: 

105 """Return a new dictionary by extracting items corresponding to the 

106 given keys from one or more existing dictionaries. 

107 

108 Parameters 

109 ---------- 

110 datasetTypes : `iterable` of `DatasetType` 

111 DatasetTypes to use as keys for the dict. Values will be obtained 

112 by lookups against ``first`` and ``rest``. 

113 first : `_DatasetDict` 

114 Another dictionary from which to extract values. 

115 rest 

116 Additional dictionaries from which to extract values. 

117 

118 Returns 

119 ------- 

120 dictionary : `_DatasetDict` 

121 A new dictionary instance. 

122 """ 

123 combined = ChainMap(first, *rest) 

124 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

125 universe=first.universe) 

126 

127 @property 

128 def dimensions(self) -> DimensionGraph: 

129 """The union of all dimensions used by all dataset types in this 

130 dictionary, including implied dependencies (`DimensionGraph`). 

131 """ 

132 base = self.universe.empty 

133 if len(self) == 0: 

134 return base 

135 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

136 

137 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

138 """Unpack nested single-element `DatasetRef` dicts into a new 

139 mapping with `DatasetType` keys and `DatasetRef` values. 

140 

141 This method assumes that each nest contains exactly one item, as is the 

142 case for all "init" datasets. 

143 

144 Returns 

145 ------- 

146 dictionary : `NamedKeyDict` 

147 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

148 `DatasetType` instances and string names usable as keys. 

149 """ 

150 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

151 ref, = refs.values() 

152 return ref 

153 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

154 

155 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

156 """Unpack nested multi-element `DatasetRef` dicts into a new 

157 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

158 

159 Returns 

160 ------- 

161 dictionary : `NamedKeyDict` 

162 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

163 both `DatasetType` instances and string names usable as keys. 

164 """ 

165 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

166 

167 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

168 ) -> Iterator[DatasetRef]: 

169 """Iterate over the contained `DatasetRef` instances that match the 

170 given `DatasetType` and data IDs. 

171 

172 Parameters 

173 ---------- 

174 datasetType : `DatasetType` 

175 Dataset type to match. 

176 dataIds : `Iterable` [ `DataCoordinate` ] 

177 Data IDs to match. 

178 

179 Returns 

180 ------- 

181 refs : `Iterator` [ `DatasetRef` ] 

182 DatasetRef instances for which ``ref.datasetType == datasetType`` 

183 and ``ref.dataId`` is in ``dataIds``. 

184 """ 

185 refs = self[datasetType] 

186 return (refs[dataId] for dataId in dataIds) 

187 

188 

189class _QuantumScaffolding: 

190 """Helper class aggregating information about a `Quantum`, used when 

191 constructing a `QuantumGraph`. 

192 

193 See `_PipelineScaffolding` for a top-down description of the full 

194 scaffolding data structure. 

195 

196 Parameters 

197 ---------- 

198 task : _TaskScaffolding 

199 Back-reference to the helper object for the `PipelineTask` this quantum 

200 represents an execution of. 

201 dataId : `DataCoordinate` 

202 Data ID for this quantum. 

203 """ 

204 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

205 self.task = task 

206 self.dataId = dataId 

207 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

208 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

209 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

210 universe=dataId.universe) 

211 

212 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

213 

214 def __repr__(self): 

215 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

216 

217 task: _TaskScaffolding 

218 """Back-reference to the helper object for the `PipelineTask` this quantum 

219 represents an execution of. 

220 """ 

221 

222 dataId: DataCoordinate 

223 """Data ID for this quantum. 

224 """ 

225 

226 inputs: _DatasetDict 

227 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

228 

229 This is initialized to map each `DatasetType` to an empty dictionary at 

230 construction. Those nested dictionaries are populated (with data IDs as 

231 keys) with unresolved `DatasetRef` instances in 

232 `_PipelineScaffolding.connectDataIds`. 

233 """ 

234 

235 outputs: _DatasetDict 

236 """Nested dictionary containing `DatasetRef` outputs this quantum. 

237 """ 

238 

239 prerequisites: _DatasetDict 

240 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

241 quantum. 

242 """ 

243 

244 def makeQuantum(self) -> Quantum: 

245 """Transform the scaffolding object into a true `Quantum` instance. 

246 

247 Returns 

248 ------- 

249 quantum : `Quantum` 

250 An actual `Quantum` instance. 

251 """ 

252 allInputs = self.inputs.unpackMultiRefs() 

253 allInputs.update(self.prerequisites.unpackMultiRefs()) 

254 # Give the task's Connections class an opportunity to remove some 

255 # inputs, or complain if they are unacceptable. 

256 # This will raise if one of the check conditions is not met, which is 

257 # the intended behavior. 

258 # If it raises NotWorkFound, there is a bug in the QG algorithm 

259 # or the adjustQuantum is incorrectly trying to make a prerequisite 

260 # input behave like a regular input; adjustQuantum should only raise 

261 # NoWorkFound if a regular input is missing, and it shouldn't be 

262 # possible for us to have generated ``self`` if that's true. 

263 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

264 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

265 return Quantum( 

266 taskName=self.task.taskDef.taskName, 

267 taskClass=self.task.taskDef.taskClass, 

268 dataId=self.dataId, 

269 initInputs=self.task.initInputs.unpackSingleRefs(), 

270 inputs=helper.inputs, 

271 outputs=helper.outputs, 

272 ) 

273 

274 

275@dataclass 

276class _TaskScaffolding: 

277 """Helper class aggregating information about a `PipelineTask`, used when 

278 constructing a `QuantumGraph`. 

279 

280 See `_PipelineScaffolding` for a top-down description of the full 

281 scaffolding data structure. 

282 

283 Parameters 

284 ---------- 

285 taskDef : `TaskDef` 

286 Data structure that identifies the task class and its config. 

287 parent : `_PipelineScaffolding` 

288 The parent data structure that will hold the instance being 

289 constructed. 

290 datasetTypes : `TaskDatasetTypes` 

291 Data structure that categorizes the dataset types used by this task. 

292 """ 

293 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

294 universe = parent.dimensions.universe 

295 self.taskDef = taskDef 

296 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

297 assert self.dimensions.issubset(parent.dimensions) 

298 # Initialize _DatasetDicts as subsets of the one or two 

299 # corresponding dicts in the parent _PipelineScaffolding. 

300 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

301 parent.initIntermediates) 

302 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

303 parent.initOutputs) 

304 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

305 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

306 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

307 self.dataIds = set() 

308 self.quanta = {} 

309 

310 def __repr__(self): 

311 # Default dataclass-injected __repr__ gets caught in an infinite loop 

312 # because of back-references. 

313 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

314 

315 taskDef: TaskDef 

316 """Data structure that identifies the task class and its config 

317 (`TaskDef`). 

318 """ 

319 

320 dimensions: DimensionGraph 

321 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

322 """ 

323 

324 initInputs: _DatasetDict 

325 """Dictionary containing information about datasets used to construct this 

326 task (`_DatasetDict`). 

327 """ 

328 

329 initOutputs: _DatasetDict 

330 """Dictionary containing information about datasets produced as a 

331 side-effect of constructing this task (`_DatasetDict`). 

332 """ 

333 

334 inputs: _DatasetDict 

335 """Dictionary containing information about datasets used as regular, 

336 graph-constraining inputs to this task (`_DatasetDict`). 

337 """ 

338 

339 outputs: _DatasetDict 

340 """Dictionary containing information about datasets produced by this task 

341 (`_DatasetDict`). 

342 """ 

343 

344 prerequisites: _DatasetDict 

345 """Dictionary containing information about input datasets that must be 

346 present in the repository before any Pipeline containing this task is run 

347 (`_DatasetDict`). 

348 """ 

349 

350 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

351 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

352 this task with that data ID. 

353 """ 

354 

355 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]: 

356 """Create a `set` of `Quantum` from the information in ``self``. 

357 

358 Returns 

359 ------- 

360 nodes : `set` of `Quantum 

361 The `Quantum` elements corresponding to this task. 

362 """ 

363 if unresolvedRefs is None: 

364 unresolvedRefs = set() 

365 outputs = set() 

366 for q in self.quanta.values(): 

367 try: 

368 tmpQuanta = q.makeQuantum() 

369 outputs.add(tmpQuanta) 

370 except (NoWorkFound, FileNotFoundError) as exc: 

371 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

372 if unresolvedRefs.intersection(refs): 

373 # This means it is a node that is Known to be pruned 

374 # later and should be left in even though some follow up 

375 # queries fail. This allows the pruning to start from this 

376 # quantum with known issues, and prune other nodes it 

377 # touches 

378 inputs = q.inputs.unpackMultiRefs() 

379 inputs.update(q.prerequisites.unpackMultiRefs()) 

380 tmpQuantum = Quantum(taskName=q.task.taskDef.taskName, 

381 taskClass=q.task.taskDef.taskClass, 

382 dataId=q.dataId, 

383 initInputs=q.task.initInputs.unpackSingleRefs(), 

384 inputs=inputs, 

385 outputs=q.outputs.unpackMultiRefs(),) 

386 outputs.add(tmpQuantum) 

387 else: 

388 raise exc 

389 return outputs 

390 

391 

392@dataclass 

393class _PipelineScaffolding: 

394 """A helper data structure that organizes the information involved in 

395 constructing a `QuantumGraph` for a `Pipeline`. 

396 

397 Parameters 

398 ---------- 

399 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

400 Sequence of tasks from which a graph is to be constructed. Must 

401 have nested task classes already imported. 

402 universe : `DimensionUniverse` 

403 Universe of all possible dimensions. 

404 

405 Notes 

406 ----- 

407 The scaffolding data structure contains nested data structures for both 

408 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

409 data structures are shared between the pipeline-level structure (which 

410 aggregates all datasets and categorizes them from the perspective of the 

411 complete pipeline) and the individual tasks that use them as inputs and 

412 outputs. 

413 

414 `QuantumGraph` construction proceeds in four steps, with each corresponding 

415 to a different `_PipelineScaffolding` method: 

416 

417 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

418 the DatasetTypes used by the pipeline (delegating to 

419 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

420 nested `_TaskScaffolding` and `_DatasetDict` objects. 

421 

422 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

423 returns related tuples of all dimensions used to identify any regular 

424 input, output, and intermediate datasets (not prerequisites). We then 

425 iterate over these tuples of related dimensions, identifying the subsets 

426 that correspond to distinct data IDs for each task and dataset type, 

427 and then create `_QuantumScaffolding` objects. 

428 

429 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

430 dataset data IDs previously identified, transforming unresolved 

431 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

432 up prerequisite datasets for all quanta. 

433 

434 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

435 per-task `_QuantumScaffolding` objects. 

436 """ 

437 def __init__(self, pipeline, *, registry): 

438 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

439 self.tasks = [] 

440 # Aggregate and categorize the DatasetTypes in the Pipeline. 

441 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

442 # Construct dictionaries that map those DatasetTypes to structures 

443 # that will (later) hold addiitonal information about them. 

444 for attr in ("initInputs", "initIntermediates", "initOutputs", 

445 "inputs", "intermediates", "outputs", "prerequisites"): 

446 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

447 universe=registry.dimensions)) 

448 # Aggregate all dimensions for all non-init, non-prerequisite 

449 # DatasetTypes. These are the ones we'll include in the big join 

450 # query. 

451 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

452 self.outputs.dimensions) 

453 # Construct scaffolding nodes for each Task, and add backreferences 

454 # to the Task from each DatasetScaffolding node. 

455 # Note that there's only one scaffolding node for each DatasetType, 

456 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

457 # reference it. 

458 if isinstance(pipeline, Pipeline): 

459 pipeline = pipeline.toExpandedPipeline() 

460 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

461 for taskDef, taskDatasetTypes in zip(pipeline, 

462 datasetTypes.byTask.values())] 

463 

464 def __repr__(self): 

465 # Default dataclass-injected __repr__ gets caught in an infinite loop 

466 # because of back-references. 

467 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

468 

469 tasks: List[_TaskScaffolding] 

470 """Scaffolding data structures for each task in the pipeline 

471 (`list` of `_TaskScaffolding`). 

472 """ 

473 

474 initInputs: _DatasetDict 

475 """Datasets consumed but not produced when constructing the tasks in this 

476 pipeline (`_DatasetDict`). 

477 """ 

478 

479 initIntermediates: _DatasetDict 

480 """Datasets that are both consumed and produced when constructing the tasks 

481 in this pipeline (`_DatasetDict`). 

482 """ 

483 

484 initOutputs: _DatasetDict 

485 """Datasets produced but not consumed when constructing the tasks in this 

486 pipeline (`_DatasetDict`). 

487 """ 

488 

489 inputs: _DatasetDict 

490 """Datasets that are consumed but not produced when running this pipeline 

491 (`_DatasetDict`). 

492 """ 

493 

494 intermediates: _DatasetDict 

495 """Datasets that are both produced and consumed when running this pipeline 

496 (`_DatasetDict`). 

497 """ 

498 

499 outputs: _DatasetDict 

500 """Datasets produced but not consumed when when running this pipeline 

501 (`_DatasetDict`). 

502 """ 

503 

504 prerequisites: _DatasetDict 

505 """Datasets that are consumed when running this pipeline and looked up 

506 per-Quantum when generating the graph (`_DatasetDict`). 

507 """ 

508 

509 dimensions: DimensionGraph 

510 """All dimensions used by any regular input, intermediate, or output 

511 (not prerequisite) dataset; the set of dimension used in the "Big Join 

512 Query" (`DimensionGraph`). 

513 

514 This is required to be a superset of all task quantum dimensions. 

515 """ 

516 

517 @contextmanager 

518 def connectDataIds(self, registry, collections, userQuery, externalDataId, 

519 datasetQueryConstraint: DatasetQueryConstraintVariant = 

520 DatasetQueryConstraintVariant.ALL): 

521 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

522 

523 This method populates `_TaskScaffolding.dataIds` and 

524 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

525 

526 Parameters 

527 ---------- 

528 registry : `lsst.daf.butler.Registry` 

529 Registry for the data repository; used for all data ID queries. 

530 collections 

531 Expressions representing the collections to search for input 

532 datasets. May be any of the types accepted by 

533 `lsst.daf.butler.CollectionSearch.fromExpression`. 

534 userQuery : `str` or `None` 

535 User-provided expression to limit the data IDs processed. 

536 externalDataId : `DataCoordinate` 

537 Externally-provided data ID that should be used to restrict the 

538 results, just as if these constraints had been included via ``AND`` 

539 in ``userQuery``. This includes (at least) any instrument named 

540 in the pipeline definition. 

541 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

542 The query constraint variant that should be used to constraint the 

543 query based on dataset existance, defaults to 

544 `DatasetQueryConstraintVariant.ALL`. 

545 

546 Returns 

547 ------- 

548 commonDataIds : \ 

549 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

550 An interface to a database temporary table containing all data IDs 

551 that will appear in this `QuantumGraph`. Returned inside a 

552 context manager, which will drop the temporary table at the end of 

553 the `with` block in which this method is called. 

554 """ 

555 _LOG.debug("Building query for data IDs.") 

556 # Initialization datasets always have empty data IDs. 

557 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

558 for datasetType, refs in itertools.chain(self.initInputs.items(), 

559 self.initIntermediates.items(), 

560 self.initOutputs.items()): 

561 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

562 # Run one big query for the data IDs for task dimensions and regular 

563 # inputs and outputs. We limit the query to only dimensions that are 

564 # associated with the input dataset types, but don't (yet) try to 

565 # obtain the dataset_ids for those inputs. 

566 _LOG.debug("Submitting data ID query and materializing results.") 

567 queryArgs = {'dimensions': self.dimensions, 'where': userQuery, 'dataId': externalDataId} 

568 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

569 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

570 queryArgs['datasets'] = list(self.inputs) 

571 queryArgs['collections'] = collections 

572 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

573 _LOG.debug("Not using dataset existance to constrain query.") 

574 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

575 constraint = set(datasetQueryConstraint) 

576 inputs = {k.name: k for k in self.inputs.keys()} 

577 if (remainder := constraint.difference(inputs.keys())): 

578 raise ValueError(f"{remainder} dataset type(s) specified as a graph constraint, but" 

579 f" do not appear as an input to the specified pipeline: {inputs.keys()}") 

580 _LOG.debug(f"Constraining graph query using {constraint}") 

581 queryArgs['datasets'] = [typ for name, typ in inputs.items() if name in constraint] 

582 queryArgs['collections'] = collections 

583 else: 

584 raise ValueError(f"Unable to handle type {datasetQueryConstraint} given as " 

585 "datasetQueryConstraint.") 

586 

587 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

588 _LOG.debug("Expanding data IDs.") 

589 commonDataIds = commonDataIds.expanded() 

590 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

591 # Iterate over query results, populating data IDs for datasets and 

592 # quanta and then connecting them to each other. 

593 n = 0 

594 for n, commonDataId in enumerate(commonDataIds): 

595 # Create DatasetRefs for all DatasetTypes from this result row, 

596 # noting that we might have created some already. 

597 # We remember both those that already existed and those that we 

598 # create now. 

599 refsForRow = {} 

600 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {} 

601 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

602 self.outputs.items()): 

603 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)): 

604 datasetDataId = commonDataId.subset(datasetType.dimensions) 

605 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

606 ref = refs.get(datasetDataId) 

607 if ref is None: 

608 ref = DatasetRef(datasetType, datasetDataId) 

609 refs[datasetDataId] = ref 

610 refsForRow[datasetType.name] = ref 

611 # Create _QuantumScaffolding objects for all tasks from this 

612 # result row, noting that we might have created some already. 

613 for task in self.tasks: 

614 quantumDataId = commonDataId.subset(task.dimensions) 

615 quantum = task.quanta.get(quantumDataId) 

616 if quantum is None: 

617 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

618 task.quanta[quantumDataId] = quantum 

619 # Whether this is a new quantum or an existing one, we can 

620 # now associate the DatasetRefs for this row with it. The 

621 # fact that a Quantum data ID and a dataset data ID both 

622 # came from the same result row is what tells us they 

623 # should be associated. 

624 # Many of these associates will be duplicates (because 

625 # another query row that differed from this one only in 

626 # irrelevant dimensions already added them), and we use 

627 # sets to skip. 

628 for datasetType in task.inputs: 

629 ref = refsForRow[datasetType.name] 

630 quantum.inputs[datasetType.name][ref.dataId] = ref 

631 for datasetType in task.outputs: 

632 ref = refsForRow[datasetType.name] 

633 quantum.outputs[datasetType.name][ref.dataId] = ref 

634 if n == 0: 

635 for message in commonDataIds.explain_no_results(): 

636 _LOG.warn(message) 

637 _LOG.debug("Finished processing %d rows from data ID query.", n) 

638 yield commonDataIds 

639 

640 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None, 

641 clobberOutputs=True, constrainedByAllDatasets: bool = True): 

642 """Perform follow up queries for each dataset data ID produced in 

643 `fillDataIds`. 

644 

645 This method populates `_DatasetScaffolding.refs` (except for those in 

646 `prerequisites`). 

647 

648 Parameters 

649 ---------- 

650 registry : `lsst.daf.butler.Registry` 

651 Registry for the data repository; used for all data ID queries. 

652 collections 

653 Expressions representing the collections to search for input 

654 datasets. May be any of the types accepted by 

655 `lsst.daf.butler.CollectionSearch.fromExpression`. 

656 run : `str`, optional 

657 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

658 output datasets, if it already exists. 

659 commonDataIds : \ 

660 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

661 Result of a previous call to `connectDataIds`. 

662 skipExistingIn 

663 Expressions representing the collections to search for existing 

664 output datasets that should be skipped. May be any of the types 

665 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

666 `None` or empty string/sequence disables skipping. 

667 clobberOutputs : `bool`, optional 

668 If `True` (default), allow quanta to created even if outputs exist; 

669 this requires the same behavior behavior to be enabled when 

670 executing. If ``skipExistingIn`` is not `None`, completed quanta 

671 (those with metadata, or all outputs if there is no metadata 

672 dataset configured) will be skipped rather than clobbered. 

673 constrainedByAllDatasets : `bool`, optional 

674 Indicates if the commonDataIds were generated with a constraint on 

675 all dataset types. 

676 

677 Raises 

678 ------ 

679 OutputExistsError 

680 Raised if an output dataset already exists in the output run 

681 and ``skipExistingIn`` does not include output run, or if only 

682 some outputs are present and ``clobberOutputs`` is `False`. 

683 """ 

684 skipCollections: Optional[CollectionSearch] = None 

685 skipExistingInRun = False 

686 if skipExistingIn: 

687 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

688 if run: 

689 # as optimization check in the explicit list of names first 

690 skipExistingInRun = run in skipCollections.explicitNames() 

691 if not skipExistingInRun: 

692 # need to flatten it and check again 

693 skipExistingInRun = run in registry.queryCollections( 

694 skipExistingIn, 

695 collectionTypes=CollectionType.RUN, 

696 ) 

697 

698 # Look up [init] intermediate and output datasets in the output 

699 # collection, if there is an output collection. 

700 if run is not None or skipCollections is not None: 

701 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

702 self.initOutputs.items(), 

703 self.intermediates.items(), 

704 self.outputs.items()): 

705 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

706 len(refs), datasetType.name) 

707 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

708 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

709 

710 # look at RUN collection first 

711 if run is not None: 

712 resolvedRefQueryResults = subset.findDatasets( 

713 datasetType, 

714 collections=run, 

715 findFirst=True 

716 ) 

717 for resolvedRef in resolvedRefQueryResults: 

718 # TODO: we could easily support per-DatasetType 

719 # skipExisting and I could imagine that being useful - 

720 # it's probably required in order to support writing 

721 # initOutputs before QuantumGraph generation. 

722 assert resolvedRef.dataId in refs 

723 if not (skipExistingInRun or isInit or clobberOutputs): 

724 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

725 f"output RUN collection '{run}' with data ID" 

726 f" {resolvedRef.dataId}.") 

727 

728 # And check skipExistingIn too, if RUN collection is in 

729 # it is handled above 

730 if skipCollections is not None: 

731 resolvedRefQueryResults = subset.findDatasets( 

732 datasetType, 

733 collections=skipCollections, 

734 findFirst=True 

735 ) 

736 for resolvedRef in resolvedRefQueryResults: 

737 assert resolvedRef.dataId in refs 

738 refs[resolvedRef.dataId] = resolvedRef 

739 

740 # Look up input and initInput datasets in the input collection(s). 

741 # container to accumulate unfound refs, if the common dataIs were not 

742 # constrained on dataset type existence. 

743 self.unfoundRefs = set() 

744 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

745 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

746 resolvedRefQueryResults = commonDataIds.subset( 

747 datasetType.dimensions, 

748 unique=True 

749 ).findDatasets( 

750 datasetType, 

751 collections=collections, 

752 findFirst=True 

753 ) 

754 dataIdsNotFoundYet = set(refs.keys()) 

755 for resolvedRef in resolvedRefQueryResults: 

756 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

757 refs[resolvedRef.dataId] = resolvedRef 

758 if dataIdsNotFoundYet: 

759 if constrainedByAllDatasets: 

760 raise RuntimeError( 

761 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

762 f"'{datasetType.name}' was/were present in a previous " 

763 f"query, but could not be found now." 

764 f"This is either a logic bug in QuantumGraph generation " 

765 f"or the input collections have been modified since " 

766 f"QuantumGraph generation began." 

767 ) 

768 else: 

769 # if the common dataIds were not constrained using all the 

770 # input dataset types, it is possible that some data ids 

771 # found dont correspond to existing dataset types and they 

772 # will be un-resolved. Mark these for later pruning from 

773 # the quantum graph. 

774 for k in dataIdsNotFoundYet: 

775 self.unfoundRefs.add(refs[k]) 

776 

777 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

778 # replacing the unresolved refs there, and then look up prerequisites. 

779 for task in self.tasks: 

780 _LOG.debug( 

781 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

782 len(task.quanta), 

783 task.taskDef.label 

784 ) 

785 lookupFunctions = { 

786 c.name: c.lookupFunction 

787 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

788 if c.lookupFunction is not None 

789 } 

790 dataIdsFailed = [] 

791 dataIdsSucceeded = [] 

792 for quantum in task.quanta.values(): 

793 # Process outputs datasets only if skipExistingIn is not None 

794 # or there is a run to look for outputs in and clobberOutputs 

795 # is True. Note that if skipExistingIn is None, any output 

796 # datasets that already exist would have already caused an 

797 # exception to be raised. We never update the DatasetRefs in 

798 # the quantum because those should never be resolved. 

799 if skipCollections is not None or (run is not None and clobberOutputs): 

800 resolvedRefs = [] 

801 unresolvedRefs = [] 

802 haveMetadata = False 

803 for datasetType, originalRefs in quantum.outputs.items(): 

804 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

805 if ref.id is not None: 

806 resolvedRefs.append(ref) 

807 if datasetType.name == task.taskDef.metadataDatasetName: 

808 haveMetadata = True 

809 else: 

810 unresolvedRefs.append(ref) 

811 if resolvedRefs: 

812 if haveMetadata or not unresolvedRefs: 

813 dataIdsSucceeded.append(quantum.dataId) 

814 if skipCollections is not None: 

815 continue 

816 else: 

817 dataIdsFailed.append(quantum.dataId) 

818 if not clobberOutputs: 

819 raise OutputExistsError( 

820 f"Quantum {quantum.dataId} of task with label " 

821 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

822 f"({resolvedRefs}) " 

823 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

824 "and clobbering outputs was not enabled." 

825 ) 

826 # Update the input DatasetRefs to the resolved ones we already 

827 # searched for. 

828 for datasetType, refs in quantum.inputs.items(): 

829 for ref in task.inputs.extract(datasetType, refs.keys()): 

830 refs[ref.dataId] = ref 

831 # Look up prerequisite datasets in the input collection(s). 

832 # These may have dimensions that extend beyond those we queried 

833 # for originally, because we want to permit those data ID 

834 # values to differ across quanta and dataset types. 

835 for datasetType in task.prerequisites: 

836 lookupFunction = lookupFunctions.get(datasetType.name) 

837 if lookupFunction is not None: 

838 # PipelineTask has provided its own function to do the 

839 # lookup. This always takes precedence. 

840 refs = list( 

841 lookupFunction(datasetType, registry, quantum.dataId, collections) 

842 ) 

843 elif (datasetType.isCalibration() 

844 and datasetType.dimensions <= quantum.dataId.graph 

845 and quantum.dataId.graph.temporal): 

846 # This is a master calibration lookup, which we have to 

847 # handle specially because the query system can't do a 

848 # temporal join on a non-dimension-based timespan yet. 

849 timespan = quantum.dataId.timespan 

850 try: 

851 refs = [registry.findDataset(datasetType, quantum.dataId, 

852 collections=collections, 

853 timespan=timespan)] 

854 except KeyError: 

855 # This dataset type is not present in the registry, 

856 # which just means there are no datasets here. 

857 refs = [] 

858 else: 

859 # Most general case. 

860 refs = list(registry.queryDatasets(datasetType, 

861 collections=collections, 

862 dataId=quantum.dataId, 

863 findFirst=True).expanded()) 

864 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs 

865 if ref is not None}) 

866 # Actually remove any quanta that we decided to skip above. 

867 if dataIdsSucceeded: 

868 if skipCollections is not None: 

869 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their " 

870 "outputs exist or metadata was written successfully.", 

871 len(dataIdsSucceeded), task.taskDef.label) 

872 for dataId in dataIdsSucceeded: 

873 del task.quanta[dataId] 

874 elif clobberOutputs: 

875 _LOG.info("Found %d successful quanta for task with label '%s' " 

876 "that will need to be clobbered during execution.", 

877 len(dataIdsSucceeded), 

878 task.taskDef.label) 

879 else: 

880 raise AssertionError("OutputExistsError should have already been raised.") 

881 if dataIdsFailed: 

882 if clobberOutputs: 

883 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' " 

884 "that will need to be clobbered during execution.", 

885 len(dataIdsFailed), 

886 task.taskDef.label) 

887 else: 

888 raise AssertionError("OutputExistsError should have already been raised.") 

889 

890 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

891 """Create a `QuantumGraph` from the quanta already present in 

892 the scaffolding data structure. 

893 

894 Parameters 

895 --------- 

896 metadata : Optional Mapping of `str` to primitives 

897 This is an optional parameter of extra data to carry with the 

898 graph. Entries in this mapping should be able to be serialized in 

899 JSON. 

900 

901 Returns 

902 ------- 

903 graph : `QuantumGraph` 

904 The full `QuantumGraph`. 

905 """ 

906 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

907 for task in self.tasks: 

908 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs) 

909 graphInput[task.taskDef] = qset 

910 

911 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs) 

912 return graph 

913 

914 

915# ------------------------ 

916# Exported definitions -- 

917# ------------------------ 

918 

919 

920class GraphBuilderError(Exception): 

921 """Base class for exceptions generated by graph builder. 

922 """ 

923 pass 

924 

925 

926class OutputExistsError(GraphBuilderError): 

927 """Exception generated when output datasets already exist. 

928 """ 

929 pass 

930 

931 

932class PrerequisiteMissingError(GraphBuilderError): 

933 """Exception generated when a prerequisite dataset does not exist. 

934 """ 

935 pass 

936 

937 

938class GraphBuilder(object): 

939 """GraphBuilder class is responsible for building task execution graph from 

940 a Pipeline. 

941 

942 Parameters 

943 ---------- 

944 registry : `~lsst.daf.butler.Registry` 

945 Data butler instance. 

946 skipExistingIn 

947 Expressions representing the collections to search for existing 

948 output datasets that should be skipped. May be any of the types 

949 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

950 clobberOutputs : `bool`, optional 

951 If `True` (default), allow quanta to created even if partial outputs 

952 exist; this requires the same behavior behavior to be enabled when 

953 executing. 

954 """ 

955 

956 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True): 

957 self.registry = registry 

958 self.dimensions = registry.dimensions 

959 self.skipExistingIn = skipExistingIn 

960 self.clobberOutputs = clobberOutputs 

961 

962 def makeGraph(self, pipeline, collections, run, userQuery, 

963 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

964 metadata: Optional[Mapping[str, Any]] = None): 

965 """Create execution graph for a pipeline. 

966 

967 Parameters 

968 ---------- 

969 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

970 Pipeline definition, task names/classes and their configs. 

971 collections 

972 Expressions representing the collections to search for input 

973 datasets. May be any of the types accepted by 

974 `lsst.daf.butler.CollectionSearch.fromExpression`. 

975 run : `str`, optional 

976 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

977 output datasets, if it already exists. 

978 userQuery : `str` 

979 String which defines user-defined selection for registry, should be 

980 empty or `None` if there is no restrictions on data selection. 

981 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

982 The query constraint variant that should be used to constraint the 

983 query based on dataset existance, defaults to 

984 `DatasetQueryConstraintVariant.ALL`. 

985 metadata : Optional Mapping of `str` to primitives 

986 This is an optional parameter of extra data to carry with the 

987 graph. Entries in this mapping should be able to be serialized in 

988 JSON. 

989 

990 Returns 

991 ------- 

992 graph : `QuantumGraph` 

993 

994 Raises 

995 ------ 

996 UserExpressionError 

997 Raised when user expression cannot be parsed. 

998 OutputExistsError 

999 Raised when output datasets already exist. 

1000 Exception 

1001 Other exceptions types may be raised by underlying registry 

1002 classes. 

1003 """ 

1004 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1005 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1006 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1007 instrument = None 

1008 if isinstance(pipeline, Pipeline): 

1009 instrument = pipeline.getInstrument() 

1010 if isinstance(instrument, str): 

1011 instrument = doImport(instrument) 

1012 pipeline = list(pipeline.toExpandedPipeline()) 

1013 if instrument is not None: 

1014 dataId = DataCoordinate.standardize(instrument=instrument.getName(), 

1015 universe=self.registry.dimensions) 

1016 else: 

1017 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1018 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId, 

1019 datasetQueryConstraint) as commonDataIds: 

1020 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1021 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

1022 skipExistingIn=self.skipExistingIn, 

1023 clobberOutputs=self.clobberOutputs, 

1024 constrainedByAllDatasets=condition) 

1025 return scaffolding.makeQuantumGraph(metadata=metadata)