Coverage for python/lsst/pipe/base/graphBuilder.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

323 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections, AdjustQuantumHelper 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph 

45from lsst.daf.butler import ( 

46 CollectionSearch, 

47 CollectionType, 

48 DataCoordinate, 

49 DatasetRef, 

50 DatasetType, 

51 DimensionGraph, 

52 DimensionUniverse, 

53 NamedKeyDict, 

54 Quantum, 

55) 

56from lsst.utils import doImport 

57from ._status import NoWorkFound 

58from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

59 

60# ---------------------------------- 

61# Local non-exported definitions -- 

62# ---------------------------------- 

63 

64_LOG = logging.getLogger(__name__) 

65 

66 

67class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

68 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

69 the known `DatasetRef` instances of that type. 

70 

71 Parameters 

72 ---------- 

73 args 

74 Positional arguments are forwarded to the `dict` constructor. 

75 universe : `DimensionUniverse` 

76 Universe of all possible dimensions. 

77 """ 

78 def __init__(self, *args, universe: DimensionGraph): 

79 super().__init__(*args) 

80 self.universe = universe 

81 

82 @classmethod 

83 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

84 universe: DimensionUniverse) -> _DatasetDict: 

85 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

86 

87 Parameters 

88 ---------- 

89 datasetTypes : `iterable` of `DatasetType` 

90 DatasetTypes to use as keys for the dict. Values will be empty 

91 dictionaries. 

92 universe : `DimensionUniverse` 

93 Universe of all possible dimensions. 

94 

95 Returns 

96 ------- 

97 dictionary : `_DatasetDict` 

98 A new `_DatasetDict` instance. 

99 """ 

100 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

101 

102 @classmethod 

103 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

104 ) -> _DatasetDict: 

105 """Return a new dictionary by extracting items corresponding to the 

106 given keys from one or more existing dictionaries. 

107 

108 Parameters 

109 ---------- 

110 datasetTypes : `iterable` of `DatasetType` 

111 DatasetTypes to use as keys for the dict. Values will be obtained 

112 by lookups against ``first`` and ``rest``. 

113 first : `_DatasetDict` 

114 Another dictionary from which to extract values. 

115 rest 

116 Additional dictionaries from which to extract values. 

117 

118 Returns 

119 ------- 

120 dictionary : `_DatasetDict` 

121 A new dictionary instance. 

122 """ 

123 combined = ChainMap(first, *rest) 

124 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

125 universe=first.universe) 

126 

127 @property 

128 def dimensions(self) -> DimensionGraph: 

129 """The union of all dimensions used by all dataset types in this 

130 dictionary, including implied dependencies (`DimensionGraph`). 

131 """ 

132 base = self.universe.empty 

133 if len(self) == 0: 

134 return base 

135 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

136 

137 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

138 """Unpack nested single-element `DatasetRef` dicts into a new 

139 mapping with `DatasetType` keys and `DatasetRef` values. 

140 

141 This method assumes that each nest contains exactly one item, as is the 

142 case for all "init" datasets. 

143 

144 Returns 

145 ------- 

146 dictionary : `NamedKeyDict` 

147 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

148 `DatasetType` instances and string names usable as keys. 

149 """ 

150 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

151 ref, = refs.values() 

152 return ref 

153 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

154 

155 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

156 """Unpack nested multi-element `DatasetRef` dicts into a new 

157 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

158 

159 Returns 

160 ------- 

161 dictionary : `NamedKeyDict` 

162 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

163 both `DatasetType` instances and string names usable as keys. 

164 """ 

165 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

166 

167 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

168 ) -> Iterator[DatasetRef]: 

169 """Iterate over the contained `DatasetRef` instances that match the 

170 given `DatasetType` and data IDs. 

171 

172 Parameters 

173 ---------- 

174 datasetType : `DatasetType` 

175 Dataset type to match. 

176 dataIds : `Iterable` [ `DataCoordinate` ] 

177 Data IDs to match. 

178 

179 Returns 

180 ------- 

181 refs : `Iterator` [ `DatasetRef` ] 

182 DatasetRef instances for which ``ref.datasetType == datasetType`` 

183 and ``ref.dataId`` is in ``dataIds``. 

184 """ 

185 refs = self[datasetType] 

186 return (refs[dataId] for dataId in dataIds) 

187 

188 

189class _QuantumScaffolding: 

190 """Helper class aggregating information about a `Quantum`, used when 

191 constructing a `QuantumGraph`. 

192 

193 See `_PipelineScaffolding` for a top-down description of the full 

194 scaffolding data structure. 

195 

196 Parameters 

197 ---------- 

198 task : _TaskScaffolding 

199 Back-reference to the helper object for the `PipelineTask` this quantum 

200 represents an execution of. 

201 dataId : `DataCoordinate` 

202 Data ID for this quantum. 

203 """ 

204 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

205 self.task = task 

206 self.dataId = dataId 

207 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

208 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

209 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

210 universe=dataId.universe) 

211 

212 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

213 

214 def __repr__(self): 

215 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

216 

217 task: _TaskScaffolding 

218 """Back-reference to the helper object for the `PipelineTask` this quantum 

219 represents an execution of. 

220 """ 

221 

222 dataId: DataCoordinate 

223 """Data ID for this quantum. 

224 """ 

225 

226 inputs: _DatasetDict 

227 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

228 

229 This is initialized to map each `DatasetType` to an empty dictionary at 

230 construction. Those nested dictionaries are populated (with data IDs as 

231 keys) with unresolved `DatasetRef` instances in 

232 `_PipelineScaffolding.connectDataIds`. 

233 """ 

234 

235 outputs: _DatasetDict 

236 """Nested dictionary containing `DatasetRef` outputs this quantum. 

237 """ 

238 

239 prerequisites: _DatasetDict 

240 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

241 quantum. 

242 """ 

243 

244 def makeQuantum(self) -> Quantum: 

245 """Transform the scaffolding object into a true `Quantum` instance. 

246 

247 Returns 

248 ------- 

249 quantum : `Quantum` 

250 An actual `Quantum` instance. 

251 """ 

252 allInputs = self.inputs.unpackMultiRefs() 

253 allInputs.update(self.prerequisites.unpackMultiRefs()) 

254 # Give the task's Connections class an opportunity to remove some 

255 # inputs, or complain if they are unacceptable. 

256 # This will raise if one of the check conditions is not met, which is 

257 # the intended behavior. 

258 # If it raises NotWorkFound, there is a bug in the QG algorithm 

259 # or the adjustQuantum is incorrectly trying to make a prerequisite 

260 # input behave like a regular input; adjustQuantum should only raise 

261 # NoWorkFound if a regular input is missing, and it shouldn't be 

262 # possible for us to have generated ``self`` if that's true. 

263 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

264 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

265 return Quantum( 

266 taskName=self.task.taskDef.taskName, 

267 taskClass=self.task.taskDef.taskClass, 

268 dataId=self.dataId, 

269 initInputs=self.task.initInputs.unpackSingleRefs(), 

270 inputs=helper.inputs, 

271 outputs=helper.outputs, 

272 ) 

273 

274 

275@dataclass 

276class _TaskScaffolding: 

277 """Helper class aggregating information about a `PipelineTask`, used when 

278 constructing a `QuantumGraph`. 

279 

280 See `_PipelineScaffolding` for a top-down description of the full 

281 scaffolding data structure. 

282 

283 Parameters 

284 ---------- 

285 taskDef : `TaskDef` 

286 Data structure that identifies the task class and its config. 

287 parent : `_PipelineScaffolding` 

288 The parent data structure that will hold the instance being 

289 constructed. 

290 datasetTypes : `TaskDatasetTypes` 

291 Data structure that categorizes the dataset types used by this task. 

292 """ 

293 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

294 universe = parent.dimensions.universe 

295 self.taskDef = taskDef 

296 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

297 assert self.dimensions.issubset(parent.dimensions) 

298 # Initialize _DatasetDicts as subsets of the one or two 

299 # corresponding dicts in the parent _PipelineScaffolding. 

300 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

301 parent.initIntermediates) 

302 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

303 parent.initOutputs) 

304 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

305 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

306 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

307 self.dataIds = set() 

308 self.quanta = {} 

309 

310 def __repr__(self): 

311 # Default dataclass-injected __repr__ gets caught in an infinite loop 

312 # because of back-references. 

313 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

314 

315 taskDef: TaskDef 

316 """Data structure that identifies the task class and its config 

317 (`TaskDef`). 

318 """ 

319 

320 dimensions: DimensionGraph 

321 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

322 """ 

323 

324 initInputs: _DatasetDict 

325 """Dictionary containing information about datasets used to construct this 

326 task (`_DatasetDict`). 

327 """ 

328 

329 initOutputs: _DatasetDict 

330 """Dictionary containing information about datasets produced as a 

331 side-effect of constructing this task (`_DatasetDict`). 

332 """ 

333 

334 inputs: _DatasetDict 

335 """Dictionary containing information about datasets used as regular, 

336 graph-constraining inputs to this task (`_DatasetDict`). 

337 """ 

338 

339 outputs: _DatasetDict 

340 """Dictionary containing information about datasets produced by this task 

341 (`_DatasetDict`). 

342 """ 

343 

344 prerequisites: _DatasetDict 

345 """Dictionary containing information about input datasets that must be 

346 present in the repository before any Pipeline containing this task is run 

347 (`_DatasetDict`). 

348 """ 

349 

350 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

351 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

352 this task with that data ID. 

353 """ 

354 

355 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]: 

356 """Create a `set` of `Quantum` from the information in ``self``. 

357 

358 Returns 

359 ------- 

360 nodes : `set` of `Quantum 

361 The `Quantum` elements corresponding to this task. 

362 """ 

363 if unresolvedRefs is None: 

364 unresolvedRefs = set() 

365 outputs = set() 

366 for q in self.quanta.values(): 

367 try: 

368 tmpQuanta = q.makeQuantum() 

369 outputs.add(tmpQuanta) 

370 except (NoWorkFound, FileNotFoundError) as exc: 

371 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

372 if unresolvedRefs.intersection(refs): 

373 # This means it is a node that is Known to be pruned 

374 # later and should be left in even though some follow up 

375 # queries fail. This allows the pruning to start from this 

376 # quantum with known issues, and prune other nodes it 

377 # touches 

378 inputs = q.inputs.unpackMultiRefs() 

379 inputs.update(q.prerequisites.unpackMultiRefs()) 

380 tmpQuantum = Quantum(taskName=q.task.taskDef.taskName, 

381 taskClass=q.task.taskDef.taskClass, 

382 dataId=q.dataId, 

383 initInputs=q.task.initInputs.unpackSingleRefs(), 

384 inputs=inputs, 

385 outputs=q.outputs.unpackMultiRefs(),) 

386 outputs.add(tmpQuantum) 

387 else: 

388 raise exc 

389 return outputs 

390 

391 

392@dataclass 

393class _PipelineScaffolding: 

394 """A helper data structure that organizes the information involved in 

395 constructing a `QuantumGraph` for a `Pipeline`. 

396 

397 Parameters 

398 ---------- 

399 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

400 Sequence of tasks from which a graph is to be constructed. Must 

401 have nested task classes already imported. 

402 universe : `DimensionUniverse` 

403 Universe of all possible dimensions. 

404 

405 Notes 

406 ----- 

407 The scaffolding data structure contains nested data structures for both 

408 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

409 data structures are shared between the pipeline-level structure (which 

410 aggregates all datasets and categorizes them from the perspective of the 

411 complete pipeline) and the individual tasks that use them as inputs and 

412 outputs. 

413 

414 `QuantumGraph` construction proceeds in four steps, with each corresponding 

415 to a different `_PipelineScaffolding` method: 

416 

417 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

418 the DatasetTypes used by the pipeline (delegating to 

419 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

420 nested `_TaskScaffolding` and `_DatasetDict` objects. 

421 

422 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

423 returns related tuples of all dimensions used to identify any regular 

424 input, output, and intermediate datasets (not prerequisites). We then 

425 iterate over these tuples of related dimensions, identifying the subsets 

426 that correspond to distinct data IDs for each task and dataset type, 

427 and then create `_QuantumScaffolding` objects. 

428 

429 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

430 dataset data IDs previously identified, transforming unresolved 

431 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

432 up prerequisite datasets for all quanta. 

433 

434 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

435 per-task `_QuantumScaffolding` objects. 

436 """ 

437 def __init__(self, pipeline, *, registry): 

438 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

439 self.tasks = [] 

440 # Aggregate and categorize the DatasetTypes in the Pipeline. 

441 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

442 # Construct dictionaries that map those DatasetTypes to structures 

443 # that will (later) hold addiitonal information about them. 

444 for attr in ("initInputs", "initIntermediates", "initOutputs", 

445 "inputs", "intermediates", "outputs", "prerequisites"): 

446 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

447 universe=registry.dimensions)) 

448 # Aggregate all dimensions for all non-init, non-prerequisite 

449 # DatasetTypes. These are the ones we'll include in the big join 

450 # query. 

451 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

452 self.outputs.dimensions) 

453 # Construct scaffolding nodes for each Task, and add backreferences 

454 # to the Task from each DatasetScaffolding node. 

455 # Note that there's only one scaffolding node for each DatasetType, 

456 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

457 # reference it. 

458 if isinstance(pipeline, Pipeline): 

459 pipeline = pipeline.toExpandedPipeline() 

460 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

461 for taskDef, taskDatasetTypes in zip(pipeline, 

462 datasetTypes.byTask.values())] 

463 

464 def __repr__(self): 

465 # Default dataclass-injected __repr__ gets caught in an infinite loop 

466 # because of back-references. 

467 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

468 

469 tasks: List[_TaskScaffolding] 

470 """Scaffolding data structures for each task in the pipeline 

471 (`list` of `_TaskScaffolding`). 

472 """ 

473 

474 initInputs: _DatasetDict 

475 """Datasets consumed but not produced when constructing the tasks in this 

476 pipeline (`_DatasetDict`). 

477 """ 

478 

479 initIntermediates: _DatasetDict 

480 """Datasets that are both consumed and produced when constructing the tasks 

481 in this pipeline (`_DatasetDict`). 

482 """ 

483 

484 initOutputs: _DatasetDict 

485 """Datasets produced but not consumed when constructing the tasks in this 

486 pipeline (`_DatasetDict`). 

487 """ 

488 

489 inputs: _DatasetDict 

490 """Datasets that are consumed but not produced when running this pipeline 

491 (`_DatasetDict`). 

492 """ 

493 

494 intermediates: _DatasetDict 

495 """Datasets that are both produced and consumed when running this pipeline 

496 (`_DatasetDict`). 

497 """ 

498 

499 outputs: _DatasetDict 

500 """Datasets produced but not consumed when when running this pipeline 

501 (`_DatasetDict`). 

502 """ 

503 

504 prerequisites: _DatasetDict 

505 """Datasets that are consumed when running this pipeline and looked up 

506 per-Quantum when generating the graph (`_DatasetDict`). 

507 """ 

508 

509 dimensions: DimensionGraph 

510 """All dimensions used by any regular input, intermediate, or output 

511 (not prerequisite) dataset; the set of dimension used in the "Big Join 

512 Query" (`DimensionGraph`). 

513 

514 This is required to be a superset of all task quantum dimensions. 

515 """ 

516 

517 @contextmanager 

518 def connectDataIds(self, registry, collections, userQuery, externalDataId, 

519 datasetQueryConstraint: DatasetQueryConstraintVariant = 

520 DatasetQueryConstraintVariant.ALL): 

521 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

522 

523 This method populates `_TaskScaffolding.dataIds` and 

524 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

525 

526 Parameters 

527 ---------- 

528 registry : `lsst.daf.butler.Registry` 

529 Registry for the data repository; used for all data ID queries. 

530 collections 

531 Expressions representing the collections to search for input 

532 datasets. May be any of the types accepted by 

533 `lsst.daf.butler.CollectionSearch.fromExpression`. 

534 userQuery : `str` or `None` 

535 User-provided expression to limit the data IDs processed. 

536 externalDataId : `DataCoordinate` 

537 Externally-provided data ID that should be used to restrict the 

538 results, just as if these constraints had been included via ``AND`` 

539 in ``userQuery``. This includes (at least) any instrument named 

540 in the pipeline definition. 

541 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

542 The query constraint variant that should be used to constraint the 

543 query based on dataset existance, defaults to 

544 `DatasetQueryConstraintVariant.ALL`. 

545 

546 Returns 

547 ------- 

548 commonDataIds : \ 

549 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

550 An interface to a database temporary table containing all data IDs 

551 that will appear in this `QuantumGraph`. Returned inside a 

552 context manager, which will drop the temporary table at the end of 

553 the `with` block in which this method is called. 

554 """ 

555 _LOG.debug("Building query for data IDs.") 

556 # Initialization datasets always have empty data IDs. 

557 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

558 for datasetType, refs in itertools.chain(self.initInputs.items(), 

559 self.initIntermediates.items(), 

560 self.initOutputs.items()): 

561 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

562 # Run one big query for the data IDs for task dimensions and regular 

563 # inputs and outputs. We limit the query to only dimensions that are 

564 # associated with the input dataset types, but don't (yet) try to 

565 # obtain the dataset_ids for those inputs. 

566 _LOG.debug("Submitting data ID query and materializing results.") 

567 queryArgs = {'dimensions': self.dimensions, 'where': userQuery, 'dataId': externalDataId} 

568 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

569 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

570 queryArgs['datasets'] = list(self.inputs) 

571 queryArgs['collections'] = collections 

572 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

573 _LOG.debug("Not using dataset existence to constrain query.") 

574 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

575 constraint = set(datasetQueryConstraint) 

576 inputs = {k.name: k for k in self.inputs.keys()} 

577 if (remainder := constraint.difference(inputs.keys())): 

578 raise ValueError(f"{remainder} dataset type(s) specified as a graph constraint, but" 

579 f" do not appear as an input to the specified pipeline: {inputs.keys()}") 

580 _LOG.debug(f"Constraining graph query using {constraint}") 

581 queryArgs['datasets'] = [typ for name, typ in inputs.items() if name in constraint] 

582 queryArgs['collections'] = collections 

583 else: 

584 raise ValueError(f"Unable to handle type {datasetQueryConstraint} given as " 

585 "datasetQueryConstraint.") 

586 

587 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

588 _LOG.debug("Expanding data IDs.") 

589 commonDataIds = commonDataIds.expanded() 

590 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

591 # Iterate over query results, populating data IDs for datasets and 

592 # quanta and then connecting them to each other. 

593 n = -1 

594 for n, commonDataId in enumerate(commonDataIds): 

595 # Create DatasetRefs for all DatasetTypes from this result row, 

596 # noting that we might have created some already. 

597 # We remember both those that already existed and those that we 

598 # create now. 

599 refsForRow = {} 

600 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {} 

601 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

602 self.outputs.items()): 

603 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)): 

604 datasetDataId = commonDataId.subset(datasetType.dimensions) 

605 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

606 ref = refs.get(datasetDataId) 

607 if ref is None: 

608 ref = DatasetRef(datasetType, datasetDataId) 

609 refs[datasetDataId] = ref 

610 refsForRow[datasetType.name] = ref 

611 # Create _QuantumScaffolding objects for all tasks from this 

612 # result row, noting that we might have created some already. 

613 for task in self.tasks: 

614 quantumDataId = commonDataId.subset(task.dimensions) 

615 quantum = task.quanta.get(quantumDataId) 

616 if quantum is None: 

617 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

618 task.quanta[quantumDataId] = quantum 

619 # Whether this is a new quantum or an existing one, we can 

620 # now associate the DatasetRefs for this row with it. The 

621 # fact that a Quantum data ID and a dataset data ID both 

622 # came from the same result row is what tells us they 

623 # should be associated. 

624 # Many of these associates will be duplicates (because 

625 # another query row that differed from this one only in 

626 # irrelevant dimensions already added them), and we use 

627 # sets to skip. 

628 for datasetType in task.inputs: 

629 ref = refsForRow[datasetType.name] 

630 quantum.inputs[datasetType.name][ref.dataId] = ref 

631 for datasetType in task.outputs: 

632 ref = refsForRow[datasetType.name] 

633 quantum.outputs[datasetType.name][ref.dataId] = ref 

634 if n < 0: 

635 emptiness_explained = False 

636 for message in commonDataIds.explain_no_results(): 

637 _LOG.warn(message) 

638 emptiness_explained = True 

639 if not emptiness_explained: 

640 _LOG.warn("To reproduce this query for debugging purposes, run " 

641 "Registry.queryDataIds with these arguments:") 

642 # We could just repr() the queryArgs dict to get something 

643 # the user could make sense of, but it's friendlier to 

644 # put these args in an easier-to-construct equivalent form 

645 # so they can read it more easily and copy and paste into 

646 # a Python terminal. 

647 _LOG.warn(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

648 _LOG.warn(" dataId=%s,", queryArgs["dataId"].byName()) 

649 if queryArgs["where"]: 

650 _LOG.warn(" where=%s,", repr(queryArgs["where"])) 

651 if "datasets" in queryArgs: 

652 _LOG.warn(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

653 if "collections" in queryArgs: 

654 _LOG.warn(" collections=%s,", list(queryArgs["collections"])) 

655 _LOG.debug("Finished processing %d rows from data ID query.", n) 

656 yield commonDataIds 

657 

658 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None, 

659 clobberOutputs=True, constrainedByAllDatasets: bool = True): 

660 """Perform follow up queries for each dataset data ID produced in 

661 `fillDataIds`. 

662 

663 This method populates `_DatasetScaffolding.refs` (except for those in 

664 `prerequisites`). 

665 

666 Parameters 

667 ---------- 

668 registry : `lsst.daf.butler.Registry` 

669 Registry for the data repository; used for all data ID queries. 

670 collections 

671 Expressions representing the collections to search for input 

672 datasets. May be any of the types accepted by 

673 `lsst.daf.butler.CollectionSearch.fromExpression`. 

674 run : `str`, optional 

675 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

676 output datasets, if it already exists. 

677 commonDataIds : \ 

678 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

679 Result of a previous call to `connectDataIds`. 

680 skipExistingIn 

681 Expressions representing the collections to search for existing 

682 output datasets that should be skipped. May be any of the types 

683 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

684 `None` or empty string/sequence disables skipping. 

685 clobberOutputs : `bool`, optional 

686 If `True` (default), allow quanta to created even if outputs exist; 

687 this requires the same behavior behavior to be enabled when 

688 executing. If ``skipExistingIn`` is not `None`, completed quanta 

689 (those with metadata, or all outputs if there is no metadata 

690 dataset configured) will be skipped rather than clobbered. 

691 constrainedByAllDatasets : `bool`, optional 

692 Indicates if the commonDataIds were generated with a constraint on 

693 all dataset types. 

694 

695 Raises 

696 ------ 

697 OutputExistsError 

698 Raised if an output dataset already exists in the output run 

699 and ``skipExistingIn`` does not include output run, or if only 

700 some outputs are present and ``clobberOutputs`` is `False`. 

701 """ 

702 skipCollections: Optional[CollectionSearch] = None 

703 skipExistingInRun = False 

704 if skipExistingIn: 

705 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

706 if run: 

707 # as optimization check in the explicit list of names first 

708 skipExistingInRun = run in skipCollections.explicitNames() 

709 if not skipExistingInRun: 

710 # need to flatten it and check again 

711 skipExistingInRun = run in registry.queryCollections( 

712 skipExistingIn, 

713 collectionTypes=CollectionType.RUN, 

714 ) 

715 

716 # Look up [init] intermediate and output datasets in the output 

717 # collection, if there is an output collection. 

718 if run is not None or skipCollections is not None: 

719 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

720 self.initOutputs.items(), 

721 self.intermediates.items(), 

722 self.outputs.items()): 

723 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

724 len(refs), datasetType.name) 

725 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

726 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

727 

728 # look at RUN collection first 

729 if run is not None: 

730 resolvedRefQueryResults = subset.findDatasets( 

731 datasetType, 

732 collections=run, 

733 findFirst=True 

734 ) 

735 for resolvedRef in resolvedRefQueryResults: 

736 # TODO: we could easily support per-DatasetType 

737 # skipExisting and I could imagine that being useful - 

738 # it's probably required in order to support writing 

739 # initOutputs before QuantumGraph generation. 

740 assert resolvedRef.dataId in refs 

741 if not (skipExistingInRun or isInit or clobberOutputs): 

742 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

743 f"output RUN collection '{run}' with data ID" 

744 f" {resolvedRef.dataId}.") 

745 

746 # And check skipExistingIn too, if RUN collection is in 

747 # it is handled above 

748 if skipCollections is not None: 

749 resolvedRefQueryResults = subset.findDatasets( 

750 datasetType, 

751 collections=skipCollections, 

752 findFirst=True 

753 ) 

754 for resolvedRef in resolvedRefQueryResults: 

755 assert resolvedRef.dataId in refs 

756 refs[resolvedRef.dataId] = resolvedRef 

757 

758 # Look up input and initInput datasets in the input collection(s). 

759 # container to accumulate unfound refs, if the common dataIs were not 

760 # constrained on dataset type existence. 

761 self.unfoundRefs = set() 

762 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

763 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

764 resolvedRefQueryResults = commonDataIds.subset( 

765 datasetType.dimensions, 

766 unique=True 

767 ).findDatasets( 

768 datasetType, 

769 collections=collections, 

770 findFirst=True 

771 ) 

772 dataIdsNotFoundYet = set(refs.keys()) 

773 for resolvedRef in resolvedRefQueryResults: 

774 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

775 refs[resolvedRef.dataId] = resolvedRef 

776 if dataIdsNotFoundYet: 

777 if constrainedByAllDatasets: 

778 raise RuntimeError( 

779 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

780 f"'{datasetType.name}' was/were present in a previous " 

781 f"query, but could not be found now." 

782 f"This is either a logic bug in QuantumGraph generation " 

783 f"or the input collections have been modified since " 

784 f"QuantumGraph generation began." 

785 ) 

786 else: 

787 # if the common dataIds were not constrained using all the 

788 # input dataset types, it is possible that some data ids 

789 # found dont correspond to existing dataset types and they 

790 # will be un-resolved. Mark these for later pruning from 

791 # the quantum graph. 

792 for k in dataIdsNotFoundYet: 

793 self.unfoundRefs.add(refs[k]) 

794 

795 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

796 # replacing the unresolved refs there, and then look up prerequisites. 

797 for task in self.tasks: 

798 _LOG.debug( 

799 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

800 len(task.quanta), 

801 task.taskDef.label 

802 ) 

803 lookupFunctions = { 

804 c.name: c.lookupFunction 

805 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

806 if c.lookupFunction is not None 

807 } 

808 dataIdsFailed = [] 

809 dataIdsSucceeded = [] 

810 for quantum in task.quanta.values(): 

811 # Process outputs datasets only if skipExistingIn is not None 

812 # or there is a run to look for outputs in and clobberOutputs 

813 # is True. Note that if skipExistingIn is None, any output 

814 # datasets that already exist would have already caused an 

815 # exception to be raised. We never update the DatasetRefs in 

816 # the quantum because those should never be resolved. 

817 if skipCollections is not None or (run is not None and clobberOutputs): 

818 resolvedRefs = [] 

819 unresolvedRefs = [] 

820 haveMetadata = False 

821 for datasetType, originalRefs in quantum.outputs.items(): 

822 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

823 if ref.id is not None: 

824 resolvedRefs.append(ref) 

825 if datasetType.name == task.taskDef.metadataDatasetName: 

826 haveMetadata = True 

827 else: 

828 unresolvedRefs.append(ref) 

829 if resolvedRefs: 

830 if haveMetadata or not unresolvedRefs: 

831 dataIdsSucceeded.append(quantum.dataId) 

832 if skipCollections is not None: 

833 continue 

834 else: 

835 dataIdsFailed.append(quantum.dataId) 

836 if not clobberOutputs: 

837 raise OutputExistsError( 

838 f"Quantum {quantum.dataId} of task with label " 

839 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

840 f"({resolvedRefs}) " 

841 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

842 "and clobbering outputs was not enabled." 

843 ) 

844 # Update the input DatasetRefs to the resolved ones we already 

845 # searched for. 

846 for datasetType, refs in quantum.inputs.items(): 

847 for ref in task.inputs.extract(datasetType, refs.keys()): 

848 refs[ref.dataId] = ref 

849 # Look up prerequisite datasets in the input collection(s). 

850 # These may have dimensions that extend beyond those we queried 

851 # for originally, because we want to permit those data ID 

852 # values to differ across quanta and dataset types. 

853 for datasetType in task.prerequisites: 

854 lookupFunction = lookupFunctions.get(datasetType.name) 

855 if lookupFunction is not None: 

856 # PipelineTask has provided its own function to do the 

857 # lookup. This always takes precedence. 

858 refs = list( 

859 lookupFunction(datasetType, registry, quantum.dataId, collections) 

860 ) 

861 elif (datasetType.isCalibration() 

862 and datasetType.dimensions <= quantum.dataId.graph 

863 and quantum.dataId.graph.temporal): 

864 # This is a master calibration lookup, which we have to 

865 # handle specially because the query system can't do a 

866 # temporal join on a non-dimension-based timespan yet. 

867 timespan = quantum.dataId.timespan 

868 try: 

869 refs = [registry.findDataset(datasetType, quantum.dataId, 

870 collections=collections, 

871 timespan=timespan)] 

872 except KeyError: 

873 # This dataset type is not present in the registry, 

874 # which just means there are no datasets here. 

875 refs = [] 

876 else: 

877 # Most general case. 

878 refs = list(registry.queryDatasets(datasetType, 

879 collections=collections, 

880 dataId=quantum.dataId, 

881 findFirst=True).expanded()) 

882 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs 

883 if ref is not None}) 

884 # Actually remove any quanta that we decided to skip above. 

885 if dataIdsSucceeded: 

886 if skipCollections is not None: 

887 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their " 

888 "outputs exist or metadata was written successfully.", 

889 len(dataIdsSucceeded), task.taskDef.label) 

890 for dataId in dataIdsSucceeded: 

891 del task.quanta[dataId] 

892 elif clobberOutputs: 

893 _LOG.info("Found %d successful quanta for task with label '%s' " 

894 "that will need to be clobbered during execution.", 

895 len(dataIdsSucceeded), 

896 task.taskDef.label) 

897 else: 

898 raise AssertionError("OutputExistsError should have already been raised.") 

899 if dataIdsFailed: 

900 if clobberOutputs: 

901 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' " 

902 "that will need to be clobbered during execution.", 

903 len(dataIdsFailed), 

904 task.taskDef.label) 

905 else: 

906 raise AssertionError("OutputExistsError should have already been raised.") 

907 

908 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

909 """Create a `QuantumGraph` from the quanta already present in 

910 the scaffolding data structure. 

911 

912 Parameters 

913 --------- 

914 metadata : Optional Mapping of `str` to primitives 

915 This is an optional parameter of extra data to carry with the 

916 graph. Entries in this mapping should be able to be serialized in 

917 JSON. 

918 

919 Returns 

920 ------- 

921 graph : `QuantumGraph` 

922 The full `QuantumGraph`. 

923 """ 

924 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

925 for task in self.tasks: 

926 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs) 

927 graphInput[task.taskDef] = qset 

928 

929 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs) 

930 return graph 

931 

932 

933# ------------------------ 

934# Exported definitions -- 

935# ------------------------ 

936 

937 

938class GraphBuilderError(Exception): 

939 """Base class for exceptions generated by graph builder. 

940 """ 

941 pass 

942 

943 

944class OutputExistsError(GraphBuilderError): 

945 """Exception generated when output datasets already exist. 

946 """ 

947 pass 

948 

949 

950class PrerequisiteMissingError(GraphBuilderError): 

951 """Exception generated when a prerequisite dataset does not exist. 

952 """ 

953 pass 

954 

955 

956class GraphBuilder(object): 

957 """GraphBuilder class is responsible for building task execution graph from 

958 a Pipeline. 

959 

960 Parameters 

961 ---------- 

962 registry : `~lsst.daf.butler.Registry` 

963 Data butler instance. 

964 skipExistingIn 

965 Expressions representing the collections to search for existing 

966 output datasets that should be skipped. May be any of the types 

967 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

968 clobberOutputs : `bool`, optional 

969 If `True` (default), allow quanta to created even if partial outputs 

970 exist; this requires the same behavior behavior to be enabled when 

971 executing. 

972 """ 

973 

974 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True): 

975 self.registry = registry 

976 self.dimensions = registry.dimensions 

977 self.skipExistingIn = skipExistingIn 

978 self.clobberOutputs = clobberOutputs 

979 

980 def makeGraph(self, pipeline, collections, run, userQuery, 

981 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

982 metadata: Optional[Mapping[str, Any]] = None): 

983 """Create execution graph for a pipeline. 

984 

985 Parameters 

986 ---------- 

987 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

988 Pipeline definition, task names/classes and their configs. 

989 collections 

990 Expressions representing the collections to search for input 

991 datasets. May be any of the types accepted by 

992 `lsst.daf.butler.CollectionSearch.fromExpression`. 

993 run : `str`, optional 

994 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

995 output datasets, if it already exists. 

996 userQuery : `str` 

997 String which defines user-defined selection for registry, should be 

998 empty or `None` if there is no restrictions on data selection. 

999 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1000 The query constraint variant that should be used to constraint the 

1001 query based on dataset existance, defaults to 

1002 `DatasetQueryConstraintVariant.ALL`. 

1003 metadata : Optional Mapping of `str` to primitives 

1004 This is an optional parameter of extra data to carry with the 

1005 graph. Entries in this mapping should be able to be serialized in 

1006 JSON. 

1007 

1008 Returns 

1009 ------- 

1010 graph : `QuantumGraph` 

1011 

1012 Raises 

1013 ------ 

1014 UserExpressionError 

1015 Raised when user expression cannot be parsed. 

1016 OutputExistsError 

1017 Raised when output datasets already exist. 

1018 Exception 

1019 Other exceptions types may be raised by underlying registry 

1020 classes. 

1021 """ 

1022 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1023 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1024 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1025 instrument = None 

1026 if isinstance(pipeline, Pipeline): 

1027 instrument = pipeline.getInstrument() 

1028 if isinstance(instrument, str): 

1029 instrument = doImport(instrument) 

1030 pipeline = list(pipeline.toExpandedPipeline()) 

1031 if instrument is not None: 

1032 dataId = DataCoordinate.standardize(instrument=instrument.getName(), 

1033 universe=self.registry.dimensions) 

1034 else: 

1035 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1036 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId, 

1037 datasetQueryConstraint) as commonDataIds: 

1038 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1039 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

1040 skipExistingIn=self.skipExistingIn, 

1041 clobberOutputs=self.clobberOutputs, 

1042 constrainedByAllDatasets=condition) 

1043 return scaffolding.makeQuantumGraph(metadata=metadata)