Coverage for python/lsst/pipe/base/graphBuilder.py: 19%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

323 statements  

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap 

34from contextlib import contextmanager 

35from dataclasses import dataclass 

36from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set 

37 

38from lsst.daf.butler import ( 

39 CollectionSearch, 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 DimensionGraph, 

45 DimensionUniverse, 

46 NamedKeyDict, 

47 Quantum, 

48) 

49from lsst.utils import doImport 

50 

51from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

52from ._status import NoWorkFound 

53 

54# ----------------------------- 

55# Imports for other modules -- 

56# ----------------------------- 

57from .connections import AdjustQuantumHelper, iterConnections 

58from .graph import QuantumGraph 

59from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

60 

61# ---------------------------------- 

62# Local non-exported definitions -- 

63# ---------------------------------- 

64 

65_LOG = logging.getLogger(__name__) 

66 

67 

68class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

69 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

70 the known `DatasetRef` instances of that type. 

71 

72 Parameters 

73 ---------- 

74 args 

75 Positional arguments are forwarded to the `dict` constructor. 

76 universe : `DimensionUniverse` 

77 Universe of all possible dimensions. 

78 """ 

79 

80 def __init__(self, *args, universe: DimensionGraph): 

81 super().__init__(*args) 

82 self.universe = universe 

83 

84 @classmethod 

85 def fromDatasetTypes( 

86 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

87 ) -> _DatasetDict: 

88 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

89 

90 Parameters 

91 ---------- 

92 datasetTypes : `iterable` of `DatasetType` 

93 DatasetTypes to use as keys for the dict. Values will be empty 

94 dictionaries. 

95 universe : `DimensionUniverse` 

96 Universe of all possible dimensions. 

97 

98 Returns 

99 ------- 

100 dictionary : `_DatasetDict` 

101 A new `_DatasetDict` instance. 

102 """ 

103 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

104 

105 @classmethod 

106 def fromSubset( 

107 cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

108 ) -> _DatasetDict: 

109 """Return a new dictionary by extracting items corresponding to the 

110 given keys from one or more existing dictionaries. 

111 

112 Parameters 

113 ---------- 

114 datasetTypes : `iterable` of `DatasetType` 

115 DatasetTypes to use as keys for the dict. Values will be obtained 

116 by lookups against ``first`` and ``rest``. 

117 first : `_DatasetDict` 

118 Another dictionary from which to extract values. 

119 rest 

120 Additional dictionaries from which to extract values. 

121 

122 Returns 

123 ------- 

124 dictionary : `_DatasetDict` 

125 A new dictionary instance. 

126 """ 

127 combined = ChainMap(first, *rest) 

128 return cls( 

129 {datasetType: combined[datasetType] for datasetType in datasetTypes}, universe=first.universe 

130 ) 

131 

132 @property 

133 def dimensions(self) -> DimensionGraph: 

134 """The union of all dimensions used by all dataset types in this 

135 dictionary, including implied dependencies (`DimensionGraph`). 

136 """ 

137 base = self.universe.empty 

138 if len(self) == 0: 

139 return base 

140 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

141 

142 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

143 """Unpack nested single-element `DatasetRef` dicts into a new 

144 mapping with `DatasetType` keys and `DatasetRef` values. 

145 

146 This method assumes that each nest contains exactly one item, as is the 

147 case for all "init" datasets. 

148 

149 Returns 

150 ------- 

151 dictionary : `NamedKeyDict` 

152 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

153 `DatasetType` instances and string names usable as keys. 

154 """ 

155 

156 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

157 (ref,) = refs.values() 

158 return ref 

159 

160 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

161 

162 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

163 """Unpack nested multi-element `DatasetRef` dicts into a new 

164 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

165 

166 Returns 

167 ------- 

168 dictionary : `NamedKeyDict` 

169 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

170 both `DatasetType` instances and string names usable as keys. 

171 """ 

172 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

173 

174 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]: 

175 """Iterate over the contained `DatasetRef` instances that match the 

176 given `DatasetType` and data IDs. 

177 

178 Parameters 

179 ---------- 

180 datasetType : `DatasetType` 

181 Dataset type to match. 

182 dataIds : `Iterable` [ `DataCoordinate` ] 

183 Data IDs to match. 

184 

185 Returns 

186 ------- 

187 refs : `Iterator` [ `DatasetRef` ] 

188 DatasetRef instances for which ``ref.datasetType == datasetType`` 

189 and ``ref.dataId`` is in ``dataIds``. 

190 """ 

191 refs = self[datasetType] 

192 return (refs[dataId] for dataId in dataIds) 

193 

194 

195class _QuantumScaffolding: 

196 """Helper class aggregating information about a `Quantum`, used when 

197 constructing a `QuantumGraph`. 

198 

199 See `_PipelineScaffolding` for a top-down description of the full 

200 scaffolding data structure. 

201 

202 Parameters 

203 ---------- 

204 task : _TaskScaffolding 

205 Back-reference to the helper object for the `PipelineTask` this quantum 

206 represents an execution of. 

207 dataId : `DataCoordinate` 

208 Data ID for this quantum. 

209 """ 

210 

211 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

212 self.task = task 

213 self.dataId = dataId 

214 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

215 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

216 self.prerequisites = _DatasetDict.fromDatasetTypes( 

217 task.prerequisites.keys(), universe=dataId.universe 

218 ) 

219 

220 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

221 

222 def __repr__(self): 

223 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

224 

225 task: _TaskScaffolding 

226 """Back-reference to the helper object for the `PipelineTask` this quantum 

227 represents an execution of. 

228 """ 

229 

230 dataId: DataCoordinate 

231 """Data ID for this quantum. 

232 """ 

233 

234 inputs: _DatasetDict 

235 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

236 

237 This is initialized to map each `DatasetType` to an empty dictionary at 

238 construction. Those nested dictionaries are populated (with data IDs as 

239 keys) with unresolved `DatasetRef` instances in 

240 `_PipelineScaffolding.connectDataIds`. 

241 """ 

242 

243 outputs: _DatasetDict 

244 """Nested dictionary containing `DatasetRef` outputs this quantum. 

245 """ 

246 

247 prerequisites: _DatasetDict 

248 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

249 quantum. 

250 """ 

251 

252 def makeQuantum(self) -> Quantum: 

253 """Transform the scaffolding object into a true `Quantum` instance. 

254 

255 Returns 

256 ------- 

257 quantum : `Quantum` 

258 An actual `Quantum` instance. 

259 """ 

260 allInputs = self.inputs.unpackMultiRefs() 

261 allInputs.update(self.prerequisites.unpackMultiRefs()) 

262 # Give the task's Connections class an opportunity to remove some 

263 # inputs, or complain if they are unacceptable. 

264 # This will raise if one of the check conditions is not met, which is 

265 # the intended behavior. 

266 # If it raises NotWorkFound, there is a bug in the QG algorithm 

267 # or the adjustQuantum is incorrectly trying to make a prerequisite 

268 # input behave like a regular input; adjustQuantum should only raise 

269 # NoWorkFound if a regular input is missing, and it shouldn't be 

270 # possible for us to have generated ``self`` if that's true. 

271 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

272 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

273 return Quantum( 

274 taskName=self.task.taskDef.taskName, 

275 taskClass=self.task.taskDef.taskClass, 

276 dataId=self.dataId, 

277 initInputs=self.task.initInputs.unpackSingleRefs(), 

278 inputs=helper.inputs, 

279 outputs=helper.outputs, 

280 ) 

281 

282 

283@dataclass 

284class _TaskScaffolding: 

285 """Helper class aggregating information about a `PipelineTask`, used when 

286 constructing a `QuantumGraph`. 

287 

288 See `_PipelineScaffolding` for a top-down description of the full 

289 scaffolding data structure. 

290 

291 Parameters 

292 ---------- 

293 taskDef : `TaskDef` 

294 Data structure that identifies the task class and its config. 

295 parent : `_PipelineScaffolding` 

296 The parent data structure that will hold the instance being 

297 constructed. 

298 datasetTypes : `TaskDatasetTypes` 

299 Data structure that categorizes the dataset types used by this task. 

300 """ 

301 

302 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

303 universe = parent.dimensions.universe 

304 self.taskDef = taskDef 

305 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

306 assert self.dimensions.issubset(parent.dimensions) 

307 # Initialize _DatasetDicts as subsets of the one or two 

308 # corresponding dicts in the parent _PipelineScaffolding. 

309 self.initInputs = _DatasetDict.fromSubset( 

310 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

311 ) 

312 self.initOutputs = _DatasetDict.fromSubset( 

313 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

314 ) 

315 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

316 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

317 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

318 self.dataIds = set() 

319 self.quanta = {} 

320 

321 def __repr__(self): 

322 # Default dataclass-injected __repr__ gets caught in an infinite loop 

323 # because of back-references. 

324 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

325 

326 taskDef: TaskDef 

327 """Data structure that identifies the task class and its config 

328 (`TaskDef`). 

329 """ 

330 

331 dimensions: DimensionGraph 

332 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

333 """ 

334 

335 initInputs: _DatasetDict 

336 """Dictionary containing information about datasets used to construct this 

337 task (`_DatasetDict`). 

338 """ 

339 

340 initOutputs: _DatasetDict 

341 """Dictionary containing information about datasets produced as a 

342 side-effect of constructing this task (`_DatasetDict`). 

343 """ 

344 

345 inputs: _DatasetDict 

346 """Dictionary containing information about datasets used as regular, 

347 graph-constraining inputs to this task (`_DatasetDict`). 

348 """ 

349 

350 outputs: _DatasetDict 

351 """Dictionary containing information about datasets produced by this task 

352 (`_DatasetDict`). 

353 """ 

354 

355 prerequisites: _DatasetDict 

356 """Dictionary containing information about input datasets that must be 

357 present in the repository before any Pipeline containing this task is run 

358 (`_DatasetDict`). 

359 """ 

360 

361 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

362 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

363 this task with that data ID. 

364 """ 

365 

366 def makeQuantumSet(self, unresolvedRefs: Optional[Set[DatasetRef]] = None) -> Set[Quantum]: 

367 """Create a `set` of `Quantum` from the information in ``self``. 

368 

369 Returns 

370 ------- 

371 nodes : `set` of `Quantum 

372 The `Quantum` elements corresponding to this task. 

373 """ 

374 if unresolvedRefs is None: 

375 unresolvedRefs = set() 

376 outputs = set() 

377 for q in self.quanta.values(): 

378 try: 

379 tmpQuanta = q.makeQuantum() 

380 outputs.add(tmpQuanta) 

381 except (NoWorkFound, FileNotFoundError) as exc: 

382 refs = itertools.chain.from_iterable(self.inputs.unpackMultiRefs().values()) 

383 if unresolvedRefs.intersection(refs): 

384 # This means it is a node that is Known to be pruned 

385 # later and should be left in even though some follow up 

386 # queries fail. This allows the pruning to start from this 

387 # quantum with known issues, and prune other nodes it 

388 # touches 

389 inputs = q.inputs.unpackMultiRefs() 

390 inputs.update(q.prerequisites.unpackMultiRefs()) 

391 tmpQuantum = Quantum( 

392 taskName=q.task.taskDef.taskName, 

393 taskClass=q.task.taskDef.taskClass, 

394 dataId=q.dataId, 

395 initInputs=q.task.initInputs.unpackSingleRefs(), 

396 inputs=inputs, 

397 outputs=q.outputs.unpackMultiRefs(), 

398 ) 

399 outputs.add(tmpQuantum) 

400 else: 

401 raise exc 

402 return outputs 

403 

404 

405@dataclass 

406class _PipelineScaffolding: 

407 """A helper data structure that organizes the information involved in 

408 constructing a `QuantumGraph` for a `Pipeline`. 

409 

410 Parameters 

411 ---------- 

412 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

413 Sequence of tasks from which a graph is to be constructed. Must 

414 have nested task classes already imported. 

415 universe : `DimensionUniverse` 

416 Universe of all possible dimensions. 

417 

418 Notes 

419 ----- 

420 The scaffolding data structure contains nested data structures for both 

421 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

422 data structures are shared between the pipeline-level structure (which 

423 aggregates all datasets and categorizes them from the perspective of the 

424 complete pipeline) and the individual tasks that use them as inputs and 

425 outputs. 

426 

427 `QuantumGraph` construction proceeds in four steps, with each corresponding 

428 to a different `_PipelineScaffolding` method: 

429 

430 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

431 the DatasetTypes used by the pipeline (delegating to 

432 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

433 nested `_TaskScaffolding` and `_DatasetDict` objects. 

434 

435 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

436 returns related tuples of all dimensions used to identify any regular 

437 input, output, and intermediate datasets (not prerequisites). We then 

438 iterate over these tuples of related dimensions, identifying the subsets 

439 that correspond to distinct data IDs for each task and dataset type, 

440 and then create `_QuantumScaffolding` objects. 

441 

442 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

443 dataset data IDs previously identified, transforming unresolved 

444 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

445 up prerequisite datasets for all quanta. 

446 

447 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

448 per-task `_QuantumScaffolding` objects. 

449 """ 

450 

451 def __init__(self, pipeline, *, registry): 

452 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

453 self.tasks = [] 

454 # Aggregate and categorize the DatasetTypes in the Pipeline. 

455 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

456 # Construct dictionaries that map those DatasetTypes to structures 

457 # that will (later) hold addiitonal information about them. 

458 for attr in ( 

459 "initInputs", 

460 "initIntermediates", 

461 "initOutputs", 

462 "inputs", 

463 "intermediates", 

464 "outputs", 

465 "prerequisites", 

466 ): 

467 setattr( 

468 self, 

469 attr, 

470 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

471 ) 

472 # Aggregate all dimensions for all non-init, non-prerequisite 

473 # DatasetTypes. These are the ones we'll include in the big join 

474 # query. 

475 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

476 # Construct scaffolding nodes for each Task, and add backreferences 

477 # to the Task from each DatasetScaffolding node. 

478 # Note that there's only one scaffolding node for each DatasetType, 

479 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

480 # reference it. 

481 if isinstance(pipeline, Pipeline): 

482 pipeline = pipeline.toExpandedPipeline() 

483 self.tasks = [ 

484 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

485 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

486 ] 

487 

488 def __repr__(self): 

489 # Default dataclass-injected __repr__ gets caught in an infinite loop 

490 # because of back-references. 

491 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

492 

493 tasks: List[_TaskScaffolding] 

494 """Scaffolding data structures for each task in the pipeline 

495 (`list` of `_TaskScaffolding`). 

496 """ 

497 

498 initInputs: _DatasetDict 

499 """Datasets consumed but not produced when constructing the tasks in this 

500 pipeline (`_DatasetDict`). 

501 """ 

502 

503 initIntermediates: _DatasetDict 

504 """Datasets that are both consumed and produced when constructing the tasks 

505 in this pipeline (`_DatasetDict`). 

506 """ 

507 

508 initOutputs: _DatasetDict 

509 """Datasets produced but not consumed when constructing the tasks in this 

510 pipeline (`_DatasetDict`). 

511 """ 

512 

513 inputs: _DatasetDict 

514 """Datasets that are consumed but not produced when running this pipeline 

515 (`_DatasetDict`). 

516 """ 

517 

518 intermediates: _DatasetDict 

519 """Datasets that are both produced and consumed when running this pipeline 

520 (`_DatasetDict`). 

521 """ 

522 

523 outputs: _DatasetDict 

524 """Datasets produced but not consumed when when running this pipeline 

525 (`_DatasetDict`). 

526 """ 

527 

528 prerequisites: _DatasetDict 

529 """Datasets that are consumed when running this pipeline and looked up 

530 per-Quantum when generating the graph (`_DatasetDict`). 

531 """ 

532 

533 dimensions: DimensionGraph 

534 """All dimensions used by any regular input, intermediate, or output 

535 (not prerequisite) dataset; the set of dimension used in the "Big Join 

536 Query" (`DimensionGraph`). 

537 

538 This is required to be a superset of all task quantum dimensions. 

539 """ 

540 

541 @contextmanager 

542 def connectDataIds( 

543 self, 

544 registry, 

545 collections, 

546 userQuery, 

547 externalDataId, 

548 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

549 ): 

550 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

551 

552 This method populates `_TaskScaffolding.dataIds` and 

553 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

554 

555 Parameters 

556 ---------- 

557 registry : `lsst.daf.butler.Registry` 

558 Registry for the data repository; used for all data ID queries. 

559 collections 

560 Expressions representing the collections to search for input 

561 datasets. May be any of the types accepted by 

562 `lsst.daf.butler.CollectionSearch.fromExpression`. 

563 userQuery : `str` or `None` 

564 User-provided expression to limit the data IDs processed. 

565 externalDataId : `DataCoordinate` 

566 Externally-provided data ID that should be used to restrict the 

567 results, just as if these constraints had been included via ``AND`` 

568 in ``userQuery``. This includes (at least) any instrument named 

569 in the pipeline definition. 

570 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

571 The query constraint variant that should be used to constraint the 

572 query based on dataset existance, defaults to 

573 `DatasetQueryConstraintVariant.ALL`. 

574 

575 Returns 

576 ------- 

577 commonDataIds : \ 

578 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

579 An interface to a database temporary table containing all data IDs 

580 that will appear in this `QuantumGraph`. Returned inside a 

581 context manager, which will drop the temporary table at the end of 

582 the `with` block in which this method is called. 

583 """ 

584 _LOG.debug("Building query for data IDs.") 

585 # Initialization datasets always have empty data IDs. 

586 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

587 for datasetType, refs in itertools.chain( 

588 self.initInputs.items(), self.initIntermediates.items(), self.initOutputs.items() 

589 ): 

590 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

591 # Run one big query for the data IDs for task dimensions and regular 

592 # inputs and outputs. We limit the query to only dimensions that are 

593 # associated with the input dataset types, but don't (yet) try to 

594 # obtain the dataset_ids for those inputs. 

595 _LOG.debug("Submitting data ID query and materializing results.") 

596 queryArgs = {"dimensions": self.dimensions, "where": userQuery, "dataId": externalDataId} 

597 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

598 _LOG.debug("Constraining graph query using all datasets in pipeline.") 

599 queryArgs["datasets"] = list(self.inputs) 

600 queryArgs["collections"] = collections 

601 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

602 _LOG.debug("Not using dataset existence to constrain query.") 

603 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

604 constraint = set(datasetQueryConstraint) 

605 inputs = {k.name: k for k in self.inputs.keys()} 

606 if remainder := constraint.difference(inputs.keys()): 

607 raise ValueError( 

608 f"{remainder} dataset type(s) specified as a graph constraint, but" 

609 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

610 ) 

611 _LOG.debug(f"Constraining graph query using {constraint}") 

612 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

613 queryArgs["collections"] = collections 

614 else: 

615 raise ValueError( 

616 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

617 ) 

618 

619 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

620 _LOG.debug("Expanding data IDs.") 

621 commonDataIds = commonDataIds.expanded() 

622 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

623 # Iterate over query results, populating data IDs for datasets and 

624 # quanta and then connecting them to each other. 

625 n = -1 

626 for n, commonDataId in enumerate(commonDataIds): 

627 # Create DatasetRefs for all DatasetTypes from this result row, 

628 # noting that we might have created some already. 

629 # We remember both those that already existed and those that we 

630 # create now. 

631 refsForRow = {} 

632 dataIdCacheForRow: Mapping[DimensionGraph, DataCoordinate] = {} 

633 for datasetType, refs in itertools.chain( 

634 self.inputs.items(), self.intermediates.items(), self.outputs.items() 

635 ): 

636 if not (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)): 

637 datasetDataId = commonDataId.subset(datasetType.dimensions) 

638 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

639 ref = refs.get(datasetDataId) 

640 if ref is None: 

641 ref = DatasetRef(datasetType, datasetDataId) 

642 refs[datasetDataId] = ref 

643 refsForRow[datasetType.name] = ref 

644 # Create _QuantumScaffolding objects for all tasks from this 

645 # result row, noting that we might have created some already. 

646 for task in self.tasks: 

647 quantumDataId = commonDataId.subset(task.dimensions) 

648 quantum = task.quanta.get(quantumDataId) 

649 if quantum is None: 

650 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

651 task.quanta[quantumDataId] = quantum 

652 # Whether this is a new quantum or an existing one, we can 

653 # now associate the DatasetRefs for this row with it. The 

654 # fact that a Quantum data ID and a dataset data ID both 

655 # came from the same result row is what tells us they 

656 # should be associated. 

657 # Many of these associates will be duplicates (because 

658 # another query row that differed from this one only in 

659 # irrelevant dimensions already added them), and we use 

660 # sets to skip. 

661 for datasetType in task.inputs: 

662 ref = refsForRow[datasetType.name] 

663 quantum.inputs[datasetType.name][ref.dataId] = ref 

664 for datasetType in task.outputs: 

665 ref = refsForRow[datasetType.name] 

666 quantum.outputs[datasetType.name][ref.dataId] = ref 

667 if n < 0: 

668 emptiness_explained = False 

669 for message in commonDataIds.explain_no_results(): 

670 _LOG.warn(message) 

671 emptiness_explained = True 

672 if not emptiness_explained: 

673 _LOG.warn( 

674 "To reproduce this query for debugging purposes, run " 

675 "Registry.queryDataIds with these arguments:" 

676 ) 

677 # We could just repr() the queryArgs dict to get something 

678 # the user could make sense of, but it's friendlier to 

679 # put these args in an easier-to-construct equivalent form 

680 # so they can read it more easily and copy and paste into 

681 # a Python terminal. 

682 _LOG.warn(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

683 _LOG.warn(" dataId=%s,", queryArgs["dataId"].byName()) 

684 if queryArgs["where"]: 

685 _LOG.warn(" where=%s,", repr(queryArgs["where"])) 

686 if "datasets" in queryArgs: 

687 _LOG.warn(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

688 if "collections" in queryArgs: 

689 _LOG.warn(" collections=%s,", list(queryArgs["collections"])) 

690 _LOG.debug("Finished processing %d rows from data ID query.", n) 

691 yield commonDataIds 

692 

693 def resolveDatasetRefs( 

694 self, 

695 registry, 

696 collections, 

697 run, 

698 commonDataIds, 

699 *, 

700 skipExistingIn=None, 

701 clobberOutputs=True, 

702 constrainedByAllDatasets: bool = True, 

703 ): 

704 """Perform follow up queries for each dataset data ID produced in 

705 `fillDataIds`. 

706 

707 This method populates `_DatasetScaffolding.refs` (except for those in 

708 `prerequisites`). 

709 

710 Parameters 

711 ---------- 

712 registry : `lsst.daf.butler.Registry` 

713 Registry for the data repository; used for all data ID queries. 

714 collections 

715 Expressions representing the collections to search for input 

716 datasets. May be any of the types accepted by 

717 `lsst.daf.butler.CollectionSearch.fromExpression`. 

718 run : `str`, optional 

719 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

720 output datasets, if it already exists. 

721 commonDataIds : \ 

722 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

723 Result of a previous call to `connectDataIds`. 

724 skipExistingIn 

725 Expressions representing the collections to search for existing 

726 output datasets that should be skipped. May be any of the types 

727 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

728 `None` or empty string/sequence disables skipping. 

729 clobberOutputs : `bool`, optional 

730 If `True` (default), allow quanta to created even if outputs exist; 

731 this requires the same behavior behavior to be enabled when 

732 executing. If ``skipExistingIn`` is not `None`, completed quanta 

733 (those with metadata, or all outputs if there is no metadata 

734 dataset configured) will be skipped rather than clobbered. 

735 constrainedByAllDatasets : `bool`, optional 

736 Indicates if the commonDataIds were generated with a constraint on 

737 all dataset types. 

738 

739 Raises 

740 ------ 

741 OutputExistsError 

742 Raised if an output dataset already exists in the output run 

743 and ``skipExistingIn`` does not include output run, or if only 

744 some outputs are present and ``clobberOutputs`` is `False`. 

745 """ 

746 skipCollections: Optional[CollectionSearch] = None 

747 skipExistingInRun = False 

748 if skipExistingIn: 

749 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

750 if run: 

751 # as optimization check in the explicit list of names first 

752 skipExistingInRun = run in skipCollections.explicitNames() 

753 if not skipExistingInRun: 

754 # need to flatten it and check again 

755 skipExistingInRun = run in registry.queryCollections( 

756 skipExistingIn, 

757 collectionTypes=CollectionType.RUN, 

758 ) 

759 

760 # Look up [init] intermediate and output datasets in the output 

761 # collection, if there is an output collection. 

762 if run is not None or skipCollections is not None: 

763 for datasetType, refs in itertools.chain( 

764 self.initIntermediates.items(), 

765 self.initOutputs.items(), 

766 self.intermediates.items(), 

767 self.outputs.items(), 

768 ): 

769 _LOG.debug( 

770 "Resolving %d datasets for intermediate and/or output dataset %s.", 

771 len(refs), 

772 datasetType.name, 

773 ) 

774 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

775 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

776 

777 # look at RUN collection first 

778 if run is not None: 

779 resolvedRefQueryResults = subset.findDatasets( 

780 datasetType, collections=run, findFirst=True 

781 ) 

782 for resolvedRef in resolvedRefQueryResults: 

783 # TODO: we could easily support per-DatasetType 

784 # skipExisting and I could imagine that being useful - 

785 # it's probably required in order to support writing 

786 # initOutputs before QuantumGraph generation. 

787 assert resolvedRef.dataId in refs 

788 if not (skipExistingInRun or isInit or clobberOutputs): 

789 raise OutputExistsError( 

790 f"Output dataset {datasetType.name} already exists in " 

791 f"output RUN collection '{run}' with data ID" 

792 f" {resolvedRef.dataId}." 

793 ) 

794 

795 # And check skipExistingIn too, if RUN collection is in 

796 # it is handled above 

797 if skipCollections is not None: 

798 resolvedRefQueryResults = subset.findDatasets( 

799 datasetType, collections=skipCollections, findFirst=True 

800 ) 

801 for resolvedRef in resolvedRefQueryResults: 

802 assert resolvedRef.dataId in refs 

803 refs[resolvedRef.dataId] = resolvedRef 

804 

805 # Look up input and initInput datasets in the input collection(s). 

806 # container to accumulate unfound refs, if the common dataIs were not 

807 # constrained on dataset type existence. 

808 self.unfoundRefs = set() 

809 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

810 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

811 resolvedRefQueryResults = commonDataIds.subset(datasetType.dimensions, unique=True).findDatasets( 

812 datasetType, collections=collections, findFirst=True 

813 ) 

814 dataIdsNotFoundYet = set(refs.keys()) 

815 for resolvedRef in resolvedRefQueryResults: 

816 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

817 refs[resolvedRef.dataId] = resolvedRef 

818 if dataIdsNotFoundYet: 

819 if constrainedByAllDatasets: 

820 raise RuntimeError( 

821 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

822 f"'{datasetType.name}' was/were present in a previous " 

823 f"query, but could not be found now." 

824 f"This is either a logic bug in QuantumGraph generation " 

825 f"or the input collections have been modified since " 

826 f"QuantumGraph generation began." 

827 ) 

828 else: 

829 # if the common dataIds were not constrained using all the 

830 # input dataset types, it is possible that some data ids 

831 # found dont correspond to existing dataset types and they 

832 # will be un-resolved. Mark these for later pruning from 

833 # the quantum graph. 

834 for k in dataIdsNotFoundYet: 

835 self.unfoundRefs.add(refs[k]) 

836 

837 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

838 # replacing the unresolved refs there, and then look up prerequisites. 

839 for task in self.tasks: 

840 _LOG.debug( 

841 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

842 len(task.quanta), 

843 task.taskDef.label, 

844 ) 

845 lookupFunctions = { 

846 c.name: c.lookupFunction 

847 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

848 if c.lookupFunction is not None 

849 } 

850 dataIdsFailed = [] 

851 dataIdsSucceeded = [] 

852 for quantum in task.quanta.values(): 

853 # Process outputs datasets only if skipExistingIn is not None 

854 # or there is a run to look for outputs in and clobberOutputs 

855 # is True. Note that if skipExistingIn is None, any output 

856 # datasets that already exist would have already caused an 

857 # exception to be raised. We never update the DatasetRefs in 

858 # the quantum because those should never be resolved. 

859 if skipCollections is not None or (run is not None and clobberOutputs): 

860 resolvedRefs = [] 

861 unresolvedRefs = [] 

862 haveMetadata = False 

863 for datasetType, originalRefs in quantum.outputs.items(): 

864 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

865 if ref.id is not None: 

866 resolvedRefs.append(ref) 

867 if datasetType.name == task.taskDef.metadataDatasetName: 

868 haveMetadata = True 

869 else: 

870 unresolvedRefs.append(ref) 

871 if resolvedRefs: 

872 if haveMetadata or not unresolvedRefs: 

873 dataIdsSucceeded.append(quantum.dataId) 

874 if skipCollections is not None: 

875 continue 

876 else: 

877 dataIdsFailed.append(quantum.dataId) 

878 if not clobberOutputs: 

879 raise OutputExistsError( 

880 f"Quantum {quantum.dataId} of task with label " 

881 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

882 f"({resolvedRefs}) " 

883 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

884 "and clobbering outputs was not enabled." 

885 ) 

886 # Update the input DatasetRefs to the resolved ones we already 

887 # searched for. 

888 for datasetType, refs in quantum.inputs.items(): 

889 for ref in task.inputs.extract(datasetType, refs.keys()): 

890 refs[ref.dataId] = ref 

891 # Look up prerequisite datasets in the input collection(s). 

892 # These may have dimensions that extend beyond those we queried 

893 # for originally, because we want to permit those data ID 

894 # values to differ across quanta and dataset types. 

895 for datasetType in task.prerequisites: 

896 lookupFunction = lookupFunctions.get(datasetType.name) 

897 if lookupFunction is not None: 

898 # PipelineTask has provided its own function to do the 

899 # lookup. This always takes precedence. 

900 refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

901 elif ( 

902 datasetType.isCalibration() 

903 and datasetType.dimensions <= quantum.dataId.graph 

904 and quantum.dataId.graph.temporal 

905 ): 

906 # This is a master calibration lookup, which we have to 

907 # handle specially because the query system can't do a 

908 # temporal join on a non-dimension-based timespan yet. 

909 timespan = quantum.dataId.timespan 

910 try: 

911 refs = [ 

912 registry.findDataset( 

913 datasetType, quantum.dataId, collections=collections, timespan=timespan 

914 ) 

915 ] 

916 except KeyError: 

917 # This dataset type is not present in the registry, 

918 # which just means there are no datasets here. 

919 refs = [] 

920 else: 

921 # Most general case. 

922 refs = list( 

923 registry.queryDatasets( 

924 datasetType, collections=collections, dataId=quantum.dataId, findFirst=True 

925 ).expanded() 

926 ) 

927 quantum.prerequisites[datasetType].update( 

928 {ref.dataId: ref for ref in refs if ref is not None} 

929 ) 

930 # Actually remove any quanta that we decided to skip above. 

931 if dataIdsSucceeded: 

932 if skipCollections is not None: 

933 _LOG.debug( 

934 "Pruning successful %d quanta for task with label '%s' because all of their " 

935 "outputs exist or metadata was written successfully.", 

936 len(dataIdsSucceeded), 

937 task.taskDef.label, 

938 ) 

939 for dataId in dataIdsSucceeded: 

940 del task.quanta[dataId] 

941 elif clobberOutputs: 

942 _LOG.info( 

943 "Found %d successful quanta for task with label '%s' " 

944 "that will need to be clobbered during execution.", 

945 len(dataIdsSucceeded), 

946 task.taskDef.label, 

947 ) 

948 else: 

949 raise AssertionError("OutputExistsError should have already been raised.") 

950 if dataIdsFailed: 

951 if clobberOutputs: 

952 _LOG.info( 

953 "Found %d failed/incomplete quanta for task with label '%s' " 

954 "that will need to be clobbered during execution.", 

955 len(dataIdsFailed), 

956 task.taskDef.label, 

957 ) 

958 else: 

959 raise AssertionError("OutputExistsError should have already been raised.") 

960 

961 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

962 """Create a `QuantumGraph` from the quanta already present in 

963 the scaffolding data structure. 

964 

965 Parameters 

966 --------- 

967 metadata : Optional Mapping of `str` to primitives 

968 This is an optional parameter of extra data to carry with the 

969 graph. Entries in this mapping should be able to be serialized in 

970 JSON. 

971 

972 Returns 

973 ------- 

974 graph : `QuantumGraph` 

975 The full `QuantumGraph`. 

976 """ 

977 graphInput: Dict[TaskDef, Set[Quantum]] = {} 

978 for task in self.tasks: 

979 qset = task.makeQuantumSet(unresolvedRefs=self.unfoundRefs) 

980 graphInput[task.taskDef] = qset 

981 

982 graph = QuantumGraph(graphInput, metadata=metadata, pruneRefs=self.unfoundRefs) 

983 return graph 

984 

985 

986# ------------------------ 

987# Exported definitions -- 

988# ------------------------ 

989 

990 

991class GraphBuilderError(Exception): 

992 """Base class for exceptions generated by graph builder.""" 

993 

994 pass 

995 

996 

997class OutputExistsError(GraphBuilderError): 

998 """Exception generated when output datasets already exist.""" 

999 

1000 pass 

1001 

1002 

1003class PrerequisiteMissingError(GraphBuilderError): 

1004 """Exception generated when a prerequisite dataset does not exist.""" 

1005 

1006 pass 

1007 

1008 

1009class GraphBuilder(object): 

1010 """GraphBuilder class is responsible for building task execution graph from 

1011 a Pipeline. 

1012 

1013 Parameters 

1014 ---------- 

1015 registry : `~lsst.daf.butler.Registry` 

1016 Data butler instance. 

1017 skipExistingIn 

1018 Expressions representing the collections to search for existing 

1019 output datasets that should be skipped. May be any of the types 

1020 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

1021 clobberOutputs : `bool`, optional 

1022 If `True` (default), allow quanta to created even if partial outputs 

1023 exist; this requires the same behavior behavior to be enabled when 

1024 executing. 

1025 """ 

1026 

1027 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True): 

1028 self.registry = registry 

1029 self.dimensions = registry.dimensions 

1030 self.skipExistingIn = skipExistingIn 

1031 self.clobberOutputs = clobberOutputs 

1032 

1033 def makeGraph( 

1034 self, 

1035 pipeline, 

1036 collections, 

1037 run, 

1038 userQuery, 

1039 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1040 metadata: Optional[Mapping[str, Any]] = None, 

1041 ): 

1042 """Create execution graph for a pipeline. 

1043 

1044 Parameters 

1045 ---------- 

1046 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1047 Pipeline definition, task names/classes and their configs. 

1048 collections 

1049 Expressions representing the collections to search for input 

1050 datasets. May be any of the types accepted by 

1051 `lsst.daf.butler.CollectionSearch.fromExpression`. 

1052 run : `str`, optional 

1053 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1054 output datasets, if it already exists. 

1055 userQuery : `str` 

1056 String which defines user-defined selection for registry, should be 

1057 empty or `None` if there is no restrictions on data selection. 

1058 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1059 The query constraint variant that should be used to constraint the 

1060 query based on dataset existance, defaults to 

1061 `DatasetQueryConstraintVariant.ALL`. 

1062 metadata : Optional Mapping of `str` to primitives 

1063 This is an optional parameter of extra data to carry with the 

1064 graph. Entries in this mapping should be able to be serialized in 

1065 JSON. 

1066 

1067 Returns 

1068 ------- 

1069 graph : `QuantumGraph` 

1070 

1071 Raises 

1072 ------ 

1073 UserExpressionError 

1074 Raised when user expression cannot be parsed. 

1075 OutputExistsError 

1076 Raised when output datasets already exist. 

1077 Exception 

1078 Other exceptions types may be raised by underlying registry 

1079 classes. 

1080 """ 

1081 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1082 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1083 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1084 instrument = None 

1085 if isinstance(pipeline, Pipeline): 

1086 instrument = pipeline.getInstrument() 

1087 if isinstance(instrument, str): 

1088 instrument = doImport(instrument) 

1089 pipeline = list(pipeline.toExpandedPipeline()) 

1090 if instrument is not None: 

1091 dataId = DataCoordinate.standardize( 

1092 instrument=instrument.getName(), universe=self.registry.dimensions 

1093 ) 

1094 else: 

1095 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1096 with scaffolding.connectDataIds( 

1097 self.registry, collections, userQuery, dataId, datasetQueryConstraint 

1098 ) as commonDataIds: 

1099 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1100 scaffolding.resolveDatasetRefs( 

1101 self.registry, 

1102 collections, 

1103 run, 

1104 commonDataIds, 

1105 skipExistingIn=self.skipExistingIn, 

1106 clobberOutputs=self.clobberOutputs, 

1107 constrainedByAllDatasets=condition, 

1108 ) 

1109 return scaffolding.makeQuantumGraph(metadata=metadata)