Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Dict, Iterable, Iterator, List 

36import logging 

37 

38# ----------------------------- 

39# Imports for other modules -- 

40# ----------------------------- 

41from .connections import iterConnections 

42from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

43from .graph import QuantumGraph, QuantumGraphTaskNodes 

44from lsst.daf.butler import ( 

45 DataCoordinate, 

46 DatasetRef, 

47 DatasetType, 

48 DimensionGraph, 

49 DimensionUniverse, 

50 NamedKeyDict, 

51 Quantum, 

52) 

53 

54# ---------------------------------- 

55# Local non-exported definitions -- 

56# ---------------------------------- 

57 

58_LOG = logging.getLogger(__name__.partition(".")[2]) 

59 

60 

61class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

62 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

63 the known `DatasetRef` instances of that type. 

64 

65 Parameters 

66 ---------- 

67 args 

68 Positional arguments are forwarded to the `dict` constructor. 

69 universe : `DimensionUniverse` 

70 Universe of all possible dimensions. 

71 """ 

72 def __init__(self, *args, universe: DimensionGraph): 

73 super().__init__(*args) 

74 self.universe = universe 

75 

76 @classmethod 

77 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

78 universe: DimensionUniverse) -> _DatasetDict: 

79 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

80 

81 Parameters 

82 ---------- 

83 datasetTypes : `iterable` of `DatasetType` 

84 DatasetTypes to use as keys for the dict. Values will be empty 

85 dictionaries. 

86 universe : `DimensionUniverse` 

87 Universe of all possible dimensions. 

88 

89 Returns 

90 ------- 

91 dictionary : `_DatasetDict` 

92 A new `_DatasetDict` instance. 

93 """ 

94 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

95 

96 @classmethod 

97 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

98 ) -> _DatasetDict: 

99 """Return a new dictionary by extracting items corresponding to the 

100 given keys from one or more existing dictionaries. 

101 

102 Parameters 

103 ---------- 

104 datasetTypes : `iterable` of `DatasetType` 

105 DatasetTypes to use as keys for the dict. Values will be obtained 

106 by lookups against ``first`` and ``rest``. 

107 first : `_DatasetDict` 

108 Another dictionary from which to extract values. 

109 rest 

110 Additional dictionaries from which to extract values. 

111 

112 Returns 

113 ------- 

114 dictionary : `_DatasetDict` 

115 A new dictionary instance. 

116 """ 

117 combined = ChainMap(first, *rest) 

118 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

119 universe=first.universe) 

120 

121 @property 

122 def dimensions(self) -> DimensionGraph: 

123 """The union of all dimensions used by all dataset types in this 

124 dictionary, including implied dependencies (`DimensionGraph`). 

125 """ 

126 base = self.universe.empty 

127 if len(self) == 0: 

128 return base 

129 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

130 

131 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

132 """Unpack nested single-element `DatasetRef` dicts into a new 

133 mapping with `DatasetType` keys and `DatasetRef` values. 

134 

135 This method assumes that each nest contains exactly one item, as is the 

136 case for all "init" datasets. 

137 

138 Returns 

139 ------- 

140 dictionary : `NamedKeyDict` 

141 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

142 `DatasetType` instances and string names usable as keys. 

143 """ 

144 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

145 ref, = refs.values() 

146 return ref 

147 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

148 

149 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

150 """Unpack nested multi-element `DatasetRef` dicts into a new 

151 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

152 

153 Returns 

154 ------- 

155 dictionary : `NamedKeyDict` 

156 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

157 `DatasetType` instances and string names usable as keys. 

158 """ 

159 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

160 

161 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

162 ) -> Iterator[DatasetRef]: 

163 """Iterate over the contained `DatasetRef` instances that match the 

164 given `DatasetType` and data IDs. 

165 

166 Parameters 

167 ---------- 

168 datasetType : `DatasetType` 

169 Dataset type to match. 

170 dataIds : `Iterable` [ `DataCoordinate` ] 

171 Data IDs to match. 

172 

173 Returns 

174 ------- 

175 refs : `Iterator` [ `DatasetRef` ] 

176 DatasetRef instances for which ``ref.datasetType == datasetType`` 

177 and ``ref.dataId`` is in ``dataIds``. 

178 """ 

179 refs = self[datasetType] 

180 return (refs[dataId] for dataId in dataIds) 

181 

182 

183class _QuantumScaffolding: 

184 """Helper class aggregating information about a `Quantum`, used when 

185 constructing a `QuantumGraph`. 

186 

187 See `_PipelineScaffolding` for a top-down description of the full 

188 scaffolding data structure. 

189 

190 Parameters 

191 ---------- 

192 task : _TaskScaffolding 

193 Back-reference to the helper object for the `PipelineTask` this quantum 

194 represents an execution of. 

195 dataId : `DataCoordinate` 

196 Data ID for this quantum. 

197 """ 

198 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

199 self.task = task 

200 self.dataId = dataId 

201 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

202 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

203 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

204 universe=dataId.universe) 

205 

206 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

207 

208 def __repr__(self): 

209 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)" 

210 

211 task: _TaskScaffolding 

212 """Back-reference to the helper object for the `PipelineTask` this quantum 

213 represents an execution of. 

214 """ 

215 

216 dataId: DataCoordinate 

217 """Data ID for this quantum. 

218 """ 

219 

220 inputs: _DatasetDict 

221 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

222 

223 This is initialized to map each `DatasetType` to an empty dictionary at 

224 construction. Those nested dictionaries are populated (with data IDs as 

225 keys) with unresolved `DatasetRef` instances in 

226 `_PipelineScaffolding.connectDataIds`. 

227 """ 

228 

229 outputs: _DatasetDict 

230 """Nested dictionary containing `DatasetRef` outputs this quantum. 

231 """ 

232 

233 prerequisites: _DatasetDict 

234 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

235 quantum. 

236 """ 

237 

238 def makeQuantum(self) -> Quantum: 

239 """Transform the scaffolding object into a true `Quantum` instance. 

240 

241 Returns 

242 ------- 

243 quantum : `Quantum` 

244 An actual `Quantum` instance. 

245 """ 

246 allInputs = self.inputs.unpackMultiRefs() 

247 allInputs.update(self.prerequisites.unpackMultiRefs()) 

248 # Give the task's Connections class an opportunity to remove some 

249 # inputs, or complain if they are unacceptable. 

250 config = self.task.taskDef.config 

251 connections = config.connections.ConnectionsClass(config=config) 

252 # This will raise if one of the check conditions is not met, which is the intended 

253 # behavior 

254 allInputs = connections.adjustQuantum(allInputs) 

255 return Quantum( 

256 taskName=self.task.taskDef.taskName, 

257 taskClass=self.task.taskDef.taskClass, 

258 dataId=self.dataId, 

259 initInputs=self.task.initInputs.unpackSingleRefs(), 

260 predictedInputs=allInputs, 

261 outputs=self.outputs.unpackMultiRefs(), 

262 ) 

263 

264 

265@dataclass 

266class _TaskScaffolding: 

267 """Helper class aggregating information about a `PipelineTask`, used when 

268 constructing a `QuantumGraph`. 

269 

270 See `_PipelineScaffolding` for a top-down description of the full 

271 scaffolding data structure. 

272 

273 Parameters 

274 ---------- 

275 taskDef : `TaskDef` 

276 Data structure that identifies the task class and its config. 

277 parent : `_PipelineScaffolding` 

278 The parent data structure that will hold the instance being 

279 constructed. 

280 datasetTypes : `TaskDatasetTypes` 

281 Data structure that categorizes the dataset types used by this task. 

282 """ 

283 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

284 universe = parent.dimensions.universe 

285 self.taskDef = taskDef 

286 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

287 assert self.dimensions.issubset(parent.dimensions) 

288 # Initialize _DatasetDicts as subsets of the one or two 

289 # corresponding dicts in the parent _PipelineScaffolding. 

290 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

291 parent.initIntermediates) 

292 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

293 parent.initOutputs) 

294 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

295 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

296 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

297 self.dataIds = set() 

298 self.quanta = {} 

299 

300 def __repr__(self): 

301 # Default dataclass-injected __repr__ gets caught in an infinite loop 

302 # because of back-references. 

303 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

304 

305 taskDef: TaskDef 

306 """Data structure that identifies the task class and its config 

307 (`TaskDef`). 

308 """ 

309 

310 dimensions: DimensionGraph 

311 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

312 """ 

313 

314 initInputs: _DatasetDict 

315 """Dictionary containing information about datasets used to construct this 

316 task (`_DatasetDict`). 

317 """ 

318 

319 initOutputs: _DatasetDict 

320 """Dictionary containing information about datasets produced as a 

321 side-effect of constructing this task (`_DatasetDict`). 

322 """ 

323 

324 inputs: _DatasetDict 

325 """Dictionary containing information about datasets used as regular, 

326 graph-constraining inputs to this task (`_DatasetDict`). 

327 """ 

328 

329 outputs: _DatasetDict 

330 """Dictionary containing information about datasets produced by this task 

331 (`_DatasetDict`). 

332 """ 

333 

334 prerequisites: _DatasetDict 

335 """Dictionary containing information about input datasets that must be 

336 present in the repository before any Pipeline containing this task is run 

337 (`_DatasetDict`). 

338 """ 

339 

340 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

341 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

342 this task with that data ID. 

343 """ 

344 

345 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes: 

346 """Create a `QuantumGraphTaskNodes` instance from the information in 

347 ``self``. 

348 

349 Returns 

350 ------- 

351 nodes : `QuantumGraphTaskNodes` 

352 The `QuantumGraph` elements corresponding to this task. 

353 """ 

354 return QuantumGraphTaskNodes( 

355 taskDef=self.taskDef, 

356 quanta=[q.makeQuantum() for q in self.quanta.values()], 

357 initInputs=self.initInputs.unpackSingleRefs(), 

358 initOutputs=self.initOutputs.unpackSingleRefs(), 

359 ) 

360 

361 

362@dataclass 

363class _PipelineScaffolding: 

364 """A helper data structure that organizes the information involved in 

365 constructing a `QuantumGraph` for a `Pipeline`. 

366 

367 Parameters 

368 ---------- 

369 pipeline : `Pipeline` 

370 Sequence of tasks from which a graph is to be constructed. Must 

371 have nested task classes already imported. 

372 universe : `DimensionUniverse` 

373 Universe of all possible dimensions. 

374 

375 Notes 

376 ----- 

377 The scaffolding data structure contains nested data structures for both 

378 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

379 data structures are shared between the pipeline-level structure (which 

380 aggregates all datasets and categorizes them from the perspective of the 

381 complete pipeline) and the individual tasks that use them as inputs and 

382 outputs. 

383 

384 `QuantumGraph` construction proceeds in four steps, with each corresponding 

385 to a different `_PipelineScaffolding` method: 

386 

387 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

388 the DatasetTypes used by the pipeline (delegating to 

389 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

390 nested `_TaskScaffolding` and `_DatasetDict` objects. 

391 

392 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

393 returns related tuples of all dimensions used to identify any regular 

394 input, output, and intermediate datasets (not prerequisites). We then 

395 iterate over these tuples of related dimensions, identifying the subsets 

396 that correspond to distinct data IDs for each task and dataset type, 

397 and then create `_QuantumScaffolding` objects. 

398 

399 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

400 dataset data IDs previously identified, transforming unresolved 

401 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

402 up prerequisite datasets for all quanta. 

403 

404 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

405 per-task `_QuantumScaffolding` objects. 

406 """ 

407 def __init__(self, pipeline, *, registry): 

408 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

409 self.tasks = [] 

410 # Aggregate and categorize the DatasetTypes in the Pipeline. 

411 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

412 # Construct dictionaries that map those DatasetTypes to structures 

413 # that will (later) hold addiitonal information about them. 

414 for attr in ("initInputs", "initIntermediates", "initOutputs", 

415 "inputs", "intermediates", "outputs", "prerequisites"): 

416 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

417 universe=registry.dimensions)) 

418 # Aggregate all dimensions for all non-init, non-prerequisite 

419 # DatasetTypes. These are the ones we'll include in the big join query. 

420 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

421 self.outputs.dimensions) 

422 # Construct scaffolding nodes for each Task, and add backreferences 

423 # to the Task from each DatasetScaffolding node. 

424 # Note that there's only one scaffolding node for each DatasetType, shared by 

425 # _PipelineScaffolding and all _TaskScaffoldings that reference it. 

426 if isinstance(pipeline, Pipeline): 

427 pipeline = pipeline.toExpandedPipeline() 

428 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

429 for taskDef, taskDatasetTypes in zip(pipeline, 

430 datasetTypes.byTask.values())] 

431 

432 def __repr__(self): 

433 # Default dataclass-injected __repr__ gets caught in an infinite loop 

434 # because of back-references. 

435 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

436 

437 tasks: List[_TaskScaffolding] 

438 """Scaffolding data structures for each task in the pipeline 

439 (`list` of `_TaskScaffolding`). 

440 """ 

441 

442 initInputs: _DatasetDict 

443 """Datasets consumed but not produced when constructing the tasks in this 

444 pipeline (`_DatasetDict`). 

445 """ 

446 

447 initIntermediates: _DatasetDict 

448 """Datasets that are both consumed and produced when constructing the tasks 

449 in this pipeline (`_DatasetDict`). 

450 """ 

451 

452 initOutputs: _DatasetDict 

453 """Datasets produced but not consumed when constructing the tasks in this 

454 pipeline (`_DatasetDict`). 

455 """ 

456 

457 inputs: _DatasetDict 

458 """Datasets that are consumed but not produced when running this pipeline 

459 (`_DatasetDict`). 

460 """ 

461 

462 intermediates: _DatasetDict 

463 """Datasets that are both produced and consumed when running this pipeline 

464 (`_DatasetDict`). 

465 """ 

466 

467 outputs: _DatasetDict 

468 """Datasets produced but not consumed when when running this pipeline 

469 (`_DatasetDict`). 

470 """ 

471 

472 prerequisites: _DatasetDict 

473 """Datasets that are consumed when running this pipeline and looked up 

474 per-Quantum when generating the graph (`_DatasetDict`). 

475 """ 

476 

477 dimensions: DimensionGraph 

478 """All dimensions used by any regular input, intermediate, or output 

479 (not prerequisite) dataset; the set of dimension used in the "Big Join 

480 Query" (`DimensionGraph`). 

481 

482 This is required to be a superset of all task quantum dimensions. 

483 """ 

484 

485 @contextmanager 

486 def connectDataIds(self, registry, collections, userQuery): 

487 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

488 

489 This method populates `_TaskScaffolding.dataIds` and 

490 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

491 

492 Parameters 

493 ---------- 

494 registry : `lsst.daf.butler.Registry` 

495 Registry for the data repository; used for all data ID queries. 

496 collections : `lsst.daf.butler.CollectionSearch` 

497 Object representing the collections to search for input datasets. 

498 userQuery : `str`, optional 

499 User-provided expression to limit the data IDs processed. 

500 

501 Returns 

502 ------- 

503 commonDataIds : \ 

504 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

505 An interface to a database temporary table containing all data IDs 

506 that will appear in this `QuantumGraph`. Returned inside a 

507 context manager, which will drop the temporary table at the end of 

508 the `with` block in which this method is called. 

509 """ 

510 _LOG.debug("Building query for data IDs.") 

511 # Initialization datasets always have empty data IDs. 

512 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

513 for datasetType, refs in itertools.chain(self.initInputs.items(), 

514 self.initIntermediates.items(), 

515 self.initOutputs.items()): 

516 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

517 # Run one big query for the data IDs for task dimensions and regular 

518 # inputs and outputs. We limit the query to only dimensions that are 

519 # associated with the input dataset types, but don't (yet) try to 

520 # obtain the dataset_ids for those inputs. 

521 _LOG.debug("Submitting data ID query and materializing results.") 

522 with registry.queryDataIds(self.dimensions, 

523 datasets=list(self.inputs), 

524 collections=collections, 

525 where=userQuery, 

526 ).materialize() as commonDataIds: 

527 _LOG.debug("Expanding data IDs.") 

528 commonDataIds = commonDataIds.expanded() 

529 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

530 # Iterate over query results, populating data IDs for datasets and 

531 # quanta and then connecting them to each other. 

532 n = 0 

533 for n, commonDataId in enumerate(commonDataIds): 

534 # Create DatasetRefs for all DatasetTypes from this result row, 

535 # noting that we might have created some already. 

536 # We remember both those that already existed and those that we 

537 # create now. 

538 refsForRow = {} 

539 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

540 self.outputs.items()): 

541 datasetDataId = commonDataId.subset(datasetType.dimensions) 

542 ref = refs.get(datasetDataId) 

543 if ref is None: 

544 ref = DatasetRef(datasetType, datasetDataId) 

545 refs[datasetDataId] = ref 

546 refsForRow[datasetType.name] = ref 

547 # Create _QuantumScaffolding objects for all tasks from this result 

548 # row, noting that we might have created some already. 

549 for task in self.tasks: 

550 quantumDataId = commonDataId.subset(task.dimensions) 

551 quantum = task.quanta.get(quantumDataId) 

552 if quantum is None: 

553 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

554 task.quanta[quantumDataId] = quantum 

555 # Whether this is a new quantum or an existing one, we can now 

556 # associate the DatasetRefs for this row with it. The fact 

557 # the fact that a Quantum data ID and a dataset data ID both 

558 # came from the same result row is what tells us they should 

559 # be associated. 

560 # Many of these associates will be duplicates (because another 

561 # query row that differed from this one only in irrelevant 

562 # dimensions already added them), and we use sets to skip. 

563 for datasetType in task.inputs: 

564 ref = refsForRow[datasetType.name] 

565 quantum.inputs[datasetType.name][ref.dataId] = ref 

566 for datasetType in task.outputs: 

567 ref = refsForRow[datasetType.name] 

568 quantum.outputs[datasetType.name][ref.dataId] = ref 

569 _LOG.debug("Finished processing %d rows from data ID query.", n) 

570 yield commonDataIds 

571 

572 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True): 

573 """Perform follow up queries for each dataset data ID produced in 

574 `fillDataIds`. 

575 

576 This method populates `_DatasetScaffolding.refs` (except for those in 

577 `prerequisites`). 

578 

579 Parameters 

580 ---------- 

581 registry : `lsst.daf.butler.Registry` 

582 Registry for the data repository; used for all data ID queries. 

583 collections : `lsst.daf.butler.CollectionSearch` 

584 Object representing the collections to search for input datasets. 

585 run : `str`, optional 

586 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

587 output datasets, if it already exists. 

588 commonDataIds : \ 

589 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

590 Result of a previous call to `connectDataIds`. 

591 skipExisting : `bool`, optional 

592 If `True` (default), a Quantum is not created if all its outputs 

593 already exist in ``run``. Ignored if ``run`` is `None`. 

594 

595 Raises 

596 ------ 

597 OutputExistsError 

598 Raised if an output dataset already exists in the output run 

599 and ``skipExisting`` is `False`. The case where some but not all 

600 of a quantum's outputs are present and ``skipExisting`` is `True` 

601 cannot be identified at this stage, and is handled by `fillQuanta` 

602 instead. 

603 """ 

604 # Look up [init] intermediate and output datasets in the output 

605 # collection, if there is an output collection. 

606 if run is not None: 

607 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

608 self.initOutputs.items(), 

609 self.intermediates.items(), 

610 self.outputs.items()): 

611 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

612 len(refs), datasetType.name) 

613 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

614 resolvedRefQueryResults = commonDataIds.subset( 

615 datasetType.dimensions, 

616 unique=True 

617 ).findDatasets( 

618 datasetType, 

619 collections=run, 

620 deduplicate=True 

621 ) 

622 for resolvedRef in resolvedRefQueryResults: 

623 # TODO: we could easily support per-DatasetType 

624 # skipExisting and I could imagine that being useful - it's 

625 # probably required in order to support writing initOutputs 

626 # before QuantumGraph generation. 

627 assert resolvedRef.dataId in refs 

628 if skipExisting or isInit: 

629 refs[resolvedRef.dataId] = resolvedRef 

630 else: 

631 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

632 f"output RUN collection '{run}' with data ID" 

633 f" {resolvedRef.dataId}.") 

634 # Look up input and initInput datasets in the input collection(s). 

635 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

636 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

637 resolvedRefQueryResults = commonDataIds.subset( 

638 datasetType.dimensions, 

639 unique=True 

640 ).findDatasets( 

641 datasetType, 

642 collections=collections, 

643 deduplicate=True 

644 ) 

645 dataIdsNotFoundYet = set(refs.keys()) 

646 for resolvedRef in resolvedRefQueryResults: 

647 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

648 refs[resolvedRef.dataId] = resolvedRef 

649 if dataIdsNotFoundYet: 

650 raise RuntimeError( 

651 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

652 f"'{datasetType.name}' was/were present in a previous " 

653 f"query, but could not be found now." 

654 f"This is either a logic bug in QuantumGraph generation " 

655 f"or the input collections have been modified since " 

656 f"QuantumGraph generation began." 

657 ) 

658 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

659 # replacing the unresolved refs there, and then look up prerequisites. 

660 for task in self.tasks: 

661 _LOG.debug( 

662 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

663 len(task.quanta), 

664 task.taskDef.label 

665 ) 

666 lookupFunctions = { 

667 c.name: c.lookupFunction 

668 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

669 if c.lookupFunction is not None 

670 } 

671 dataIdsToSkip = [] 

672 for quantum in task.quanta.values(): 

673 # Process outputs datasets only if there is a run to look for 

674 # outputs in and skipExisting is True. Note that if 

675 # skipExisting is False, any output datasets that already exist 

676 # would have already caused an exception to be raised. 

677 # We never update the DatasetRefs in the quantum because those 

678 # should never be resolved. 

679 if run is not None and skipExisting: 

680 resolvedRefs = [] 

681 unresolvedRefs = [] 

682 for datasetType, originalRefs in quantum.outputs.items(): 

683 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

684 if ref.id is not None: 

685 resolvedRefs.append(ref) 

686 else: 

687 unresolvedRefs.append(ref) 

688 if resolvedRefs: 

689 if unresolvedRefs: 

690 raise OutputExistsError( 

691 f"Quantum {quantum.dataId} of task with label " 

692 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) " 

693 f"and others that don't ({unresolvedRefs})." 

694 ) 

695 else: 

696 # All outputs are already present; skip this 

697 # quantum and continue to the next. 

698 dataIdsToSkip.append(quantum.dataId) 

699 continue 

700 # Update the input DatasetRefs to the resolved ones we already 

701 # searched for. 

702 for datasetType, refs in quantum.inputs.items(): 

703 for ref in task.inputs.extract(datasetType, refs.keys()): 

704 refs[ref.dataId] = ref 

705 # Look up prerequisite datasets in the input collection(s). 

706 # These may have dimensions that extend beyond those we queried 

707 # for originally, because we want to permit those data ID 

708 # values to differ across quanta and dataset types. 

709 # For example, the same quantum may have a flat and bias with 

710 # a different calibration_label, or a refcat with a skypix 

711 # value that overlaps the quantum's data ID's region, but not 

712 # the user expression used for the initial query. 

713 for datasetType in task.prerequisites: 

714 lookupFunction = lookupFunctions.get(datasetType.name) 

715 if lookupFunction is not None: 

716 refs = list( 

717 lookupFunction(datasetType, registry, quantum.dataId, collections) 

718 ) 

719 else: 

720 refs = list(registry.queryDatasets(datasetType, 

721 collections=collections, 

722 dataId=quantum.dataId, 

723 deduplicate=True).expanded()) 

724 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs}) 

725 # Actually remove any quanta that we decided to skip above. 

726 if dataIdsToSkip: 

727 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.", 

728 len(dataIdsToSkip), task.taskDef.label) 

729 for dataId in dataIdsToSkip: 

730 del task.quanta[dataId] 

731 

732 def makeQuantumGraph(self): 

733 """Create a `QuantumGraph` from the quanta already present in 

734 the scaffolding data structure. 

735 

736 Returns 

737 ------- 

738 graph : `QuantumGraph` 

739 The full `QuantumGraph`. 

740 """ 

741 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks) 

742 graph.initInputs = self.initInputs.unpackSingleRefs() 

743 graph.initOutputs = self.initOutputs.unpackSingleRefs() 

744 graph.initIntermediates = self.initIntermediates.unpackSingleRefs() 

745 return graph 

746 

747 

748# ------------------------ 

749# Exported definitions -- 

750# ------------------------ 

751 

752 

753class GraphBuilderError(Exception): 

754 """Base class for exceptions generated by graph builder. 

755 """ 

756 pass 

757 

758 

759class OutputExistsError(GraphBuilderError): 

760 """Exception generated when output datasets already exist. 

761 """ 

762 pass 

763 

764 

765class PrerequisiteMissingError(GraphBuilderError): 

766 """Exception generated when a prerequisite dataset does not exist. 

767 """ 

768 pass 

769 

770 

771class GraphBuilder(object): 

772 """GraphBuilder class is responsible for building task execution graph from 

773 a Pipeline. 

774 

775 Parameters 

776 ---------- 

777 registry : `~lsst.daf.butler.Registry` 

778 Data butler instance. 

779 skipExisting : `bool`, optional 

780 If `True` (default), a Quantum is not created if all its outputs 

781 already exist. 

782 """ 

783 

784 def __init__(self, registry, skipExisting=True): 

785 self.registry = registry 

786 self.dimensions = registry.dimensions 

787 self.skipExisting = skipExisting 

788 

789 def makeGraph(self, pipeline, collections, run, userQuery): 

790 """Create execution graph for a pipeline. 

791 

792 Parameters 

793 ---------- 

794 pipeline : `Pipeline` 

795 Pipeline definition, task names/classes and their configs. 

796 collections : `lsst.daf.butler.CollectionSearch` 

797 Object representing the collections to search for input datasets. 

798 run : `str`, optional 

799 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

800 output datasets, if it already exists. 

801 userQuery : `str` 

802 String which defunes user-defined selection for registry, should be 

803 empty or `None` if there is no restrictions on data selection. 

804 

805 Returns 

806 ------- 

807 graph : `QuantumGraph` 

808 

809 Raises 

810 ------ 

811 UserExpressionError 

812 Raised when user expression cannot be parsed. 

813 OutputExistsError 

814 Raised when output datasets already exist. 

815 Exception 

816 Other exceptions types may be raised by underlying registry 

817 classes. 

818 """ 

819 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

820 with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds: 

821 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

822 skipExisting=self.skipExisting) 

823 return scaffolding.makeQuantumGraph()