Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from dataclasses import dataclass 

34from typing import Dict, Iterable, Iterator, List 

35import logging 

36 

37# ----------------------------- 

38# Imports for other modules -- 

39# ----------------------------- 

40from .connections import iterConnections 

41from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

42from .graph import QuantumGraph, QuantumGraphTaskNodes 

43from lsst.daf.butler import ( 

44 DataCoordinate, 

45 DatasetRef, 

46 DatasetType, 

47 DimensionGraph, 

48 DimensionUniverse, 

49 NamedKeyDict, 

50 Quantum, 

51) 

52 

53# ---------------------------------- 

54# Local non-exported definitions -- 

55# ---------------------------------- 

56 

57_LOG = logging.getLogger(__name__.partition(".")[2]) 

58 

59 

60class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

61 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

62 the known `DatasetRef` instances of that type. 

63 

64 Parameters 

65 ---------- 

66 args 

67 Positional arguments are forwarded to the `dict` constructor. 

68 universe : `DimensionUniverse` 

69 Universe of all possible dimensions. 

70 """ 

71 def __init__(self, *args, universe: DimensionGraph): 

72 super().__init__(*args) 

73 self.universe = universe 

74 

75 @classmethod 

76 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

77 universe: DimensionUniverse) -> _DatasetDict: 

78 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

79 

80 Parameters 

81 ---------- 

82 datasetTypes : `iterable` of `DatasetType` 

83 DatasetTypes to use as keys for the dict. Values will be empty 

84 dictionaries. 

85 universe : `DimensionUniverse` 

86 Universe of all possible dimensions. 

87 

88 Returns 

89 ------- 

90 dictionary : `_DatasetDict` 

91 A new `_DatasetDict` instance. 

92 """ 

93 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

94 

95 @classmethod 

96 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

97 ) -> _DatasetDict: 

98 """Return a new dictionary by extracting items corresponding to the 

99 given keys from one or more existing dictionaries. 

100 

101 Parameters 

102 ---------- 

103 datasetTypes : `iterable` of `DatasetType` 

104 DatasetTypes to use as keys for the dict. Values will be obtained 

105 by lookups against ``first`` and ``rest``. 

106 first : `_DatasetDict` 

107 Another dictionary from which to extract values. 

108 rest 

109 Additional dictionaries from which to extract values. 

110 

111 Returns 

112 ------- 

113 dictionary : `_DatasetDict` 

114 A new dictionary instance. 

115 """ 

116 combined = ChainMap(first, *rest) 

117 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

118 universe=first.universe) 

119 

120 @property 

121 def dimensions(self) -> DimensionGraph: 

122 """The union of all dimensions used by all dataset types in this 

123 dictionary, including implied dependencies (`DimensionGraph`). 

124 """ 

125 base = self.universe.empty 

126 if len(self) == 0: 

127 return base 

128 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

129 

130 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

131 """Unpack nested single-element `DatasetRef` dicts into a new 

132 mapping with `DatasetType` keys and `DatasetRef` values. 

133 

134 This method assumes that each nest contains exactly one item, as is the 

135 case for all "init" datasets. 

136 

137 Returns 

138 ------- 

139 dictionary : `NamedKeyDict` 

140 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

141 `DatasetType` instances and string names usable as keys. 

142 """ 

143 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

144 ref, = refs.values() 

145 return ref 

146 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

147 

148 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

149 """Unpack nested multi-element `DatasetRef` dicts into a new 

150 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

151 

152 Returns 

153 ------- 

154 dictionary : `NamedKeyDict` 

155 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

156 `DatasetType` instances and string names usable as keys. 

157 """ 

158 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

159 

160 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

161 ) -> Iterator[DatasetRef]: 

162 """Iterate over the contained `DatasetRef` instances that match the 

163 given `DatasetType` and data IDs. 

164 

165 Parameters 

166 ---------- 

167 datasetType : `DatasetType` 

168 Dataset type to match. 

169 dataIds : `Iterable` [ `DataCoordinate` ] 

170 Data IDs to match. 

171 

172 Returns 

173 ------- 

174 refs : `Iterator` [ `DatasetRef` ] 

175 DatasetRef instances for which ``ref.datasetType == datasetType`` 

176 and ``ref.dataId`` is in ``dataIds``. 

177 """ 

178 refs = self[datasetType] 

179 return (refs[dataId] for dataId in dataIds) 

180 

181 

182class _QuantumScaffolding: 

183 """Helper class aggregating information about a `Quantum`, used when 

184 constructing a `QuantumGraph`. 

185 

186 See `_PipelineScaffolding` for a top-down description of the full 

187 scaffolding data structure. 

188 

189 Parameters 

190 ---------- 

191 task : _TaskScaffolding 

192 Back-reference to the helper object for the `PipelineTask` this quantum 

193 represents an execution of. 

194 dataId : `DataCoordinate` 

195 Data ID for this quantum. 

196 """ 

197 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

198 self.task = task 

199 self.dataId = dataId 

200 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

201 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

202 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

203 universe=dataId.universe) 

204 

205 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

206 

207 def __repr__(self): 

208 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)" 

209 

210 task: _TaskScaffolding 

211 """Back-reference to the helper object for the `PipelineTask` this quantum 

212 represents an execution of. 

213 """ 

214 

215 dataId: DataCoordinate 

216 """Data ID for this quantum. 

217 """ 

218 

219 inputs: _DatasetDict 

220 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

221 

222 This is initialized to map each `DatasetType` to an empty dictionary at 

223 construction. Those nested dictionaries are populated (with data IDs as 

224 keys) with unresolved `DatasetRef` instances in 

225 `_PipelineScaffolding.connectDataIds`. 

226 """ 

227 

228 outputs: _DatasetDict 

229 """Nested dictionary containing `DatasetRef` outputs this quantum. 

230 """ 

231 

232 prerequisites: _DatasetDict 

233 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

234 quantum. 

235 """ 

236 

237 def makeQuantum(self) -> Quantum: 

238 """Transform the scaffolding object into a true `Quantum` instance. 

239 

240 Returns 

241 ------- 

242 quantum : `Quantum` 

243 An actual `Quantum` instance. 

244 """ 

245 allInputs = self.inputs.unpackMultiRefs() 

246 allInputs.update(self.prerequisites.unpackMultiRefs()) 

247 # Give the task's Connections class an opportunity to remove some 

248 # inputs, or complain if they are unacceptable. 

249 config = self.task.taskDef.config 

250 connections = config.connections.ConnectionsClass(config=config) 

251 # This will raise if one of the check conditions is not met, which is the intended 

252 # behavior 

253 allInputs = connections.adjustQuantum(allInputs) 

254 return Quantum( 

255 taskName=self.task.taskDef.taskName, 

256 taskClass=self.task.taskDef.taskClass, 

257 dataId=self.dataId, 

258 initInputs=self.task.initInputs.unpackSingleRefs(), 

259 predictedInputs=allInputs, 

260 outputs=self.outputs.unpackMultiRefs(), 

261 ) 

262 

263 

264@dataclass 

265class _TaskScaffolding: 

266 """Helper class aggregating information about a `PipelineTask`, used when 

267 constructing a `QuantumGraph`. 

268 

269 See `_PipelineScaffolding` for a top-down description of the full 

270 scaffolding data structure. 

271 

272 Parameters 

273 ---------- 

274 taskDef : `TaskDef` 

275 Data structure that identifies the task class and its config. 

276 parent : `_PipelineScaffolding` 

277 The parent data structure that will hold the instance being 

278 constructed. 

279 datasetTypes : `TaskDatasetTypes` 

280 Data structure that categorizes the dataset types used by this task. 

281 """ 

282 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

283 universe = parent.dimensions.universe 

284 self.taskDef = taskDef 

285 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

286 assert self.dimensions.issubset(parent.dimensions) 

287 # Initialize _DatasetDicts as subsets of the one or two 

288 # corresponding dicts in the parent _PipelineScaffolding. 

289 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

290 parent.initIntermediates) 

291 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

292 parent.initOutputs) 

293 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

294 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

295 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

296 self.dataIds = set() 

297 self.quanta = {} 

298 

299 def __repr__(self): 

300 # Default dataclass-injected __repr__ gets caught in an infinite loop 

301 # because of back-references. 

302 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

303 

304 taskDef: TaskDef 

305 """Data structure that identifies the task class and its config 

306 (`TaskDef`). 

307 """ 

308 

309 dimensions: DimensionGraph 

310 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

311 """ 

312 

313 initInputs: _DatasetDict 

314 """Dictionary containing information about datasets used to construct this 

315 task (`_DatasetDict`). 

316 """ 

317 

318 initOutputs: _DatasetDict 

319 """Dictionary containing information about datasets produced as a 

320 side-effect of constructing this task (`_DatasetDict`). 

321 """ 

322 

323 inputs: _DatasetDict 

324 """Dictionary containing information about datasets used as regular, 

325 graph-constraining inputs to this task (`_DatasetDict`). 

326 """ 

327 

328 outputs: _DatasetDict 

329 """Dictionary containing information about datasets produced by this task 

330 (`_DatasetDict`). 

331 """ 

332 

333 prerequisites: _DatasetDict 

334 """Dictionary containing information about input datasets that must be 

335 present in the repository before any Pipeline containing this task is run 

336 (`_DatasetDict`). 

337 """ 

338 

339 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

340 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

341 this task with that data ID. 

342 """ 

343 

344 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes: 

345 """Create a `QuantumGraphTaskNodes` instance from the information in 

346 ``self``. 

347 

348 Returns 

349 ------- 

350 nodes : `QuantumGraphTaskNodes` 

351 The `QuantumGraph` elements corresponding to this task. 

352 """ 

353 return QuantumGraphTaskNodes( 

354 taskDef=self.taskDef, 

355 quanta=[q.makeQuantum() for q in self.quanta.values()], 

356 initInputs=self.initInputs.unpackSingleRefs(), 

357 initOutputs=self.initOutputs.unpackSingleRefs(), 

358 ) 

359 

360 

361@dataclass 

362class _PipelineScaffolding: 

363 """A helper data structure that organizes the information involved in 

364 constructing a `QuantumGraph` for a `Pipeline`. 

365 

366 Parameters 

367 ---------- 

368 pipeline : `Pipeline` 

369 Sequence of tasks from which a graph is to be constructed. Must 

370 have nested task classes already imported. 

371 universe : `DimensionUniverse` 

372 Universe of all possible dimensions. 

373 

374 Notes 

375 ----- 

376 The scaffolding data structure contains nested data structures for both 

377 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

378 data structures are shared between the pipeline-level structure (which 

379 aggregates all datasets and categorizes them from the perspective of the 

380 complete pipeline) and the individual tasks that use them as inputs and 

381 outputs. 

382 

383 `QuantumGraph` construction proceeds in four steps, with each corresponding 

384 to a different `_PipelineScaffolding` method: 

385 

386 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

387 the DatasetTypes used by the pipeline (delegating to 

388 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

389 nested `_TaskScaffolding` and `_DatasetDict` objects. 

390 

391 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

392 returns related tuples of all dimensions used to identify any regular 

393 input, output, and intermediate datasets (not prerequisites). We then 

394 iterate over these tuples of related dimensions, identifying the subsets 

395 that correspond to distinct data IDs for each task and dataset type, 

396 and then create `_QuantumScaffolding` objects. 

397 

398 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

399 dataset data IDs previously identified, transforming unresolved 

400 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

401 up prerequisite datasets for all quanta. 

402 

403 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

404 per-task `_QuantumScaffolding` objects. 

405 """ 

406 def __init__(self, pipeline, *, registry): 

407 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

408 self.tasks = [] 

409 # Aggregate and categorize the DatasetTypes in the Pipeline. 

410 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

411 # Construct dictionaries that map those DatasetTypes to structures 

412 # that will (later) hold addiitonal information about them. 

413 for attr in ("initInputs", "initIntermediates", "initOutputs", 

414 "inputs", "intermediates", "outputs", "prerequisites"): 

415 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

416 universe=registry.dimensions)) 

417 # Aggregate all dimensions for all non-init, non-prerequisite 

418 # DatasetTypes. These are the ones we'll include in the big join query. 

419 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

420 self.outputs.dimensions) 

421 # Construct scaffolding nodes for each Task, and add backreferences 

422 # to the Task from each DatasetScaffolding node. 

423 # Note that there's only one scaffolding node for each DatasetType, shared by 

424 # _PipelineScaffolding and all _TaskScaffoldings that reference it. 

425 if isinstance(pipeline, Pipeline): 

426 pipeline = pipeline.toExpandedPipeline() 

427 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

428 for taskDef, taskDatasetTypes in zip(pipeline, 

429 datasetTypes.byTask.values())] 

430 

431 def __repr__(self): 

432 # Default dataclass-injected __repr__ gets caught in an infinite loop 

433 # because of back-references. 

434 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

435 

436 tasks: List[_TaskScaffolding] 

437 """Scaffolding data structures for each task in the pipeline 

438 (`list` of `_TaskScaffolding`). 

439 """ 

440 

441 initInputs: _DatasetDict 

442 """Datasets consumed but not produced when constructing the tasks in this 

443 pipeline (`_DatasetDict`). 

444 """ 

445 

446 initIntermediates: _DatasetDict 

447 """Datasets that are both consumed and produced when constructing the tasks 

448 in this pipeline (`_DatasetDict`). 

449 """ 

450 

451 initOutputs: _DatasetDict 

452 """Datasets produced but not consumed when constructing the tasks in this 

453 pipeline (`_DatasetDict`). 

454 """ 

455 

456 inputs: _DatasetDict 

457 """Datasets that are consumed but not produced when running this pipeline 

458 (`_DatasetDict`). 

459 """ 

460 

461 intermediates: _DatasetDict 

462 """Datasets that are both produced and consumed when running this pipeline 

463 (`_DatasetDict`). 

464 """ 

465 

466 outputs: _DatasetDict 

467 """Datasets produced but not consumed when when running this pipeline 

468 (`_DatasetDict`). 

469 """ 

470 

471 prerequisites: _DatasetDict 

472 """Datasets that are consumed when running this pipeline and looked up 

473 per-Quantum when generating the graph (`_DatasetDict`). 

474 """ 

475 

476 dimensions: DimensionGraph 

477 """All dimensions used by any regular input, intermediate, or output 

478 (not prerequisite) dataset; the set of dimension used in the "Big Join 

479 Query" (`DimensionGraph`). 

480 

481 This is required to be a superset of all task quantum dimensions. 

482 """ 

483 

484 def connectDataIds(self, registry, collections, userQuery): 

485 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

486 

487 This method populates `_TaskScaffolding.dataIds` and 

488 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

489 

490 Parameters 

491 ---------- 

492 registry : `lsst.daf.butler.Registry` 

493 Registry for the data repository; used for all data ID queries. 

494 collections : `lsst.daf.butler.CollectionSearch` 

495 Object representing the collections to search for input datasets. 

496 userQuery : `str`, optional 

497 User-provided expression to limit the data IDs processed. 

498 """ 

499 _LOG.debug("Building query for data IDs.") 

500 # Initialization datasets always have empty data IDs. 

501 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

502 for datasetType, refs in itertools.chain(self.initInputs.items(), 

503 self.initIntermediates.items(), 

504 self.initOutputs.items()): 

505 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

506 # Run one big query for the data IDs for task dimensions and regular 

507 # inputs and outputs. We limit the query to only dimensions that are 

508 # associated with the input dataset types, but don't (yet) try to 

509 # obtain the dataset_ids for those inputs. 

510 _LOG.debug("Submitting data ID query and processing results.") 

511 resultIter = registry.queryDimensions( 

512 self.dimensions, 

513 datasets=list(self.inputs), 

514 collections=collections, 

515 where=userQuery, 

516 ) 

517 # Iterate over query results, populating data IDs for datasets and 

518 # quanta and then connecting them to each other. 

519 n = -1 # If we had no results 

520 for n, commonDataId in enumerate(resultIter): 

521 # Create DatasetRefs for all DatasetTypes from this result row, 

522 # noting that we might have created some already. 

523 # We remember both those that already existed and those that we 

524 # create now. 

525 refsForRow = {} 

526 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

527 self.outputs.items()): 

528 datasetDataId = commonDataId.subset(datasetType.dimensions) 

529 ref = refs.get(datasetDataId) 

530 if ref is None: 

531 ref = DatasetRef(datasetType, datasetDataId) 

532 refs[datasetDataId] = ref 

533 refsForRow[datasetType.name] = ref 

534 # Create _QuantumScaffolding objects for all tasks from this result 

535 # row, noting that we might have created some already. 

536 for task in self.tasks: 

537 quantumDataId = commonDataId.subset(task.dimensions) 

538 quantum = task.quanta.get(quantumDataId) 

539 if quantum is None: 

540 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

541 task.quanta[quantumDataId] = quantum 

542 # Whether this is a new quantum or an existing one, we can now 

543 # associate the DatasetRefs for this row with it. The fact 

544 # the fact that a Quantum data ID and a dataset data ID both 

545 # came from the same result row is what tells us they should 

546 # be associated. 

547 # Many of these associates will be duplicates (because another 

548 # query row that differed from this one only in irrelevant 

549 # dimensions already added them), and we use sets to skip. 

550 for datasetType in task.inputs: 

551 ref = refsForRow[datasetType.name] 

552 quantum.inputs[datasetType.name][ref.dataId] = ref 

553 for datasetType in task.outputs: 

554 ref = refsForRow[datasetType.name] 

555 quantum.outputs[datasetType.name][ref.dataId] = ref 

556 if n >= 0: 

557 _LOG.debug("Finished processing %d rows from data ID query.", n+1) 

558 else: 

559 _LOG.debug("Received no rows from data ID query.") 

560 

561 def resolveDatasetRefs(self, registry, collections, run, *, skipExisting=True): 

562 """Perform follow up queries for each dataset data ID produced in 

563 `fillDataIds`. 

564 

565 This method populates `_DatasetScaffolding.refs` (except for those in 

566 `prerequisites`). 

567 

568 Parameters 

569 ---------- 

570 registry : `lsst.daf.butler.Registry` 

571 Registry for the data repository; used for all data ID queries. 

572 collections : `lsst.daf.butler.CollectionSearch` 

573 Object representing the collections to search for input datasets. 

574 run : `str`, optional 

575 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

576 output datasets, if it already exists. 

577 skipExisting : `bool`, optional 

578 If `True` (default), a Quantum is not created if all its outputs 

579 already exist in ``run``. Ignored if ``run`` is `None`. 

580 

581 Raises 

582 ------ 

583 OutputExistsError 

584 Raised if an output dataset already exists in the output run 

585 and ``skipExisting`` is `False`. The case where some but not all 

586 of a quantum's outputs are present and ``skipExisting`` is `True` 

587 cannot be identified at this stage, and is handled by `fillQuanta` 

588 instead. 

589 """ 

590 # Look up [init] intermediate and output datasets in the output 

591 # collection, if there is an output collection. 

592 if run is not None: 

593 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

594 self.initOutputs.items(), 

595 self.intermediates.items(), 

596 self.outputs.items()): 

597 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

598 len(refs), datasetType.name) 

599 for dataId, unresolvedRef in refs.items(): 

600 # TODO: we could easily support per-DatasetType 

601 # skipExisting and I could imagine that being useful - it's 

602 # probably required in order to support writing initOutputs 

603 # before QuantumGraph generation. 

604 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run) 

605 if ref is not None: 

606 if skipExisting: 

607 refs[dataId] = ref 

608 else: 

609 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

610 f"output RUN collection '{run}' with data ID {dataId}.") 

611 # Look up input and initInput datasets in the input collection(s). 

612 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

613 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

614 for dataId in refs: 

615 refs[dataId] = registry.findDataset(datasetType, dataId=dataId, collections=collections) 

616 if any(ref is None for ref in refs.values()): 

617 raise RuntimeError( 

618 f"One or more dataset of type '{datasetType.name}' was " 

619 f"present in a previous query, but could not be found now." 

620 f"This is either a logic bug in QuantumGraph generation, " 

621 f"or the input collections have been modified since " 

622 f"QuantumGraph generation began." 

623 ) 

624 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

625 # replacing the unresolved refs there, and then look up prerequisites. 

626 for task in self.tasks: 

627 _LOG.debug( 

628 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

629 len(task.quanta), 

630 task.taskDef.label 

631 ) 

632 lookupFunctions = { 

633 c.name: c.lookupFunction 

634 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

635 if c.lookupFunction is not None 

636 } 

637 dataIdsToSkip = [] 

638 for quantum in task.quanta.values(): 

639 # Process outputs datasets only if there is a run to look for 

640 # outputs in and skipExisting is True. Note that if 

641 # skipExisting is False, any output datasets that already exist 

642 # would have already caused an exception to be raised. 

643 # We never update the DatasetRefs in the quantum because those 

644 # should never be resolved. 

645 if run is not None and skipExisting: 

646 resolvedRefs = [] 

647 unresolvedRefs = [] 

648 for datasetType, originalRefs in quantum.outputs.items(): 

649 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

650 if ref.id is not None: 

651 resolvedRefs.append(ref) 

652 else: 

653 unresolvedRefs.append(ref) 

654 if resolvedRefs: 

655 if unresolvedRefs: 

656 raise OutputExistsError( 

657 f"Quantum {quantum.dataId} of task with label " 

658 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) " 

659 f"and others that don't ({unresolvedRefs})." 

660 ) 

661 else: 

662 # All outputs are already present; skip this 

663 # quantum and continue to the next. 

664 dataIdsToSkip.append(quantum.dataId) 

665 continue 

666 # Update the input DatasetRefs to the resolved ones we already 

667 # searched for. 

668 for datasetType, refs in quantum.inputs.items(): 

669 for ref in task.inputs.extract(datasetType, refs.keys()): 

670 refs[ref.dataId] = ref 

671 # Look up prerequisite datasets in the input collection(s). 

672 # These may have dimensions that extend beyond those we queried 

673 # for originally, because we want to permit those data ID 

674 # values to differ across quanta and dataset types. 

675 # For example, the same quantum may have a flat and bias with 

676 # a different calibration_label, or a refcat with a skypix 

677 # value that overlaps the quantum's data ID's region, but not 

678 # the user expression used for the initial query. 

679 for datasetType in task.prerequisites: 

680 lookupFunction = lookupFunctions.get(datasetType.name) 

681 if lookupFunction is not None: 

682 refs = list( 

683 lookupFunction(datasetType, registry, quantum.dataId, collections) 

684 ) 

685 else: 

686 refs = list( 

687 registry.queryDatasets( 

688 datasetType, 

689 collections=collections, 

690 dataId=quantum.dataId, 

691 deduplicate=True, 

692 expand=True, 

693 ) 

694 ) 

695 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs}) 

696 # Actually remove any quanta that we decided to skip above. 

697 if dataIdsToSkip: 

698 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.", 

699 len(dataIdsToSkip), task.taskDef.label) 

700 for dataId in dataIdsToSkip: 

701 del task.quanta[dataId] 

702 

703 def makeQuantumGraph(self): 

704 """Create a `QuantumGraph` from the quanta already present in 

705 the scaffolding data structure. 

706 

707 Returns 

708 ------- 

709 graph : `QuantumGraph` 

710 The full `QuantumGraph`. 

711 """ 

712 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks) 

713 graph.initInputs = self.initInputs.unpackSingleRefs() 

714 graph.initOutputs = self.initOutputs.unpackSingleRefs() 

715 graph.initIntermediates = self.initIntermediates.unpackSingleRefs() 

716 return graph 

717 

718 

719# ------------------------ 

720# Exported definitions -- 

721# ------------------------ 

722 

723 

724class GraphBuilderError(Exception): 

725 """Base class for exceptions generated by graph builder. 

726 """ 

727 pass 

728 

729 

730class OutputExistsError(GraphBuilderError): 

731 """Exception generated when output datasets already exist. 

732 """ 

733 pass 

734 

735 

736class PrerequisiteMissingError(GraphBuilderError): 

737 """Exception generated when a prerequisite dataset does not exist. 

738 """ 

739 pass 

740 

741 

742class GraphBuilder(object): 

743 """GraphBuilder class is responsible for building task execution graph from 

744 a Pipeline. 

745 

746 Parameters 

747 ---------- 

748 registry : `~lsst.daf.butler.Registry` 

749 Data butler instance. 

750 skipExisting : `bool`, optional 

751 If `True` (default), a Quantum is not created if all its outputs 

752 already exist. 

753 """ 

754 

755 def __init__(self, registry, skipExisting=True): 

756 self.registry = registry 

757 self.dimensions = registry.dimensions 

758 self.skipExisting = skipExisting 

759 

760 def makeGraph(self, pipeline, collections, run, userQuery): 

761 """Create execution graph for a pipeline. 

762 

763 Parameters 

764 ---------- 

765 pipeline : `Pipeline` 

766 Pipeline definition, task names/classes and their configs. 

767 collections : `lsst.daf.butler.CollectionSearch` 

768 Object representing the collections to search for input datasets. 

769 run : `str`, optional 

770 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

771 output datasets, if it already exists. 

772 userQuery : `str` 

773 String which defunes user-defined selection for registry, should be 

774 empty or `None` if there is no restrictions on data selection. 

775 

776 Returns 

777 ------- 

778 graph : `QuantumGraph` 

779 

780 Raises 

781 ------ 

782 UserExpressionError 

783 Raised when user expression cannot be parsed. 

784 OutputExistsError 

785 Raised when output datasets already exist. 

786 Exception 

787 Other exceptions types may be raised by underlying registry 

788 classes. 

789 """ 

790 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

791 scaffolding.connectDataIds(self.registry, collections, userQuery) 

792 scaffolding.resolveDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting) 

793 return scaffolding.makeQuantumGraph()