Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from dataclasses import dataclass 

34from typing import Set, List, Dict, Optional, Iterable 

35import logging 

36 

37# ----------------------------- 

38# Imports for other modules -- 

39# ----------------------------- 

40from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

41from .graph import QuantumGraph, QuantumGraphTaskNodes 

42from lsst.daf.butler import ( 

43 DatasetRef, 

44 DatasetType, 

45 DimensionGraph, 

46 DimensionUniverse, 

47 ExpandedDataCoordinate, 

48 Quantum, 

49) 

50from lsst.daf.butler.core.utils import NamedKeyDict 

51 

52# ---------------------------------- 

53# Local non-exported definitions -- 

54# ---------------------------------- 

55 

56_LOG = logging.getLogger(__name__.partition(".")[2]) 

57 

58 

59@dataclass 

60class _DatasetScaffolding: 

61 """Helper class aggregating information about a `DatasetType`, used when 

62 constructing a `QuantumGraph`. 

63 

64 `_DatasetScaffolding` does not hold the `DatasetType` instance itself 

65 because it is usually used as the value type in `_DatasetScaffoldingDict`, 

66 which uses `DatasetType` instances as keys. 

67 

68 See `_PipelineScaffolding` for a top-down description of the full 

69 scaffolding data structure. 

70 

71 Parameters 

72 ---------- 

73 dimensions : `DimensionGraph` 

74 Dimensions of the `DatasetType`. 

75 """ 

76 def __init__(self, dimensions: DimensionGraph): 

77 self.dimensions = dimensions 

78 self.producer = None 

79 self.consumers = {} 

80 self.dataIds = set() 

81 self.refs = [] 

82 

83 __slots__ = ("dimensions", "producer", "consumers", "dataIds", "refs") 

84 

85 def __repr__(self): 

86 # Default dataclass-injected __repr__ gets caught in an infinite loop 

87 # because of back-references. 

88 return f"_DatasetScaffolding(dimensions={self.dimensions}, ...)" 

89 

90 dimensions: DimensionGraph 

91 """The dimensions of the dataset type (`DimensionGraph`). 

92 

93 Set during `_PipelineScaffolding` construction. 

94 """ 

95 

96 producer: Optional[_TaskScaffolding] 

97 """The scaffolding objects for the Task that produces this dataset. 

98 

99 Set during `_PipelineScaffolding` construction. 

100 """ 

101 

102 consumers: Dict[str, _TaskScaffolding] 

103 """The scaffolding objects for the Tasks that consume this dataset, 

104 keyed by their label in the `Pipeline`. 

105 

106 Set during `_PipelineScaffolding` construction. 

107 """ 

108 

109 dataIds: Set[ExpandedDataCoordinate] 

110 """Data IDs for all instances of this dataset type in the graph. 

111 

112 Populated after construction by `_PipelineScaffolding.fillDataIds`. 

113 """ 

114 

115 refs: List[DatasetRef] 

116 """References for all instances of this dataset type in the graph. 

117 

118 Populated after construction by `_PipelineScaffolding.fillDatasetRefs`. 

119 """ 

120 

121 

122class _DatasetScaffoldingDict(NamedKeyDict): 

123 """Custom dictionary that maps `DatasetType` to `_DatasetScaffolding`. 

124 

125 See `_PipelineScaffolding` for a top-down description of the full 

126 scaffolding data structure. 

127 

128 Parameters 

129 ---------- 

130 args 

131 Positional arguments are forwarded to the `dict` constructor. 

132 universe : `DimensionUniverse` 

133 Universe of all possible dimensions. 

134 """ 

135 def __init__(self, *args, universe: DimensionGraph): 

136 super().__init__(*args) 

137 self.universe = universe 

138 

139 @classmethod 

140 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

141 universe: DimensionUniverse) -> _DatasetScaffoldingDict: 

142 """Construct a a dictionary from a flat iterable of `DatasetType` keys. 

143 

144 Parameters 

145 ---------- 

146 datasetTypes : `iterable` of `DatasetType` 

147 DatasetTypes to use as keys for the dict. Values will be 

148 constructed from the dimensions of the keys. 

149 universe : `DimensionUniverse` 

150 Universe of all possible dimensions. 

151 

152 Returns 

153 ------- 

154 dictionary : `_DatasetScaffoldingDict` 

155 A new dictionary instance. 

156 """ 

157 return cls(((datasetType, _DatasetScaffolding(datasetType.dimensions)) 

158 for datasetType in datasetTypes), 

159 universe=universe) 

160 

161 @classmethod 

162 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetScaffoldingDict, 

163 *rest) -> _DatasetScaffoldingDict: 

164 """Return a new dictionary by extracting items corresponding to the 

165 given keys from one or more existing dictionaries. 

166 

167 Parameters 

168 ---------- 

169 datasetTypes : `iterable` of `DatasetType` 

170 DatasetTypes to use as keys for the dict. Values will be obtained 

171 by lookups against ``first`` and ``rest``. 

172 first : `_DatasetScaffoldingDict` 

173 Another dictionary from which to extract values. 

174 rest 

175 Additional dictionaries from which to extract values. 

176 

177 Returns 

178 ------- 

179 dictionary : `_DatasetScaffoldingDict` 

180 A new dictionary instance. 

181 """ 

182 combined = ChainMap(first, *rest) 

183 return cls(((datasetType, combined[datasetType]) for datasetType in datasetTypes), 

184 universe=first.universe) 

185 

186 @property 

187 def dimensions(self) -> DimensionGraph: 

188 """The union of all dimensions used by all dataset types in this 

189 dictionary, including implied dependencies (`DimensionGraph`). 

190 """ 

191 base = self.universe.empty 

192 if len(self) == 0: 

193 return base 

194 return base.union(*[scaffolding.dimensions for scaffolding in self.values()]) 

195 

196 def unpackRefs(self) -> NamedKeyDict: 

197 """Unpack nested single-element `DatasetRef` lists into a new 

198 dictionary. 

199 

200 This method assumes that each `_DatasetScaffolding.refs` list contains 

201 exactly one `DatasetRef`, as is the case for all "init" datasets. 

202 

203 Returns 

204 ------- 

205 dictionary : `NamedKeyDict` 

206 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

207 `DatasetType` instances and string names usable as keys. 

208 """ 

209 return NamedKeyDict((datasetType, scaffolding.refs[0]) for datasetType, scaffolding in self.items()) 

210 

211 

212@dataclass 

213class _TaskScaffolding: 

214 """Helper class aggregating information about a `PipelineTask`, used when 

215 constructing a `QuantumGraph`. 

216 

217 See `_PipelineScaffolding` for a top-down description of the full 

218 scaffolding data structure. 

219 

220 Parameters 

221 ---------- 

222 taskDef : `TaskDef` 

223 Data structure that identifies the task class and its config. 

224 parent : `_PipelineScaffolding` 

225 The parent data structure that will hold the instance being 

226 constructed. 

227 datasetTypes : `TaskDatasetTypes` 

228 Data structure that categorizes the dataset types used by this task. 

229 

230 Raises 

231 ------ 

232 GraphBuilderError 

233 Raised if the task's dimensions are not a subset of the union of the 

234 pipeline's dataset dimensions. 

235 """ 

236 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

237 universe = parent.dimensions.universe 

238 self.taskDef = taskDef 

239 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

240 if not self.dimensions.issubset(parent.dimensions): 

241 raise GraphBuilderError(f"Task with label '{taskDef.label}' has dimensions " 

242 f"{self.dimensions} that are not a subset of " 

243 f"the pipeline dimensions {parent.dimensions}.") 

244 

245 # Initialize _DatasetScaffoldingDicts as subsets of the one or two 

246 # corresponding dicts in the parent _PipelineScaffolding. 

247 self.initInputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initInputs, 

248 parent.initInputs, parent.initIntermediates) 

249 self.initOutputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.initOutputs, 

250 parent.initIntermediates, parent.initOutputs) 

251 self.inputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.inputs, 

252 parent.inputs, parent.intermediates) 

253 self.outputs = _DatasetScaffoldingDict.fromSubset(datasetTypes.outputs, 

254 parent.intermediates, parent.outputs) 

255 self.prerequisites = _DatasetScaffoldingDict.fromSubset(datasetTypes.prerequisites, 

256 parent.prerequisites) 

257 # Add backreferences to the _DatasetScaffolding objects that point to 

258 # this Task. 

259 for dataset in itertools.chain(self.initInputs.values(), self.inputs.values(), 

260 self.prerequisites.values()): 

261 dataset.consumers[self.taskDef.label] = self 

262 for dataset in itertools.chain(self.initOutputs.values(), self.outputs.values()): 

263 assert dataset.producer is None 

264 dataset.producer = self 

265 self.dataIds = set() 

266 self.quanta = [] 

267 

268 def __repr__(self): 

269 # Default dataclass-injected __repr__ gets caught in an infinite loop 

270 # because of back-references. 

271 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

272 

273 taskDef: TaskDef 

274 """Data structure that identifies the task class and its config 

275 (`TaskDef`). 

276 """ 

277 

278 dimensions: DimensionGraph 

279 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

280 """ 

281 

282 initInputs: _DatasetScaffoldingDict 

283 """Dictionary containing information about datasets used to construct this 

284 task (`_DatasetScaffoldingDict`). 

285 """ 

286 

287 initOutputs: _DatasetScaffoldingDict 

288 """Dictionary containing information about datasets produced as a 

289 side-effect of constructing this task (`_DatasetScaffoldingDict`). 

290 """ 

291 

292 inputs: _DatasetScaffoldingDict 

293 """Dictionary containing information about datasets used as regular, 

294 graph-constraining inputs to this task (`_DatasetScaffoldingDict`). 

295 """ 

296 

297 outputs: _DatasetScaffoldingDict 

298 """Dictionary containing information about datasets produced by this task 

299 (`_DatasetScaffoldingDict`). 

300 """ 

301 

302 prerequisites: _DatasetScaffoldingDict 

303 """Dictionary containing information about input datasets that must be 

304 present in the repository before any Pipeline containing this task is run 

305 (`_DatasetScaffoldingDict`). 

306 """ 

307 

308 dataIds: Set[ExpandedDataCoordinate] 

309 """Data IDs for all quanta for this task in the graph (`set` of 

310 `ExpandedDataCoordinate`). 

311 

312 Populated after construction by `_PipelineScaffolding.fillDataIds`. 

313 """ 

314 

315 quanta: List[Quantum] 

316 """All quanta for this task in the graph (`list` of `Quantum`). 

317 

318 Populated after construction by `_PipelineScaffolding.fillQuanta`. 

319 """ 

320 

321 def addQuantum(self, quantum: Quantum): 

322 config = self.taskDef.config 

323 connectionClass = config.connections.ConnectionsClass 

324 connectionInstance = connectionClass(config=config) 

325 # This will raise if one of the check conditions is not met, which is the intended 

326 # behavior 

327 result = connectionInstance.adjustQuantum(quantum.predictedInputs) 

328 quantum._predictedInputs = NamedKeyDict(result) 

329 

330 # If this function has reached this far add the quantum 

331 self.quanta.append(quantum) 

332 

333 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes: 

334 """Create a `QuantumGraphTaskNodes` instance from the information in 

335 ``self``. 

336 

337 Returns 

338 ------- 

339 nodes : `QuantumGraphTaskNodes` 

340 The `QuantumGraph` elements corresponding to this task. 

341 """ 

342 return QuantumGraphTaskNodes( 

343 taskDef=self.taskDef, 

344 quanta=self.quanta, 

345 initInputs=self.initInputs.unpackRefs(), 

346 initOutputs=self.initOutputs.unpackRefs(), 

347 ) 

348 

349 

350@dataclass 

351class _PipelineScaffolding: 

352 """A helper data structure that organizes the information involved in 

353 constructing a `QuantumGraph` for a `Pipeline`. 

354 

355 Parameters 

356 ---------- 

357 pipeline : `Pipeline` 

358 Sequence of tasks from which a graph is to be constructed. Must 

359 have nested task classes already imported. 

360 universe : `DimensionUniverse` 

361 Universe of all possible dimensions. 

362 

363 Raises 

364 ------ 

365 GraphBuilderError 

366 Raised if the task's dimensions are not a subset of the union of the 

367 pipeline's dataset dimensions. 

368 

369 Notes 

370 ----- 

371 The scaffolding data structure contains nested data structures for both 

372 tasks (`_TaskScaffolding`) and datasets (`_DatasetScaffolding`), with the 

373 latter held by `_DatasetScaffoldingDict`. The dataset data structures are 

374 shared between the pipeline-level structure (which aggregates all datasets 

375 and categorizes them from the perspective of the complete pipeline) and the 

376 individual tasks that use them as inputs and outputs. 

377 

378 `QuantumGraph` construction proceeds in five steps, with each corresponding 

379 to a different `_PipelineScaffolding` method: 

380 

381 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

382 the DatasetTypes used by the pipeline (delegating to 

383 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

384 nested `_TaskScaffolding` and `_DatasetScaffolding` objects. 

385 

386 2. In `fillDataIds`, we construct and run the "Big Join Query", which 

387 returns related tuples of all dimensions used to identify any regular 

388 input, output, and intermediate datasets (not prerequisites). We then 

389 iterate over these tuples of related dimensions, identifying the subsets 

390 that correspond to distinct data IDs for each task and dataset type. 

391 

392 3. In `fillDatasetRefs`, we run follow-up queries against all of the 

393 dataset data IDs previously identified, populating the 

394 `_DatasetScaffolding.refs` lists - except for those for prerequisite 

395 datasets, which cannot be resolved until distinct quanta are 

396 identified. 

397 

398 4. In `fillQuanta`, we extract subsets from the lists of `DatasetRef` into 

399 the inputs and outputs for each `Quantum` and search for prerequisite 

400 datasets, populating `_TaskScaffolding.quanta`. 

401 

402 5. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

403 per-task quanta identified in the previous step. 

404 """ 

405 def __init__(self, pipeline, *, registry): 

406 self.tasks = [] 

407 # Aggregate and categorize the DatasetTypes in the Pipeline. 

408 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

409 # Construct dictionaries that map those DatasetTypes to structures 

410 # that will (later) hold addiitonal information about them. 

411 for attr in ("initInputs", "initIntermediates", "initOutputs", 

412 "inputs", "intermediates", "outputs", "prerequisites"): 

413 setattr(self, attr, _DatasetScaffoldingDict.fromDatasetTypes(getattr(datasetTypes, attr), 

414 universe=registry.dimensions)) 

415 # Aggregate all dimensions for all non-init, non-prerequisite 

416 # DatasetTypes. These are the ones we'll include in the big join query. 

417 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

418 self.outputs.dimensions) 

419 # Construct scaffolding nodes for each Task, and add backreferences 

420 # to the Task from each DatasetScaffolding node. 

421 # Note that there's only one scaffolding node for each DatasetType, shared by 

422 # _PipelineScaffolding and all _TaskScaffoldings that reference it. 

423 if isinstance(pipeline, Pipeline): 

424 pipeline = pipeline.toExpandedPipeline() 

425 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

426 for taskDef, taskDatasetTypes in zip(pipeline, 

427 datasetTypes.byTask.values())] 

428 

429 def __repr__(self): 

430 # Default dataclass-injected __repr__ gets caught in an infinite loop 

431 # because of back-references. 

432 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

433 

434 tasks: List[_TaskScaffolding] 

435 """Scaffolding data structures for each task in the pipeline 

436 (`list` of `_TaskScaffolding`). 

437 """ 

438 

439 initInputs: _DatasetScaffoldingDict 

440 """Datasets consumed but not produced when constructing the tasks in this 

441 pipeline (`_DatasetScaffoldingDict`). 

442 """ 

443 

444 initIntermediates: _DatasetScaffoldingDict 

445 """Datasets that are both consumed and produced when constructing the tasks 

446 in this pipeline (`_DatasetScaffoldingDict`). 

447 """ 

448 

449 initOutputs: _DatasetScaffoldingDict 

450 """Datasets produced but not consumed when constructing the tasks in this 

451 pipeline (`_DatasetScaffoldingDict`). 

452 """ 

453 

454 inputs: _DatasetScaffoldingDict 

455 """Datasets that are consumed but not produced when running this pipeline 

456 (`_DatasetScaffoldingDict`). 

457 """ 

458 

459 intermediates: _DatasetScaffoldingDict 

460 """Datasets that are both produced and consumed when running this pipeline 

461 (`_DatasetScaffoldingDict`). 

462 """ 

463 

464 outputs: _DatasetScaffoldingDict 

465 """Datasets produced but not consumed when when running this pipeline 

466 (`_DatasetScaffoldingDict`). 

467 """ 

468 

469 prerequisites: _DatasetScaffoldingDict 

470 """Datasets that are consumed when running this pipeline and looked up 

471 per-Quantum when generating the graph (`_DatasetScaffoldingDict`). 

472 """ 

473 

474 dimensions: DimensionGraph 

475 """All dimensions used by any regular input, intermediate, or output 

476 (not prerequisite) dataset; the set of dimension used in the "Big Join 

477 Query" (`DimensionGraph`). 

478 

479 This is required to be a superset of all task quantum dimensions. 

480 """ 

481 

482 def fillDataIds(self, registry, collections, userQuery): 

483 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

484 

485 This method populates `_TaskScaffolding.dataIds` and 

486 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

487 

488 Parameters 

489 ---------- 

490 registry : `lsst.daf.butler.Registry` 

491 Registry for the data repository; used for all data ID queries. 

492 collections : `lsst.daf.butler.CollectionSearch` 

493 Object representing the collections to search for input datasets. 

494 userQuery : `str`, optional 

495 User-provided expression to limit the data IDs processed. 

496 """ 

497 # Initialization datasets always have empty data IDs. 

498 emptyDataId = ExpandedDataCoordinate(registry.dimensions.empty, (), records={}) 

499 for scaffolding in itertools.chain(self.initInputs.values(), 

500 self.initIntermediates.values(), 

501 self.initOutputs.values()): 

502 scaffolding.dataIds.add(emptyDataId) 

503 # Run one big query for the data IDs for task dimensions and regular 

504 # inputs and outputs. We limit the query to only dimensions that are 

505 # associated with the input dataset types, but don't (yet) try to 

506 # obtain the dataset_ids for those inputs. 

507 resultIter = registry.queryDimensions( 

508 self.dimensions, 

509 datasets=list(self.inputs), 

510 collections=collections, 

511 where=userQuery, 

512 ) 

513 # Iterate over query results and populate the data IDs in 

514 # self._TaskScaffolding.refs, extracting the subsets of the common data 

515 # ID from the query corresponding to the dimensions of each. By using 

516 # sets, we remove duplicates caused by query rows in which the 

517 # dimensions that change are not relevant for that task or dataset 

518 # type. For example, if the Big Join Query involves the dimensions 

519 # (instrument, visit, detector, skymap, tract, patch), we extract 

520 # "calexp" data IDs from the instrument, visit, and detector values 

521 # only, and rely on `set.add` to avoid duplications due to result rows 

522 # in which only skymap, tract, and patch are varying. The Big Join 

523 # Query is defined such that only visit+detector and tract+patch 

524 # combinations that represent spatial overlaps are included in the 

525 # results. 

526 for commonDataId in resultIter: 

527 for taskScaffolding in self.tasks: 

528 taskScaffolding.dataIds.add(commonDataId.subset(taskScaffolding.dimensions)) 

529 for datasetType, scaffolding in itertools.chain(self.inputs.items(), 

530 self.intermediates.items(), 

531 self.outputs.items()): 

532 scaffolding.dataIds.add(commonDataId.subset(scaffolding.dimensions)) 

533 

534 def fillDatasetRefs(self, registry, collections, run, *, skipExisting=True): 

535 """Perform follow up queries for each dataset data ID produced in 

536 `fillDataIds`. 

537 

538 This method populates `_DatasetScaffolding.refs` (except for those in 

539 `prerequisites`). 

540 

541 Parameters 

542 ---------- 

543 registry : `lsst.daf.butler.Registry` 

544 Registry for the data repository; used for all data ID queries. 

545 collections : `lsst.daf.butler.CollectionSearch` 

546 Object representing the collections to search for input datasets. 

547 run : `str`, optional 

548 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

549 output datasets, if it already exists. 

550 skipExisting : `bool`, optional 

551 If `True` (default), a Quantum is not created if all its outputs 

552 already exist in ``run``. Ignored if ``run`` is `None`. 

553 

554 Raises 

555 ------ 

556 OutputExistsError 

557 Raised if an output dataset already exists in the output run 

558 and ``skipExisting`` is `False`. The case where some but not all 

559 of a quantum's outputs are present and ``skipExisting`` is `True` 

560 cannot be identified at this stage, and is handled by `fillQuanta` 

561 instead. 

562 """ 

563 # Look up input and initInput datasets in the input collection(s). 

564 for datasetType, scaffolding in itertools.chain(self.initInputs.items(), self.inputs.items()): 

565 for dataId in scaffolding.dataIds: 

566 refs = list( 

567 registry.queryDatasets( 

568 datasetType, 

569 collections=collections, 

570 dataId=dataId, 

571 deduplicate=True, 

572 expand=True, 

573 ) 

574 ) 

575 if len(refs) != 1: 

576 raise RuntimeError(f"Expected exactly one instance of input {datasetType} " 

577 f"for data ID {dataId}; got {refs}.") 

578 scaffolding.refs.extend(refs) 

579 # Look up [init] intermediate and output datasets in the output collection, 

580 # unless clobberExisting is True (in which case we don't care if these 

581 # already exist). 

582 for datasetType, scaffolding in itertools.chain(self.initIntermediates.items(), 

583 self.initOutputs.items(), 

584 self.intermediates.items(), 

585 self.outputs.items()): 

586 for dataId in scaffolding.dataIds: 

587 # TODO: we could easily support per-DatasetType skipExisting 

588 # (it might make sense to put them in originInfo), and I could 

589 # imagine that being useful - it's probably required in order 

590 # to support writing initOutputs before QuantumGraph 

591 # generation. 

592 if run is not None: 

593 ref = registry.findDataset(datasetType=datasetType, dataId=dataId, collections=run) 

594 else: 

595 ref = None 

596 if ref is None: 

597 ref = DatasetRef(datasetType, dataId) 

598 elif not skipExisting: 

599 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

600 f"output RUN collection '{run}' with data ID {dataId}.") 

601 scaffolding.refs.append(ref) 

602 # Prerequisite dataset lookups are deferred until fillQuanta. 

603 

604 def fillQuanta(self, registry, collections, *, skipExisting=True): 

605 """Define quanta for each task by splitting up the datasets associated 

606 with each task data ID. 

607 

608 This method populates `_TaskScaffolding.quanta`. 

609 

610 Parameters 

611 ---------- 

612 registry : `lsst.daf.butler.Registry` 

613 Registry for the data repository; used for all data ID queries. 

614 collections : `lsst.daf.butler.CollectionSearch` 

615 Object representing the collections to search for input datasets. 

616 skipExisting : `bool`, optional 

617 If `True` (default), a Quantum is not created if all its outputs 

618 already exist. 

619 """ 

620 for task in self.tasks: 

621 for quantumDataId in task.dataIds: 

622 # Identify the (regular) inputs that correspond to the Quantum 

623 # with this data ID. These are those whose data IDs have the 

624 # same values for all dimensions they have in common. 

625 # We do this data IDs expanded to include implied dimensions, 

626 # which is why _DatasetScaffolding.dimensions is thus expanded 

627 # even though DatasetType.dimensions is not. 

628 inputs = NamedKeyDict() 

629 for datasetType, scaffolding in task.inputs.items(): 

630 inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds) 

631 if registry.relateDataIds(quantumDataId, dataId)] 

632 

633 _LOG.debug("%s dataId %s has inputs: %s", 

634 task.taskDef.taskName, quantumDataId, list(inputs.names)) 

635 

636 # Same for outputs. 

637 outputs = NamedKeyDict() 

638 allOutputsPresent = True 

639 for datasetType, scaffolding in task.outputs.items(): 

640 outputs[datasetType] = [] 

641 for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds): 

642 if registry.relateDataIds(quantumDataId, dataId): 

643 if ref.id is None: 

644 allOutputsPresent = False 

645 else: 

646 assert skipExisting, "Existing outputs should have already been identified." 

647 if not allOutputsPresent: 

648 raise OutputExistsError(f"Output {datasetType.name} with data ID " 

649 f"{dataId} already exists, but other outputs " 

650 f"for task with label {task.taskDef.label} " 

651 f"and data ID {quantumDataId} do not.") 

652 outputs[datasetType].append(ref) 

653 if allOutputsPresent and skipExisting: 

654 continue 

655 

656 _LOG.debug("%s dataID %s has outputs: %s", 

657 task.taskDef.taskName, quantumDataId, list(outputs.names)) 

658 

659 # Look up prerequisite datasets in the input collection(s). 

660 # These may have dimensions that extend beyond those we queried 

661 # for originally, because we want to permit those data ID 

662 # values to differ across quanta and dataset types. 

663 # For example, the same quantum may have a flat and bias with 

664 # a different calibration_label, or a refcat with a skypix 

665 # value that overlaps the quantum's data ID's region, but not 

666 # the user expression used for the initial query. 

667 connections = task.taskDef.connections 

668 for con_name in connections.prerequisiteInputs: 

669 con = getattr(connections, con_name) 

670 for datasetType in task.prerequisites: 

671 if datasetType.name == con.name: 

672 break 

673 if con.lookupFunction is not None: 

674 refs = list(con.lookupFunction(datasetType, registry, 

675 quantumDataId, collections)) 

676 else: 

677 refs = list( 

678 registry.queryDatasets( 

679 datasetType, 

680 collections=collections, 

681 dataId=quantumDataId, 

682 deduplicate=True, 

683 expand=True, 

684 ) 

685 ) 

686 inputs[datasetType] = refs 

687 

688 _LOG.debug("%s dataID %s has inputs+prereqs: %s", 

689 task.taskDef.taskName, quantumDataId, list(inputs.names)) 

690 

691 task.addQuantum( 

692 Quantum( 

693 taskName=task.taskDef.taskName, 

694 taskClass=task.taskDef.taskClass, 

695 dataId=quantumDataId, 

696 initInputs=task.initInputs.unpackRefs(), 

697 predictedInputs=inputs, 

698 outputs=outputs, 

699 ) 

700 ) 

701 

702 def makeQuantumGraph(self): 

703 """Create a `QuantumGraph` from the quanta already present in 

704 the scaffolding data structure. 

705 """ 

706 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks) 

707 graph.initInputs = self.initInputs.unpackRefs() 

708 graph.initOutputs = self.initOutputs.unpackRefs() 

709 graph.initIntermediates = self.initIntermediates.unpackRefs() 

710 return graph 

711 

712 

713# ------------------------ 

714# Exported definitions -- 

715# ------------------------ 

716 

717 

718class GraphBuilderError(Exception): 

719 """Base class for exceptions generated by graph builder. 

720 """ 

721 pass 

722 

723 

724class OutputExistsError(GraphBuilderError): 

725 """Exception generated when output datasets already exist. 

726 """ 

727 pass 

728 

729 

730class PrerequisiteMissingError(GraphBuilderError): 

731 """Exception generated when a prerequisite dataset does not exist. 

732 """ 

733 pass 

734 

735 

736class GraphBuilder(object): 

737 """GraphBuilder class is responsible for building task execution graph from 

738 a Pipeline. 

739 

740 Parameters 

741 ---------- 

742 registry : `~lsst.daf.butler.Registry` 

743 Data butler instance. 

744 skipExisting : `bool`, optional 

745 If `True` (default), a Quantum is not created if all its outputs 

746 already exist. 

747 """ 

748 

749 def __init__(self, registry, skipExisting=True): 

750 self.registry = registry 

751 self.dimensions = registry.dimensions 

752 self.skipExisting = skipExisting 

753 

754 def makeGraph(self, pipeline, collections, run, userQuery): 

755 """Create execution graph for a pipeline. 

756 

757 Parameters 

758 ---------- 

759 pipeline : `Pipeline` 

760 Pipeline definition, task names/classes and their configs. 

761 collections : `lsst.daf.butler.CollectionSearch` 

762 Object representing the collections to search for input datasets. 

763 run : `str`, optional 

764 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

765 output datasets, if it already exists. 

766 userQuery : `str` 

767 String which defunes user-defined selection for registry, should be 

768 empty or `None` if there is no restrictions on data selection. 

769 

770 Returns 

771 ------- 

772 graph : `QuantumGraph` 

773 

774 Raises 

775 ------ 

776 UserExpressionError 

777 Raised when user expression cannot be parsed. 

778 OutputExistsError 

779 Raised when output datasets already exist. 

780 Exception 

781 Other exceptions types may be raised by underlying registry 

782 classes. 

783 """ 

784 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

785 scaffolding.fillDataIds(self.registry, collections, userQuery) 

786 scaffolding.fillDatasetRefs(self.registry, collections, run, skipExisting=self.skipExisting) 

787 scaffolding.fillQuanta(self.registry, collections, skipExisting=self.skipExisting) 

788 return scaffolding.makeQuantumGraph()