Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections, AdjustQuantumHelper 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph 

45from lsst.daf.butler import ( 

46 DataCoordinate, 

47 DatasetRef, 

48 DatasetType, 

49 DimensionGraph, 

50 DimensionUniverse, 

51 NamedKeyDict, 

52 Quantum, 

53) 

54from lsst.utils import doImport 

55 

56# ---------------------------------- 

57# Local non-exported definitions -- 

58# ---------------------------------- 

59 

60_LOG = logging.getLogger(__name__.partition(".")[2]) 

61 

62 

63class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

64 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

65 the known `DatasetRef` instances of that type. 

66 

67 Parameters 

68 ---------- 

69 args 

70 Positional arguments are forwarded to the `dict` constructor. 

71 universe : `DimensionUniverse` 

72 Universe of all possible dimensions. 

73 """ 

74 def __init__(self, *args, universe: DimensionGraph): 

75 super().__init__(*args) 

76 self.universe = universe 

77 

78 @classmethod 

79 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

80 universe: DimensionUniverse) -> _DatasetDict: 

81 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

82 

83 Parameters 

84 ---------- 

85 datasetTypes : `iterable` of `DatasetType` 

86 DatasetTypes to use as keys for the dict. Values will be empty 

87 dictionaries. 

88 universe : `DimensionUniverse` 

89 Universe of all possible dimensions. 

90 

91 Returns 

92 ------- 

93 dictionary : `_DatasetDict` 

94 A new `_DatasetDict` instance. 

95 """ 

96 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

97 

98 @classmethod 

99 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

100 ) -> _DatasetDict: 

101 """Return a new dictionary by extracting items corresponding to the 

102 given keys from one or more existing dictionaries. 

103 

104 Parameters 

105 ---------- 

106 datasetTypes : `iterable` of `DatasetType` 

107 DatasetTypes to use as keys for the dict. Values will be obtained 

108 by lookups against ``first`` and ``rest``. 

109 first : `_DatasetDict` 

110 Another dictionary from which to extract values. 

111 rest 

112 Additional dictionaries from which to extract values. 

113 

114 Returns 

115 ------- 

116 dictionary : `_DatasetDict` 

117 A new dictionary instance. 

118 """ 

119 combined = ChainMap(first, *rest) 

120 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

121 universe=first.universe) 

122 

123 @property 

124 def dimensions(self) -> DimensionGraph: 

125 """The union of all dimensions used by all dataset types in this 

126 dictionary, including implied dependencies (`DimensionGraph`). 

127 """ 

128 base = self.universe.empty 

129 if len(self) == 0: 

130 return base 

131 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

132 

133 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

134 """Unpack nested single-element `DatasetRef` dicts into a new 

135 mapping with `DatasetType` keys and `DatasetRef` values. 

136 

137 This method assumes that each nest contains exactly one item, as is the 

138 case for all "init" datasets. 

139 

140 Returns 

141 ------- 

142 dictionary : `NamedKeyDict` 

143 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

144 `DatasetType` instances and string names usable as keys. 

145 """ 

146 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

147 ref, = refs.values() 

148 return ref 

149 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

150 

151 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

152 """Unpack nested multi-element `DatasetRef` dicts into a new 

153 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

154 

155 Returns 

156 ------- 

157 dictionary : `NamedKeyDict` 

158 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

159 both `DatasetType` instances and string names usable as keys. 

160 """ 

161 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

162 

163 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

164 ) -> Iterator[DatasetRef]: 

165 """Iterate over the contained `DatasetRef` instances that match the 

166 given `DatasetType` and data IDs. 

167 

168 Parameters 

169 ---------- 

170 datasetType : `DatasetType` 

171 Dataset type to match. 

172 dataIds : `Iterable` [ `DataCoordinate` ] 

173 Data IDs to match. 

174 

175 Returns 

176 ------- 

177 refs : `Iterator` [ `DatasetRef` ] 

178 DatasetRef instances for which ``ref.datasetType == datasetType`` 

179 and ``ref.dataId`` is in ``dataIds``. 

180 """ 

181 refs = self[datasetType] 

182 return (refs[dataId] for dataId in dataIds) 

183 

184 

185class _QuantumScaffolding: 

186 """Helper class aggregating information about a `Quantum`, used when 

187 constructing a `QuantumGraph`. 

188 

189 See `_PipelineScaffolding` for a top-down description of the full 

190 scaffolding data structure. 

191 

192 Parameters 

193 ---------- 

194 task : _TaskScaffolding 

195 Back-reference to the helper object for the `PipelineTask` this quantum 

196 represents an execution of. 

197 dataId : `DataCoordinate` 

198 Data ID for this quantum. 

199 """ 

200 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

201 self.task = task 

202 self.dataId = dataId 

203 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

204 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

205 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

206 universe=dataId.universe) 

207 

208 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

209 

210 def __repr__(self): 

211 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

212 

213 task: _TaskScaffolding 

214 """Back-reference to the helper object for the `PipelineTask` this quantum 

215 represents an execution of. 

216 """ 

217 

218 dataId: DataCoordinate 

219 """Data ID for this quantum. 

220 """ 

221 

222 inputs: _DatasetDict 

223 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

224 

225 This is initialized to map each `DatasetType` to an empty dictionary at 

226 construction. Those nested dictionaries are populated (with data IDs as 

227 keys) with unresolved `DatasetRef` instances in 

228 `_PipelineScaffolding.connectDataIds`. 

229 """ 

230 

231 outputs: _DatasetDict 

232 """Nested dictionary containing `DatasetRef` outputs this quantum. 

233 """ 

234 

235 prerequisites: _DatasetDict 

236 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

237 quantum. 

238 """ 

239 

240 def makeQuantum(self) -> Quantum: 

241 """Transform the scaffolding object into a true `Quantum` instance. 

242 

243 Returns 

244 ------- 

245 quantum : `Quantum` 

246 An actual `Quantum` instance. 

247 """ 

248 allInputs = self.inputs.unpackMultiRefs() 

249 allInputs.update(self.prerequisites.unpackMultiRefs()) 

250 # Give the task's Connections class an opportunity to remove some 

251 # inputs, or complain if they are unacceptable. 

252 # This will raise if one of the check conditions is not met, which is 

253 # the intended behavior. 

254 # If it raises NotWorkFound, there is a bug in the QG algorithm 

255 # or the adjustQuantum is incorrectly trying to make a prerequisite 

256 # input behave like a regular input; adjustQuantum should only raise 

257 # NoWorkFound if a regular input is missing, and it shouldn't be 

258 # possible for us to have generated ``self`` if that's true. 

259 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

260 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

261 return Quantum( 

262 taskName=self.task.taskDef.taskName, 

263 taskClass=self.task.taskDef.taskClass, 

264 dataId=self.dataId, 

265 initInputs=self.task.initInputs.unpackSingleRefs(), 

266 inputs=helper.inputs, 

267 outputs=helper.outputs, 

268 ) 

269 

270 

271@dataclass 

272class _TaskScaffolding: 

273 """Helper class aggregating information about a `PipelineTask`, used when 

274 constructing a `QuantumGraph`. 

275 

276 See `_PipelineScaffolding` for a top-down description of the full 

277 scaffolding data structure. 

278 

279 Parameters 

280 ---------- 

281 taskDef : `TaskDef` 

282 Data structure that identifies the task class and its config. 

283 parent : `_PipelineScaffolding` 

284 The parent data structure that will hold the instance being 

285 constructed. 

286 datasetTypes : `TaskDatasetTypes` 

287 Data structure that categorizes the dataset types used by this task. 

288 """ 

289 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

290 universe = parent.dimensions.universe 

291 self.taskDef = taskDef 

292 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

293 assert self.dimensions.issubset(parent.dimensions) 

294 # Initialize _DatasetDicts as subsets of the one or two 

295 # corresponding dicts in the parent _PipelineScaffolding. 

296 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

297 parent.initIntermediates) 

298 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

299 parent.initOutputs) 

300 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

301 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

302 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

303 self.dataIds = set() 

304 self.quanta = {} 

305 

306 def __repr__(self): 

307 # Default dataclass-injected __repr__ gets caught in an infinite loop 

308 # because of back-references. 

309 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

310 

311 taskDef: TaskDef 

312 """Data structure that identifies the task class and its config 

313 (`TaskDef`). 

314 """ 

315 

316 dimensions: DimensionGraph 

317 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

318 """ 

319 

320 initInputs: _DatasetDict 

321 """Dictionary containing information about datasets used to construct this 

322 task (`_DatasetDict`). 

323 """ 

324 

325 initOutputs: _DatasetDict 

326 """Dictionary containing information about datasets produced as a 

327 side-effect of constructing this task (`_DatasetDict`). 

328 """ 

329 

330 inputs: _DatasetDict 

331 """Dictionary containing information about datasets used as regular, 

332 graph-constraining inputs to this task (`_DatasetDict`). 

333 """ 

334 

335 outputs: _DatasetDict 

336 """Dictionary containing information about datasets produced by this task 

337 (`_DatasetDict`). 

338 """ 

339 

340 prerequisites: _DatasetDict 

341 """Dictionary containing information about input datasets that must be 

342 present in the repository before any Pipeline containing this task is run 

343 (`_DatasetDict`). 

344 """ 

345 

346 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

347 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

348 this task with that data ID. 

349 """ 

350 

351 def makeQuantumSet(self) -> Set[Quantum]: 

352 """Create a `set` of `Quantum` from the information in ``self``. 

353 

354 Returns 

355 ------- 

356 nodes : `set` of `Quantum 

357 The `Quantum` elements corresponding to this task. 

358 """ 

359 return set(q.makeQuantum() for q in self.quanta.values()) 

360 

361 

362@dataclass 

363class _PipelineScaffolding: 

364 """A helper data structure that organizes the information involved in 

365 constructing a `QuantumGraph` for a `Pipeline`. 

366 

367 Parameters 

368 ---------- 

369 pipeline : `Pipeline` 

370 Sequence of tasks from which a graph is to be constructed. Must 

371 have nested task classes already imported. 

372 universe : `DimensionUniverse` 

373 Universe of all possible dimensions. 

374 

375 Notes 

376 ----- 

377 The scaffolding data structure contains nested data structures for both 

378 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

379 data structures are shared between the pipeline-level structure (which 

380 aggregates all datasets and categorizes them from the perspective of the 

381 complete pipeline) and the individual tasks that use them as inputs and 

382 outputs. 

383 

384 `QuantumGraph` construction proceeds in four steps, with each corresponding 

385 to a different `_PipelineScaffolding` method: 

386 

387 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

388 the DatasetTypes used by the pipeline (delegating to 

389 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

390 nested `_TaskScaffolding` and `_DatasetDict` objects. 

391 

392 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

393 returns related tuples of all dimensions used to identify any regular 

394 input, output, and intermediate datasets (not prerequisites). We then 

395 iterate over these tuples of related dimensions, identifying the subsets 

396 that correspond to distinct data IDs for each task and dataset type, 

397 and then create `_QuantumScaffolding` objects. 

398 

399 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

400 dataset data IDs previously identified, transforming unresolved 

401 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

402 up prerequisite datasets for all quanta. 

403 

404 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

405 per-task `_QuantumScaffolding` objects. 

406 """ 

407 def __init__(self, pipeline, *, registry): 

408 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

409 self.tasks = [] 

410 # Aggregate and categorize the DatasetTypes in the Pipeline. 

411 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

412 # Construct dictionaries that map those DatasetTypes to structures 

413 # that will (later) hold addiitonal information about them. 

414 for attr in ("initInputs", "initIntermediates", "initOutputs", 

415 "inputs", "intermediates", "outputs", "prerequisites"): 

416 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

417 universe=registry.dimensions)) 

418 # Aggregate all dimensions for all non-init, non-prerequisite 

419 # DatasetTypes. These are the ones we'll include in the big join 

420 # query. 

421 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

422 self.outputs.dimensions) 

423 # Construct scaffolding nodes for each Task, and add backreferences 

424 # to the Task from each DatasetScaffolding node. 

425 # Note that there's only one scaffolding node for each DatasetType, 

426 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

427 # reference it. 

428 if isinstance(pipeline, Pipeline): 

429 pipeline = pipeline.toExpandedPipeline() 

430 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

431 for taskDef, taskDatasetTypes in zip(pipeline, 

432 datasetTypes.byTask.values())] 

433 

434 def __repr__(self): 

435 # Default dataclass-injected __repr__ gets caught in an infinite loop 

436 # because of back-references. 

437 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

438 

439 tasks: List[_TaskScaffolding] 

440 """Scaffolding data structures for each task in the pipeline 

441 (`list` of `_TaskScaffolding`). 

442 """ 

443 

444 initInputs: _DatasetDict 

445 """Datasets consumed but not produced when constructing the tasks in this 

446 pipeline (`_DatasetDict`). 

447 """ 

448 

449 initIntermediates: _DatasetDict 

450 """Datasets that are both consumed and produced when constructing the tasks 

451 in this pipeline (`_DatasetDict`). 

452 """ 

453 

454 initOutputs: _DatasetDict 

455 """Datasets produced but not consumed when constructing the tasks in this 

456 pipeline (`_DatasetDict`). 

457 """ 

458 

459 inputs: _DatasetDict 

460 """Datasets that are consumed but not produced when running this pipeline 

461 (`_DatasetDict`). 

462 """ 

463 

464 intermediates: _DatasetDict 

465 """Datasets that are both produced and consumed when running this pipeline 

466 (`_DatasetDict`). 

467 """ 

468 

469 outputs: _DatasetDict 

470 """Datasets produced but not consumed when when running this pipeline 

471 (`_DatasetDict`). 

472 """ 

473 

474 prerequisites: _DatasetDict 

475 """Datasets that are consumed when running this pipeline and looked up 

476 per-Quantum when generating the graph (`_DatasetDict`). 

477 """ 

478 

479 dimensions: DimensionGraph 

480 """All dimensions used by any regular input, intermediate, or output 

481 (not prerequisite) dataset; the set of dimension used in the "Big Join 

482 Query" (`DimensionGraph`). 

483 

484 This is required to be a superset of all task quantum dimensions. 

485 """ 

486 

487 @contextmanager 

488 def connectDataIds(self, registry, collections, userQuery, externalDataId): 

489 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

490 

491 This method populates `_TaskScaffolding.dataIds` and 

492 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

493 

494 Parameters 

495 ---------- 

496 registry : `lsst.daf.butler.Registry` 

497 Registry for the data repository; used for all data ID queries. 

498 collections 

499 Expressions representing the collections to search for input 

500 datasets. May be any of the types accepted by 

501 `lsst.daf.butler.CollectionSearch.fromExpression`. 

502 userQuery : `str` or `None` 

503 User-provided expression to limit the data IDs processed. 

504 externalDataId : `DataCoordinate` 

505 Externally-provided data ID that should be used to restrict the 

506 results, just as if these constraints had been included via ``AND`` 

507 in ``userQuery``. This includes (at least) any instrument named 

508 in the pipeline definition. 

509 

510 Returns 

511 ------- 

512 commonDataIds : \ 

513 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

514 An interface to a database temporary table containing all data IDs 

515 that will appear in this `QuantumGraph`. Returned inside a 

516 context manager, which will drop the temporary table at the end of 

517 the `with` block in which this method is called. 

518 """ 

519 _LOG.debug("Building query for data IDs.") 

520 # Initialization datasets always have empty data IDs. 

521 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

522 for datasetType, refs in itertools.chain(self.initInputs.items(), 

523 self.initIntermediates.items(), 

524 self.initOutputs.items()): 

525 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

526 # Run one big query for the data IDs for task dimensions and regular 

527 # inputs and outputs. We limit the query to only dimensions that are 

528 # associated with the input dataset types, but don't (yet) try to 

529 # obtain the dataset_ids for those inputs. 

530 _LOG.debug("Submitting data ID query and materializing results.") 

531 with registry.queryDataIds(self.dimensions, 

532 datasets=list(self.inputs), 

533 collections=collections, 

534 where=userQuery, 

535 dataId=externalDataId, 

536 ).materialize() as commonDataIds: 

537 _LOG.debug("Expanding data IDs.") 

538 commonDataIds = commonDataIds.expanded() 

539 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

540 # Iterate over query results, populating data IDs for datasets and 

541 # quanta and then connecting them to each other. 

542 n = 0 

543 for n, commonDataId in enumerate(commonDataIds): 

544 # Create DatasetRefs for all DatasetTypes from this result row, 

545 # noting that we might have created some already. 

546 # We remember both those that already existed and those that we 

547 # create now. 

548 refsForRow = {} 

549 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

550 self.outputs.items()): 

551 datasetDataId = commonDataId.subset(datasetType.dimensions) 

552 ref = refs.get(datasetDataId) 

553 if ref is None: 

554 ref = DatasetRef(datasetType, datasetDataId) 

555 refs[datasetDataId] = ref 

556 refsForRow[datasetType.name] = ref 

557 # Create _QuantumScaffolding objects for all tasks from this 

558 # result row, noting that we might have created some already. 

559 for task in self.tasks: 

560 quantumDataId = commonDataId.subset(task.dimensions) 

561 quantum = task.quanta.get(quantumDataId) 

562 if quantum is None: 

563 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

564 task.quanta[quantumDataId] = quantum 

565 # Whether this is a new quantum or an existing one, we can 

566 # now associate the DatasetRefs for this row with it. The 

567 # fact that a Quantum data ID and a dataset data ID both 

568 # came from the same result row is what tells us they 

569 # should be associated. 

570 # Many of these associates will be duplicates (because 

571 # another query row that differed from this one only in 

572 # irrelevant dimensions already added them), and we use 

573 # sets to skip. 

574 for datasetType in task.inputs: 

575 ref = refsForRow[datasetType.name] 

576 quantum.inputs[datasetType.name][ref.dataId] = ref 

577 for datasetType in task.outputs: 

578 ref = refsForRow[datasetType.name] 

579 quantum.outputs[datasetType.name][ref.dataId] = ref 

580 _LOG.debug("Finished processing %d rows from data ID query.", n) 

581 yield commonDataIds 

582 

583 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True, 

584 clobberOutputs=True): 

585 """Perform follow up queries for each dataset data ID produced in 

586 `fillDataIds`. 

587 

588 This method populates `_DatasetScaffolding.refs` (except for those in 

589 `prerequisites`). 

590 

591 Parameters 

592 ---------- 

593 registry : `lsst.daf.butler.Registry` 

594 Registry for the data repository; used for all data ID queries. 

595 collections 

596 Expressions representing the collections to search for input 

597 datasets. May be any of the types accepted by 

598 `lsst.daf.butler.CollectionSearch.fromExpression`. 

599 run : `str`, optional 

600 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

601 output datasets, if it already exists. 

602 commonDataIds : \ 

603 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

604 Result of a previous call to `connectDataIds`. 

605 skipExisting : `bool`, optional 

606 If `True` (default), a Quantum is not created if all its outputs 

607 already exist in ``run``. Ignored if ``run`` is `None`. 

608 clobberOutputs : `bool`, optional 

609 If `True` (default), allow quanta to created even if outputs exist; 

610 this requires the same behavior behavior to be enabled when 

611 executing. If ``skipExisting`` is also `True`, completed quanta 

612 (those with metadata, or all outputs if there is no metadata 

613 dataset configured) will be skipped rather than clobbered. 

614 

615 Raises 

616 ------ 

617 OutputExistsError 

618 Raised if an output dataset already exists in the output run 

619 and ``skipExisting`` is `False`, or if only some outputs are 

620 present and ``clobberOutputs`` is `False`. 

621 """ 

622 # Look up [init] intermediate and output datasets in the output 

623 # collection, if there is an output collection. 

624 if run is not None: 

625 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

626 self.initOutputs.items(), 

627 self.intermediates.items(), 

628 self.outputs.items()): 

629 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

630 len(refs), datasetType.name) 

631 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

632 resolvedRefQueryResults = commonDataIds.subset( 

633 datasetType.dimensions, 

634 unique=True 

635 ).findDatasets( 

636 datasetType, 

637 collections=run, 

638 findFirst=True 

639 ) 

640 for resolvedRef in resolvedRefQueryResults: 

641 # TODO: we could easily support per-DatasetType 

642 # skipExisting and I could imagine that being useful - it's 

643 # probably required in order to support writing initOutputs 

644 # before QuantumGraph generation. 

645 assert resolvedRef.dataId in refs 

646 if skipExisting or isInit or clobberOutputs: 

647 refs[resolvedRef.dataId] = resolvedRef 

648 else: 

649 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

650 f"output RUN collection '{run}' with data ID" 

651 f" {resolvedRef.dataId}.") 

652 # Look up input and initInput datasets in the input collection(s). 

653 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

654 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

655 resolvedRefQueryResults = commonDataIds.subset( 

656 datasetType.dimensions, 

657 unique=True 

658 ).findDatasets( 

659 datasetType, 

660 collections=collections, 

661 findFirst=True 

662 ) 

663 dataIdsNotFoundYet = set(refs.keys()) 

664 for resolvedRef in resolvedRefQueryResults: 

665 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

666 refs[resolvedRef.dataId] = resolvedRef 

667 if dataIdsNotFoundYet: 

668 raise RuntimeError( 

669 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

670 f"'{datasetType.name}' was/were present in a previous " 

671 f"query, but could not be found now." 

672 f"This is either a logic bug in QuantumGraph generation " 

673 f"or the input collections have been modified since " 

674 f"QuantumGraph generation began." 

675 ) 

676 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

677 # replacing the unresolved refs there, and then look up prerequisites. 

678 for task in self.tasks: 

679 _LOG.debug( 

680 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

681 len(task.quanta), 

682 task.taskDef.label 

683 ) 

684 lookupFunctions = { 

685 c.name: c.lookupFunction 

686 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

687 if c.lookupFunction is not None 

688 } 

689 dataIdsFailed = [] 

690 dataIdsSucceeded = [] 

691 for quantum in task.quanta.values(): 

692 # Process outputs datasets only if there is a run to look for 

693 # outputs in and skipExisting and/or clobberOutputs is True. 

694 # Note that if skipExisting is False, any output datasets that 

695 # already exist would have already caused an exception to be 

696 # raised. We never update the DatasetRefs in the quantum 

697 # because those should never be resolved. 

698 if run is not None and (skipExisting or clobberOutputs): 

699 resolvedRefs = [] 

700 unresolvedRefs = [] 

701 haveMetadata = False 

702 for datasetType, originalRefs in quantum.outputs.items(): 

703 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

704 if ref.id is not None: 

705 resolvedRefs.append(ref) 

706 if datasetType.name == task.taskDef.metadataDatasetName: 

707 haveMetadata = True 

708 else: 

709 unresolvedRefs.append(ref) 

710 if resolvedRefs: 

711 if haveMetadata or not unresolvedRefs: 

712 dataIdsSucceeded.append(quantum.dataId) 

713 if skipExisting: 

714 continue 

715 else: 

716 dataIdsFailed.append(quantum.dataId) 

717 if not clobberOutputs: 

718 raise OutputExistsError( 

719 f"Quantum {quantum.dataId} of task with label " 

720 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

721 f"({resolvedRefs}) " 

722 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

723 "and clobbering outputs was not enabled." 

724 ) 

725 # Update the input DatasetRefs to the resolved ones we already 

726 # searched for. 

727 for datasetType, refs in quantum.inputs.items(): 

728 for ref in task.inputs.extract(datasetType, refs.keys()): 

729 refs[ref.dataId] = ref 

730 # Look up prerequisite datasets in the input collection(s). 

731 # These may have dimensions that extend beyond those we queried 

732 # for originally, because we want to permit those data ID 

733 # values to differ across quanta and dataset types. 

734 for datasetType in task.prerequisites: 

735 lookupFunction = lookupFunctions.get(datasetType.name) 

736 if lookupFunction is not None: 

737 # PipelineTask has provided its own function to do the 

738 # lookup. This always takes precedence. 

739 refs = list( 

740 lookupFunction(datasetType, registry, quantum.dataId, collections) 

741 ) 

742 elif (datasetType.isCalibration() 

743 and datasetType.dimensions <= quantum.dataId.graph 

744 and quantum.dataId.graph.temporal): 

745 # This is a master calibration lookup, which we have to 

746 # handle specially because the query system can't do a 

747 # temporal join on a non-dimension-based timespan yet. 

748 timespan = quantum.dataId.timespan 

749 try: 

750 refs = [registry.findDataset(datasetType, quantum.dataId, 

751 collections=collections, 

752 timespan=timespan)] 

753 except KeyError: 

754 # This dataset type is not present in the registry, 

755 # which just means there are no datasets here. 

756 refs = [] 

757 else: 

758 # Most general case. 

759 refs = list(registry.queryDatasets(datasetType, 

760 collections=collections, 

761 dataId=quantum.dataId, 

762 findFirst=True).expanded()) 

763 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs 

764 if ref is not None}) 

765 # Actually remove any quanta that we decided to skip above. 

766 if dataIdsSucceeded: 

767 if skipExisting: 

768 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their " 

769 "outputs exist or metadata was written successfully.", 

770 len(dataIdsSucceeded), task.taskDef.label) 

771 for dataId in dataIdsSucceeded: 

772 del task.quanta[dataId] 

773 elif clobberOutputs: 

774 _LOG.info("Found %d successful quanta for task with label '%s' " 

775 "that will need to be clobbered during execution.", 

776 len(dataIdsSucceeded), 

777 task.taskDef.label) 

778 else: 

779 raise AssertionError("OutputExistsError should have already been raised.") 

780 if dataIdsFailed: 

781 if clobberOutputs: 

782 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' " 

783 "that will need to be clobbered during execution.", 

784 len(dataIdsFailed), 

785 task.taskDef.label) 

786 else: 

787 raise AssertionError("OutputExistsError should have already been raised.") 

788 

789 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

790 """Create a `QuantumGraph` from the quanta already present in 

791 the scaffolding data structure. 

792 

793 Parameters 

794 --------- 

795 metadata : Optional Mapping of `str` to primitives 

796 This is an optional parameter of extra data to carry with the 

797 graph. Entries in this mapping should be able to be serialized in 

798 JSON. 

799 

800 Returns 

801 ------- 

802 graph : `QuantumGraph` 

803 The full `QuantumGraph`. 

804 """ 

805 graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks}, metadata=metadata) 

806 return graph 

807 

808 

809# ------------------------ 

810# Exported definitions -- 

811# ------------------------ 

812 

813 

814class GraphBuilderError(Exception): 

815 """Base class for exceptions generated by graph builder. 

816 """ 

817 pass 

818 

819 

820class OutputExistsError(GraphBuilderError): 

821 """Exception generated when output datasets already exist. 

822 """ 

823 pass 

824 

825 

826class PrerequisiteMissingError(GraphBuilderError): 

827 """Exception generated when a prerequisite dataset does not exist. 

828 """ 

829 pass 

830 

831 

832class GraphBuilder(object): 

833 """GraphBuilder class is responsible for building task execution graph from 

834 a Pipeline. 

835 

836 Parameters 

837 ---------- 

838 registry : `~lsst.daf.butler.Registry` 

839 Data butler instance. 

840 skipExisting : `bool`, optional 

841 If `True` (default), a Quantum is not created if all its outputs 

842 already exist. 

843 clobberOutputs : `bool`, optional 

844 If `True` (default), allow quanta to created even if partial outputs 

845 exist; this requires the same behavior behavior to be enabled when 

846 executing. 

847 """ 

848 

849 def __init__(self, registry, skipExisting=True, clobberOutputs=True): 

850 self.registry = registry 

851 self.dimensions = registry.dimensions 

852 self.skipExisting = skipExisting 

853 self.clobberOutputs = clobberOutputs 

854 

855 def makeGraph(self, pipeline, collections, run, userQuery, 

856 metadata: Optional[Mapping[str, Any]] = None): 

857 """Create execution graph for a pipeline. 

858 

859 Parameters 

860 ---------- 

861 pipeline : `Pipeline` 

862 Pipeline definition, task names/classes and their configs. 

863 collections 

864 Expressions representing the collections to search for input 

865 datasets. May be any of the types accepted by 

866 `lsst.daf.butler.CollectionSearch.fromExpression`. 

867 run : `str`, optional 

868 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

869 output datasets, if it already exists. 

870 userQuery : `str` 

871 String which defines user-defined selection for registry, should be 

872 empty or `None` if there is no restrictions on data selection. 

873 metadata : Optional Mapping of `str` to primitives 

874 This is an optional parameter of extra data to carry with the 

875 graph. Entries in this mapping should be able to be serialized in 

876 JSON. 

877 

878 Returns 

879 ------- 

880 graph : `QuantumGraph` 

881 

882 Raises 

883 ------ 

884 UserExpressionError 

885 Raised when user expression cannot be parsed. 

886 OutputExistsError 

887 Raised when output datasets already exist. 

888 Exception 

889 Other exceptions types may be raised by underlying registry 

890 classes. 

891 """ 

892 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

893 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

894 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

895 instrument = pipeline.getInstrument() 

896 if isinstance(instrument, str): 

897 instrument = doImport(instrument) 

898 if instrument is not None: 

899 dataId = DataCoordinate.standardize(instrument=instrument.getName(), 

900 universe=self.registry.dimensions) 

901 else: 

902 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

903 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds: 

904 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

905 skipExisting=self.skipExisting, 

906 clobberOutputs=self.clobberOutputs) 

907 return scaffolding.makeQuantumGraph(metadata=metadata)