Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections, AdjustQuantumHelper 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph 

45from lsst.daf.butler import ( 

46 CollectionSearch, 

47 CollectionType, 

48 DataCoordinate, 

49 DatasetRef, 

50 DatasetType, 

51 DimensionGraph, 

52 DimensionUniverse, 

53 NamedKeyDict, 

54 Quantum, 

55) 

56from lsst.utils import doImport 

57 

58# ---------------------------------- 

59# Local non-exported definitions -- 

60# ---------------------------------- 

61 

62_LOG = logging.getLogger(__name__.partition(".")[2]) 

63 

64 

65class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

66 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

67 the known `DatasetRef` instances of that type. 

68 

69 Parameters 

70 ---------- 

71 args 

72 Positional arguments are forwarded to the `dict` constructor. 

73 universe : `DimensionUniverse` 

74 Universe of all possible dimensions. 

75 """ 

76 def __init__(self, *args, universe: DimensionGraph): 

77 super().__init__(*args) 

78 self.universe = universe 

79 

80 @classmethod 

81 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

82 universe: DimensionUniverse) -> _DatasetDict: 

83 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

84 

85 Parameters 

86 ---------- 

87 datasetTypes : `iterable` of `DatasetType` 

88 DatasetTypes to use as keys for the dict. Values will be empty 

89 dictionaries. 

90 universe : `DimensionUniverse` 

91 Universe of all possible dimensions. 

92 

93 Returns 

94 ------- 

95 dictionary : `_DatasetDict` 

96 A new `_DatasetDict` instance. 

97 """ 

98 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

99 

100 @classmethod 

101 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

102 ) -> _DatasetDict: 

103 """Return a new dictionary by extracting items corresponding to the 

104 given keys from one or more existing dictionaries. 

105 

106 Parameters 

107 ---------- 

108 datasetTypes : `iterable` of `DatasetType` 

109 DatasetTypes to use as keys for the dict. Values will be obtained 

110 by lookups against ``first`` and ``rest``. 

111 first : `_DatasetDict` 

112 Another dictionary from which to extract values. 

113 rest 

114 Additional dictionaries from which to extract values. 

115 

116 Returns 

117 ------- 

118 dictionary : `_DatasetDict` 

119 A new dictionary instance. 

120 """ 

121 combined = ChainMap(first, *rest) 

122 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

123 universe=first.universe) 

124 

125 @property 

126 def dimensions(self) -> DimensionGraph: 

127 """The union of all dimensions used by all dataset types in this 

128 dictionary, including implied dependencies (`DimensionGraph`). 

129 """ 

130 base = self.universe.empty 

131 if len(self) == 0: 

132 return base 

133 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

134 

135 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

136 """Unpack nested single-element `DatasetRef` dicts into a new 

137 mapping with `DatasetType` keys and `DatasetRef` values. 

138 

139 This method assumes that each nest contains exactly one item, as is the 

140 case for all "init" datasets. 

141 

142 Returns 

143 ------- 

144 dictionary : `NamedKeyDict` 

145 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

146 `DatasetType` instances and string names usable as keys. 

147 """ 

148 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

149 ref, = refs.values() 

150 return ref 

151 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

152 

153 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

154 """Unpack nested multi-element `DatasetRef` dicts into a new 

155 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

156 

157 Returns 

158 ------- 

159 dictionary : `NamedKeyDict` 

160 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

161 both `DatasetType` instances and string names usable as keys. 

162 """ 

163 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

164 

165 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

166 ) -> Iterator[DatasetRef]: 

167 """Iterate over the contained `DatasetRef` instances that match the 

168 given `DatasetType` and data IDs. 

169 

170 Parameters 

171 ---------- 

172 datasetType : `DatasetType` 

173 Dataset type to match. 

174 dataIds : `Iterable` [ `DataCoordinate` ] 

175 Data IDs to match. 

176 

177 Returns 

178 ------- 

179 refs : `Iterator` [ `DatasetRef` ] 

180 DatasetRef instances for which ``ref.datasetType == datasetType`` 

181 and ``ref.dataId`` is in ``dataIds``. 

182 """ 

183 refs = self[datasetType] 

184 return (refs[dataId] for dataId in dataIds) 

185 

186 

187class _QuantumScaffolding: 

188 """Helper class aggregating information about a `Quantum`, used when 

189 constructing a `QuantumGraph`. 

190 

191 See `_PipelineScaffolding` for a top-down description of the full 

192 scaffolding data structure. 

193 

194 Parameters 

195 ---------- 

196 task : _TaskScaffolding 

197 Back-reference to the helper object for the `PipelineTask` this quantum 

198 represents an execution of. 

199 dataId : `DataCoordinate` 

200 Data ID for this quantum. 

201 """ 

202 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

203 self.task = task 

204 self.dataId = dataId 

205 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

206 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

207 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

208 universe=dataId.universe) 

209 

210 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

211 

212 def __repr__(self): 

213 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

214 

215 task: _TaskScaffolding 

216 """Back-reference to the helper object for the `PipelineTask` this quantum 

217 represents an execution of. 

218 """ 

219 

220 dataId: DataCoordinate 

221 """Data ID for this quantum. 

222 """ 

223 

224 inputs: _DatasetDict 

225 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

226 

227 This is initialized to map each `DatasetType` to an empty dictionary at 

228 construction. Those nested dictionaries are populated (with data IDs as 

229 keys) with unresolved `DatasetRef` instances in 

230 `_PipelineScaffolding.connectDataIds`. 

231 """ 

232 

233 outputs: _DatasetDict 

234 """Nested dictionary containing `DatasetRef` outputs this quantum. 

235 """ 

236 

237 prerequisites: _DatasetDict 

238 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

239 quantum. 

240 """ 

241 

242 def makeQuantum(self) -> Quantum: 

243 """Transform the scaffolding object into a true `Quantum` instance. 

244 

245 Returns 

246 ------- 

247 quantum : `Quantum` 

248 An actual `Quantum` instance. 

249 """ 

250 allInputs = self.inputs.unpackMultiRefs() 

251 allInputs.update(self.prerequisites.unpackMultiRefs()) 

252 # Give the task's Connections class an opportunity to remove some 

253 # inputs, or complain if they are unacceptable. 

254 # This will raise if one of the check conditions is not met, which is 

255 # the intended behavior. 

256 # If it raises NotWorkFound, there is a bug in the QG algorithm 

257 # or the adjustQuantum is incorrectly trying to make a prerequisite 

258 # input behave like a regular input; adjustQuantum should only raise 

259 # NoWorkFound if a regular input is missing, and it shouldn't be 

260 # possible for us to have generated ``self`` if that's true. 

261 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

262 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

263 return Quantum( 

264 taskName=self.task.taskDef.taskName, 

265 taskClass=self.task.taskDef.taskClass, 

266 dataId=self.dataId, 

267 initInputs=self.task.initInputs.unpackSingleRefs(), 

268 inputs=helper.inputs, 

269 outputs=helper.outputs, 

270 ) 

271 

272 

273@dataclass 

274class _TaskScaffolding: 

275 """Helper class aggregating information about a `PipelineTask`, used when 

276 constructing a `QuantumGraph`. 

277 

278 See `_PipelineScaffolding` for a top-down description of the full 

279 scaffolding data structure. 

280 

281 Parameters 

282 ---------- 

283 taskDef : `TaskDef` 

284 Data structure that identifies the task class and its config. 

285 parent : `_PipelineScaffolding` 

286 The parent data structure that will hold the instance being 

287 constructed. 

288 datasetTypes : `TaskDatasetTypes` 

289 Data structure that categorizes the dataset types used by this task. 

290 """ 

291 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

292 universe = parent.dimensions.universe 

293 self.taskDef = taskDef 

294 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

295 assert self.dimensions.issubset(parent.dimensions) 

296 # Initialize _DatasetDicts as subsets of the one or two 

297 # corresponding dicts in the parent _PipelineScaffolding. 

298 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

299 parent.initIntermediates) 

300 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

301 parent.initOutputs) 

302 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

303 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

304 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

305 self.dataIds = set() 

306 self.quanta = {} 

307 

308 def __repr__(self): 

309 # Default dataclass-injected __repr__ gets caught in an infinite loop 

310 # because of back-references. 

311 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

312 

313 taskDef: TaskDef 

314 """Data structure that identifies the task class and its config 

315 (`TaskDef`). 

316 """ 

317 

318 dimensions: DimensionGraph 

319 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

320 """ 

321 

322 initInputs: _DatasetDict 

323 """Dictionary containing information about datasets used to construct this 

324 task (`_DatasetDict`). 

325 """ 

326 

327 initOutputs: _DatasetDict 

328 """Dictionary containing information about datasets produced as a 

329 side-effect of constructing this task (`_DatasetDict`). 

330 """ 

331 

332 inputs: _DatasetDict 

333 """Dictionary containing information about datasets used as regular, 

334 graph-constraining inputs to this task (`_DatasetDict`). 

335 """ 

336 

337 outputs: _DatasetDict 

338 """Dictionary containing information about datasets produced by this task 

339 (`_DatasetDict`). 

340 """ 

341 

342 prerequisites: _DatasetDict 

343 """Dictionary containing information about input datasets that must be 

344 present in the repository before any Pipeline containing this task is run 

345 (`_DatasetDict`). 

346 """ 

347 

348 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

349 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

350 this task with that data ID. 

351 """ 

352 

353 def makeQuantumSet(self) -> Set[Quantum]: 

354 """Create a `set` of `Quantum` from the information in ``self``. 

355 

356 Returns 

357 ------- 

358 nodes : `set` of `Quantum 

359 The `Quantum` elements corresponding to this task. 

360 """ 

361 return set(q.makeQuantum() for q in self.quanta.values()) 

362 

363 

364@dataclass 

365class _PipelineScaffolding: 

366 """A helper data structure that organizes the information involved in 

367 constructing a `QuantumGraph` for a `Pipeline`. 

368 

369 Parameters 

370 ---------- 

371 pipeline : `Pipeline` 

372 Sequence of tasks from which a graph is to be constructed. Must 

373 have nested task classes already imported. 

374 universe : `DimensionUniverse` 

375 Universe of all possible dimensions. 

376 

377 Notes 

378 ----- 

379 The scaffolding data structure contains nested data structures for both 

380 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

381 data structures are shared between the pipeline-level structure (which 

382 aggregates all datasets and categorizes them from the perspective of the 

383 complete pipeline) and the individual tasks that use them as inputs and 

384 outputs. 

385 

386 `QuantumGraph` construction proceeds in four steps, with each corresponding 

387 to a different `_PipelineScaffolding` method: 

388 

389 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

390 the DatasetTypes used by the pipeline (delegating to 

391 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

392 nested `_TaskScaffolding` and `_DatasetDict` objects. 

393 

394 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

395 returns related tuples of all dimensions used to identify any regular 

396 input, output, and intermediate datasets (not prerequisites). We then 

397 iterate over these tuples of related dimensions, identifying the subsets 

398 that correspond to distinct data IDs for each task and dataset type, 

399 and then create `_QuantumScaffolding` objects. 

400 

401 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

402 dataset data IDs previously identified, transforming unresolved 

403 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

404 up prerequisite datasets for all quanta. 

405 

406 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

407 per-task `_QuantumScaffolding` objects. 

408 """ 

409 def __init__(self, pipeline, *, registry): 

410 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

411 self.tasks = [] 

412 # Aggregate and categorize the DatasetTypes in the Pipeline. 

413 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

414 # Construct dictionaries that map those DatasetTypes to structures 

415 # that will (later) hold addiitonal information about them. 

416 for attr in ("initInputs", "initIntermediates", "initOutputs", 

417 "inputs", "intermediates", "outputs", "prerequisites"): 

418 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

419 universe=registry.dimensions)) 

420 # Aggregate all dimensions for all non-init, non-prerequisite 

421 # DatasetTypes. These are the ones we'll include in the big join 

422 # query. 

423 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

424 self.outputs.dimensions) 

425 # Construct scaffolding nodes for each Task, and add backreferences 

426 # to the Task from each DatasetScaffolding node. 

427 # Note that there's only one scaffolding node for each DatasetType, 

428 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

429 # reference it. 

430 if isinstance(pipeline, Pipeline): 

431 pipeline = pipeline.toExpandedPipeline() 

432 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

433 for taskDef, taskDatasetTypes in zip(pipeline, 

434 datasetTypes.byTask.values())] 

435 

436 def __repr__(self): 

437 # Default dataclass-injected __repr__ gets caught in an infinite loop 

438 # because of back-references. 

439 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

440 

441 tasks: List[_TaskScaffolding] 

442 """Scaffolding data structures for each task in the pipeline 

443 (`list` of `_TaskScaffolding`). 

444 """ 

445 

446 initInputs: _DatasetDict 

447 """Datasets consumed but not produced when constructing the tasks in this 

448 pipeline (`_DatasetDict`). 

449 """ 

450 

451 initIntermediates: _DatasetDict 

452 """Datasets that are both consumed and produced when constructing the tasks 

453 in this pipeline (`_DatasetDict`). 

454 """ 

455 

456 initOutputs: _DatasetDict 

457 """Datasets produced but not consumed when constructing the tasks in this 

458 pipeline (`_DatasetDict`). 

459 """ 

460 

461 inputs: _DatasetDict 

462 """Datasets that are consumed but not produced when running this pipeline 

463 (`_DatasetDict`). 

464 """ 

465 

466 intermediates: _DatasetDict 

467 """Datasets that are both produced and consumed when running this pipeline 

468 (`_DatasetDict`). 

469 """ 

470 

471 outputs: _DatasetDict 

472 """Datasets produced but not consumed when when running this pipeline 

473 (`_DatasetDict`). 

474 """ 

475 

476 prerequisites: _DatasetDict 

477 """Datasets that are consumed when running this pipeline and looked up 

478 per-Quantum when generating the graph (`_DatasetDict`). 

479 """ 

480 

481 dimensions: DimensionGraph 

482 """All dimensions used by any regular input, intermediate, or output 

483 (not prerequisite) dataset; the set of dimension used in the "Big Join 

484 Query" (`DimensionGraph`). 

485 

486 This is required to be a superset of all task quantum dimensions. 

487 """ 

488 

489 @contextmanager 

490 def connectDataIds(self, registry, collections, userQuery, externalDataId): 

491 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

492 

493 This method populates `_TaskScaffolding.dataIds` and 

494 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

495 

496 Parameters 

497 ---------- 

498 registry : `lsst.daf.butler.Registry` 

499 Registry for the data repository; used for all data ID queries. 

500 collections 

501 Expressions representing the collections to search for input 

502 datasets. May be any of the types accepted by 

503 `lsst.daf.butler.CollectionSearch.fromExpression`. 

504 userQuery : `str` or `None` 

505 User-provided expression to limit the data IDs processed. 

506 externalDataId : `DataCoordinate` 

507 Externally-provided data ID that should be used to restrict the 

508 results, just as if these constraints had been included via ``AND`` 

509 in ``userQuery``. This includes (at least) any instrument named 

510 in the pipeline definition. 

511 

512 Returns 

513 ------- 

514 commonDataIds : \ 

515 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

516 An interface to a database temporary table containing all data IDs 

517 that will appear in this `QuantumGraph`. Returned inside a 

518 context manager, which will drop the temporary table at the end of 

519 the `with` block in which this method is called. 

520 """ 

521 _LOG.debug("Building query for data IDs.") 

522 # Initialization datasets always have empty data IDs. 

523 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

524 for datasetType, refs in itertools.chain(self.initInputs.items(), 

525 self.initIntermediates.items(), 

526 self.initOutputs.items()): 

527 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

528 # Run one big query for the data IDs for task dimensions and regular 

529 # inputs and outputs. We limit the query to only dimensions that are 

530 # associated with the input dataset types, but don't (yet) try to 

531 # obtain the dataset_ids for those inputs. 

532 _LOG.debug("Submitting data ID query and materializing results.") 

533 with registry.queryDataIds(self.dimensions, 

534 datasets=list(self.inputs), 

535 collections=collections, 

536 where=userQuery, 

537 dataId=externalDataId, 

538 ).materialize() as commonDataIds: 

539 _LOG.debug("Expanding data IDs.") 

540 commonDataIds = commonDataIds.expanded() 

541 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

542 # Iterate over query results, populating data IDs for datasets and 

543 # quanta and then connecting them to each other. 

544 n = 0 

545 for n, commonDataId in enumerate(commonDataIds): 

546 # Create DatasetRefs for all DatasetTypes from this result row, 

547 # noting that we might have created some already. 

548 # We remember both those that already existed and those that we 

549 # create now. 

550 refsForRow = {} 

551 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

552 self.outputs.items()): 

553 datasetDataId = commonDataId.subset(datasetType.dimensions) 

554 ref = refs.get(datasetDataId) 

555 if ref is None: 

556 ref = DatasetRef(datasetType, datasetDataId) 

557 refs[datasetDataId] = ref 

558 refsForRow[datasetType.name] = ref 

559 # Create _QuantumScaffolding objects for all tasks from this 

560 # result row, noting that we might have created some already. 

561 for task in self.tasks: 

562 quantumDataId = commonDataId.subset(task.dimensions) 

563 quantum = task.quanta.get(quantumDataId) 

564 if quantum is None: 

565 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

566 task.quanta[quantumDataId] = quantum 

567 # Whether this is a new quantum or an existing one, we can 

568 # now associate the DatasetRefs for this row with it. The 

569 # fact that a Quantum data ID and a dataset data ID both 

570 # came from the same result row is what tells us they 

571 # should be associated. 

572 # Many of these associates will be duplicates (because 

573 # another query row that differed from this one only in 

574 # irrelevant dimensions already added them), and we use 

575 # sets to skip. 

576 for datasetType in task.inputs: 

577 ref = refsForRow[datasetType.name] 

578 quantum.inputs[datasetType.name][ref.dataId] = ref 

579 for datasetType in task.outputs: 

580 ref = refsForRow[datasetType.name] 

581 quantum.outputs[datasetType.name][ref.dataId] = ref 

582 if n == 0: 

583 for message in commonDataIds.explain_no_results(): 

584 _LOG.warn(message) 

585 _LOG.debug("Finished processing %d rows from data ID query.", n) 

586 yield commonDataIds 

587 

588 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None, 

589 clobberOutputs=True): 

590 """Perform follow up queries for each dataset data ID produced in 

591 `fillDataIds`. 

592 

593 This method populates `_DatasetScaffolding.refs` (except for those in 

594 `prerequisites`). 

595 

596 Parameters 

597 ---------- 

598 registry : `lsst.daf.butler.Registry` 

599 Registry for the data repository; used for all data ID queries. 

600 collections 

601 Expressions representing the collections to search for input 

602 datasets. May be any of the types accepted by 

603 `lsst.daf.butler.CollectionSearch.fromExpression`. 

604 run : `str`, optional 

605 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

606 output datasets, if it already exists. 

607 commonDataIds : \ 

608 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

609 Result of a previous call to `connectDataIds`. 

610 skipExistingIn 

611 Expressions representing the collections to search for existing 

612 output datasets that should be skipped. May be any of the types 

613 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

614 `None` or empty string/sequence disables skipping. 

615 clobberOutputs : `bool`, optional 

616 If `True` (default), allow quanta to created even if outputs exist; 

617 this requires the same behavior behavior to be enabled when 

618 executing. If ``skipExistingIn`` is not `None`, completed quanta 

619 (those with metadata, or all outputs if there is no metadata 

620 dataset configured) will be skipped rather than clobbered. 

621 

622 Raises 

623 ------ 

624 OutputExistsError 

625 Raised if an output dataset already exists in the output run 

626 and ``skipExistingIn`` does not include output run, or if only 

627 some outputs are present and ``clobberOutputs`` is `False`. 

628 """ 

629 skipCollections: Optional[CollectionSearch] = None 

630 skipExistingInRun = False 

631 if skipExistingIn: 

632 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

633 if run: 

634 # as optimization check in the explicit list of names first 

635 skipExistingInRun = run in skipCollections.explicitNames() 

636 if not skipExistingInRun: 

637 # need to flatten it and check again 

638 skipExistingInRun = run in registry.queryCollections( 

639 skipExistingIn, 

640 collectionTypes=CollectionType.RUN, 

641 ) 

642 

643 # Look up [init] intermediate and output datasets in the output 

644 # collection, if there is an output collection. 

645 if run is not None or skipCollections is not None: 

646 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

647 self.initOutputs.items(), 

648 self.intermediates.items(), 

649 self.outputs.items()): 

650 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

651 len(refs), datasetType.name) 

652 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

653 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

654 

655 # look at RUN collection first 

656 if run is not None: 

657 resolvedRefQueryResults = subset.findDatasets( 

658 datasetType, 

659 collections=run, 

660 findFirst=True 

661 ) 

662 for resolvedRef in resolvedRefQueryResults: 

663 # TODO: we could easily support per-DatasetType 

664 # skipExisting and I could imagine that being useful - 

665 # it's probably required in order to support writing 

666 # initOutputs before QuantumGraph generation. 

667 assert resolvedRef.dataId in refs 

668 if not (skipExistingInRun or isInit or clobberOutputs): 

669 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

670 f"output RUN collection '{run}' with data ID" 

671 f" {resolvedRef.dataId}.") 

672 

673 # And check skipExistingIn too, if RUN collection is in 

674 # it is handled above 

675 if skipCollections is not None: 

676 resolvedRefQueryResults = subset.findDatasets( 

677 datasetType, 

678 collections=skipCollections, 

679 findFirst=True 

680 ) 

681 for resolvedRef in resolvedRefQueryResults: 

682 assert resolvedRef.dataId in refs 

683 refs[resolvedRef.dataId] = resolvedRef 

684 

685 # Look up input and initInput datasets in the input collection(s). 

686 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

687 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

688 resolvedRefQueryResults = commonDataIds.subset( 

689 datasetType.dimensions, 

690 unique=True 

691 ).findDatasets( 

692 datasetType, 

693 collections=collections, 

694 findFirst=True 

695 ) 

696 dataIdsNotFoundYet = set(refs.keys()) 

697 for resolvedRef in resolvedRefQueryResults: 

698 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

699 refs[resolvedRef.dataId] = resolvedRef 

700 if dataIdsNotFoundYet: 

701 raise RuntimeError( 

702 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

703 f"'{datasetType.name}' was/were present in a previous " 

704 f"query, but could not be found now." 

705 f"This is either a logic bug in QuantumGraph generation " 

706 f"or the input collections have been modified since " 

707 f"QuantumGraph generation began." 

708 ) 

709 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

710 # replacing the unresolved refs there, and then look up prerequisites. 

711 for task in self.tasks: 

712 _LOG.debug( 

713 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

714 len(task.quanta), 

715 task.taskDef.label 

716 ) 

717 lookupFunctions = { 

718 c.name: c.lookupFunction 

719 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

720 if c.lookupFunction is not None 

721 } 

722 dataIdsFailed = [] 

723 dataIdsSucceeded = [] 

724 for quantum in task.quanta.values(): 

725 # Process outputs datasets only if skipExistingIn is not None 

726 # or there is a run to look for outputs in and clobberOutputs 

727 # is True. Note that if skipExistingIn is None, any output 

728 # datasets that already exist would have already caused an 

729 # exception to be raised. We never update the DatasetRefs in 

730 # the quantum because those should never be resolved. 

731 if skipCollections is not None or (run is not None and clobberOutputs): 

732 resolvedRefs = [] 

733 unresolvedRefs = [] 

734 haveMetadata = False 

735 for datasetType, originalRefs in quantum.outputs.items(): 

736 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

737 if ref.id is not None: 

738 resolvedRefs.append(ref) 

739 if datasetType.name == task.taskDef.metadataDatasetName: 

740 haveMetadata = True 

741 else: 

742 unresolvedRefs.append(ref) 

743 if resolvedRefs: 

744 if haveMetadata or not unresolvedRefs: 

745 dataIdsSucceeded.append(quantum.dataId) 

746 if skipCollections is not None: 

747 continue 

748 else: 

749 dataIdsFailed.append(quantum.dataId) 

750 if not clobberOutputs: 

751 raise OutputExistsError( 

752 f"Quantum {quantum.dataId} of task with label " 

753 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

754 f"({resolvedRefs}) " 

755 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

756 "and clobbering outputs was not enabled." 

757 ) 

758 # Update the input DatasetRefs to the resolved ones we already 

759 # searched for. 

760 for datasetType, refs in quantum.inputs.items(): 

761 for ref in task.inputs.extract(datasetType, refs.keys()): 

762 refs[ref.dataId] = ref 

763 # Look up prerequisite datasets in the input collection(s). 

764 # These may have dimensions that extend beyond those we queried 

765 # for originally, because we want to permit those data ID 

766 # values to differ across quanta and dataset types. 

767 for datasetType in task.prerequisites: 

768 lookupFunction = lookupFunctions.get(datasetType.name) 

769 if lookupFunction is not None: 

770 # PipelineTask has provided its own function to do the 

771 # lookup. This always takes precedence. 

772 refs = list( 

773 lookupFunction(datasetType, registry, quantum.dataId, collections) 

774 ) 

775 elif (datasetType.isCalibration() 

776 and datasetType.dimensions <= quantum.dataId.graph 

777 and quantum.dataId.graph.temporal): 

778 # This is a master calibration lookup, which we have to 

779 # handle specially because the query system can't do a 

780 # temporal join on a non-dimension-based timespan yet. 

781 timespan = quantum.dataId.timespan 

782 try: 

783 refs = [registry.findDataset(datasetType, quantum.dataId, 

784 collections=collections, 

785 timespan=timespan)] 

786 except KeyError: 

787 # This dataset type is not present in the registry, 

788 # which just means there are no datasets here. 

789 refs = [] 

790 else: 

791 # Most general case. 

792 refs = list(registry.queryDatasets(datasetType, 

793 collections=collections, 

794 dataId=quantum.dataId, 

795 findFirst=True).expanded()) 

796 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs 

797 if ref is not None}) 

798 # Actually remove any quanta that we decided to skip above. 

799 if dataIdsSucceeded: 

800 if skipCollections is not None: 

801 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their " 

802 "outputs exist or metadata was written successfully.", 

803 len(dataIdsSucceeded), task.taskDef.label) 

804 for dataId in dataIdsSucceeded: 

805 del task.quanta[dataId] 

806 elif clobberOutputs: 

807 _LOG.info("Found %d successful quanta for task with label '%s' " 

808 "that will need to be clobbered during execution.", 

809 len(dataIdsSucceeded), 

810 task.taskDef.label) 

811 else: 

812 raise AssertionError("OutputExistsError should have already been raised.") 

813 if dataIdsFailed: 

814 if clobberOutputs: 

815 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' " 

816 "that will need to be clobbered during execution.", 

817 len(dataIdsFailed), 

818 task.taskDef.label) 

819 else: 

820 raise AssertionError("OutputExistsError should have already been raised.") 

821 

822 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

823 """Create a `QuantumGraph` from the quanta already present in 

824 the scaffolding data structure. 

825 

826 Parameters 

827 --------- 

828 metadata : Optional Mapping of `str` to primitives 

829 This is an optional parameter of extra data to carry with the 

830 graph. Entries in this mapping should be able to be serialized in 

831 JSON. 

832 

833 Returns 

834 ------- 

835 graph : `QuantumGraph` 

836 The full `QuantumGraph`. 

837 """ 

838 graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks}, metadata=metadata) 

839 return graph 

840 

841 

842# ------------------------ 

843# Exported definitions -- 

844# ------------------------ 

845 

846 

847class GraphBuilderError(Exception): 

848 """Base class for exceptions generated by graph builder. 

849 """ 

850 pass 

851 

852 

853class OutputExistsError(GraphBuilderError): 

854 """Exception generated when output datasets already exist. 

855 """ 

856 pass 

857 

858 

859class PrerequisiteMissingError(GraphBuilderError): 

860 """Exception generated when a prerequisite dataset does not exist. 

861 """ 

862 pass 

863 

864 

865class GraphBuilder(object): 

866 """GraphBuilder class is responsible for building task execution graph from 

867 a Pipeline. 

868 

869 Parameters 

870 ---------- 

871 registry : `~lsst.daf.butler.Registry` 

872 Data butler instance. 

873 skipExistingIn 

874 Expressions representing the collections to search for existing 

875 output datasets that should be skipped. May be any of the types 

876 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

877 clobberOutputs : `bool`, optional 

878 If `True` (default), allow quanta to created even if partial outputs 

879 exist; this requires the same behavior behavior to be enabled when 

880 executing. 

881 """ 

882 

883 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True): 

884 self.registry = registry 

885 self.dimensions = registry.dimensions 

886 self.skipExistingIn = skipExistingIn 

887 self.clobberOutputs = clobberOutputs 

888 

889 def makeGraph(self, pipeline, collections, run, userQuery, 

890 metadata: Optional[Mapping[str, Any]] = None): 

891 """Create execution graph for a pipeline. 

892 

893 Parameters 

894 ---------- 

895 pipeline : `Pipeline` 

896 Pipeline definition, task names/classes and their configs. 

897 collections 

898 Expressions representing the collections to search for input 

899 datasets. May be any of the types accepted by 

900 `lsst.daf.butler.CollectionSearch.fromExpression`. 

901 run : `str`, optional 

902 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

903 output datasets, if it already exists. 

904 userQuery : `str` 

905 String which defines user-defined selection for registry, should be 

906 empty or `None` if there is no restrictions on data selection. 

907 metadata : Optional Mapping of `str` to primitives 

908 This is an optional parameter of extra data to carry with the 

909 graph. Entries in this mapping should be able to be serialized in 

910 JSON. 

911 

912 Returns 

913 ------- 

914 graph : `QuantumGraph` 

915 

916 Raises 

917 ------ 

918 UserExpressionError 

919 Raised when user expression cannot be parsed. 

920 OutputExistsError 

921 Raised when output datasets already exist. 

922 Exception 

923 Other exceptions types may be raised by underlying registry 

924 classes. 

925 """ 

926 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

927 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

928 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

929 instrument = pipeline.getInstrument() 

930 if isinstance(instrument, str): 

931 instrument = doImport(instrument) 

932 if instrument is not None: 

933 dataId = DataCoordinate.standardize(instrument=instrument.getName(), 

934 universe=self.registry.dimensions) 

935 else: 

936 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

937 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds: 

938 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

939 skipExistingIn=self.skipExistingIn, 

940 clobberOutputs=self.clobberOutputs) 

941 return scaffolding.makeQuantumGraph(metadata=metadata)