Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Any, Dict, Iterable, Iterator, List, Optional, Set, Mapping 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections, AdjustQuantumHelper 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph 

45from lsst.daf.butler import ( 

46 CollectionSearch, 

47 CollectionType, 

48 DataCoordinate, 

49 DatasetRef, 

50 DatasetType, 

51 DimensionGraph, 

52 DimensionUniverse, 

53 NamedKeyDict, 

54 Quantum, 

55) 

56from lsst.utils import doImport 

57 

58# ---------------------------------- 

59# Local non-exported definitions -- 

60# ---------------------------------- 

61 

62_LOG = logging.getLogger(__name__.partition(".")[2]) 

63 

64 

65class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

66 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

67 the known `DatasetRef` instances of that type. 

68 

69 Parameters 

70 ---------- 

71 args 

72 Positional arguments are forwarded to the `dict` constructor. 

73 universe : `DimensionUniverse` 

74 Universe of all possible dimensions. 

75 """ 

76 def __init__(self, *args, universe: DimensionGraph): 

77 super().__init__(*args) 

78 self.universe = universe 

79 

80 @classmethod 

81 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

82 universe: DimensionUniverse) -> _DatasetDict: 

83 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

84 

85 Parameters 

86 ---------- 

87 datasetTypes : `iterable` of `DatasetType` 

88 DatasetTypes to use as keys for the dict. Values will be empty 

89 dictionaries. 

90 universe : `DimensionUniverse` 

91 Universe of all possible dimensions. 

92 

93 Returns 

94 ------- 

95 dictionary : `_DatasetDict` 

96 A new `_DatasetDict` instance. 

97 """ 

98 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

99 

100 @classmethod 

101 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

102 ) -> _DatasetDict: 

103 """Return a new dictionary by extracting items corresponding to the 

104 given keys from one or more existing dictionaries. 

105 

106 Parameters 

107 ---------- 

108 datasetTypes : `iterable` of `DatasetType` 

109 DatasetTypes to use as keys for the dict. Values will be obtained 

110 by lookups against ``first`` and ``rest``. 

111 first : `_DatasetDict` 

112 Another dictionary from which to extract values. 

113 rest 

114 Additional dictionaries from which to extract values. 

115 

116 Returns 

117 ------- 

118 dictionary : `_DatasetDict` 

119 A new dictionary instance. 

120 """ 

121 combined = ChainMap(first, *rest) 

122 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

123 universe=first.universe) 

124 

125 @property 

126 def dimensions(self) -> DimensionGraph: 

127 """The union of all dimensions used by all dataset types in this 

128 dictionary, including implied dependencies (`DimensionGraph`). 

129 """ 

130 base = self.universe.empty 

131 if len(self) == 0: 

132 return base 

133 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

134 

135 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

136 """Unpack nested single-element `DatasetRef` dicts into a new 

137 mapping with `DatasetType` keys and `DatasetRef` values. 

138 

139 This method assumes that each nest contains exactly one item, as is the 

140 case for all "init" datasets. 

141 

142 Returns 

143 ------- 

144 dictionary : `NamedKeyDict` 

145 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

146 `DatasetType` instances and string names usable as keys. 

147 """ 

148 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

149 ref, = refs.values() 

150 return ref 

151 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

152 

153 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

154 """Unpack nested multi-element `DatasetRef` dicts into a new 

155 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

156 

157 Returns 

158 ------- 

159 dictionary : `NamedKeyDict` 

160 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

161 both `DatasetType` instances and string names usable as keys. 

162 """ 

163 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

164 

165 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

166 ) -> Iterator[DatasetRef]: 

167 """Iterate over the contained `DatasetRef` instances that match the 

168 given `DatasetType` and data IDs. 

169 

170 Parameters 

171 ---------- 

172 datasetType : `DatasetType` 

173 Dataset type to match. 

174 dataIds : `Iterable` [ `DataCoordinate` ] 

175 Data IDs to match. 

176 

177 Returns 

178 ------- 

179 refs : `Iterator` [ `DatasetRef` ] 

180 DatasetRef instances for which ``ref.datasetType == datasetType`` 

181 and ``ref.dataId`` is in ``dataIds``. 

182 """ 

183 refs = self[datasetType] 

184 return (refs[dataId] for dataId in dataIds) 

185 

186 

187class _QuantumScaffolding: 

188 """Helper class aggregating information about a `Quantum`, used when 

189 constructing a `QuantumGraph`. 

190 

191 See `_PipelineScaffolding` for a top-down description of the full 

192 scaffolding data structure. 

193 

194 Parameters 

195 ---------- 

196 task : _TaskScaffolding 

197 Back-reference to the helper object for the `PipelineTask` this quantum 

198 represents an execution of. 

199 dataId : `DataCoordinate` 

200 Data ID for this quantum. 

201 """ 

202 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

203 self.task = task 

204 self.dataId = dataId 

205 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

206 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

207 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

208 universe=dataId.universe) 

209 

210 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

211 

212 def __repr__(self): 

213 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

214 

215 task: _TaskScaffolding 

216 """Back-reference to the helper object for the `PipelineTask` this quantum 

217 represents an execution of. 

218 """ 

219 

220 dataId: DataCoordinate 

221 """Data ID for this quantum. 

222 """ 

223 

224 inputs: _DatasetDict 

225 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

226 

227 This is initialized to map each `DatasetType` to an empty dictionary at 

228 construction. Those nested dictionaries are populated (with data IDs as 

229 keys) with unresolved `DatasetRef` instances in 

230 `_PipelineScaffolding.connectDataIds`. 

231 """ 

232 

233 outputs: _DatasetDict 

234 """Nested dictionary containing `DatasetRef` outputs this quantum. 

235 """ 

236 

237 prerequisites: _DatasetDict 

238 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

239 quantum. 

240 """ 

241 

242 def makeQuantum(self) -> Quantum: 

243 """Transform the scaffolding object into a true `Quantum` instance. 

244 

245 Returns 

246 ------- 

247 quantum : `Quantum` 

248 An actual `Quantum` instance. 

249 """ 

250 allInputs = self.inputs.unpackMultiRefs() 

251 allInputs.update(self.prerequisites.unpackMultiRefs()) 

252 # Give the task's Connections class an opportunity to remove some 

253 # inputs, or complain if they are unacceptable. 

254 # This will raise if one of the check conditions is not met, which is 

255 # the intended behavior. 

256 # If it raises NotWorkFound, there is a bug in the QG algorithm 

257 # or the adjustQuantum is incorrectly trying to make a prerequisite 

258 # input behave like a regular input; adjustQuantum should only raise 

259 # NoWorkFound if a regular input is missing, and it shouldn't be 

260 # possible for us to have generated ``self`` if that's true. 

261 helper = AdjustQuantumHelper(inputs=allInputs, outputs=self.outputs.unpackMultiRefs()) 

262 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

263 return Quantum( 

264 taskName=self.task.taskDef.taskName, 

265 taskClass=self.task.taskDef.taskClass, 

266 dataId=self.dataId, 

267 initInputs=self.task.initInputs.unpackSingleRefs(), 

268 inputs=helper.inputs, 

269 outputs=helper.outputs, 

270 ) 

271 

272 

273@dataclass 

274class _TaskScaffolding: 

275 """Helper class aggregating information about a `PipelineTask`, used when 

276 constructing a `QuantumGraph`. 

277 

278 See `_PipelineScaffolding` for a top-down description of the full 

279 scaffolding data structure. 

280 

281 Parameters 

282 ---------- 

283 taskDef : `TaskDef` 

284 Data structure that identifies the task class and its config. 

285 parent : `_PipelineScaffolding` 

286 The parent data structure that will hold the instance being 

287 constructed. 

288 datasetTypes : `TaskDatasetTypes` 

289 Data structure that categorizes the dataset types used by this task. 

290 """ 

291 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

292 universe = parent.dimensions.universe 

293 self.taskDef = taskDef 

294 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

295 assert self.dimensions.issubset(parent.dimensions) 

296 # Initialize _DatasetDicts as subsets of the one or two 

297 # corresponding dicts in the parent _PipelineScaffolding. 

298 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

299 parent.initIntermediates) 

300 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

301 parent.initOutputs) 

302 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

303 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

304 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

305 self.dataIds = set() 

306 self.quanta = {} 

307 

308 def __repr__(self): 

309 # Default dataclass-injected __repr__ gets caught in an infinite loop 

310 # because of back-references. 

311 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

312 

313 taskDef: TaskDef 

314 """Data structure that identifies the task class and its config 

315 (`TaskDef`). 

316 """ 

317 

318 dimensions: DimensionGraph 

319 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

320 """ 

321 

322 initInputs: _DatasetDict 

323 """Dictionary containing information about datasets used to construct this 

324 task (`_DatasetDict`). 

325 """ 

326 

327 initOutputs: _DatasetDict 

328 """Dictionary containing information about datasets produced as a 

329 side-effect of constructing this task (`_DatasetDict`). 

330 """ 

331 

332 inputs: _DatasetDict 

333 """Dictionary containing information about datasets used as regular, 

334 graph-constraining inputs to this task (`_DatasetDict`). 

335 """ 

336 

337 outputs: _DatasetDict 

338 """Dictionary containing information about datasets produced by this task 

339 (`_DatasetDict`). 

340 """ 

341 

342 prerequisites: _DatasetDict 

343 """Dictionary containing information about input datasets that must be 

344 present in the repository before any Pipeline containing this task is run 

345 (`_DatasetDict`). 

346 """ 

347 

348 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

349 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

350 this task with that data ID. 

351 """ 

352 

353 def makeQuantumSet(self) -> Set[Quantum]: 

354 """Create a `set` of `Quantum` from the information in ``self``. 

355 

356 Returns 

357 ------- 

358 nodes : `set` of `Quantum 

359 The `Quantum` elements corresponding to this task. 

360 """ 

361 return set(q.makeQuantum() for q in self.quanta.values()) 

362 

363 

364@dataclass 

365class _PipelineScaffolding: 

366 """A helper data structure that organizes the information involved in 

367 constructing a `QuantumGraph` for a `Pipeline`. 

368 

369 Parameters 

370 ---------- 

371 pipeline : `Pipeline` 

372 Sequence of tasks from which a graph is to be constructed. Must 

373 have nested task classes already imported. 

374 universe : `DimensionUniverse` 

375 Universe of all possible dimensions. 

376 

377 Notes 

378 ----- 

379 The scaffolding data structure contains nested data structures for both 

380 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

381 data structures are shared between the pipeline-level structure (which 

382 aggregates all datasets and categorizes them from the perspective of the 

383 complete pipeline) and the individual tasks that use them as inputs and 

384 outputs. 

385 

386 `QuantumGraph` construction proceeds in four steps, with each corresponding 

387 to a different `_PipelineScaffolding` method: 

388 

389 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

390 the DatasetTypes used by the pipeline (delegating to 

391 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

392 nested `_TaskScaffolding` and `_DatasetDict` objects. 

393 

394 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

395 returns related tuples of all dimensions used to identify any regular 

396 input, output, and intermediate datasets (not prerequisites). We then 

397 iterate over these tuples of related dimensions, identifying the subsets 

398 that correspond to distinct data IDs for each task and dataset type, 

399 and then create `_QuantumScaffolding` objects. 

400 

401 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

402 dataset data IDs previously identified, transforming unresolved 

403 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

404 up prerequisite datasets for all quanta. 

405 

406 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

407 per-task `_QuantumScaffolding` objects. 

408 """ 

409 def __init__(self, pipeline, *, registry): 

410 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

411 self.tasks = [] 

412 # Aggregate and categorize the DatasetTypes in the Pipeline. 

413 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

414 # Construct dictionaries that map those DatasetTypes to structures 

415 # that will (later) hold addiitonal information about them. 

416 for attr in ("initInputs", "initIntermediates", "initOutputs", 

417 "inputs", "intermediates", "outputs", "prerequisites"): 

418 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

419 universe=registry.dimensions)) 

420 # Aggregate all dimensions for all non-init, non-prerequisite 

421 # DatasetTypes. These are the ones we'll include in the big join 

422 # query. 

423 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

424 self.outputs.dimensions) 

425 # Construct scaffolding nodes for each Task, and add backreferences 

426 # to the Task from each DatasetScaffolding node. 

427 # Note that there's only one scaffolding node for each DatasetType, 

428 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

429 # reference it. 

430 if isinstance(pipeline, Pipeline): 

431 pipeline = pipeline.toExpandedPipeline() 

432 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

433 for taskDef, taskDatasetTypes in zip(pipeline, 

434 datasetTypes.byTask.values())] 

435 

436 def __repr__(self): 

437 # Default dataclass-injected __repr__ gets caught in an infinite loop 

438 # because of back-references. 

439 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

440 

441 tasks: List[_TaskScaffolding] 

442 """Scaffolding data structures for each task in the pipeline 

443 (`list` of `_TaskScaffolding`). 

444 """ 

445 

446 initInputs: _DatasetDict 

447 """Datasets consumed but not produced when constructing the tasks in this 

448 pipeline (`_DatasetDict`). 

449 """ 

450 

451 initIntermediates: _DatasetDict 

452 """Datasets that are both consumed and produced when constructing the tasks 

453 in this pipeline (`_DatasetDict`). 

454 """ 

455 

456 initOutputs: _DatasetDict 

457 """Datasets produced but not consumed when constructing the tasks in this 

458 pipeline (`_DatasetDict`). 

459 """ 

460 

461 inputs: _DatasetDict 

462 """Datasets that are consumed but not produced when running this pipeline 

463 (`_DatasetDict`). 

464 """ 

465 

466 intermediates: _DatasetDict 

467 """Datasets that are both produced and consumed when running this pipeline 

468 (`_DatasetDict`). 

469 """ 

470 

471 outputs: _DatasetDict 

472 """Datasets produced but not consumed when when running this pipeline 

473 (`_DatasetDict`). 

474 """ 

475 

476 prerequisites: _DatasetDict 

477 """Datasets that are consumed when running this pipeline and looked up 

478 per-Quantum when generating the graph (`_DatasetDict`). 

479 """ 

480 

481 dimensions: DimensionGraph 

482 """All dimensions used by any regular input, intermediate, or output 

483 (not prerequisite) dataset; the set of dimension used in the "Big Join 

484 Query" (`DimensionGraph`). 

485 

486 This is required to be a superset of all task quantum dimensions. 

487 """ 

488 

489 @contextmanager 

490 def connectDataIds(self, registry, collections, userQuery, externalDataId): 

491 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

492 

493 This method populates `_TaskScaffolding.dataIds` and 

494 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

495 

496 Parameters 

497 ---------- 

498 registry : `lsst.daf.butler.Registry` 

499 Registry for the data repository; used for all data ID queries. 

500 collections 

501 Expressions representing the collections to search for input 

502 datasets. May be any of the types accepted by 

503 `lsst.daf.butler.CollectionSearch.fromExpression`. 

504 userQuery : `str` or `None` 

505 User-provided expression to limit the data IDs processed. 

506 externalDataId : `DataCoordinate` 

507 Externally-provided data ID that should be used to restrict the 

508 results, just as if these constraints had been included via ``AND`` 

509 in ``userQuery``. This includes (at least) any instrument named 

510 in the pipeline definition. 

511 

512 Returns 

513 ------- 

514 commonDataIds : \ 

515 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

516 An interface to a database temporary table containing all data IDs 

517 that will appear in this `QuantumGraph`. Returned inside a 

518 context manager, which will drop the temporary table at the end of 

519 the `with` block in which this method is called. 

520 """ 

521 _LOG.debug("Building query for data IDs.") 

522 # Initialization datasets always have empty data IDs. 

523 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

524 for datasetType, refs in itertools.chain(self.initInputs.items(), 

525 self.initIntermediates.items(), 

526 self.initOutputs.items()): 

527 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

528 # Run one big query for the data IDs for task dimensions and regular 

529 # inputs and outputs. We limit the query to only dimensions that are 

530 # associated with the input dataset types, but don't (yet) try to 

531 # obtain the dataset_ids for those inputs. 

532 _LOG.debug("Submitting data ID query and materializing results.") 

533 with registry.queryDataIds(self.dimensions, 

534 datasets=list(self.inputs), 

535 collections=collections, 

536 where=userQuery, 

537 dataId=externalDataId, 

538 ).materialize() as commonDataIds: 

539 _LOG.debug("Expanding data IDs.") 

540 commonDataIds = commonDataIds.expanded() 

541 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

542 # Iterate over query results, populating data IDs for datasets and 

543 # quanta and then connecting them to each other. 

544 n = 0 

545 for n, commonDataId in enumerate(commonDataIds): 

546 # Create DatasetRefs for all DatasetTypes from this result row, 

547 # noting that we might have created some already. 

548 # We remember both those that already existed and those that we 

549 # create now. 

550 refsForRow = {} 

551 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

552 self.outputs.items()): 

553 datasetDataId = commonDataId.subset(datasetType.dimensions) 

554 ref = refs.get(datasetDataId) 

555 if ref is None: 

556 ref = DatasetRef(datasetType, datasetDataId) 

557 refs[datasetDataId] = ref 

558 refsForRow[datasetType.name] = ref 

559 # Create _QuantumScaffolding objects for all tasks from this 

560 # result row, noting that we might have created some already. 

561 for task in self.tasks: 

562 quantumDataId = commonDataId.subset(task.dimensions) 

563 quantum = task.quanta.get(quantumDataId) 

564 if quantum is None: 

565 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

566 task.quanta[quantumDataId] = quantum 

567 # Whether this is a new quantum or an existing one, we can 

568 # now associate the DatasetRefs for this row with it. The 

569 # fact that a Quantum data ID and a dataset data ID both 

570 # came from the same result row is what tells us they 

571 # should be associated. 

572 # Many of these associates will be duplicates (because 

573 # another query row that differed from this one only in 

574 # irrelevant dimensions already added them), and we use 

575 # sets to skip. 

576 for datasetType in task.inputs: 

577 ref = refsForRow[datasetType.name] 

578 quantum.inputs[datasetType.name][ref.dataId] = ref 

579 for datasetType in task.outputs: 

580 ref = refsForRow[datasetType.name] 

581 quantum.outputs[datasetType.name][ref.dataId] = ref 

582 _LOG.debug("Finished processing %d rows from data ID query.", n) 

583 yield commonDataIds 

584 

585 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExistingIn=None, 

586 clobberOutputs=True): 

587 """Perform follow up queries for each dataset data ID produced in 

588 `fillDataIds`. 

589 

590 This method populates `_DatasetScaffolding.refs` (except for those in 

591 `prerequisites`). 

592 

593 Parameters 

594 ---------- 

595 registry : `lsst.daf.butler.Registry` 

596 Registry for the data repository; used for all data ID queries. 

597 collections 

598 Expressions representing the collections to search for input 

599 datasets. May be any of the types accepted by 

600 `lsst.daf.butler.CollectionSearch.fromExpression`. 

601 run : `str`, optional 

602 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

603 output datasets, if it already exists. 

604 commonDataIds : \ 

605 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

606 Result of a previous call to `connectDataIds`. 

607 skipExistingIn 

608 Expressions representing the collections to search for existing 

609 output datasets that should be skipped. May be any of the types 

610 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

611 `None` or empty string/sequence disables skipping. 

612 clobberOutputs : `bool`, optional 

613 If `True` (default), allow quanta to created even if outputs exist; 

614 this requires the same behavior behavior to be enabled when 

615 executing. If ``skipExistingIn`` is not `None`, completed quanta 

616 (those with metadata, or all outputs if there is no metadata 

617 dataset configured) will be skipped rather than clobbered. 

618 

619 Raises 

620 ------ 

621 OutputExistsError 

622 Raised if an output dataset already exists in the output run 

623 and ``skipExistingIn`` does not include output run, or if only 

624 some outputs are present and ``clobberOutputs`` is `False`. 

625 """ 

626 skipCollections: Optional[CollectionSearch] = None 

627 skipExistingInRun = False 

628 if skipExistingIn: 

629 skipCollections = CollectionSearch.fromExpression(skipExistingIn) 

630 if run: 

631 # as optimization check in the explicit list of names first 

632 skipExistingInRun = run in skipCollections.explicitNames() 

633 if not skipExistingInRun: 

634 # need to flatten it and check again 

635 skipExistingInRun = run in registry.queryCollections( 

636 skipExistingIn, 

637 collectionTypes=CollectionType.RUN, 

638 ) 

639 

640 # Look up [init] intermediate and output datasets in the output 

641 # collection, if there is an output collection. 

642 if run is not None or skipCollections is not None: 

643 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

644 self.initOutputs.items(), 

645 self.intermediates.items(), 

646 self.outputs.items()): 

647 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

648 len(refs), datasetType.name) 

649 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

650 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

651 

652 # look at RUN collection first 

653 if run is not None: 

654 resolvedRefQueryResults = subset.findDatasets( 

655 datasetType, 

656 collections=run, 

657 findFirst=True 

658 ) 

659 for resolvedRef in resolvedRefQueryResults: 

660 # TODO: we could easily support per-DatasetType 

661 # skipExisting and I could imagine that being useful - 

662 # it's probably required in order to support writing 

663 # initOutputs before QuantumGraph generation. 

664 assert resolvedRef.dataId in refs 

665 if not (skipExistingInRun or isInit or clobberOutputs): 

666 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

667 f"output RUN collection '{run}' with data ID" 

668 f" {resolvedRef.dataId}.") 

669 

670 # And check skipExistingIn too, if RUN collection is in 

671 # it is handled above 

672 if skipCollections is not None: 

673 resolvedRefQueryResults = subset.findDatasets( 

674 datasetType, 

675 collections=skipCollections, 

676 findFirst=True 

677 ) 

678 for resolvedRef in resolvedRefQueryResults: 

679 assert resolvedRef.dataId in refs 

680 refs[resolvedRef.dataId] = resolvedRef 

681 

682 # Look up input and initInput datasets in the input collection(s). 

683 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

684 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

685 resolvedRefQueryResults = commonDataIds.subset( 

686 datasetType.dimensions, 

687 unique=True 

688 ).findDatasets( 

689 datasetType, 

690 collections=collections, 

691 findFirst=True 

692 ) 

693 dataIdsNotFoundYet = set(refs.keys()) 

694 for resolvedRef in resolvedRefQueryResults: 

695 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

696 refs[resolvedRef.dataId] = resolvedRef 

697 if dataIdsNotFoundYet: 

698 raise RuntimeError( 

699 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

700 f"'{datasetType.name}' was/were present in a previous " 

701 f"query, but could not be found now." 

702 f"This is either a logic bug in QuantumGraph generation " 

703 f"or the input collections have been modified since " 

704 f"QuantumGraph generation began." 

705 ) 

706 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

707 # replacing the unresolved refs there, and then look up prerequisites. 

708 for task in self.tasks: 

709 _LOG.debug( 

710 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

711 len(task.quanta), 

712 task.taskDef.label 

713 ) 

714 lookupFunctions = { 

715 c.name: c.lookupFunction 

716 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

717 if c.lookupFunction is not None 

718 } 

719 dataIdsFailed = [] 

720 dataIdsSucceeded = [] 

721 for quantum in task.quanta.values(): 

722 # Process outputs datasets only if skipExistingIn is not None 

723 # or there is a run to look for outputs in and clobberOutputs 

724 # is True. Note that if skipExistingIn is None, any output 

725 # datasets that already exist would have already caused an 

726 # exception to be raised. We never update the DatasetRefs in 

727 # the quantum because those should never be resolved. 

728 if skipCollections is not None or (run is not None and clobberOutputs): 

729 resolvedRefs = [] 

730 unresolvedRefs = [] 

731 haveMetadata = False 

732 for datasetType, originalRefs in quantum.outputs.items(): 

733 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

734 if ref.id is not None: 

735 resolvedRefs.append(ref) 

736 if datasetType.name == task.taskDef.metadataDatasetName: 

737 haveMetadata = True 

738 else: 

739 unresolvedRefs.append(ref) 

740 if resolvedRefs: 

741 if haveMetadata or not unresolvedRefs: 

742 dataIdsSucceeded.append(quantum.dataId) 

743 if skipCollections is not None: 

744 continue 

745 else: 

746 dataIdsFailed.append(quantum.dataId) 

747 if not clobberOutputs: 

748 raise OutputExistsError( 

749 f"Quantum {quantum.dataId} of task with label " 

750 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

751 f"({resolvedRefs}) " 

752 f"and others that don't ({unresolvedRefs}), with no metadata output, " 

753 "and clobbering outputs was not enabled." 

754 ) 

755 # Update the input DatasetRefs to the resolved ones we already 

756 # searched for. 

757 for datasetType, refs in quantum.inputs.items(): 

758 for ref in task.inputs.extract(datasetType, refs.keys()): 

759 refs[ref.dataId] = ref 

760 # Look up prerequisite datasets in the input collection(s). 

761 # These may have dimensions that extend beyond those we queried 

762 # for originally, because we want to permit those data ID 

763 # values to differ across quanta and dataset types. 

764 for datasetType in task.prerequisites: 

765 lookupFunction = lookupFunctions.get(datasetType.name) 

766 if lookupFunction is not None: 

767 # PipelineTask has provided its own function to do the 

768 # lookup. This always takes precedence. 

769 refs = list( 

770 lookupFunction(datasetType, registry, quantum.dataId, collections) 

771 ) 

772 elif (datasetType.isCalibration() 

773 and datasetType.dimensions <= quantum.dataId.graph 

774 and quantum.dataId.graph.temporal): 

775 # This is a master calibration lookup, which we have to 

776 # handle specially because the query system can't do a 

777 # temporal join on a non-dimension-based timespan yet. 

778 timespan = quantum.dataId.timespan 

779 try: 

780 refs = [registry.findDataset(datasetType, quantum.dataId, 

781 collections=collections, 

782 timespan=timespan)] 

783 except KeyError: 

784 # This dataset type is not present in the registry, 

785 # which just means there are no datasets here. 

786 refs = [] 

787 else: 

788 # Most general case. 

789 refs = list(registry.queryDatasets(datasetType, 

790 collections=collections, 

791 dataId=quantum.dataId, 

792 findFirst=True).expanded()) 

793 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs 

794 if ref is not None}) 

795 # Actually remove any quanta that we decided to skip above. 

796 if dataIdsSucceeded: 

797 if skipCollections is not None: 

798 _LOG.debug("Pruning successful %d quanta for task with label '%s' because all of their " 

799 "outputs exist or metadata was written successfully.", 

800 len(dataIdsSucceeded), task.taskDef.label) 

801 for dataId in dataIdsSucceeded: 

802 del task.quanta[dataId] 

803 elif clobberOutputs: 

804 _LOG.info("Found %d successful quanta for task with label '%s' " 

805 "that will need to be clobbered during execution.", 

806 len(dataIdsSucceeded), 

807 task.taskDef.label) 

808 else: 

809 raise AssertionError("OutputExistsError should have already been raised.") 

810 if dataIdsFailed: 

811 if clobberOutputs: 

812 _LOG.info("Found %d failed/incomplete quanta for task with label '%s' " 

813 "that will need to be clobbered during execution.", 

814 len(dataIdsFailed), 

815 task.taskDef.label) 

816 else: 

817 raise AssertionError("OutputExistsError should have already been raised.") 

818 

819 def makeQuantumGraph(self, metadata: Optional[Mapping[str, Any]] = None): 

820 """Create a `QuantumGraph` from the quanta already present in 

821 the scaffolding data structure. 

822 

823 Parameters 

824 --------- 

825 metadata : Optional Mapping of `str` to primitives 

826 This is an optional parameter of extra data to carry with the 

827 graph. Entries in this mapping should be able to be serialized in 

828 JSON. 

829 

830 Returns 

831 ------- 

832 graph : `QuantumGraph` 

833 The full `QuantumGraph`. 

834 """ 

835 graph = QuantumGraph({task.taskDef: task.makeQuantumSet() for task in self.tasks}, metadata=metadata) 

836 return graph 

837 

838 

839# ------------------------ 

840# Exported definitions -- 

841# ------------------------ 

842 

843 

844class GraphBuilderError(Exception): 

845 """Base class for exceptions generated by graph builder. 

846 """ 

847 pass 

848 

849 

850class OutputExistsError(GraphBuilderError): 

851 """Exception generated when output datasets already exist. 

852 """ 

853 pass 

854 

855 

856class PrerequisiteMissingError(GraphBuilderError): 

857 """Exception generated when a prerequisite dataset does not exist. 

858 """ 

859 pass 

860 

861 

862class GraphBuilder(object): 

863 """GraphBuilder class is responsible for building task execution graph from 

864 a Pipeline. 

865 

866 Parameters 

867 ---------- 

868 registry : `~lsst.daf.butler.Registry` 

869 Data butler instance. 

870 skipExistingIn 

871 Expressions representing the collections to search for existing 

872 output datasets that should be skipped. May be any of the types 

873 accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. 

874 clobberOutputs : `bool`, optional 

875 If `True` (default), allow quanta to created even if partial outputs 

876 exist; this requires the same behavior behavior to be enabled when 

877 executing. 

878 """ 

879 

880 def __init__(self, registry, skipExistingIn=None, clobberOutputs=True): 

881 self.registry = registry 

882 self.dimensions = registry.dimensions 

883 self.skipExistingIn = skipExistingIn 

884 self.clobberOutputs = clobberOutputs 

885 

886 def makeGraph(self, pipeline, collections, run, userQuery, 

887 metadata: Optional[Mapping[str, Any]] = None): 

888 """Create execution graph for a pipeline. 

889 

890 Parameters 

891 ---------- 

892 pipeline : `Pipeline` 

893 Pipeline definition, task names/classes and their configs. 

894 collections 

895 Expressions representing the collections to search for input 

896 datasets. May be any of the types accepted by 

897 `lsst.daf.butler.CollectionSearch.fromExpression`. 

898 run : `str`, optional 

899 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

900 output datasets, if it already exists. 

901 userQuery : `str` 

902 String which defines user-defined selection for registry, should be 

903 empty or `None` if there is no restrictions on data selection. 

904 metadata : Optional Mapping of `str` to primitives 

905 This is an optional parameter of extra data to carry with the 

906 graph. Entries in this mapping should be able to be serialized in 

907 JSON. 

908 

909 Returns 

910 ------- 

911 graph : `QuantumGraph` 

912 

913 Raises 

914 ------ 

915 UserExpressionError 

916 Raised when user expression cannot be parsed. 

917 OutputExistsError 

918 Raised when output datasets already exist. 

919 Exception 

920 Other exceptions types may be raised by underlying registry 

921 classes. 

922 """ 

923 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

924 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

925 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

926 instrument = pipeline.getInstrument() 

927 if isinstance(instrument, str): 

928 instrument = doImport(instrument) 

929 if instrument is not None: 

930 dataId = DataCoordinate.standardize(instrument=instrument.getName(), 

931 universe=self.registry.dimensions) 

932 else: 

933 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

934 with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds: 

935 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

936 skipExistingIn=self.skipExistingIn, 

937 clobberOutputs=self.clobberOutputs) 

938 return scaffolding.makeQuantumGraph(metadata=metadata)