Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Dict, Iterable, Iterator, List 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph, QuantumGraphTaskNodes 

45from lsst.daf.butler import ( 

46 DataCoordinate, 

47 DatasetRef, 

48 DatasetType, 

49 DimensionGraph, 

50 DimensionUniverse, 

51 NamedKeyDict, 

52 Quantum, 

53) 

54from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor 

55from lsst.utils import doImport 

56 

57# ---------------------------------- 

58# Local non-exported definitions -- 

59# ---------------------------------- 

60 

61_LOG = logging.getLogger(__name__.partition(".")[2]) 

62 

63 

64class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

65 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

66 the known `DatasetRef` instances of that type. 

67 

68 Parameters 

69 ---------- 

70 args 

71 Positional arguments are forwarded to the `dict` constructor. 

72 universe : `DimensionUniverse` 

73 Universe of all possible dimensions. 

74 """ 

75 def __init__(self, *args, universe: DimensionGraph): 

76 super().__init__(*args) 

77 self.universe = universe 

78 

79 @classmethod 

80 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

81 universe: DimensionUniverse) -> _DatasetDict: 

82 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

83 

84 Parameters 

85 ---------- 

86 datasetTypes : `iterable` of `DatasetType` 

87 DatasetTypes to use as keys for the dict. Values will be empty 

88 dictionaries. 

89 universe : `DimensionUniverse` 

90 Universe of all possible dimensions. 

91 

92 Returns 

93 ------- 

94 dictionary : `_DatasetDict` 

95 A new `_DatasetDict` instance. 

96 """ 

97 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

98 

99 @classmethod 

100 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

101 ) -> _DatasetDict: 

102 """Return a new dictionary by extracting items corresponding to the 

103 given keys from one or more existing dictionaries. 

104 

105 Parameters 

106 ---------- 

107 datasetTypes : `iterable` of `DatasetType` 

108 DatasetTypes to use as keys for the dict. Values will be obtained 

109 by lookups against ``first`` and ``rest``. 

110 first : `_DatasetDict` 

111 Another dictionary from which to extract values. 

112 rest 

113 Additional dictionaries from which to extract values. 

114 

115 Returns 

116 ------- 

117 dictionary : `_DatasetDict` 

118 A new dictionary instance. 

119 """ 

120 combined = ChainMap(first, *rest) 

121 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

122 universe=first.universe) 

123 

124 @property 

125 def dimensions(self) -> DimensionGraph: 

126 """The union of all dimensions used by all dataset types in this 

127 dictionary, including implied dependencies (`DimensionGraph`). 

128 """ 

129 base = self.universe.empty 

130 if len(self) == 0: 

131 return base 

132 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

133 

134 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

135 """Unpack nested single-element `DatasetRef` dicts into a new 

136 mapping with `DatasetType` keys and `DatasetRef` values. 

137 

138 This method assumes that each nest contains exactly one item, as is the 

139 case for all "init" datasets. 

140 

141 Returns 

142 ------- 

143 dictionary : `NamedKeyDict` 

144 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

145 `DatasetType` instances and string names usable as keys. 

146 """ 

147 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

148 ref, = refs.values() 

149 return ref 

150 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

151 

152 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

153 """Unpack nested multi-element `DatasetRef` dicts into a new 

154 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

155 

156 Returns 

157 ------- 

158 dictionary : `NamedKeyDict` 

159 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

160 `DatasetType` instances and string names usable as keys. 

161 """ 

162 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

163 

164 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

165 ) -> Iterator[DatasetRef]: 

166 """Iterate over the contained `DatasetRef` instances that match the 

167 given `DatasetType` and data IDs. 

168 

169 Parameters 

170 ---------- 

171 datasetType : `DatasetType` 

172 Dataset type to match. 

173 dataIds : `Iterable` [ `DataCoordinate` ] 

174 Data IDs to match. 

175 

176 Returns 

177 ------- 

178 refs : `Iterator` [ `DatasetRef` ] 

179 DatasetRef instances for which ``ref.datasetType == datasetType`` 

180 and ``ref.dataId`` is in ``dataIds``. 

181 """ 

182 refs = self[datasetType] 

183 return (refs[dataId] for dataId in dataIds) 

184 

185 

186class _QuantumScaffolding: 

187 """Helper class aggregating information about a `Quantum`, used when 

188 constructing a `QuantumGraph`. 

189 

190 See `_PipelineScaffolding` for a top-down description of the full 

191 scaffolding data structure. 

192 

193 Parameters 

194 ---------- 

195 task : _TaskScaffolding 

196 Back-reference to the helper object for the `PipelineTask` this quantum 

197 represents an execution of. 

198 dataId : `DataCoordinate` 

199 Data ID for this quantum. 

200 """ 

201 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

202 self.task = task 

203 self.dataId = dataId 

204 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

205 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

206 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

207 universe=dataId.universe) 

208 

209 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

210 

211 def __repr__(self): 

212 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)" 

213 

214 task: _TaskScaffolding 

215 """Back-reference to the helper object for the `PipelineTask` this quantum 

216 represents an execution of. 

217 """ 

218 

219 dataId: DataCoordinate 

220 """Data ID for this quantum. 

221 """ 

222 

223 inputs: _DatasetDict 

224 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

225 

226 This is initialized to map each `DatasetType` to an empty dictionary at 

227 construction. Those nested dictionaries are populated (with data IDs as 

228 keys) with unresolved `DatasetRef` instances in 

229 `_PipelineScaffolding.connectDataIds`. 

230 """ 

231 

232 outputs: _DatasetDict 

233 """Nested dictionary containing `DatasetRef` outputs this quantum. 

234 """ 

235 

236 prerequisites: _DatasetDict 

237 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

238 quantum. 

239 """ 

240 

241 def makeQuantum(self) -> Quantum: 

242 """Transform the scaffolding object into a true `Quantum` instance. 

243 

244 Returns 

245 ------- 

246 quantum : `Quantum` 

247 An actual `Quantum` instance. 

248 """ 

249 allInputs = self.inputs.unpackMultiRefs() 

250 allInputs.update(self.prerequisites.unpackMultiRefs()) 

251 # Give the task's Connections class an opportunity to remove some 

252 # inputs, or complain if they are unacceptable. 

253 config = self.task.taskDef.config 

254 connections = config.connections.ConnectionsClass(config=config) 

255 # This will raise if one of the check conditions is not met, which is the intended 

256 # behavior 

257 allInputs = connections.adjustQuantum(allInputs) 

258 return Quantum( 

259 taskName=self.task.taskDef.taskName, 

260 taskClass=self.task.taskDef.taskClass, 

261 dataId=self.dataId, 

262 initInputs=self.task.initInputs.unpackSingleRefs(), 

263 predictedInputs=allInputs, 

264 outputs=self.outputs.unpackMultiRefs(), 

265 ) 

266 

267 

268@dataclass 

269class _TaskScaffolding: 

270 """Helper class aggregating information about a `PipelineTask`, used when 

271 constructing a `QuantumGraph`. 

272 

273 See `_PipelineScaffolding` for a top-down description of the full 

274 scaffolding data structure. 

275 

276 Parameters 

277 ---------- 

278 taskDef : `TaskDef` 

279 Data structure that identifies the task class and its config. 

280 parent : `_PipelineScaffolding` 

281 The parent data structure that will hold the instance being 

282 constructed. 

283 datasetTypes : `TaskDatasetTypes` 

284 Data structure that categorizes the dataset types used by this task. 

285 """ 

286 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

287 universe = parent.dimensions.universe 

288 self.taskDef = taskDef 

289 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

290 assert self.dimensions.issubset(parent.dimensions) 

291 # Initialize _DatasetDicts as subsets of the one or two 

292 # corresponding dicts in the parent _PipelineScaffolding. 

293 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

294 parent.initIntermediates) 

295 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

296 parent.initOutputs) 

297 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

298 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

299 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

300 self.dataIds = set() 

301 self.quanta = {} 

302 

303 def __repr__(self): 

304 # Default dataclass-injected __repr__ gets caught in an infinite loop 

305 # because of back-references. 

306 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

307 

308 taskDef: TaskDef 

309 """Data structure that identifies the task class and its config 

310 (`TaskDef`). 

311 """ 

312 

313 dimensions: DimensionGraph 

314 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

315 """ 

316 

317 initInputs: _DatasetDict 

318 """Dictionary containing information about datasets used to construct this 

319 task (`_DatasetDict`). 

320 """ 

321 

322 initOutputs: _DatasetDict 

323 """Dictionary containing information about datasets produced as a 

324 side-effect of constructing this task (`_DatasetDict`). 

325 """ 

326 

327 inputs: _DatasetDict 

328 """Dictionary containing information about datasets used as regular, 

329 graph-constraining inputs to this task (`_DatasetDict`). 

330 """ 

331 

332 outputs: _DatasetDict 

333 """Dictionary containing information about datasets produced by this task 

334 (`_DatasetDict`). 

335 """ 

336 

337 prerequisites: _DatasetDict 

338 """Dictionary containing information about input datasets that must be 

339 present in the repository before any Pipeline containing this task is run 

340 (`_DatasetDict`). 

341 """ 

342 

343 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

344 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

345 this task with that data ID. 

346 """ 

347 

348 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes: 

349 """Create a `QuantumGraphTaskNodes` instance from the information in 

350 ``self``. 

351 

352 Returns 

353 ------- 

354 nodes : `QuantumGraphTaskNodes` 

355 The `QuantumGraph` elements corresponding to this task. 

356 """ 

357 return QuantumGraphTaskNodes( 

358 taskDef=self.taskDef, 

359 quanta=[q.makeQuantum() for q in self.quanta.values()], 

360 initInputs=self.initInputs.unpackSingleRefs(), 

361 initOutputs=self.initOutputs.unpackSingleRefs(), 

362 ) 

363 

364 

365@dataclass 

366class _PipelineScaffolding: 

367 """A helper data structure that organizes the information involved in 

368 constructing a `QuantumGraph` for a `Pipeline`. 

369 

370 Parameters 

371 ---------- 

372 pipeline : `Pipeline` 

373 Sequence of tasks from which a graph is to be constructed. Must 

374 have nested task classes already imported. 

375 universe : `DimensionUniverse` 

376 Universe of all possible dimensions. 

377 

378 Notes 

379 ----- 

380 The scaffolding data structure contains nested data structures for both 

381 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

382 data structures are shared between the pipeline-level structure (which 

383 aggregates all datasets and categorizes them from the perspective of the 

384 complete pipeline) and the individual tasks that use them as inputs and 

385 outputs. 

386 

387 `QuantumGraph` construction proceeds in four steps, with each corresponding 

388 to a different `_PipelineScaffolding` method: 

389 

390 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

391 the DatasetTypes used by the pipeline (delegating to 

392 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

393 nested `_TaskScaffolding` and `_DatasetDict` objects. 

394 

395 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

396 returns related tuples of all dimensions used to identify any regular 

397 input, output, and intermediate datasets (not prerequisites). We then 

398 iterate over these tuples of related dimensions, identifying the subsets 

399 that correspond to distinct data IDs for each task and dataset type, 

400 and then create `_QuantumScaffolding` objects. 

401 

402 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

403 dataset data IDs previously identified, transforming unresolved 

404 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

405 up prerequisite datasets for all quanta. 

406 

407 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

408 per-task `_QuantumScaffolding` objects. 

409 """ 

410 def __init__(self, pipeline, *, registry): 

411 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

412 self.tasks = [] 

413 # Aggregate and categorize the DatasetTypes in the Pipeline. 

414 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

415 # Construct dictionaries that map those DatasetTypes to structures 

416 # that will (later) hold addiitonal information about them. 

417 for attr in ("initInputs", "initIntermediates", "initOutputs", 

418 "inputs", "intermediates", "outputs", "prerequisites"): 

419 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

420 universe=registry.dimensions)) 

421 # Aggregate all dimensions for all non-init, non-prerequisite 

422 # DatasetTypes. These are the ones we'll include in the big join query. 

423 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

424 self.outputs.dimensions) 

425 # Construct scaffolding nodes for each Task, and add backreferences 

426 # to the Task from each DatasetScaffolding node. 

427 # Note that there's only one scaffolding node for each DatasetType, shared by 

428 # _PipelineScaffolding and all _TaskScaffoldings that reference it. 

429 if isinstance(pipeline, Pipeline): 

430 pipeline = pipeline.toExpandedPipeline() 

431 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

432 for taskDef, taskDatasetTypes in zip(pipeline, 

433 datasetTypes.byTask.values())] 

434 

435 def __repr__(self): 

436 # Default dataclass-injected __repr__ gets caught in an infinite loop 

437 # because of back-references. 

438 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

439 

440 tasks: List[_TaskScaffolding] 

441 """Scaffolding data structures for each task in the pipeline 

442 (`list` of `_TaskScaffolding`). 

443 """ 

444 

445 initInputs: _DatasetDict 

446 """Datasets consumed but not produced when constructing the tasks in this 

447 pipeline (`_DatasetDict`). 

448 """ 

449 

450 initIntermediates: _DatasetDict 

451 """Datasets that are both consumed and produced when constructing the tasks 

452 in this pipeline (`_DatasetDict`). 

453 """ 

454 

455 initOutputs: _DatasetDict 

456 """Datasets produced but not consumed when constructing the tasks in this 

457 pipeline (`_DatasetDict`). 

458 """ 

459 

460 inputs: _DatasetDict 

461 """Datasets that are consumed but not produced when running this pipeline 

462 (`_DatasetDict`). 

463 """ 

464 

465 intermediates: _DatasetDict 

466 """Datasets that are both produced and consumed when running this pipeline 

467 (`_DatasetDict`). 

468 """ 

469 

470 outputs: _DatasetDict 

471 """Datasets produced but not consumed when when running this pipeline 

472 (`_DatasetDict`). 

473 """ 

474 

475 prerequisites: _DatasetDict 

476 """Datasets that are consumed when running this pipeline and looked up 

477 per-Quantum when generating the graph (`_DatasetDict`). 

478 """ 

479 

480 dimensions: DimensionGraph 

481 """All dimensions used by any regular input, intermediate, or output 

482 (not prerequisite) dataset; the set of dimension used in the "Big Join 

483 Query" (`DimensionGraph`). 

484 

485 This is required to be a superset of all task quantum dimensions. 

486 """ 

487 

488 @contextmanager 

489 def connectDataIds(self, registry, collections, userQuery): 

490 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

491 

492 This method populates `_TaskScaffolding.dataIds` and 

493 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

494 

495 Parameters 

496 ---------- 

497 registry : `lsst.daf.butler.Registry` 

498 Registry for the data repository; used for all data ID queries. 

499 collections : `lsst.daf.butler.CollectionSearch` 

500 Object representing the collections to search for input datasets. 

501 userQuery : `str`, optional 

502 User-provided expression to limit the data IDs processed. 

503 

504 Returns 

505 ------- 

506 commonDataIds : \ 

507 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

508 An interface to a database temporary table containing all data IDs 

509 that will appear in this `QuantumGraph`. Returned inside a 

510 context manager, which will drop the temporary table at the end of 

511 the `with` block in which this method is called. 

512 """ 

513 _LOG.debug("Building query for data IDs.") 

514 # Initialization datasets always have empty data IDs. 

515 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

516 for datasetType, refs in itertools.chain(self.initInputs.items(), 

517 self.initIntermediates.items(), 

518 self.initOutputs.items()): 

519 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

520 # Run one big query for the data IDs for task dimensions and regular 

521 # inputs and outputs. We limit the query to only dimensions that are 

522 # associated with the input dataset types, but don't (yet) try to 

523 # obtain the dataset_ids for those inputs. 

524 _LOG.debug("Submitting data ID query and materializing results.") 

525 with registry.queryDataIds(self.dimensions, 

526 datasets=list(self.inputs), 

527 collections=collections, 

528 where=userQuery, 

529 ).materialize() as commonDataIds: 

530 _LOG.debug("Expanding data IDs.") 

531 commonDataIds = commonDataIds.expanded() 

532 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

533 # Iterate over query results, populating data IDs for datasets and 

534 # quanta and then connecting them to each other. 

535 n = 0 

536 for n, commonDataId in enumerate(commonDataIds): 

537 # Create DatasetRefs for all DatasetTypes from this result row, 

538 # noting that we might have created some already. 

539 # We remember both those that already existed and those that we 

540 # create now. 

541 refsForRow = {} 

542 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

543 self.outputs.items()): 

544 datasetDataId = commonDataId.subset(datasetType.dimensions) 

545 ref = refs.get(datasetDataId) 

546 if ref is None: 

547 ref = DatasetRef(datasetType, datasetDataId) 

548 refs[datasetDataId] = ref 

549 refsForRow[datasetType.name] = ref 

550 # Create _QuantumScaffolding objects for all tasks from this result 

551 # row, noting that we might have created some already. 

552 for task in self.tasks: 

553 quantumDataId = commonDataId.subset(task.dimensions) 

554 quantum = task.quanta.get(quantumDataId) 

555 if quantum is None: 

556 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

557 task.quanta[quantumDataId] = quantum 

558 # Whether this is a new quantum or an existing one, we can now 

559 # associate the DatasetRefs for this row with it. The fact 

560 # the fact that a Quantum data ID and a dataset data ID both 

561 # came from the same result row is what tells us they should 

562 # be associated. 

563 # Many of these associates will be duplicates (because another 

564 # query row that differed from this one only in irrelevant 

565 # dimensions already added them), and we use sets to skip. 

566 for datasetType in task.inputs: 

567 ref = refsForRow[datasetType.name] 

568 quantum.inputs[datasetType.name][ref.dataId] = ref 

569 for datasetType in task.outputs: 

570 ref = refsForRow[datasetType.name] 

571 quantum.outputs[datasetType.name][ref.dataId] = ref 

572 _LOG.debug("Finished processing %d rows from data ID query.", n) 

573 yield commonDataIds 

574 

575 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True): 

576 """Perform follow up queries for each dataset data ID produced in 

577 `fillDataIds`. 

578 

579 This method populates `_DatasetScaffolding.refs` (except for those in 

580 `prerequisites`). 

581 

582 Parameters 

583 ---------- 

584 registry : `lsst.daf.butler.Registry` 

585 Registry for the data repository; used for all data ID queries. 

586 collections : `lsst.daf.butler.CollectionSearch` 

587 Object representing the collections to search for input datasets. 

588 run : `str`, optional 

589 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

590 output datasets, if it already exists. 

591 commonDataIds : \ 

592 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

593 Result of a previous call to `connectDataIds`. 

594 skipExisting : `bool`, optional 

595 If `True` (default), a Quantum is not created if all its outputs 

596 already exist in ``run``. Ignored if ``run`` is `None`. 

597 

598 Raises 

599 ------ 

600 OutputExistsError 

601 Raised if an output dataset already exists in the output run 

602 and ``skipExisting`` is `False`. The case where some but not all 

603 of a quantum's outputs are present and ``skipExisting`` is `True` 

604 cannot be identified at this stage, and is handled by `fillQuanta` 

605 instead. 

606 """ 

607 # Look up [init] intermediate and output datasets in the output 

608 # collection, if there is an output collection. 

609 if run is not None: 

610 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

611 self.initOutputs.items(), 

612 self.intermediates.items(), 

613 self.outputs.items()): 

614 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

615 len(refs), datasetType.name) 

616 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

617 resolvedRefQueryResults = commonDataIds.subset( 

618 datasetType.dimensions, 

619 unique=True 

620 ).findDatasets( 

621 datasetType, 

622 collections=run, 

623 deduplicate=True 

624 ) 

625 for resolvedRef in resolvedRefQueryResults: 

626 # TODO: we could easily support per-DatasetType 

627 # skipExisting and I could imagine that being useful - it's 

628 # probably required in order to support writing initOutputs 

629 # before QuantumGraph generation. 

630 assert resolvedRef.dataId in refs 

631 if skipExisting or isInit: 

632 refs[resolvedRef.dataId] = resolvedRef 

633 else: 

634 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

635 f"output RUN collection '{run}' with data ID" 

636 f" {resolvedRef.dataId}.") 

637 # Look up input and initInput datasets in the input collection(s). 

638 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

639 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

640 resolvedRefQueryResults = commonDataIds.subset( 

641 datasetType.dimensions, 

642 unique=True 

643 ).findDatasets( 

644 datasetType, 

645 collections=collections, 

646 deduplicate=True 

647 ) 

648 dataIdsNotFoundYet = set(refs.keys()) 

649 for resolvedRef in resolvedRefQueryResults: 

650 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

651 refs[resolvedRef.dataId] = resolvedRef 

652 if dataIdsNotFoundYet: 

653 raise RuntimeError( 

654 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

655 f"'{datasetType.name}' was/were present in a previous " 

656 f"query, but could not be found now." 

657 f"This is either a logic bug in QuantumGraph generation " 

658 f"or the input collections have been modified since " 

659 f"QuantumGraph generation began." 

660 ) 

661 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

662 # replacing the unresolved refs there, and then look up prerequisites. 

663 for task in self.tasks: 

664 _LOG.debug( 

665 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

666 len(task.quanta), 

667 task.taskDef.label 

668 ) 

669 lookupFunctions = { 

670 c.name: c.lookupFunction 

671 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

672 if c.lookupFunction is not None 

673 } 

674 dataIdsToSkip = [] 

675 for quantum in task.quanta.values(): 

676 # Process outputs datasets only if there is a run to look for 

677 # outputs in and skipExisting is True. Note that if 

678 # skipExisting is False, any output datasets that already exist 

679 # would have already caused an exception to be raised. 

680 # We never update the DatasetRefs in the quantum because those 

681 # should never be resolved. 

682 if run is not None and skipExisting: 

683 resolvedRefs = [] 

684 unresolvedRefs = [] 

685 for datasetType, originalRefs in quantum.outputs.items(): 

686 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

687 if ref.id is not None: 

688 resolvedRefs.append(ref) 

689 else: 

690 unresolvedRefs.append(ref) 

691 if resolvedRefs: 

692 if unresolvedRefs: 

693 raise OutputExistsError( 

694 f"Quantum {quantum.dataId} of task with label " 

695 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) " 

696 f"and others that don't ({unresolvedRefs})." 

697 ) 

698 else: 

699 # All outputs are already present; skip this 

700 # quantum and continue to the next. 

701 dataIdsToSkip.append(quantum.dataId) 

702 continue 

703 # Update the input DatasetRefs to the resolved ones we already 

704 # searched for. 

705 for datasetType, refs in quantum.inputs.items(): 

706 for ref in task.inputs.extract(datasetType, refs.keys()): 

707 refs[ref.dataId] = ref 

708 # Look up prerequisite datasets in the input collection(s). 

709 # These may have dimensions that extend beyond those we queried 

710 # for originally, because we want to permit those data ID 

711 # values to differ across quanta and dataset types. 

712 # For example, the same quantum may have a flat and bias with 

713 # a different calibration_label, or a refcat with a skypix 

714 # value that overlaps the quantum's data ID's region, but not 

715 # the user expression used for the initial query. 

716 for datasetType in task.prerequisites: 

717 lookupFunction = lookupFunctions.get(datasetType.name) 

718 if lookupFunction is not None: 

719 refs = list( 

720 lookupFunction(datasetType, registry, quantum.dataId, collections) 

721 ) 

722 else: 

723 refs = list(registry.queryDatasets(datasetType, 

724 collections=collections, 

725 dataId=quantum.dataId, 

726 deduplicate=True).expanded()) 

727 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs}) 

728 # Actually remove any quanta that we decided to skip above. 

729 if dataIdsToSkip: 

730 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.", 

731 len(dataIdsToSkip), task.taskDef.label) 

732 for dataId in dataIdsToSkip: 

733 del task.quanta[dataId] 

734 

735 def makeQuantumGraph(self): 

736 """Create a `QuantumGraph` from the quanta already present in 

737 the scaffolding data structure. 

738 

739 Returns 

740 ------- 

741 graph : `QuantumGraph` 

742 The full `QuantumGraph`. 

743 """ 

744 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks) 

745 graph.initInputs = self.initInputs.unpackSingleRefs() 

746 graph.initOutputs = self.initOutputs.unpackSingleRefs() 

747 graph.initIntermediates = self.initIntermediates.unpackSingleRefs() 

748 return graph 

749 

750 

751class _InstrumentFinder(TreeVisitor): 

752 """Implementation of TreeVisitor which looks for instrument name 

753 

754 Instrument should be specified as a boolean expression 

755 

756 instrument = 'string' 

757 'string' = instrument 

758 

759 so we only need to find a binary operator where operator is "=", 

760 one side is a string literal and other side is an identifier. 

761 All visit methods return tuple of (type, value), non-useful nodes 

762 return None for both type and value. 

763 """ 

764 def __init__(self): 

765 self.instruments = [] 

766 

767 def visitNumericLiteral(self, value, node): 

768 # do not care about numbers 

769 return (None, None) 

770 

771 def visitStringLiteral(self, value, node): 

772 # return type and value 

773 return ("str", value) 

774 

775 def visitTimeLiteral(self, value, node): 

776 # do not care about these 

777 return (None, None) 

778 

779 def visitRangeLiteral(self, start, stop, stride, node): 

780 # do not care about these 

781 return (None, None) 

782 

783 def visitIdentifier(self, name, node): 

784 if name.lower() == "instrument": 

785 return ("id", "instrument") 

786 return (None, None) 

787 

788 def visitUnaryOp(self, operator, operand, node): 

789 # do not care about these 

790 return (None, None) 

791 

792 def visitBinaryOp(self, operator, lhs, rhs, node): 

793 if operator == "=": 

794 if lhs == ("id", "instrument") and rhs[0] == "str": 

795 self.instruments.append(rhs[1]) 

796 elif rhs == ("id", "instrument") and lhs[0] == "str": 

797 self.instruments.append(lhs[1]) 

798 return (None, None) 

799 

800 def visitIsIn(self, lhs, values, not_in, node): 

801 # do not care about these 

802 return (None, None) 

803 

804 def visitParens(self, expression, node): 

805 # do not care about these 

806 return (None, None) 

807 

808 

809def _findInstruments(queryStr): 

810 parser = ParserYacc() 

811 finder = _InstrumentFinder() 

812 try: 

813 tree = parser.parse(queryStr) 

814 except ParseError as exc: 

815 raise ValueError(f"failed to parse query expression: {queryStr}") from exc 

816 tree.visit(finder) 

817 return finder.instruments 

818 

819 

820# ------------------------ 

821# Exported definitions -- 

822# ------------------------ 

823 

824 

825class GraphBuilderError(Exception): 

826 """Base class for exceptions generated by graph builder. 

827 """ 

828 pass 

829 

830 

831class OutputExistsError(GraphBuilderError): 

832 """Exception generated when output datasets already exist. 

833 """ 

834 pass 

835 

836 

837class PrerequisiteMissingError(GraphBuilderError): 

838 """Exception generated when a prerequisite dataset does not exist. 

839 """ 

840 pass 

841 

842 

843class GraphBuilder(object): 

844 """GraphBuilder class is responsible for building task execution graph from 

845 a Pipeline. 

846 

847 Parameters 

848 ---------- 

849 registry : `~lsst.daf.butler.Registry` 

850 Data butler instance. 

851 skipExisting : `bool`, optional 

852 If `True` (default), a Quantum is not created if all its outputs 

853 already exist. 

854 """ 

855 

856 def __init__(self, registry, skipExisting=True): 

857 self.registry = registry 

858 self.dimensions = registry.dimensions 

859 self.skipExisting = skipExisting 

860 

861 def makeGraph(self, pipeline, collections, run, userQuery): 

862 """Create execution graph for a pipeline. 

863 

864 Parameters 

865 ---------- 

866 pipeline : `Pipeline` 

867 Pipeline definition, task names/classes and their configs. 

868 collections : `lsst.daf.butler.CollectionSearch` 

869 Object representing the collections to search for input datasets. 

870 run : `str`, optional 

871 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

872 output datasets, if it already exists. 

873 userQuery : `str` 

874 String which defines user-defined selection for registry, should be 

875 empty or `None` if there is no restrictions on data selection. 

876 

877 Returns 

878 ------- 

879 graph : `QuantumGraph` 

880 

881 Raises 

882 ------ 

883 UserExpressionError 

884 Raised when user expression cannot be parsed. 

885 OutputExistsError 

886 Raised when output datasets already exist. 

887 Exception 

888 Other exceptions types may be raised by underlying registry 

889 classes. 

890 """ 

891 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

892 

893 instrument = pipeline.getInstrument() 

894 if isinstance(instrument, str): 

895 instrument = doImport(instrument) 

896 instrumentName = instrument.getName() if instrument else None 

897 userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery) 

898 

899 with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds: 

900 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

901 skipExisting=self.skipExisting) 

902 return scaffolding.makeQuantumGraph() 

903 

904 @staticmethod 

905 def _verifyInstrumentRestriction(instrumentName, query): 

906 """Add an instrument restriction to the query if it does not have one, 

907 and verify that if given an instrument name that there are no other 

908 instrument restrictions in the query. 

909 

910 Parameters 

911 ---------- 

912 instrumentName : `str` 

913 The name of the instrument that should appear in the query. 

914 query : `str` 

915 The query string. 

916 

917 Returns 

918 ------- 

919 query : `str` 

920 The query string with the instrument added to it if needed. 

921 

922 Raises 

923 ------ 

924 RuntimeError 

925 If the pipeline names an instrument and the query contains more 

926 than one instrument or the name of the instrument in the query does 

927 not match the instrument named by the pipeline. 

928 """ 

929 if not instrumentName: 

930 return query 

931 queryInstruments = _findInstruments(query) 

932 if len(queryInstruments) > 1: 

933 raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must " 

934 "have zero instruments or one instrument that matches the pipeline. " 

935 f"Found these instruments in the query: {queryInstruments}.") 

936 if not queryInstruments: 

937 # There is not an instrument in the query, add it: 

938 restriction = f"instrument = '{instrumentName}'" 

939 _LOG.debug(f"Adding restriction \"{restriction}\" to query.") 

940 query = f"{restriction} AND ({query})" 

941 elif queryInstruments[0] != instrumentName: 

942 # Since there is an instrument in the query, it should match 

943 # the instrument in the pipeline. 

944 raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not " 

945 f"match the instrument named by the pipeline (\"{instrumentName}\")") 

946 return query