Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ['GraphBuilder'] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32from collections import ChainMap 

33from contextlib import contextmanager 

34from dataclasses import dataclass 

35from typing import Dict, Iterable, Iterator, List 

36import logging 

37 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from .connections import iterConnections 

43from .pipeline import PipelineDatasetTypes, TaskDatasetTypes, TaskDef, Pipeline 

44from .graph import QuantumGraph, QuantumGraphTaskNodes 

45from lsst.daf.butler import ( 

46 DataCoordinate, 

47 DatasetRef, 

48 DatasetType, 

49 DimensionGraph, 

50 DimensionUniverse, 

51 NamedKeyDict, 

52 Quantum, 

53) 

54from lsst.daf.butler.registry.queries.exprParser import ParseError, ParserYacc, TreeVisitor 

55from lsst.utils import doImport 

56 

57# ---------------------------------- 

58# Local non-exported definitions -- 

59# ---------------------------------- 

60 

61_LOG = logging.getLogger(__name__.partition(".")[2]) 

62 

63 

64class _DatasetDict(NamedKeyDict[DatasetType, Dict[DataCoordinate, DatasetRef]]): 

65 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

66 the known `DatasetRef` instances of that type. 

67 

68 Parameters 

69 ---------- 

70 args 

71 Positional arguments are forwarded to the `dict` constructor. 

72 universe : `DimensionUniverse` 

73 Universe of all possible dimensions. 

74 """ 

75 def __init__(self, *args, universe: DimensionGraph): 

76 super().__init__(*args) 

77 self.universe = universe 

78 

79 @classmethod 

80 def fromDatasetTypes(cls, datasetTypes: Iterable[DatasetType], *, 

81 universe: DimensionUniverse) -> _DatasetDict: 

82 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

83 

84 Parameters 

85 ---------- 

86 datasetTypes : `iterable` of `DatasetType` 

87 DatasetTypes to use as keys for the dict. Values will be empty 

88 dictionaries. 

89 universe : `DimensionUniverse` 

90 Universe of all possible dimensions. 

91 

92 Returns 

93 ------- 

94 dictionary : `_DatasetDict` 

95 A new `_DatasetDict` instance. 

96 """ 

97 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

98 

99 @classmethod 

100 def fromSubset(cls, datasetTypes: Iterable[DatasetType], first: _DatasetDict, *rest: _DatasetDict 

101 ) -> _DatasetDict: 

102 """Return a new dictionary by extracting items corresponding to the 

103 given keys from one or more existing dictionaries. 

104 

105 Parameters 

106 ---------- 

107 datasetTypes : `iterable` of `DatasetType` 

108 DatasetTypes to use as keys for the dict. Values will be obtained 

109 by lookups against ``first`` and ``rest``. 

110 first : `_DatasetDict` 

111 Another dictionary from which to extract values. 

112 rest 

113 Additional dictionaries from which to extract values. 

114 

115 Returns 

116 ------- 

117 dictionary : `_DatasetDict` 

118 A new dictionary instance. 

119 """ 

120 combined = ChainMap(first, *rest) 

121 return cls({datasetType: combined[datasetType] for datasetType in datasetTypes}, 

122 universe=first.universe) 

123 

124 @property 

125 def dimensions(self) -> DimensionGraph: 

126 """The union of all dimensions used by all dataset types in this 

127 dictionary, including implied dependencies (`DimensionGraph`). 

128 """ 

129 base = self.universe.empty 

130 if len(self) == 0: 

131 return base 

132 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

133 

134 def unpackSingleRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

135 """Unpack nested single-element `DatasetRef` dicts into a new 

136 mapping with `DatasetType` keys and `DatasetRef` values. 

137 

138 This method assumes that each nest contains exactly one item, as is the 

139 case for all "init" datasets. 

140 

141 Returns 

142 ------- 

143 dictionary : `NamedKeyDict` 

144 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

145 `DatasetType` instances and string names usable as keys. 

146 """ 

147 def getOne(refs: Dict[DataCoordinate, DatasetRef]) -> DatasetRef: 

148 ref, = refs.values() 

149 return ref 

150 return NamedKeyDict({datasetType: getOne(refs) for datasetType, refs in self.items()}) 

151 

152 def unpackMultiRefs(self) -> NamedKeyDict[DatasetType, DatasetRef]: 

153 """Unpack nested multi-element `DatasetRef` dicts into a new 

154 mapping with `DatasetType` keys and `set` of `DatasetRef` values. 

155 

156 Returns 

157 ------- 

158 dictionary : `NamedKeyDict` 

159 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

160 `DatasetType` instances and string names usable as keys. 

161 """ 

162 return NamedKeyDict({datasetType: list(refs.values()) for datasetType, refs in self.items()}) 

163 

164 def extract(self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

165 ) -> Iterator[DatasetRef]: 

166 """Iterate over the contained `DatasetRef` instances that match the 

167 given `DatasetType` and data IDs. 

168 

169 Parameters 

170 ---------- 

171 datasetType : `DatasetType` 

172 Dataset type to match. 

173 dataIds : `Iterable` [ `DataCoordinate` ] 

174 Data IDs to match. 

175 

176 Returns 

177 ------- 

178 refs : `Iterator` [ `DatasetRef` ] 

179 DatasetRef instances for which ``ref.datasetType == datasetType`` 

180 and ``ref.dataId`` is in ``dataIds``. 

181 """ 

182 refs = self[datasetType] 

183 return (refs[dataId] for dataId in dataIds) 

184 

185 

186class _QuantumScaffolding: 

187 """Helper class aggregating information about a `Quantum`, used when 

188 constructing a `QuantumGraph`. 

189 

190 See `_PipelineScaffolding` for a top-down description of the full 

191 scaffolding data structure. 

192 

193 Parameters 

194 ---------- 

195 task : _TaskScaffolding 

196 Back-reference to the helper object for the `PipelineTask` this quantum 

197 represents an execution of. 

198 dataId : `DataCoordinate` 

199 Data ID for this quantum. 

200 """ 

201 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

202 self.task = task 

203 self.dataId = dataId 

204 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

205 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

206 self.prerequisites = _DatasetDict.fromDatasetTypes(task.prerequisites.keys(), 

207 universe=dataId.universe) 

208 

209 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

210 

211 def __repr__(self): 

212 return f"_QuantumScaffolding(taskDef={self.taskDef}, dataId={self.dataId}, ...)" 

213 

214 task: _TaskScaffolding 

215 """Back-reference to the helper object for the `PipelineTask` this quantum 

216 represents an execution of. 

217 """ 

218 

219 dataId: DataCoordinate 

220 """Data ID for this quantum. 

221 """ 

222 

223 inputs: _DatasetDict 

224 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

225 

226 This is initialized to map each `DatasetType` to an empty dictionary at 

227 construction. Those nested dictionaries are populated (with data IDs as 

228 keys) with unresolved `DatasetRef` instances in 

229 `_PipelineScaffolding.connectDataIds`. 

230 """ 

231 

232 outputs: _DatasetDict 

233 """Nested dictionary containing `DatasetRef` outputs this quantum. 

234 """ 

235 

236 prerequisites: _DatasetDict 

237 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

238 quantum. 

239 """ 

240 

241 def makeQuantum(self) -> Quantum: 

242 """Transform the scaffolding object into a true `Quantum` instance. 

243 

244 Returns 

245 ------- 

246 quantum : `Quantum` 

247 An actual `Quantum` instance. 

248 """ 

249 allInputs = self.inputs.unpackMultiRefs() 

250 allInputs.update(self.prerequisites.unpackMultiRefs()) 

251 # Give the task's Connections class an opportunity to remove some 

252 # inputs, or complain if they are unacceptable. 

253 # This will raise if one of the check conditions is not met, which is the intended 

254 # behavior 

255 allInputs = self.task.taskDef.connections.adjustQuantum(allInputs) 

256 return Quantum( 

257 taskName=self.task.taskDef.taskName, 

258 taskClass=self.task.taskDef.taskClass, 

259 dataId=self.dataId, 

260 initInputs=self.task.initInputs.unpackSingleRefs(), 

261 predictedInputs=allInputs, 

262 outputs=self.outputs.unpackMultiRefs(), 

263 ) 

264 

265 

266@dataclass 

267class _TaskScaffolding: 

268 """Helper class aggregating information about a `PipelineTask`, used when 

269 constructing a `QuantumGraph`. 

270 

271 See `_PipelineScaffolding` for a top-down description of the full 

272 scaffolding data structure. 

273 

274 Parameters 

275 ---------- 

276 taskDef : `TaskDef` 

277 Data structure that identifies the task class and its config. 

278 parent : `_PipelineScaffolding` 

279 The parent data structure that will hold the instance being 

280 constructed. 

281 datasetTypes : `TaskDatasetTypes` 

282 Data structure that categorizes the dataset types used by this task. 

283 """ 

284 def __init__(self, taskDef: TaskDef, parent: _PipelineScaffolding, datasetTypes: TaskDatasetTypes): 

285 universe = parent.dimensions.universe 

286 self.taskDef = taskDef 

287 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

288 assert self.dimensions.issubset(parent.dimensions) 

289 # Initialize _DatasetDicts as subsets of the one or two 

290 # corresponding dicts in the parent _PipelineScaffolding. 

291 self.initInputs = _DatasetDict.fromSubset(datasetTypes.initInputs, parent.initInputs, 

292 parent.initIntermediates) 

293 self.initOutputs = _DatasetDict.fromSubset(datasetTypes.initOutputs, parent.initIntermediates, 

294 parent.initOutputs) 

295 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

296 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

297 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

298 self.dataIds = set() 

299 self.quanta = {} 

300 

301 def __repr__(self): 

302 # Default dataclass-injected __repr__ gets caught in an infinite loop 

303 # because of back-references. 

304 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

305 

306 taskDef: TaskDef 

307 """Data structure that identifies the task class and its config 

308 (`TaskDef`). 

309 """ 

310 

311 dimensions: DimensionGraph 

312 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

313 """ 

314 

315 initInputs: _DatasetDict 

316 """Dictionary containing information about datasets used to construct this 

317 task (`_DatasetDict`). 

318 """ 

319 

320 initOutputs: _DatasetDict 

321 """Dictionary containing information about datasets produced as a 

322 side-effect of constructing this task (`_DatasetDict`). 

323 """ 

324 

325 inputs: _DatasetDict 

326 """Dictionary containing information about datasets used as regular, 

327 graph-constraining inputs to this task (`_DatasetDict`). 

328 """ 

329 

330 outputs: _DatasetDict 

331 """Dictionary containing information about datasets produced by this task 

332 (`_DatasetDict`). 

333 """ 

334 

335 prerequisites: _DatasetDict 

336 """Dictionary containing information about input datasets that must be 

337 present in the repository before any Pipeline containing this task is run 

338 (`_DatasetDict`). 

339 """ 

340 

341 quanta: Dict[DataCoordinate, _QuantumScaffolding] 

342 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

343 this task with that data ID. 

344 """ 

345 

346 def makeQuantumGraphTaskNodes(self) -> QuantumGraphTaskNodes: 

347 """Create a `QuantumGraphTaskNodes` instance from the information in 

348 ``self``. 

349 

350 Returns 

351 ------- 

352 nodes : `QuantumGraphTaskNodes` 

353 The `QuantumGraph` elements corresponding to this task. 

354 """ 

355 return QuantumGraphTaskNodes( 

356 taskDef=self.taskDef, 

357 quanta=[q.makeQuantum() for q in self.quanta.values()], 

358 initInputs=self.initInputs.unpackSingleRefs(), 

359 initOutputs=self.initOutputs.unpackSingleRefs(), 

360 ) 

361 

362 

363@dataclass 

364class _PipelineScaffolding: 

365 """A helper data structure that organizes the information involved in 

366 constructing a `QuantumGraph` for a `Pipeline`. 

367 

368 Parameters 

369 ---------- 

370 pipeline : `Pipeline` 

371 Sequence of tasks from which a graph is to be constructed. Must 

372 have nested task classes already imported. 

373 universe : `DimensionUniverse` 

374 Universe of all possible dimensions. 

375 

376 Notes 

377 ----- 

378 The scaffolding data structure contains nested data structures for both 

379 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

380 data structures are shared between the pipeline-level structure (which 

381 aggregates all datasets and categorizes them from the perspective of the 

382 complete pipeline) and the individual tasks that use them as inputs and 

383 outputs. 

384 

385 `QuantumGraph` construction proceeds in four steps, with each corresponding 

386 to a different `_PipelineScaffolding` method: 

387 

388 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

389 the DatasetTypes used by the pipeline (delegating to 

390 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

391 nested `_TaskScaffolding` and `_DatasetDict` objects. 

392 

393 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

394 returns related tuples of all dimensions used to identify any regular 

395 input, output, and intermediate datasets (not prerequisites). We then 

396 iterate over these tuples of related dimensions, identifying the subsets 

397 that correspond to distinct data IDs for each task and dataset type, 

398 and then create `_QuantumScaffolding` objects. 

399 

400 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

401 dataset data IDs previously identified, transforming unresolved 

402 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

403 up prerequisite datasets for all quanta. 

404 

405 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

406 per-task `_QuantumScaffolding` objects. 

407 """ 

408 def __init__(self, pipeline, *, registry): 

409 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

410 self.tasks = [] 

411 # Aggregate and categorize the DatasetTypes in the Pipeline. 

412 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

413 # Construct dictionaries that map those DatasetTypes to structures 

414 # that will (later) hold addiitonal information about them. 

415 for attr in ("initInputs", "initIntermediates", "initOutputs", 

416 "inputs", "intermediates", "outputs", "prerequisites"): 

417 setattr(self, attr, _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), 

418 universe=registry.dimensions)) 

419 # Aggregate all dimensions for all non-init, non-prerequisite 

420 # DatasetTypes. These are the ones we'll include in the big join query. 

421 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, 

422 self.outputs.dimensions) 

423 # Construct scaffolding nodes for each Task, and add backreferences 

424 # to the Task from each DatasetScaffolding node. 

425 # Note that there's only one scaffolding node for each DatasetType, shared by 

426 # _PipelineScaffolding and all _TaskScaffoldings that reference it. 

427 if isinstance(pipeline, Pipeline): 

428 pipeline = pipeline.toExpandedPipeline() 

429 self.tasks = [_TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

430 for taskDef, taskDatasetTypes in zip(pipeline, 

431 datasetTypes.byTask.values())] 

432 

433 def __repr__(self): 

434 # Default dataclass-injected __repr__ gets caught in an infinite loop 

435 # because of back-references. 

436 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

437 

438 tasks: List[_TaskScaffolding] 

439 """Scaffolding data structures for each task in the pipeline 

440 (`list` of `_TaskScaffolding`). 

441 """ 

442 

443 initInputs: _DatasetDict 

444 """Datasets consumed but not produced when constructing the tasks in this 

445 pipeline (`_DatasetDict`). 

446 """ 

447 

448 initIntermediates: _DatasetDict 

449 """Datasets that are both consumed and produced when constructing the tasks 

450 in this pipeline (`_DatasetDict`). 

451 """ 

452 

453 initOutputs: _DatasetDict 

454 """Datasets produced but not consumed when constructing the tasks in this 

455 pipeline (`_DatasetDict`). 

456 """ 

457 

458 inputs: _DatasetDict 

459 """Datasets that are consumed but not produced when running this pipeline 

460 (`_DatasetDict`). 

461 """ 

462 

463 intermediates: _DatasetDict 

464 """Datasets that are both produced and consumed when running this pipeline 

465 (`_DatasetDict`). 

466 """ 

467 

468 outputs: _DatasetDict 

469 """Datasets produced but not consumed when when running this pipeline 

470 (`_DatasetDict`). 

471 """ 

472 

473 prerequisites: _DatasetDict 

474 """Datasets that are consumed when running this pipeline and looked up 

475 per-Quantum when generating the graph (`_DatasetDict`). 

476 """ 

477 

478 dimensions: DimensionGraph 

479 """All dimensions used by any regular input, intermediate, or output 

480 (not prerequisite) dataset; the set of dimension used in the "Big Join 

481 Query" (`DimensionGraph`). 

482 

483 This is required to be a superset of all task quantum dimensions. 

484 """ 

485 

486 @contextmanager 

487 def connectDataIds(self, registry, collections, userQuery): 

488 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

489 

490 This method populates `_TaskScaffolding.dataIds` and 

491 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

492 

493 Parameters 

494 ---------- 

495 registry : `lsst.daf.butler.Registry` 

496 Registry for the data repository; used for all data ID queries. 

497 collections : `lsst.daf.butler.CollectionSearch` 

498 Object representing the collections to search for input datasets. 

499 userQuery : `str`, optional 

500 User-provided expression to limit the data IDs processed. 

501 

502 Returns 

503 ------- 

504 commonDataIds : \ 

505 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

506 An interface to a database temporary table containing all data IDs 

507 that will appear in this `QuantumGraph`. Returned inside a 

508 context manager, which will drop the temporary table at the end of 

509 the `with` block in which this method is called. 

510 """ 

511 _LOG.debug("Building query for data IDs.") 

512 # Initialization datasets always have empty data IDs. 

513 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

514 for datasetType, refs in itertools.chain(self.initInputs.items(), 

515 self.initIntermediates.items(), 

516 self.initOutputs.items()): 

517 refs[emptyDataId] = DatasetRef(datasetType, emptyDataId) 

518 # Run one big query for the data IDs for task dimensions and regular 

519 # inputs and outputs. We limit the query to only dimensions that are 

520 # associated with the input dataset types, but don't (yet) try to 

521 # obtain the dataset_ids for those inputs. 

522 _LOG.debug("Submitting data ID query and materializing results.") 

523 with registry.queryDataIds(self.dimensions, 

524 datasets=list(self.inputs), 

525 collections=collections, 

526 where=userQuery, 

527 ).materialize() as commonDataIds: 

528 _LOG.debug("Expanding data IDs.") 

529 commonDataIds = commonDataIds.expanded() 

530 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

531 # Iterate over query results, populating data IDs for datasets and 

532 # quanta and then connecting them to each other. 

533 n = 0 

534 for n, commonDataId in enumerate(commonDataIds): 

535 # Create DatasetRefs for all DatasetTypes from this result row, 

536 # noting that we might have created some already. 

537 # We remember both those that already existed and those that we 

538 # create now. 

539 refsForRow = {} 

540 for datasetType, refs in itertools.chain(self.inputs.items(), self.intermediates.items(), 

541 self.outputs.items()): 

542 datasetDataId = commonDataId.subset(datasetType.dimensions) 

543 ref = refs.get(datasetDataId) 

544 if ref is None: 

545 ref = DatasetRef(datasetType, datasetDataId) 

546 refs[datasetDataId] = ref 

547 refsForRow[datasetType.name] = ref 

548 # Create _QuantumScaffolding objects for all tasks from this result 

549 # row, noting that we might have created some already. 

550 for task in self.tasks: 

551 quantumDataId = commonDataId.subset(task.dimensions) 

552 quantum = task.quanta.get(quantumDataId) 

553 if quantum is None: 

554 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

555 task.quanta[quantumDataId] = quantum 

556 # Whether this is a new quantum or an existing one, we can now 

557 # associate the DatasetRefs for this row with it. The fact 

558 # the fact that a Quantum data ID and a dataset data ID both 

559 # came from the same result row is what tells us they should 

560 # be associated. 

561 # Many of these associates will be duplicates (because another 

562 # query row that differed from this one only in irrelevant 

563 # dimensions already added them), and we use sets to skip. 

564 for datasetType in task.inputs: 

565 ref = refsForRow[datasetType.name] 

566 quantum.inputs[datasetType.name][ref.dataId] = ref 

567 for datasetType in task.outputs: 

568 ref = refsForRow[datasetType.name] 

569 quantum.outputs[datasetType.name][ref.dataId] = ref 

570 _LOG.debug("Finished processing %d rows from data ID query.", n) 

571 yield commonDataIds 

572 

573 def resolveDatasetRefs(self, registry, collections, run, commonDataIds, *, skipExisting=True): 

574 """Perform follow up queries for each dataset data ID produced in 

575 `fillDataIds`. 

576 

577 This method populates `_DatasetScaffolding.refs` (except for those in 

578 `prerequisites`). 

579 

580 Parameters 

581 ---------- 

582 registry : `lsst.daf.butler.Registry` 

583 Registry for the data repository; used for all data ID queries. 

584 collections : `lsst.daf.butler.CollectionSearch` 

585 Object representing the collections to search for input datasets. 

586 run : `str`, optional 

587 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

588 output datasets, if it already exists. 

589 commonDataIds : \ 

590 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

591 Result of a previous call to `connectDataIds`. 

592 skipExisting : `bool`, optional 

593 If `True` (default), a Quantum is not created if all its outputs 

594 already exist in ``run``. Ignored if ``run`` is `None`. 

595 

596 Raises 

597 ------ 

598 OutputExistsError 

599 Raised if an output dataset already exists in the output run 

600 and ``skipExisting`` is `False`. The case where some but not all 

601 of a quantum's outputs are present and ``skipExisting`` is `True` 

602 cannot be identified at this stage, and is handled by `fillQuanta` 

603 instead. 

604 """ 

605 # Look up [init] intermediate and output datasets in the output 

606 # collection, if there is an output collection. 

607 if run is not None: 

608 for datasetType, refs in itertools.chain(self.initIntermediates.items(), 

609 self.initOutputs.items(), 

610 self.intermediates.items(), 

611 self.outputs.items()): 

612 _LOG.debug("Resolving %d datasets for intermediate and/or output dataset %s.", 

613 len(refs), datasetType.name) 

614 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

615 resolvedRefQueryResults = commonDataIds.subset( 

616 datasetType.dimensions, 

617 unique=True 

618 ).findDatasets( 

619 datasetType, 

620 collections=run, 

621 deduplicate=True 

622 ) 

623 for resolvedRef in resolvedRefQueryResults: 

624 # TODO: we could easily support per-DatasetType 

625 # skipExisting and I could imagine that being useful - it's 

626 # probably required in order to support writing initOutputs 

627 # before QuantumGraph generation. 

628 assert resolvedRef.dataId in refs 

629 if skipExisting or isInit: 

630 refs[resolvedRef.dataId] = resolvedRef 

631 else: 

632 raise OutputExistsError(f"Output dataset {datasetType.name} already exists in " 

633 f"output RUN collection '{run}' with data ID" 

634 f" {resolvedRef.dataId}.") 

635 # Look up input and initInput datasets in the input collection(s). 

636 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

637 _LOG.debug("Resolving %d datasets for input dataset %s.", len(refs), datasetType.name) 

638 resolvedRefQueryResults = commonDataIds.subset( 

639 datasetType.dimensions, 

640 unique=True 

641 ).findDatasets( 

642 datasetType, 

643 collections=collections, 

644 deduplicate=True 

645 ) 

646 dataIdsNotFoundYet = set(refs.keys()) 

647 for resolvedRef in resolvedRefQueryResults: 

648 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

649 refs[resolvedRef.dataId] = resolvedRef 

650 if dataIdsNotFoundYet: 

651 raise RuntimeError( 

652 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

653 f"'{datasetType.name}' was/were present in a previous " 

654 f"query, but could not be found now." 

655 f"This is either a logic bug in QuantumGraph generation " 

656 f"or the input collections have been modified since " 

657 f"QuantumGraph generation began." 

658 ) 

659 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

660 # replacing the unresolved refs there, and then look up prerequisites. 

661 for task in self.tasks: 

662 _LOG.debug( 

663 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

664 len(task.quanta), 

665 task.taskDef.label 

666 ) 

667 lookupFunctions = { 

668 c.name: c.lookupFunction 

669 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

670 if c.lookupFunction is not None 

671 } 

672 dataIdsToSkip = [] 

673 for quantum in task.quanta.values(): 

674 # Process outputs datasets only if there is a run to look for 

675 # outputs in and skipExisting is True. Note that if 

676 # skipExisting is False, any output datasets that already exist 

677 # would have already caused an exception to be raised. 

678 # We never update the DatasetRefs in the quantum because those 

679 # should never be resolved. 

680 if run is not None and skipExisting: 

681 resolvedRefs = [] 

682 unresolvedRefs = [] 

683 for datasetType, originalRefs in quantum.outputs.items(): 

684 for ref in task.outputs.extract(datasetType, originalRefs.keys()): 

685 if ref.id is not None: 

686 resolvedRefs.append(ref) 

687 else: 

688 unresolvedRefs.append(ref) 

689 if resolvedRefs: 

690 if unresolvedRefs: 

691 raise OutputExistsError( 

692 f"Quantum {quantum.dataId} of task with label " 

693 f"'{quantum.taskDef.label}' has some outputs that exist ({resolvedRefs}) " 

694 f"and others that don't ({unresolvedRefs})." 

695 ) 

696 else: 

697 # All outputs are already present; skip this 

698 # quantum and continue to the next. 

699 dataIdsToSkip.append(quantum.dataId) 

700 continue 

701 # Update the input DatasetRefs to the resolved ones we already 

702 # searched for. 

703 for datasetType, refs in quantum.inputs.items(): 

704 for ref in task.inputs.extract(datasetType, refs.keys()): 

705 refs[ref.dataId] = ref 

706 # Look up prerequisite datasets in the input collection(s). 

707 # These may have dimensions that extend beyond those we queried 

708 # for originally, because we want to permit those data ID 

709 # values to differ across quanta and dataset types. 

710 # For example, the same quantum may have a flat and bias with 

711 # a different calibration_label, or a refcat with a skypix 

712 # value that overlaps the quantum's data ID's region, but not 

713 # the user expression used for the initial query. 

714 for datasetType in task.prerequisites: 

715 lookupFunction = lookupFunctions.get(datasetType.name) 

716 if lookupFunction is not None: 

717 refs = list( 

718 lookupFunction(datasetType, registry, quantum.dataId, collections) 

719 ) 

720 else: 

721 refs = list(registry.queryDatasets(datasetType, 

722 collections=collections, 

723 dataId=quantum.dataId, 

724 deduplicate=True).expanded()) 

725 quantum.prerequisites[datasetType].update({ref.dataId: ref for ref in refs}) 

726 # Actually remove any quanta that we decided to skip above. 

727 if dataIdsToSkip: 

728 _LOG.debug("Pruning %d quanta for task with label '%s' because all of their outputs exist.", 

729 len(dataIdsToSkip), task.taskDef.label) 

730 for dataId in dataIdsToSkip: 

731 del task.quanta[dataId] 

732 

733 def makeQuantumGraph(self): 

734 """Create a `QuantumGraph` from the quanta already present in 

735 the scaffolding data structure. 

736 

737 Returns 

738 ------- 

739 graph : `QuantumGraph` 

740 The full `QuantumGraph`. 

741 """ 

742 graph = QuantumGraph(task.makeQuantumGraphTaskNodes() for task in self.tasks) 

743 graph.initInputs = self.initInputs.unpackSingleRefs() 

744 graph.initOutputs = self.initOutputs.unpackSingleRefs() 

745 graph.initIntermediates = self.initIntermediates.unpackSingleRefs() 

746 return graph 

747 

748 

749class _InstrumentFinder(TreeVisitor): 

750 """Implementation of TreeVisitor which looks for instrument name 

751 

752 Instrument should be specified as a boolean expression 

753 

754 instrument = 'string' 

755 'string' = instrument 

756 

757 so we only need to find a binary operator where operator is "=", 

758 one side is a string literal and other side is an identifier. 

759 All visit methods return tuple of (type, value), non-useful nodes 

760 return None for both type and value. 

761 """ 

762 def __init__(self): 

763 self.instruments = [] 

764 

765 def visitNumericLiteral(self, value, node): 

766 # do not care about numbers 

767 return (None, None) 

768 

769 def visitStringLiteral(self, value, node): 

770 # return type and value 

771 return ("str", value) 

772 

773 def visitTimeLiteral(self, value, node): 

774 # do not care about these 

775 return (None, None) 

776 

777 def visitRangeLiteral(self, start, stop, stride, node): 

778 # do not care about these 

779 return (None, None) 

780 

781 def visitIdentifier(self, name, node): 

782 if name.lower() == "instrument": 

783 return ("id", "instrument") 

784 return (None, None) 

785 

786 def visitUnaryOp(self, operator, operand, node): 

787 # do not care about these 

788 return (None, None) 

789 

790 def visitBinaryOp(self, operator, lhs, rhs, node): 

791 if operator == "=": 

792 if lhs == ("id", "instrument") and rhs[0] == "str": 

793 self.instruments.append(rhs[1]) 

794 elif rhs == ("id", "instrument") and lhs[0] == "str": 

795 self.instruments.append(lhs[1]) 

796 return (None, None) 

797 

798 def visitIsIn(self, lhs, values, not_in, node): 

799 # do not care about these 

800 return (None, None) 

801 

802 def visitParens(self, expression, node): 

803 # do not care about these 

804 return (None, None) 

805 

806 

807def _findInstruments(queryStr): 

808 """Get the names of any instrument named in the query string by searching 

809 for "instrument = <value>" and similar patterns. 

810 

811 Parameters 

812 ---------- 

813 queryStr : `str` or None 

814 The query string to search, or None if there is no query. 

815 

816 Returns 

817 ------- 

818 instruments : `list` [`str`] 

819 The list of instrument names found in the query. 

820 

821 Raises 

822 ------ 

823 ValueError 

824 If the query expression can not be parsed. 

825 """ 

826 if not queryStr: 

827 return [] 

828 parser = ParserYacc() 

829 finder = _InstrumentFinder() 

830 try: 

831 tree = parser.parse(queryStr) 

832 except ParseError as exc: 

833 raise ValueError(f"failed to parse query expression: {queryStr}") from exc 

834 tree.visit(finder) 

835 return finder.instruments 

836 

837 

838# ------------------------ 

839# Exported definitions -- 

840# ------------------------ 

841 

842 

843class GraphBuilderError(Exception): 

844 """Base class for exceptions generated by graph builder. 

845 """ 

846 pass 

847 

848 

849class OutputExistsError(GraphBuilderError): 

850 """Exception generated when output datasets already exist. 

851 """ 

852 pass 

853 

854 

855class PrerequisiteMissingError(GraphBuilderError): 

856 """Exception generated when a prerequisite dataset does not exist. 

857 """ 

858 pass 

859 

860 

861class GraphBuilder(object): 

862 """GraphBuilder class is responsible for building task execution graph from 

863 a Pipeline. 

864 

865 Parameters 

866 ---------- 

867 registry : `~lsst.daf.butler.Registry` 

868 Data butler instance. 

869 skipExisting : `bool`, optional 

870 If `True` (default), a Quantum is not created if all its outputs 

871 already exist. 

872 """ 

873 

874 def __init__(self, registry, skipExisting=True): 

875 self.registry = registry 

876 self.dimensions = registry.dimensions 

877 self.skipExisting = skipExisting 

878 

879 def makeGraph(self, pipeline, collections, run, userQuery): 

880 """Create execution graph for a pipeline. 

881 

882 Parameters 

883 ---------- 

884 pipeline : `Pipeline` 

885 Pipeline definition, task names/classes and their configs. 

886 collections : `lsst.daf.butler.CollectionSearch` 

887 Object representing the collections to search for input datasets. 

888 run : `str`, optional 

889 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

890 output datasets, if it already exists. 

891 userQuery : `str` 

892 String which defines user-defined selection for registry, should be 

893 empty or `None` if there is no restrictions on data selection. 

894 

895 Returns 

896 ------- 

897 graph : `QuantumGraph` 

898 

899 Raises 

900 ------ 

901 UserExpressionError 

902 Raised when user expression cannot be parsed. 

903 OutputExistsError 

904 Raised when output datasets already exist. 

905 Exception 

906 Other exceptions types may be raised by underlying registry 

907 classes. 

908 """ 

909 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

910 

911 instrument = pipeline.getInstrument() 

912 if isinstance(instrument, str): 

913 instrument = doImport(instrument) 

914 instrumentName = instrument.getName() if instrument else None 

915 userQuery = self._verifyInstrumentRestriction(instrumentName, userQuery) 

916 

917 with scaffolding.connectDataIds(self.registry, collections, userQuery) as commonDataIds: 

918 scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, 

919 skipExisting=self.skipExisting) 

920 return scaffolding.makeQuantumGraph() 

921 

922 @staticmethod 

923 def _verifyInstrumentRestriction(instrumentName, query): 

924 """Add an instrument restriction to the query if it does not have one, 

925 and verify that if given an instrument name that there are no other 

926 instrument restrictions in the query. 

927 

928 Parameters 

929 ---------- 

930 instrumentName : `str` 

931 The name of the instrument that should appear in the query. 

932 query : `str` 

933 The query string. 

934 

935 Returns 

936 ------- 

937 query : `str` 

938 The query string with the instrument added to it if needed. 

939 

940 Raises 

941 ------ 

942 RuntimeError 

943 If the pipeline names an instrument and the query contains more 

944 than one instrument or the name of the instrument in the query does 

945 not match the instrument named by the pipeline. 

946 """ 

947 if not instrumentName: 

948 return query 

949 queryInstruments = _findInstruments(query) 

950 if len(queryInstruments) > 1: 

951 raise RuntimeError(f"When the pipeline has an instrument (\"{instrumentName}\") the query must " 

952 "have zero instruments or one instrument that matches the pipeline. " 

953 f"Found these instruments in the query: {queryInstruments}.") 

954 if not queryInstruments: 

955 # There is not an instrument in the query, add it: 

956 restriction = f"instrument = '{instrumentName}'" 

957 _LOG.debug(f"Adding restriction \"{restriction}\" to query.") 

958 query = f"{restriction} AND ({query})" if query else restriction # (there may not be a query) 

959 elif queryInstruments[0] != instrumentName: 

960 # Since there is an instrument in the query, it should match 

961 # the instrument in the pipeline. 

962 raise RuntimeError(f"The instrument named in the query (\"{queryInstruments[0]}\") does not " 

963 f"match the instrument named by the pipeline (\"{instrumentName}\")") 

964 return query