Coverage for python/lsst/pipe/base/graphBuilder.py: 15%

554 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-06-08 09:15 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from collections.abc import Collection, Iterable, Iterator, Mapping 

35from contextlib import contextmanager 

36from dataclasses import dataclass 

37from typing import Any, Optional 

38 

39from lsst.daf.butler import ( 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56from lsst.utils import doImportType 

57 

58# ----------------------------- 

59# Imports for other modules -- 

60# ----------------------------- 

61from . import automatic_connection_constants as acc 

62from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

63from ._status import NoWorkFound 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75@dataclass 

76class _RefHolder: 

77 """Placeholder for `DatasetRef` representing a future resolved reference. 

78 

79 As we eliminated unresolved DatasetRefs we now use `None` to represent 

80 a reference that is yet to be resolved. Information about its corresponding 

81 dataset type and coordinate is stored in `_DatasetDict` mapping. 

82 """ 

83 

84 dataset_type: DatasetType 

85 """Dataset type of the dataset to be created later. I need to store it here 

86 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

87 between different compatible dataset types.""" 

88 

89 ref: DatasetRef | None = None 

90 """Dataset reference, initially `None`, created when all datasets are 

91 resolved. 

92 """ 

93 

94 @property 

95 def resolved_ref(self) -> DatasetRef: 

96 """Access resolved reference, should only be called after the 

97 reference is set (`DatasetRef`).""" 

98 assert self.ref is not None, "Dataset reference is not set." 

99 return self.ref 

100 

101 

102class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]): 

103 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

104 the known `DatasetRef` instances of that type. 

105 

106 Parameters 

107 ---------- 

108 args 

109 Positional arguments are forwarded to the `dict` constructor. 

110 universe : `DimensionUniverse` 

111 Universe of all possible dimensions. 

112 """ 

113 

114 def __init__(self, *args: Any, universe: DimensionUniverse): 

115 super().__init__(*args) 

116 self.universe = universe 

117 

118 @classmethod 

119 def fromDatasetTypes( 

120 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

121 ) -> _DatasetDict: 

122 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

123 

124 Parameters 

125 ---------- 

126 datasetTypes : `iterable` of `DatasetType` 

127 DatasetTypes to use as keys for the dict. Values will be empty 

128 dictionaries. 

129 universe : `DimensionUniverse` 

130 Universe of all possible dimensions. 

131 

132 Returns 

133 ------- 

134 dictionary : `_DatasetDict` 

135 A new `_DatasetDict` instance. 

136 """ 

137 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

138 

139 @classmethod 

140 def fromSubset( 

141 cls, 

142 datasetTypes: Collection[DatasetType], 

143 first: _DatasetDict, 

144 *rest: _DatasetDict, 

145 ) -> _DatasetDict: 

146 """Return a new dictionary by extracting items corresponding to the 

147 given keys from one or more existing dictionaries. 

148 

149 Parameters 

150 ---------- 

151 datasetTypes : `iterable` of `DatasetType` 

152 DatasetTypes to use as keys for the dict. Values will be obtained 

153 by lookups against ``first`` and ``rest``. 

154 first : `_DatasetDict` 

155 Another dictionary from which to extract values. 

156 rest 

157 Additional dictionaries from which to extract values. 

158 

159 Returns 

160 ------- 

161 dictionary : `_DatasetDict` 

162 A new dictionary instance. 

163 """ 

164 combined = ChainMap(first, *rest) 

165 

166 # Dataset types known to match immediately can be processed 

167 # without checks. 

168 matches = combined.keys() & set(datasetTypes) 

169 _dict = {k: combined[k] for k in matches} 

170 

171 if len(_dict) < len(datasetTypes): 

172 # Work out which ones are missing. 

173 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

174 

175 # Get the known names for comparison. 

176 combined_by_name = {k.name: k for k in combined} 

177 

178 missing = set() 

179 incompatible = {} 

180 for datasetType in missing_datasetTypes: 

181 # The dataset type is not found. It may not be listed 

182 # or it may be that it is there with the same name 

183 # but different definition. 

184 if datasetType.name in combined_by_name: 

185 # This implies some inconsistency in definitions 

186 # for connections. If there is support for storage 

187 # class conversion we can let it slide. 

188 # At this point we do not know 

189 # where the inconsistency is but trust that down 

190 # stream code will be more explicit about input 

191 # vs output incompatibilities. 

192 existing = combined_by_name[datasetType.name] 

193 convertible_to_existing = existing.is_compatible_with(datasetType) 

194 convertible_from_existing = datasetType.is_compatible_with(existing) 

195 if convertible_to_existing and convertible_from_existing: 

196 _LOG.debug( 

197 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

198 datasetType.name, 

199 datasetType.storageClass_name, 

200 existing.storageClass_name, 

201 ) 

202 _dict[datasetType] = combined[existing] 

203 elif convertible_to_existing or convertible_from_existing: 

204 # We'd need to refactor a fair amount to recognize 

205 # whether this is an error or not, so I'm not going to 

206 # bother until we need to do that for other reasons 

207 # (it won't be too long). 

208 _LOG.info( 

209 "Dataset type %s is present with multiple only partially-compatible storage " 

210 "classes %s and %s.", 

211 datasetType.name, 

212 datasetType.storageClass_name, 

213 existing.storageClass_name, 

214 ) 

215 _dict[datasetType] = combined[existing] 

216 else: 

217 incompatible[datasetType] = existing 

218 else: 

219 missing.add(datasetType) 

220 

221 if missing or incompatible: 

222 reasons = [] 

223 if missing: 

224 reasons.append( 

225 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

226 f"types: [{', '.join(d.name for d in combined)}]." 

227 ) 

228 if incompatible: 

229 for x, y in incompatible.items(): 

230 reasons.append(f"{x} incompatible with {y}") 

231 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

232 

233 return cls(_dict, universe=first.universe) 

234 

235 @property 

236 def dimensions(self) -> DimensionGraph: 

237 """The union of all dimensions used by all dataset types in this 

238 dictionary, including implied dependencies (`DimensionGraph`). 

239 """ 

240 base = self.universe.empty 

241 if len(self) == 0: 

242 return base 

243 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

244 

245 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

246 """Unpack nested single-element `DatasetRef` dicts into a new 

247 mapping with `DatasetType` keys and `DatasetRef` values. 

248 

249 This method assumes that each nest contains exactly one item, as is the 

250 case for all "init" datasets. 

251 

252 Parameters 

253 ---------- 

254 storage_classes : `dict` [ `str`, `str` ] 

255 Mapping from dataset type name to the storage class to use for that 

256 dataset type. These are typically the storage classes declared 

257 for a particular task, which may differ rom the data repository 

258 definitions. 

259 

260 Returns 

261 ------- 

262 dictionary : `NamedKeyDict` 

263 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

264 `DatasetType` instances and string names usable as keys. 

265 """ 

266 return NamedKeyDict( 

267 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

268 ) 

269 

270 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

271 """Unpack nested multi-element `DatasetRef` dicts into a new 

272 mapping with `DatasetType` keys and `list` of `DatasetRef` values. 

273 

274 Parameters 

275 ---------- 

276 storage_classes : `dict` [ `str`, `str` ] 

277 Mapping from dataset type name to the storage class to use for that 

278 dataset type. These are typically the storage classes declared 

279 for a particular task, which may differ rom the data repository 

280 definitions. 

281 

282 Returns 

283 ------- 

284 dictionary : `NamedKeyDict` 

285 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

286 both `DatasetType` instances and string names usable as keys. 

287 """ 

288 result = {} 

289 for dataset_type, holders in self.items(): 

290 if ( 

291 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

292 ) != dataset_type.storageClass_name: 

293 dataset_type = dataset_type.overrideStorageClass(override) 

294 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

295 else: 

296 refs = [holder.resolved_ref for holder in holders.values()] 

297 result[dataset_type] = refs 

298 return NamedKeyDict(result) 

299 

300 def extract( 

301 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

302 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

303 """Iterate over the contained `DatasetRef` instances that match the 

304 given `DatasetType` and data IDs. 

305 

306 Parameters 

307 ---------- 

308 datasetType : `DatasetType` 

309 Dataset type to match. 

310 dataIds : `Iterable` [ `DataCoordinate` ] 

311 Data IDs to match. 

312 

313 Returns 

314 ------- 

315 refs : `Iterator` [ `DatasetRef` ] 

316 DatasetRef instances for which ``ref.datasetType == datasetType`` 

317 and ``ref.dataId`` is in ``dataIds``. 

318 """ 

319 refs = self[datasetType] 

320 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

321 

322 def isdisjoint(self, other: _DatasetDict) -> bool: 

323 """Test whether ``self`` and ``other`` have any datasets in common. 

324 

325 Datasets are considered in common if they have the same *parent* 

326 dataset type name and data ID; storage classes and components are not 

327 considered. 

328 """ 

329 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

330 for k, v in other.items(): 

331 parent_name, _ = k.nameAndComponent() 

332 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

333 return False 

334 return True 

335 

336 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

337 """Iterate over all DatasetRef instances held by this data structure, 

338 assuming that each `_RefHolder` already carries are resolved ref. 

339 """ 

340 for holders_by_data_id in self.values(): 

341 for holder in holders_by_data_id.values(): 

342 yield holder.resolved_ref 

343 

344 

345class _QuantumScaffolding: 

346 """Helper class aggregating information about a `Quantum`, used when 

347 constructing a `QuantumGraph`. 

348 

349 See `_PipelineScaffolding` for a top-down description of the full 

350 scaffolding data structure. 

351 

352 Parameters 

353 ---------- 

354 task : _TaskScaffolding 

355 Back-reference to the helper object for the `PipelineTask` this quantum 

356 represents an execution of. 

357 dataId : `DataCoordinate` 

358 Data ID for this quantum. 

359 """ 

360 

361 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

362 self.task = task 

363 self.dataId = dataId 

364 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

365 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

366 self.prerequisites = _DatasetDict.fromDatasetTypes( 

367 task.prerequisites.keys(), universe=dataId.universe 

368 ) 

369 

370 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

371 

372 def __repr__(self) -> str: 

373 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

374 

375 task: _TaskScaffolding 

376 """Back-reference to the helper object for the `PipelineTask` this quantum 

377 represents an execution of. 

378 """ 

379 

380 dataId: DataCoordinate 

381 """Data ID for this quantum. 

382 """ 

383 

384 inputs: _DatasetDict 

385 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

386 

387 This is initialized to map each `DatasetType` to an empty dictionary at 

388 construction. Those nested dictionaries are populated (with data IDs as 

389 keys) with unresolved `DatasetRef` instances in 

390 `_PipelineScaffolding.connectDataIds`. 

391 """ 

392 

393 outputs: _DatasetDict 

394 """Nested dictionary containing `DatasetRef` outputs this quantum. 

395 """ 

396 

397 prerequisites: _DatasetDict 

398 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

399 quantum. 

400 """ 

401 

402 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

403 """Transform the scaffolding object into a true `Quantum` instance. 

404 

405 Parameters 

406 ---------- 

407 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

408 If not `None` then fill datastore records in each generated Quantum 

409 using the records from this structure. 

410 

411 Returns 

412 ------- 

413 quantum : `Quantum` 

414 An actual `Quantum` instance. 

415 """ 

416 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

417 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

418 # Give the task's Connections class an opportunity to remove some 

419 # inputs, or complain if they are unacceptable. 

420 # This will raise if one of the check conditions is not met, which is 

421 # the intended behavior. 

422 # If it raises NotWorkFound, there is a bug in the QG algorithm 

423 # or the adjustQuantum is incorrectly trying to make a prerequisite 

424 # input behave like a regular input; adjustQuantum should only raise 

425 # NoWorkFound if a regular input is missing, and it shouldn't be 

426 # possible for us to have generated ``self`` if that's true. 

427 helper = AdjustQuantumHelper( 

428 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

429 ) 

430 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

431 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

432 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

433 if datastore_records is not None: 

434 quantum_records = {} 

435 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

436 input_refs += list(initInputs.values()) 

437 input_ids = set(ref.id for ref in input_refs) 

438 for datastore_name, records in datastore_records.items(): 

439 matching_records = records.subset(input_ids) 

440 if matching_records is not None: 

441 quantum_records[datastore_name] = matching_records 

442 return Quantum( 

443 taskName=self.task.taskDef.taskName, 

444 taskClass=self.task.taskDef.taskClass, 

445 dataId=self.dataId, 

446 initInputs=initInputs, 

447 inputs=helper.inputs, 

448 outputs=helper.outputs, 

449 datastore_records=quantum_records, 

450 ) 

451 

452 

453@dataclass 

454class _TaskScaffolding: 

455 """Helper class aggregating information about a `PipelineTask`, used when 

456 constructing a `QuantumGraph`. 

457 

458 See `_PipelineScaffolding` for a top-down description of the full 

459 scaffolding data structure. 

460 

461 Parameters 

462 ---------- 

463 taskDef : `TaskDef` 

464 Data structure that identifies the task class and its config. 

465 parent : `_PipelineScaffolding` 

466 The parent data structure that will hold the instance being 

467 constructed. 

468 datasetTypes : `TaskDatasetTypes` 

469 Data structure that categorizes the dataset types used by this task. 

470 """ 

471 

472 def __init__( 

473 self, 

474 taskDef: TaskDef, 

475 parent: _PipelineScaffolding, 

476 datasetTypes: TaskDatasetTypes, 

477 ): 

478 universe = parent.dimensions.universe 

479 self.taskDef = taskDef 

480 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

481 assert self.dimensions.issubset(parent.dimensions) 

482 # Initialize _DatasetDicts as subsets of the one or two 

483 # corresponding dicts in the parent _PipelineScaffolding. 

484 self.initInputs = _DatasetDict.fromSubset( 

485 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

486 ) 

487 self.initOutputs = _DatasetDict.fromSubset( 

488 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

489 ) 

490 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

491 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

492 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

493 self.dataIds: set[DataCoordinate] = set() 

494 self.quanta = {} 

495 self.storage_classes = { 

496 connection.name: connection.storageClass 

497 for connection in self.taskDef.connections.allConnections.values() 

498 } 

499 self.storage_classes[ 

500 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

501 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

502 self.storage_classes[ 

503 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

504 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

505 self.storage_classes[ 

506 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

507 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

508 

509 def __repr__(self) -> str: 

510 # Default dataclass-injected __repr__ gets caught in an infinite loop 

511 # because of back-references. 

512 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

513 

514 taskDef: TaskDef 

515 """Data structure that identifies the task class and its config 

516 (`TaskDef`). 

517 """ 

518 

519 dimensions: DimensionGraph 

520 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

521 """ 

522 

523 initInputs: _DatasetDict 

524 """Dictionary containing information about datasets used to construct this 

525 task (`_DatasetDict`). 

526 """ 

527 

528 initOutputs: _DatasetDict 

529 """Dictionary containing information about datasets produced as a 

530 side-effect of constructing this task (`_DatasetDict`). 

531 """ 

532 

533 inputs: _DatasetDict 

534 """Dictionary containing information about datasets used as regular, 

535 graph-constraining inputs to this task (`_DatasetDict`). 

536 """ 

537 

538 outputs: _DatasetDict 

539 """Dictionary containing information about datasets produced by this task 

540 (`_DatasetDict`). 

541 """ 

542 

543 prerequisites: _DatasetDict 

544 """Dictionary containing information about input datasets that must be 

545 present in the repository before any Pipeline containing this task is run 

546 (`_DatasetDict`). 

547 """ 

548 

549 quanta: dict[DataCoordinate, _QuantumScaffolding] 

550 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

551 this task with that data ID. 

552 """ 

553 

554 storage_classes: dict[str, str] 

555 """Mapping from dataset type name to storage class declared by this task. 

556 """ 

557 

558 def makeQuantumSet( 

559 self, 

560 missing: _DatasetDict, 

561 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

562 ) -> set[Quantum]: 

563 """Create a `set` of `Quantum` from the information in ``self``. 

564 

565 Parameters 

566 ---------- 

567 missing : `_DatasetDict` 

568 Input datasets that have not been found. 

569 datastore_records : `dict` 

570 Record from the datastore to export with quanta. 

571 

572 Returns 

573 ------- 

574 nodes : `set` of `Quantum` 

575 The `Quantum` elements corresponding to this task. 

576 """ 

577 outputs = set() 

578 for q in self.quanta.values(): 

579 try: 

580 tmpQuanta = q.makeQuantum(datastore_records) 

581 outputs.add(tmpQuanta) 

582 except (NoWorkFound, FileNotFoundError) as exc: 

583 if not missing.isdisjoint(q.inputs): 

584 # This is a node that is known to be pruned later and 

585 # should be left in even though some follow up queries 

586 # fail. This allows the pruning to start from this quantum 

587 # with known issues, and prune other nodes it touches. 

588 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

589 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

590 tmpQuantum = Quantum( 

591 taskName=q.task.taskDef.taskName, 

592 taskClass=q.task.taskDef.taskClass, 

593 dataId=q.dataId, 

594 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

595 inputs=inputs, 

596 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

597 ) 

598 outputs.add(tmpQuantum) 

599 else: 

600 raise exc 

601 return outputs 

602 

603 

604class _DatasetIdMaker: 

605 """Helper class which generates random dataset UUIDs for unresolved 

606 datasets. 

607 """ 

608 

609 def __init__(self, run: str): 

610 self.run = run 

611 # Cache of dataset refs generated so far. 

612 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

613 

614 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

615 # For components we need their parent dataset ID. 

616 if dataset_type.isComponent(): 

617 parent_type = dataset_type.makeCompositeDatasetType() 

618 # Parent should be resolved if this is an existing input, or it 

619 # should be in the cache already if it is an intermediate. 

620 key = parent_type, data_id 

621 if key not in self.resolved: 

622 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

623 parent_ref = self.resolved[key] 

624 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

625 

626 key = dataset_type, data_id 

627 if (resolved := self.resolved.get(key)) is None: 

628 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

629 self.resolved[key] = resolved 

630 return resolved 

631 

632 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None: 

633 """Resolve all unresolved references in the provided dictionary.""" 

634 for data_id, holder in refs.items(): 

635 if holder.ref is None: 

636 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

637 

638 

639@dataclass 

640class _PipelineScaffolding: 

641 """A helper data structure that organizes the information involved in 

642 constructing a `QuantumGraph` for a `Pipeline`. 

643 

644 Parameters 

645 ---------- 

646 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

647 Sequence of tasks from which a graph is to be constructed. Must 

648 have nested task classes already imported. 

649 universe : `DimensionUniverse` 

650 Universe of all possible dimensions. 

651 

652 Notes 

653 ----- 

654 The scaffolding data structure contains nested data structures for both 

655 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

656 data structures are shared between the pipeline-level structure (which 

657 aggregates all datasets and categorizes them from the perspective of the 

658 complete pipeline) and the individual tasks that use them as inputs and 

659 outputs. 

660 

661 `QuantumGraph` construction proceeds in four steps, with each corresponding 

662 to a different `_PipelineScaffolding` method: 

663 

664 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

665 the DatasetTypes used by the pipeline (delegating to 

666 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

667 nested `_TaskScaffolding` and `_DatasetDict` objects. 

668 

669 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

670 returns related tuples of all dimensions used to identify any regular 

671 input, output, and intermediate datasets (not prerequisites). We then 

672 iterate over these tuples of related dimensions, identifying the subsets 

673 that correspond to distinct data IDs for each task and dataset type, 

674 and then create `_QuantumScaffolding` objects. 

675 

676 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

677 dataset data IDs previously identified, transforming unresolved 

678 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

679 up prerequisite datasets for all quanta. 

680 

681 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

682 per-task `_QuantumScaffolding` objects. 

683 """ 

684 

685 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

686 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

687 self.tasks = [] 

688 # Aggregate and categorize the DatasetTypes in the Pipeline. 

689 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

690 # Construct dictionaries that map those DatasetTypes to structures 

691 # that will (later) hold additional information about them. 

692 for attr in ( 

693 "initInputs", 

694 "initIntermediates", 

695 "initOutputs", 

696 "inputs", 

697 "intermediates", 

698 "outputs", 

699 "prerequisites", 

700 ): 

701 setattr( 

702 self, 

703 attr, 

704 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

705 ) 

706 self.missing = _DatasetDict(universe=registry.dimensions) 

707 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

708 # Aggregate all dimensions for all non-init, non-prerequisite 

709 # DatasetTypes. These are the ones we'll include in the big join 

710 # query. 

711 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

712 # Construct scaffolding nodes for each Task, and add backreferences 

713 # to the Task from each DatasetScaffolding node. 

714 # Note that there's only one scaffolding node for each DatasetType, 

715 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

716 # reference it. 

717 if isinstance(pipeline, Pipeline): 

718 pipeline = pipeline.toExpandedPipeline() 

719 self.tasks = [ 

720 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

721 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

722 ] 

723 

724 def __repr__(self) -> str: 

725 # Default dataclass-injected __repr__ gets caught in an infinite loop 

726 # because of back-references. 

727 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

728 

729 tasks: list[_TaskScaffolding] 

730 """Scaffolding data structures for each task in the pipeline 

731 (`list` of `_TaskScaffolding`). 

732 """ 

733 

734 initInputs: _DatasetDict 

735 """Datasets consumed but not produced when constructing the tasks in this 

736 pipeline (`_DatasetDict`). 

737 """ 

738 

739 initIntermediates: _DatasetDict 

740 """Datasets that are both consumed and produced when constructing the tasks 

741 in this pipeline (`_DatasetDict`). 

742 """ 

743 

744 initOutputs: _DatasetDict 

745 """Datasets produced but not consumed when constructing the tasks in this 

746 pipeline (`_DatasetDict`). 

747 """ 

748 

749 inputs: _DatasetDict 

750 """Datasets that are consumed but not produced when running this pipeline 

751 (`_DatasetDict`). 

752 """ 

753 

754 intermediates: _DatasetDict 

755 """Datasets that are both produced and consumed when running this pipeline 

756 (`_DatasetDict`). 

757 """ 

758 

759 outputs: _DatasetDict 

760 """Datasets produced but not consumed when when running this pipeline 

761 (`_DatasetDict`). 

762 """ 

763 

764 prerequisites: _DatasetDict 

765 """Datasets that are consumed when running this pipeline and looked up 

766 per-Quantum when generating the graph (`_DatasetDict`). 

767 """ 

768 

769 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

770 """Datasets that should be used as constraints in the initial query, 

771 according to tasks (`NamedValueSet`). 

772 """ 

773 

774 dimensions: DimensionGraph 

775 """All dimensions used by any regular input, intermediate, or output 

776 (not prerequisite) dataset; the set of dimension used in the "Big Join 

777 Query" (`DimensionGraph`). 

778 

779 This is required to be a superset of all task quantum dimensions. 

780 """ 

781 

782 missing: _DatasetDict 

783 """Datasets whose existence was originally predicted but were not 

784 actually found. 

785 

786 Quanta that require these datasets as inputs will be pruned (recursively) 

787 when actually constructing a `QuantumGraph` object. 

788 

789 These are currently populated only when the "initial dataset query 

790 constraint" does not include all overall-input dataset types, and hence the 

791 initial data ID query can include data IDs that it should not. 

792 """ 

793 

794 globalInitOutputs: _DatasetDict | None = None 

795 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

796 """ 

797 

798 @contextmanager 

799 def connectDataIds( 

800 self, 

801 registry: Registry, 

802 collections: Any, 

803 userQuery: Optional[str], 

804 externalDataId: DataCoordinate, 

805 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

806 bind: Optional[Mapping[str, Any]] = None, 

807 ) -> Iterator[DataCoordinateQueryResults]: 

808 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

809 

810 This method populates `_TaskScaffolding.dataIds` and 

811 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

812 

813 Parameters 

814 ---------- 

815 registry : `lsst.daf.butler.Registry` 

816 Registry for the data repository; used for all data ID queries. 

817 collections 

818 Expressions representing the collections to search for input 

819 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

820 userQuery : `str` or `None` 

821 User-provided expression to limit the data IDs processed. 

822 externalDataId : `DataCoordinate` 

823 Externally-provided data ID that should be used to restrict the 

824 results, just as if these constraints had been included via ``AND`` 

825 in ``userQuery``. This includes (at least) any instrument named 

826 in the pipeline definition. 

827 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

828 The query constraint variant that should be used to constraint the 

829 query based on dataset existance, defaults to 

830 `DatasetQueryConstraintVariant.ALL`. 

831 bind : `Mapping`, optional 

832 Mapping containing literal values that should be injected into the 

833 ``userQuery`` expression, keyed by the identifiers they replace. 

834 

835 Returns 

836 ------- 

837 commonDataIds : \ 

838 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

839 An interface to a database temporary table containing all data IDs 

840 that will appear in this `QuantumGraph`. Returned inside a 

841 context manager, which will drop the temporary table at the end of 

842 the `with` block in which this method is called. 

843 """ 

844 _LOG.debug("Building query for data IDs.") 

845 # Initialization datasets always have empty data IDs. 

846 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

847 for datasetType, refs in itertools.chain( 

848 self.initInputs.items(), 

849 self.initIntermediates.items(), 

850 self.initOutputs.items(), 

851 ): 

852 refs[emptyDataId] = _RefHolder(datasetType) 

853 # Run one big query for the data IDs for task dimensions and regular 

854 # inputs and outputs. We limit the query to only dimensions that are 

855 # associated with the input dataset types, but don't (yet) try to 

856 # obtain the dataset_ids for those inputs. 

857 _LOG.debug( 

858 "Submitting data ID query over dimensions %s and materializing results.", 

859 list(self.dimensions.names), 

860 ) 

861 queryArgs: dict[str, Any] = { 

862 "dimensions": self.dimensions, 

863 "where": userQuery, 

864 "dataId": externalDataId, 

865 "bind": bind, 

866 } 

867 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

868 _LOG.debug( 

869 "Constraining graph query using default of %s.", 

870 list(self.defaultDatasetQueryConstraints.names), 

871 ) 

872 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

873 queryArgs["collections"] = collections 

874 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

875 _LOG.debug("Not using dataset existence to constrain query.") 

876 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

877 constraint = set(datasetQueryConstraint) 

878 inputs = {k.name: k for k in self.inputs.keys()} 

879 if remainder := constraint.difference(inputs.keys()): 

880 raise ValueError( 

881 f"{remainder} dataset type(s) specified as a graph constraint, but" 

882 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

883 ) 

884 _LOG.debug(f"Constraining graph query using {constraint}") 

885 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

886 queryArgs["collections"] = collections 

887 else: 

888 raise ValueError( 

889 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

890 ) 

891 

892 if "datasets" in queryArgs: 

893 for i, dataset_type in enumerate(queryArgs["datasets"]): 

894 if dataset_type.isComponent(): 

895 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

896 

897 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

898 _LOG.debug("Expanding data IDs.") 

899 commonDataIds = commonDataIds.expanded() 

900 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

901 # Iterate over query results, populating data IDs for datasets and 

902 # quanta and then connecting them to each other. 

903 n = -1 

904 for n, commonDataId in enumerate(commonDataIds): 

905 # Create DatasetRefs for all DatasetTypes from this result row, 

906 # noting that we might have created some already. 

907 # We remember both those that already existed and those that we 

908 # create now. 

909 refsForRow = {} 

910 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

911 for datasetType, refs in itertools.chain( 

912 self.inputs.items(), 

913 self.intermediates.items(), 

914 self.outputs.items(), 

915 ): 

916 datasetDataId: Optional[DataCoordinate] 

917 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

918 datasetDataId = commonDataId.subset(datasetType.dimensions) 

919 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

920 ref_holder = refs.get(datasetDataId) 

921 if ref_holder is None: 

922 ref_holder = _RefHolder(datasetType) 

923 refs[datasetDataId] = ref_holder 

924 refsForRow[datasetType.name] = ref_holder 

925 # Create _QuantumScaffolding objects for all tasks from this 

926 # result row, noting that we might have created some already. 

927 for task in self.tasks: 

928 quantumDataId = commonDataId.subset(task.dimensions) 

929 quantum = task.quanta.get(quantumDataId) 

930 if quantum is None: 

931 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

932 task.quanta[quantumDataId] = quantum 

933 # Whether this is a new quantum or an existing one, we can 

934 # now associate the DatasetRefs for this row with it. The 

935 # fact that a Quantum data ID and a dataset data ID both 

936 # came from the same result row is what tells us they 

937 # should be associated. 

938 # Many of these associates will be duplicates (because 

939 # another query row that differed from this one only in 

940 # irrelevant dimensions already added them), and we use 

941 # sets to skip. 

942 for datasetType in task.inputs: 

943 dataId = dataIdCacheForRow[datasetType.dimensions] 

944 ref_holder = refsForRow[datasetType.name] 

945 quantum.inputs[datasetType.name][dataId] = ref_holder 

946 for datasetType in task.outputs: 

947 dataId = dataIdCacheForRow[datasetType.dimensions] 

948 ref_holder = refsForRow[datasetType.name] 

949 quantum.outputs[datasetType.name][dataId] = ref_holder 

950 if n < 0: 

951 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

952 emptiness_explained = False 

953 for message in commonDataIds.explain_no_results(): 

954 _LOG.critical(message) 

955 emptiness_explained = True 

956 if not emptiness_explained: 

957 _LOG.critical( 

958 "To reproduce this query for debugging purposes, run " 

959 "Registry.queryDataIds with these arguments:" 

960 ) 

961 # We could just repr() the queryArgs dict to get something 

962 # the user could make sense of, but it's friendlier to 

963 # put these args in an easier-to-construct equivalent form 

964 # so they can read it more easily and copy and paste into 

965 # a Python terminal. 

966 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

967 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

968 if queryArgs["where"]: 

969 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

970 if "datasets" in queryArgs: 

971 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

972 if "collections" in queryArgs: 

973 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

974 _LOG.debug("Finished processing %d rows from data ID query.", n) 

975 yield commonDataIds 

976 

977 def resolveDatasetRefs( 

978 self, 

979 registry: Registry, 

980 collections: Any, 

981 run: str, 

982 commonDataIds: DataCoordinateQueryResults, 

983 *, 

984 skipExistingIn: Any = None, 

985 clobberOutputs: bool = True, 

986 constrainedByAllDatasets: bool = True, 

987 ) -> None: 

988 """Perform follow up queries for each dataset data ID produced in 

989 `fillDataIds`. 

990 

991 This method populates `_DatasetScaffolding.refs` (except for those in 

992 `prerequisites`). 

993 

994 Parameters 

995 ---------- 

996 registry : `lsst.daf.butler.Registry` 

997 Registry for the data repository; used for all data ID queries. 

998 collections 

999 Expressions representing the collections to search for input 

1000 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1001 run : `str` 

1002 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1003 output datasets, if it already exists. 

1004 commonDataIds : \ 

1005 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1006 Result of a previous call to `connectDataIds`. 

1007 skipExistingIn 

1008 Expressions representing the collections to search for existing 

1009 output datasets that should be skipped. See 

1010 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1011 `None` or empty string/sequence disables skipping. 

1012 clobberOutputs : `bool`, optional 

1013 If `True` (default), allow quanta to created even if outputs exist; 

1014 this requires the same behavior behavior to be enabled when 

1015 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1016 (those with metadata, or all outputs if there is no metadata 

1017 dataset configured) will be skipped rather than clobbered. 

1018 constrainedByAllDatasets : `bool`, optional 

1019 Indicates if the commonDataIds were generated with a constraint on 

1020 all dataset types. 

1021 

1022 Raises 

1023 ------ 

1024 OutputExistsError 

1025 Raised if an output dataset already exists in the output run 

1026 and ``skipExistingIn`` does not include output run, or if only 

1027 some outputs are present and ``clobberOutputs`` is `False`. 

1028 """ 

1029 # Run may be provided but it does not have to exist, in that case we 

1030 # use it for resolving references but don't check it for existing refs. 

1031 run_exists = False 

1032 if run: 

1033 try: 

1034 run_exists = bool(registry.queryCollections(run)) 

1035 except MissingCollectionError: 

1036 # Undocumented exception is raise if it does not exist 

1037 pass 

1038 

1039 skip_collections_wildcard: CollectionWildcard | None = None 

1040 skipExistingInRun = False 

1041 if skipExistingIn: 

1042 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1043 if run_exists: 

1044 # as optimization check in the explicit list of names first 

1045 skipExistingInRun = run in skip_collections_wildcard.strings 

1046 if not skipExistingInRun: 

1047 # need to flatten it and check again 

1048 skipExistingInRun = run in registry.queryCollections( 

1049 skipExistingIn, 

1050 collectionTypes=CollectionType.RUN, 

1051 ) 

1052 

1053 idMaker = _DatasetIdMaker(run) 

1054 

1055 resolvedRefQueryResults: Iterable[DatasetRef] 

1056 

1057 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1058 # few different code paths that each transfer different pieces of 

1059 # information about what dataset query constraints were applied here, 

1060 # and none of them has the complete picture until we get here. We're 

1061 # long overdue for a QG generation rewrite that will make this go away 

1062 # entirely anyway. 

1063 constrainedByAllDatasets = ( 

1064 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1065 ) 

1066 

1067 # Look up [init] intermediate and output datasets in the output 

1068 # collection, if there is an output collection. 

1069 if run_exists or skip_collections_wildcard is not None: 

1070 for datasetType, refs in itertools.chain( 

1071 self.initIntermediates.items(), 

1072 self.initOutputs.items(), 

1073 self.intermediates.items(), 

1074 self.outputs.items(), 

1075 ): 

1076 _LOG.debug( 

1077 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1078 len(refs), 

1079 datasetType.name, 

1080 ) 

1081 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1082 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1083 # TODO: this assert incorrectly bans component inputs; 

1084 # investigate on DM-33027. 

1085 # assert not datasetType.isComponent(), \ 

1086 # "Output datasets cannot be components." 

1087 # 

1088 # Instead we have to handle them manually to avoid a 

1089 # deprecation warning, but it is at least confusing and 

1090 # possibly a bug for components to appear here at all. 

1091 if datasetType.isComponent(): 

1092 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1093 component = datasetType.component() 

1094 else: 

1095 parent_dataset_type = datasetType 

1096 component = None 

1097 

1098 # look at RUN collection first 

1099 if run_exists: 

1100 try: 

1101 resolvedRefQueryResults = subset.findDatasets( 

1102 parent_dataset_type, collections=run, findFirst=True 

1103 ) 

1104 except MissingDatasetTypeError: 

1105 resolvedRefQueryResults = [] 

1106 for resolvedRef in resolvedRefQueryResults: 

1107 # TODO: we could easily support per-DatasetType 

1108 # skipExisting and I could imagine that being useful - 

1109 # it's probably required in order to support writing 

1110 # initOutputs before QuantumGraph generation. 

1111 assert resolvedRef.dataId in refs 

1112 if not (skipExistingInRun or isInit or clobberOutputs): 

1113 raise OutputExistsError( 

1114 f"Output dataset {datasetType.name} already exists in " 

1115 f"output RUN collection '{run}' with data ID" 

1116 f" {resolvedRef.dataId}." 

1117 ) 

1118 # To resolve all outputs we have to remember existing 

1119 # ones to avoid generating new dataset IDs for them. 

1120 refs[resolvedRef.dataId].ref = ( 

1121 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1122 ) 

1123 

1124 # And check skipExistingIn too, if RUN collection is in 

1125 # it is handled above 

1126 if skip_collections_wildcard is not None: 

1127 try: 

1128 resolvedRefQueryResults = subset.findDatasets( 

1129 parent_dataset_type, 

1130 collections=skip_collections_wildcard, 

1131 findFirst=True, 

1132 ) 

1133 except MissingDatasetTypeError: 

1134 resolvedRefQueryResults = [] 

1135 for resolvedRef in resolvedRefQueryResults: 

1136 if resolvedRef.dataId not in refs: 

1137 continue 

1138 refs[resolvedRef.dataId].ref = ( 

1139 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1140 ) 

1141 

1142 # Look up input and initInput datasets in the input collection(s). We 

1143 # accumulate datasets in self.missing, if the common data IDs were not 

1144 # constrained on dataset type existence. 

1145 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1146 _LOG.debug( 

1147 "Resolving %d datasets for input dataset %s.", 

1148 len(refs), 

1149 datasetType.name, 

1150 ) 

1151 if datasetType.isComponent(): 

1152 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1153 component = datasetType.component() 

1154 else: 

1155 parent_dataset_type = datasetType 

1156 component = None 

1157 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1158 try: 

1159 resolvedRefQueryResults = commonDataIds.subset( 

1160 datasetType.dimensions, unique=True 

1161 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1162 except MissingDatasetTypeError: 

1163 resolvedRefQueryResults = [] 

1164 dataIdsNotFoundYet = set(refs.keys()) 

1165 for resolvedRef in resolvedRefQueryResults: 

1166 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1167 if resolvedRef.dataId not in refs: 

1168 continue 

1169 refs[resolvedRef.dataId].ref = ( 

1170 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1171 ) 

1172 if dataIdsNotFoundYet: 

1173 if constrainedByAllDatasets: 

1174 raise RuntimeError( 

1175 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1176 f"'{datasetType.name}' was/were present in a previous " 

1177 "query, but could not be found now. " 

1178 "This is either a logic bug in QuantumGraph generation " 

1179 "or the input collections have been modified since " 

1180 "QuantumGraph generation began." 

1181 ) 

1182 elif not datasetType.dimensions: 

1183 raise RuntimeError( 

1184 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1185 f"collections {collections}." 

1186 ) 

1187 else: 

1188 # If the common dataIds were not constrained using all the 

1189 # input dataset types, it is possible that some data ids 

1190 # found don't correspond to existing datasets. Mark these 

1191 # for later pruning from the quantum graph. 

1192 for k in dataIdsNotFoundYet: 

1193 missing_for_dataset_type[k] = refs[k] 

1194 if missing_for_dataset_type: 

1195 self.missing[datasetType] = missing_for_dataset_type 

1196 

1197 # Resolve the missing refs, just so they look like all of the others; 

1198 # in the end other code will make sure they never appear in the QG. 

1199 for dataset_type, refDict in self.missing.items(): 

1200 idMaker.resolveDict(dataset_type, refDict) 

1201 

1202 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1203 # replacing the unresolved refs there, and then look up prerequisites. 

1204 for task in self.tasks: 

1205 _LOG.debug( 

1206 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1207 len(task.quanta), 

1208 task.taskDef.label, 

1209 ) 

1210 # The way iterConnections is designed makes it impossible to 

1211 # annotate precisely enough to satisfy MyPy here. 

1212 lookupFunctions = { 

1213 c.name: c.lookupFunction # type: ignore 

1214 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1215 if c.lookupFunction is not None # type: ignore 

1216 } 

1217 dataIdsFailed = [] 

1218 dataIdsSucceeded = [] 

1219 for quantum in task.quanta.values(): 

1220 # Process outputs datasets only if skipExistingIn is not None 

1221 # or there is a run to look for outputs in and clobberOutputs 

1222 # is True. Note that if skipExistingIn is None, any output 

1223 # datasets that already exist would have already caused an 

1224 # exception to be raised. 

1225 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1226 resolvedRefs = [] 

1227 unresolvedDataIds = [] 

1228 haveMetadata = False 

1229 for datasetType, originalRefs in quantum.outputs.items(): 

1230 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1231 if ref is not None: 

1232 resolvedRefs.append(ref) 

1233 originalRefs[dataId].ref = ref 

1234 if datasetType.name == task.taskDef.metadataDatasetName: 

1235 haveMetadata = True 

1236 else: 

1237 unresolvedDataIds.append((datasetType, dataId)) 

1238 if resolvedRefs: 

1239 if haveMetadata or not unresolvedDataIds: 

1240 dataIdsSucceeded.append(quantum.dataId) 

1241 if skip_collections_wildcard is not None: 

1242 continue 

1243 else: 

1244 dataIdsFailed.append(quantum.dataId) 

1245 if not clobberOutputs: 

1246 raise OutputExistsError( 

1247 f"Quantum {quantum.dataId} of task with label " 

1248 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1249 f"({resolvedRefs}) " 

1250 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1251 "and clobbering outputs was not enabled." 

1252 ) 

1253 # Update the input DatasetRefs to the resolved ones we already 

1254 # searched for. 

1255 for datasetType, input_refs in quantum.inputs.items(): 

1256 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1257 input_refs[data_id].ref = ref 

1258 # Look up prerequisite datasets in the input collection(s). 

1259 # These may have dimensions that extend beyond those we queried 

1260 # for originally, because we want to permit those data ID 

1261 # values to differ across quanta and dataset types. 

1262 for datasetType in task.prerequisites: 

1263 if datasetType.isComponent(): 

1264 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1265 component = datasetType.component() 

1266 else: 

1267 parent_dataset_type = datasetType 

1268 component = None 

1269 lookupFunction = lookupFunctions.get(datasetType.name) 

1270 if lookupFunction is not None: 

1271 # PipelineTask has provided its own function to do the 

1272 # lookup. This always takes precedence. 

1273 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1274 elif ( 

1275 datasetType.isCalibration() 

1276 and datasetType.dimensions <= quantum.dataId.graph 

1277 and quantum.dataId.graph.temporal 

1278 ): 

1279 # This is a master calibration lookup, which we have to 

1280 # handle specially because the query system can't do a 

1281 # temporal join on a non-dimension-based timespan yet. 

1282 timespan = quantum.dataId.timespan 

1283 try: 

1284 prereq_ref = registry.findDataset( 

1285 parent_dataset_type, 

1286 quantum.dataId, 

1287 collections=collections, 

1288 timespan=timespan, 

1289 ) 

1290 if prereq_ref is not None: 

1291 if component is not None: 

1292 prereq_ref = prereq_ref.makeComponentRef(component) 

1293 prereq_refs = [prereq_ref] 

1294 else: 

1295 prereq_refs = [] 

1296 except (KeyError, MissingDatasetTypeError): 

1297 # This dataset type is not present in the registry, 

1298 # which just means there are no datasets here. 

1299 prereq_refs = [] 

1300 else: 

1301 # Most general case. 

1302 prereq_refs = [ 

1303 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1304 for prereq_ref in registry.queryDatasets( 

1305 parent_dataset_type, 

1306 collections=collections, 

1307 dataId=quantum.dataId, 

1308 findFirst=True, 

1309 ).expanded() 

1310 ] 

1311 

1312 for ref in prereq_refs: 

1313 if ref is not None: 

1314 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1315 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1316 

1317 # Resolve all quantum inputs and outputs. 

1318 for datasetDict in (quantum.inputs, quantum.outputs): 

1319 for dataset_type, refDict in datasetDict.items(): 

1320 idMaker.resolveDict(dataset_type, refDict) 

1321 

1322 # Resolve task initInputs and initOutputs. 

1323 for datasetDict in (task.initInputs, task.initOutputs): 

1324 for dataset_type, refDict in datasetDict.items(): 

1325 idMaker.resolveDict(dataset_type, refDict) 

1326 

1327 # Actually remove any quanta that we decided to skip above. 

1328 if dataIdsSucceeded: 

1329 if skip_collections_wildcard is not None: 

1330 _LOG.debug( 

1331 "Pruning successful %d quanta for task with label '%s' because all of their " 

1332 "outputs exist or metadata was written successfully.", 

1333 len(dataIdsSucceeded), 

1334 task.taskDef.label, 

1335 ) 

1336 for dataId in dataIdsSucceeded: 

1337 del task.quanta[dataId] 

1338 elif clobberOutputs: 

1339 _LOG.info( 

1340 "Found %d successful quanta for task with label '%s' " 

1341 "that will need to be clobbered during execution.", 

1342 len(dataIdsSucceeded), 

1343 task.taskDef.label, 

1344 ) 

1345 else: 

1346 raise AssertionError("OutputExistsError should have already been raised.") 

1347 if dataIdsFailed: 

1348 if clobberOutputs: 

1349 _LOG.info( 

1350 "Found %d failed/incomplete quanta for task with label '%s' " 

1351 "that will need to be clobbered during execution.", 

1352 len(dataIdsFailed), 

1353 task.taskDef.label, 

1354 ) 

1355 else: 

1356 raise AssertionError("OutputExistsError should have already been raised.") 

1357 

1358 # Collect initOutputs that do not belong to any task. 

1359 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1360 for task in self.tasks: 

1361 global_dataset_types -= set(task.initOutputs) 

1362 if global_dataset_types: 

1363 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1364 for dataset_type, refDict in self.globalInitOutputs.items(): 

1365 idMaker.resolveDict(dataset_type, refDict) 

1366 

1367 def makeQuantumGraph( 

1368 self, 

1369 registry: Registry, 

1370 metadata: Optional[Mapping[str, Any]] = None, 

1371 datastore: Optional[Datastore] = None, 

1372 ) -> QuantumGraph: 

1373 """Create a `QuantumGraph` from the quanta already present in 

1374 the scaffolding data structure. 

1375 

1376 Parameters 

1377 --------- 

1378 registry : `lsst.daf.butler.Registry` 

1379 Registry for the data repository; used for all data ID queries. 

1380 metadata : Optional Mapping of `str` to primitives 

1381 This is an optional parameter of extra data to carry with the 

1382 graph. Entries in this mapping should be able to be serialized in 

1383 JSON. 

1384 datastore : `Datastore`, optional 

1385 If not `None` then fill datastore records in each generated 

1386 Quantum. 

1387 

1388 Returns 

1389 ------- 

1390 graph : `QuantumGraph` 

1391 The full `QuantumGraph`. 

1392 """ 

1393 

1394 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1395 """Extract all DatasetRefs from the dictionaries""" 

1396 for ref_dict in dataset_dict.values(): 

1397 for holder in ref_dict.values(): 

1398 yield holder.resolved_ref 

1399 

1400 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1401 if datastore is not None: 

1402 datastore_records = datastore.export_records( 

1403 itertools.chain( 

1404 _make_refs(self.inputs), 

1405 _make_refs(self.initInputs), 

1406 _make_refs(self.prerequisites), 

1407 ) 

1408 ) 

1409 

1410 graphInput: dict[TaskDef, set[Quantum]] = {} 

1411 for task in self.tasks: 

1412 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1413 graphInput[task.taskDef] = qset 

1414 

1415 taskInitInputs = { 

1416 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1417 for task in self.tasks 

1418 } 

1419 taskInitOutputs = { 

1420 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1421 for task in self.tasks 

1422 } 

1423 

1424 globalInitOutputs: list[DatasetRef] = [] 

1425 if self.globalInitOutputs is not None: 

1426 for refs_dict in self.globalInitOutputs.values(): 

1427 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1428 

1429 graph = QuantumGraph( 

1430 graphInput, 

1431 metadata=metadata, 

1432 pruneRefs=list(self.missing.iter_resolved_refs()), 

1433 universe=self.dimensions.universe, 

1434 initInputs=taskInitInputs, 

1435 initOutputs=taskInitOutputs, 

1436 globalInitOutputs=globalInitOutputs, 

1437 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1438 ) 

1439 return graph 

1440 

1441 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1442 """Make a list of all dataset types used by a graph as defined in 

1443 registry. 

1444 """ 

1445 chain = [ 

1446 self.initInputs, 

1447 self.initIntermediates, 

1448 self.initOutputs, 

1449 self.inputs, 

1450 self.intermediates, 

1451 self.outputs, 

1452 self.prerequisites, 

1453 ] 

1454 if self.globalInitOutputs is not None: 

1455 chain.append(self.globalInitOutputs) 

1456 

1457 # Collect names of all dataset types. 

1458 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1459 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1460 

1461 # Check for types that do not exist in registry yet: 

1462 # - inputs must exist 

1463 # - intermediates and outputs may not exist, but there must not be 

1464 # more than one definition (e.g. differing in storage class) 

1465 # - prerequisites may not exist, treat it the same as outputs here 

1466 for dstype in itertools.chain(self.initInputs, self.inputs): 

1467 if dstype.name not in dataset_types: 

1468 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1469 

1470 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1471 chain = [ 

1472 self.initIntermediates, 

1473 self.initOutputs, 

1474 self.intermediates, 

1475 self.outputs, 

1476 self.prerequisites, 

1477 ] 

1478 if self.globalInitOutputs is not None: 

1479 chain.append(self.globalInitOutputs) 

1480 for dstype in itertools.chain(*chain): 

1481 if dstype.name not in dataset_types: 

1482 new_outputs[dstype.name].add(dstype) 

1483 for name, dstypes in new_outputs.items(): 

1484 if len(dstypes) > 1: 

1485 raise ValueError( 

1486 "Pipeline contains multiple definitions for a dataset type " 

1487 f"which is not defined in registry yet: {dstypes}" 

1488 ) 

1489 elif len(dstypes) == 1: 

1490 dataset_types[name] = dstypes.pop() 

1491 

1492 return dataset_types.values() 

1493 

1494 

1495# ------------------------ 

1496# Exported definitions -- 

1497# ------------------------ 

1498 

1499 

1500class GraphBuilderError(Exception): 

1501 """Base class for exceptions generated by graph builder.""" 

1502 

1503 pass 

1504 

1505 

1506class OutputExistsError(GraphBuilderError): 

1507 """Exception generated when output datasets already exist.""" 

1508 

1509 pass 

1510 

1511 

1512class PrerequisiteMissingError(GraphBuilderError): 

1513 """Exception generated when a prerequisite dataset does not exist.""" 

1514 

1515 pass 

1516 

1517 

1518class GraphBuilder: 

1519 """GraphBuilder class is responsible for building task execution graph from 

1520 a Pipeline. 

1521 

1522 Parameters 

1523 ---------- 

1524 registry : `~lsst.daf.butler.Registry` 

1525 Data butler instance. 

1526 skipExistingIn 

1527 Expressions representing the collections to search for existing 

1528 output datasets that should be skipped. See 

1529 :ref:`daf_butler_ordered_collection_searches`. 

1530 clobberOutputs : `bool`, optional 

1531 If `True` (default), allow quanta to created even if partial outputs 

1532 exist; this requires the same behavior behavior to be enabled when 

1533 executing. 

1534 datastore : `Datastore`, optional 

1535 If not `None` then fill datastore records in each generated Quantum. 

1536 """ 

1537 

1538 def __init__( 

1539 self, 

1540 registry: Registry, 

1541 skipExistingIn: Any = None, 

1542 clobberOutputs: bool = True, 

1543 datastore: Optional[Datastore] = None, 

1544 ): 

1545 self.registry = registry 

1546 self.dimensions = registry.dimensions 

1547 self.skipExistingIn = skipExistingIn 

1548 self.clobberOutputs = clobberOutputs 

1549 self.datastore = datastore 

1550 

1551 def makeGraph( 

1552 self, 

1553 pipeline: Pipeline | Iterable[TaskDef], 

1554 collections: Any, 

1555 run: str, 

1556 userQuery: Optional[str], 

1557 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1558 metadata: Optional[Mapping[str, Any]] = None, 

1559 bind: Optional[Mapping[str, Any]] = None, 

1560 ) -> QuantumGraph: 

1561 """Create execution graph for a pipeline. 

1562 

1563 Parameters 

1564 ---------- 

1565 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1566 Pipeline definition, task names/classes and their configs. 

1567 collections 

1568 Expressions representing the collections to search for input 

1569 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1570 run : `str` 

1571 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1572 output datasets. Collection does not have to exist and it will be 

1573 created when graph is executed. 

1574 userQuery : `str` 

1575 String which defines user-defined selection for registry, should be 

1576 empty or `None` if there is no restrictions on data selection. 

1577 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1578 The query constraint variant that should be used to constraint the 

1579 query based on dataset existance, defaults to 

1580 `DatasetQueryConstraintVariant.ALL`. 

1581 metadata : Optional Mapping of `str` to primitives 

1582 This is an optional parameter of extra data to carry with the 

1583 graph. Entries in this mapping should be able to be serialized in 

1584 JSON. 

1585 bind : `Mapping`, optional 

1586 Mapping containing literal values that should be injected into the 

1587 ``userQuery`` expression, keyed by the identifiers they replace. 

1588 

1589 Returns 

1590 ------- 

1591 graph : `QuantumGraph` 

1592 

1593 Raises 

1594 ------ 

1595 UserExpressionError 

1596 Raised when user expression cannot be parsed. 

1597 OutputExistsError 

1598 Raised when output datasets already exist. 

1599 Exception 

1600 Other exceptions types may be raised by underlying registry 

1601 classes. 

1602 """ 

1603 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1604 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1605 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1606 instrument_class: Optional[Any] = None 

1607 if isinstance(pipeline, Pipeline): 

1608 instrument_class_name = pipeline.getInstrument() 

1609 if instrument_class_name is not None: 

1610 instrument_class = doImportType(instrument_class_name) 

1611 pipeline = list(pipeline.toExpandedPipeline()) 

1612 if instrument_class is not None: 

1613 dataId = DataCoordinate.standardize( 

1614 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1615 ) 

1616 else: 

1617 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1618 with scaffolding.connectDataIds( 

1619 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1620 ) as commonDataIds: 

1621 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1622 scaffolding.resolveDatasetRefs( 

1623 self.registry, 

1624 collections, 

1625 run, 

1626 commonDataIds, 

1627 skipExistingIn=self.skipExistingIn, 

1628 clobberOutputs=self.clobberOutputs, 

1629 constrainedByAllDatasets=condition, 

1630 ) 

1631 return scaffolding.makeQuantumGraph( 

1632 registry=self.registry, metadata=metadata, datastore=self.datastore 

1633 )