Coverage for python/lsst/pipe/base/graphBuilder.py: 15%

555 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 02:45 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from collections.abc import Collection, Iterable, Iterator, Mapping 

35from contextlib import contextmanager 

36from dataclasses import dataclass 

37from typing import Any, Optional 

38 

39from lsst.daf.butler import ( 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56from lsst.utils import doImportType 

57 

58# ----------------------------- 

59# Imports for other modules -- 

60# ----------------------------- 

61from . import automatic_connection_constants as acc 

62from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

63from ._status import NoWorkFound 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75@dataclass 

76class _RefHolder: 

77 """Placeholder for `DatasetRef` representing a future resolved reference. 

78 

79 As we eliminated unresolved DatasetRefs we now use `None` to represent 

80 a reference that is yet to be resolved. Information about its corresponding 

81 dataset type and coordinate is stored in `_DatasetDict` mapping. 

82 """ 

83 

84 dataset_type: DatasetType 

85 """Dataset type of the dataset to be created later. I need to store it here 

86 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

87 between different compatible dataset types.""" 

88 

89 ref: DatasetRef | None = None 

90 """Dataset reference, initially `None`, created when all datasets are 

91 resolved. 

92 """ 

93 

94 @property 

95 def resolved_ref(self) -> DatasetRef: 

96 """Access resolved reference, should only be called after the 

97 reference is set (`DatasetRef`).""" 

98 assert self.ref is not None, "Dataset reference is not set." 

99 return self.ref 

100 

101 

102class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]): 

103 """A custom dictionary that maps `DatasetType` to a nested dictionary of 

104 the known `DatasetRef` instances of that type. 

105 

106 Parameters 

107 ---------- 

108 args 

109 Positional arguments are forwarded to the `dict` constructor. 

110 universe : `DimensionUniverse` 

111 Universe of all possible dimensions. 

112 """ 

113 

114 def __init__(self, *args: Any, universe: DimensionUniverse): 

115 super().__init__(*args) 

116 self.universe = universe 

117 

118 @classmethod 

119 def fromDatasetTypes( 

120 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

121 ) -> _DatasetDict: 

122 """Construct a dictionary from a flat iterable of `DatasetType` keys. 

123 

124 Parameters 

125 ---------- 

126 datasetTypes : `iterable` of `DatasetType` 

127 DatasetTypes to use as keys for the dict. Values will be empty 

128 dictionaries. 

129 universe : `DimensionUniverse` 

130 Universe of all possible dimensions. 

131 

132 Returns 

133 ------- 

134 dictionary : `_DatasetDict` 

135 A new `_DatasetDict` instance. 

136 """ 

137 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

138 

139 @classmethod 

140 def fromSubset( 

141 cls, 

142 datasetTypes: Collection[DatasetType], 

143 first: _DatasetDict, 

144 *rest: _DatasetDict, 

145 ) -> _DatasetDict: 

146 """Return a new dictionary by extracting items corresponding to the 

147 given keys from one or more existing dictionaries. 

148 

149 Parameters 

150 ---------- 

151 datasetTypes : `iterable` of `DatasetType` 

152 DatasetTypes to use as keys for the dict. Values will be obtained 

153 by lookups against ``first`` and ``rest``. 

154 first : `_DatasetDict` 

155 Another dictionary from which to extract values. 

156 rest 

157 Additional dictionaries from which to extract values. 

158 

159 Returns 

160 ------- 

161 dictionary : `_DatasetDict` 

162 A new dictionary instance. 

163 """ 

164 combined = ChainMap(first, *rest) 

165 

166 # Dataset types known to match immediately can be processed 

167 # without checks. 

168 matches = combined.keys() & set(datasetTypes) 

169 _dict = {k: combined[k] for k in matches} 

170 

171 if len(_dict) < len(datasetTypes): 

172 # Work out which ones are missing. 

173 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

174 

175 # Get the known names for comparison. 

176 combined_by_name = {k.name: k for k in combined} 

177 

178 missing = set() 

179 incompatible = {} 

180 for datasetType in missing_datasetTypes: 

181 # The dataset type is not found. It may not be listed 

182 # or it may be that it is there with the same name 

183 # but different definition. 

184 if datasetType.name in combined_by_name: 

185 # This implies some inconsistency in definitions 

186 # for connections. If there is support for storage 

187 # class conversion we can let it slide. 

188 # At this point we do not know 

189 # where the inconsistency is but trust that down 

190 # stream code will be more explicit about input 

191 # vs output incompatibilities. 

192 existing = combined_by_name[datasetType.name] 

193 convertible_to_existing = existing.is_compatible_with(datasetType) 

194 convertible_from_existing = datasetType.is_compatible_with(existing) 

195 if convertible_to_existing and convertible_from_existing: 

196 _LOG.debug( 

197 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

198 datasetType.name, 

199 datasetType.storageClass_name, 

200 existing.storageClass_name, 

201 ) 

202 _dict[datasetType] = combined[existing] 

203 elif convertible_to_existing or convertible_from_existing: 

204 # We'd need to refactor a fair amount to recognize 

205 # whether this is an error or not, so I'm not going to 

206 # bother until we need to do that for other reasons 

207 # (it won't be too long). 

208 _LOG.info( 

209 "Dataset type %s is present with multiple only partially-compatible storage " 

210 "classes %s and %s.", 

211 datasetType.name, 

212 datasetType.storageClass_name, 

213 existing.storageClass_name, 

214 ) 

215 _dict[datasetType] = combined[existing] 

216 else: 

217 incompatible[datasetType] = existing 

218 else: 

219 missing.add(datasetType) 

220 

221 if missing or incompatible: 

222 reasons = [] 

223 if missing: 

224 reasons.append( 

225 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

226 f"types: [{', '.join(d.name for d in combined)}]." 

227 ) 

228 if incompatible: 

229 for x, y in incompatible.items(): 

230 reasons.append(f"{x} incompatible with {y}") 

231 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

232 

233 return cls(_dict, universe=first.universe) 

234 

235 @property 

236 def dimensions(self) -> DimensionGraph: 

237 """The union of all dimensions used by all dataset types in this 

238 dictionary, including implied dependencies (`DimensionGraph`). 

239 """ 

240 base = self.universe.empty 

241 if len(self) == 0: 

242 return base 

243 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

244 

245 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

246 """Unpack nested single-element `DatasetRef` dicts into a new 

247 mapping with `DatasetType` keys and `DatasetRef` values. 

248 

249 This method assumes that each nest contains exactly one item, as is the 

250 case for all "init" datasets. 

251 

252 Parameters 

253 ---------- 

254 storage_classes : `dict` [ `str`, `str` ] 

255 Mapping from dataset type name to the storage class to use for that 

256 dataset type. These are typically the storage classes declared 

257 for a particular task, which may differ rom the data repository 

258 definitions. 

259 

260 Returns 

261 ------- 

262 dictionary : `NamedKeyDict` 

263 Dictionary mapping `DatasetType` to `DatasetRef`, with both 

264 `DatasetType` instances and string names usable as keys. 

265 """ 

266 return NamedKeyDict( 

267 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

268 ) 

269 

270 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

271 """Unpack nested multi-element `DatasetRef` dicts into a new 

272 mapping with `DatasetType` keys and `list` of `DatasetRef` values. 

273 

274 Parameters 

275 ---------- 

276 storage_classes : `dict` [ `str`, `str` ] 

277 Mapping from dataset type name to the storage class to use for that 

278 dataset type. These are typically the storage classes declared 

279 for a particular task, which may differ rom the data repository 

280 definitions. 

281 

282 Returns 

283 ------- 

284 dictionary : `NamedKeyDict` 

285 Dictionary mapping `DatasetType` to `list` of `DatasetRef`, with 

286 both `DatasetType` instances and string names usable as keys. 

287 """ 

288 result = {} 

289 for dataset_type, holders in self.items(): 

290 if ( 

291 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

292 ) != dataset_type.storageClass_name: 

293 dataset_type = dataset_type.overrideStorageClass(override) 

294 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

295 else: 

296 refs = [holder.resolved_ref for holder in holders.values()] 

297 result[dataset_type] = refs 

298 return NamedKeyDict(result) 

299 

300 def extract( 

301 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

302 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

303 """Iterate over the contained `DatasetRef` instances that match the 

304 given `DatasetType` and data IDs. 

305 

306 Parameters 

307 ---------- 

308 datasetType : `DatasetType` 

309 Dataset type to match. 

310 dataIds : `Iterable` [ `DataCoordinate` ] 

311 Data IDs to match. 

312 

313 Returns 

314 ------- 

315 refs : `Iterator` [ `DatasetRef` ] 

316 DatasetRef instances for which ``ref.datasetType == datasetType`` 

317 and ``ref.dataId`` is in ``dataIds``. 

318 """ 

319 refs = self[datasetType] 

320 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

321 

322 def isdisjoint(self, other: _DatasetDict) -> bool: 

323 """Test whether ``self`` and ``other`` have any datasets in common. 

324 

325 Datasets are considered in common if they have the same *parent* 

326 dataset type name and data ID; storage classes and components are not 

327 considered. 

328 """ 

329 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

330 for k, v in other.items(): 

331 parent_name, _ = k.nameAndComponent() 

332 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

333 return False 

334 return True 

335 

336 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

337 """Iterate over all DatasetRef instances held by this data structure, 

338 assuming that each `_RefHolder` already carries are resolved ref. 

339 """ 

340 for holders_by_data_id in self.values(): 

341 for holder in holders_by_data_id.values(): 

342 yield holder.resolved_ref 

343 

344 

345class _QuantumScaffolding: 

346 """Helper class aggregating information about a `Quantum`, used when 

347 constructing a `QuantumGraph`. 

348 

349 See `_PipelineScaffolding` for a top-down description of the full 

350 scaffolding data structure. 

351 

352 Parameters 

353 ---------- 

354 task : _TaskScaffolding 

355 Back-reference to the helper object for the `PipelineTask` this quantum 

356 represents an execution of. 

357 dataId : `DataCoordinate` 

358 Data ID for this quantum. 

359 """ 

360 

361 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

362 self.task = task 

363 self.dataId = dataId 

364 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

365 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

366 self.prerequisites = _DatasetDict.fromDatasetTypes( 

367 task.prerequisites.keys(), universe=dataId.universe 

368 ) 

369 

370 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

371 

372 def __repr__(self) -> str: 

373 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

374 

375 task: _TaskScaffolding 

376 """Back-reference to the helper object for the `PipelineTask` this quantum 

377 represents an execution of. 

378 """ 

379 

380 dataId: DataCoordinate 

381 """Data ID for this quantum. 

382 """ 

383 

384 inputs: _DatasetDict 

385 """Nested dictionary containing `DatasetRef` inputs to this quantum. 

386 

387 This is initialized to map each `DatasetType` to an empty dictionary at 

388 construction. Those nested dictionaries are populated (with data IDs as 

389 keys) with unresolved `DatasetRef` instances in 

390 `_PipelineScaffolding.connectDataIds`. 

391 """ 

392 

393 outputs: _DatasetDict 

394 """Nested dictionary containing `DatasetRef` outputs this quantum. 

395 """ 

396 

397 prerequisites: _DatasetDict 

398 """Nested dictionary containing `DatasetRef` prerequisite inputs to this 

399 quantum. 

400 """ 

401 

402 def makeQuantum(self, datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None) -> Quantum: 

403 """Transform the scaffolding object into a true `Quantum` instance. 

404 

405 Parameters 

406 ---------- 

407 datastore_records : `dict` [ `str`, `DatastoreRecordData` ], optional 

408 If not `None` then fill datastore records in each generated Quantum 

409 using the records from this structure. 

410 

411 Returns 

412 ------- 

413 quantum : `Quantum` 

414 An actual `Quantum` instance. 

415 """ 

416 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

417 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

418 # Give the task's Connections class an opportunity to remove some 

419 # inputs, or complain if they are unacceptable. 

420 # This will raise if one of the check conditions is not met, which is 

421 # the intended behavior. 

422 # If it raises NotWorkFound, there is a bug in the QG algorithm 

423 # or the adjustQuantum is incorrectly trying to make a prerequisite 

424 # input behave like a regular input; adjustQuantum should only raise 

425 # NoWorkFound if a regular input is missing, and it shouldn't be 

426 # possible for us to have generated ``self`` if that's true. 

427 helper = AdjustQuantumHelper( 

428 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

429 ) 

430 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

431 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

432 quantum_records: Optional[Mapping[str, DatastoreRecordData]] = None 

433 if datastore_records is not None: 

434 quantum_records = {} 

435 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

436 input_refs += list(initInputs.values()) 

437 input_ids = set(ref.id for ref in input_refs if ref.id is not None) 

438 for datastore_name, records in datastore_records.items(): 

439 matching_records = records.subset(input_ids) 

440 if matching_records is not None: 

441 quantum_records[datastore_name] = matching_records 

442 return Quantum( 

443 taskName=self.task.taskDef.taskName, 

444 taskClass=self.task.taskDef.taskClass, 

445 dataId=self.dataId, 

446 initInputs=initInputs, 

447 inputs=helper.inputs, 

448 outputs=helper.outputs, 

449 datastore_records=quantum_records, 

450 ) 

451 

452 

453@dataclass 

454class _TaskScaffolding: 

455 """Helper class aggregating information about a `PipelineTask`, used when 

456 constructing a `QuantumGraph`. 

457 

458 See `_PipelineScaffolding` for a top-down description of the full 

459 scaffolding data structure. 

460 

461 Parameters 

462 ---------- 

463 taskDef : `TaskDef` 

464 Data structure that identifies the task class and its config. 

465 parent : `_PipelineScaffolding` 

466 The parent data structure that will hold the instance being 

467 constructed. 

468 datasetTypes : `TaskDatasetTypes` 

469 Data structure that categorizes the dataset types used by this task. 

470 """ 

471 

472 def __init__( 

473 self, 

474 taskDef: TaskDef, 

475 parent: _PipelineScaffolding, 

476 datasetTypes: TaskDatasetTypes, 

477 ): 

478 universe = parent.dimensions.universe 

479 self.taskDef = taskDef 

480 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

481 assert self.dimensions.issubset(parent.dimensions) 

482 # Initialize _DatasetDicts as subsets of the one or two 

483 # corresponding dicts in the parent _PipelineScaffolding. 

484 self.initInputs = _DatasetDict.fromSubset( 

485 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

486 ) 

487 self.initOutputs = _DatasetDict.fromSubset( 

488 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

489 ) 

490 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

491 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

492 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

493 self.dataIds: set[DataCoordinate] = set() 

494 self.quanta = {} 

495 self.storage_classes = { 

496 connection.name: connection.storageClass 

497 for connection in self.taskDef.connections.allConnections.values() 

498 } 

499 self.storage_classes[ 

500 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

501 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

502 self.storage_classes[ 

503 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

504 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

505 self.storage_classes[ 

506 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

507 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

508 

509 def __repr__(self) -> str: 

510 # Default dataclass-injected __repr__ gets caught in an infinite loop 

511 # because of back-references. 

512 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

513 

514 taskDef: TaskDef 

515 """Data structure that identifies the task class and its config 

516 (`TaskDef`). 

517 """ 

518 

519 dimensions: DimensionGraph 

520 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

521 """ 

522 

523 initInputs: _DatasetDict 

524 """Dictionary containing information about datasets used to construct this 

525 task (`_DatasetDict`). 

526 """ 

527 

528 initOutputs: _DatasetDict 

529 """Dictionary containing information about datasets produced as a 

530 side-effect of constructing this task (`_DatasetDict`). 

531 """ 

532 

533 inputs: _DatasetDict 

534 """Dictionary containing information about datasets used as regular, 

535 graph-constraining inputs to this task (`_DatasetDict`). 

536 """ 

537 

538 outputs: _DatasetDict 

539 """Dictionary containing information about datasets produced by this task 

540 (`_DatasetDict`). 

541 """ 

542 

543 prerequisites: _DatasetDict 

544 """Dictionary containing information about input datasets that must be 

545 present in the repository before any Pipeline containing this task is run 

546 (`_DatasetDict`). 

547 """ 

548 

549 quanta: dict[DataCoordinate, _QuantumScaffolding] 

550 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

551 this task with that data ID. 

552 """ 

553 

554 storage_classes: dict[str, str] 

555 """Mapping from dataset type name to storage class declared by this task. 

556 """ 

557 

558 def makeQuantumSet( 

559 self, 

560 missing: _DatasetDict, 

561 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

562 ) -> set[Quantum]: 

563 """Create a `set` of `Quantum` from the information in ``self``. 

564 

565 Parameters 

566 ---------- 

567 missing : `_DatasetDict` 

568 Input datasets that have not been found. 

569 datastore_records : `dict` 

570 Record from the datastore to export with quanta. 

571 

572 Returns 

573 ------- 

574 nodes : `set` of `Quantum` 

575 The `Quantum` elements corresponding to this task. 

576 """ 

577 outputs = set() 

578 for q in self.quanta.values(): 

579 try: 

580 tmpQuanta = q.makeQuantum(datastore_records) 

581 outputs.add(tmpQuanta) 

582 except (NoWorkFound, FileNotFoundError) as exc: 

583 if not missing.isdisjoint(q.inputs): 

584 # This is a node that is known to be pruned later and 

585 # should be left in even though some follow up queries 

586 # fail. This allows the pruning to start from this quantum 

587 # with known issues, and prune other nodes it touches. 

588 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

589 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

590 tmpQuantum = Quantum( 

591 taskName=q.task.taskDef.taskName, 

592 taskClass=q.task.taskDef.taskClass, 

593 dataId=q.dataId, 

594 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

595 inputs=inputs, 

596 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

597 ) 

598 outputs.add(tmpQuantum) 

599 else: 

600 raise exc 

601 return outputs 

602 

603 

604class _DatasetIdMaker: 

605 """Helper class which generates random dataset UUIDs for unresolved 

606 datasets. 

607 """ 

608 

609 def __init__(self, run: str): 

610 self.run = run 

611 # Cache of dataset refs generated so far. 

612 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

613 

614 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

615 # For components we need their parent dataset ID. 

616 if dataset_type.isComponent(): 

617 parent_type = dataset_type.makeCompositeDatasetType() 

618 # Parent should be resolved if this is an existing input, or it 

619 # should be in the cache already if it is an intermediate. 

620 key = parent_type, data_id 

621 if key not in self.resolved: 

622 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

623 parent_ref = self.resolved[key] 

624 assert parent_ref.id is not None and parent_ref.run is not None, "parent ref must be resolved" 

625 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

626 

627 key = dataset_type, data_id 

628 if (resolved := self.resolved.get(key)) is None: 

629 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

630 self.resolved[key] = resolved 

631 return resolved 

632 

633 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None: 

634 """Resolve all unresolved references in the provided dictionary.""" 

635 for data_id, holder in refs.items(): 

636 if holder.ref is None: 

637 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

638 

639 

640@dataclass 

641class _PipelineScaffolding: 

642 """A helper data structure that organizes the information involved in 

643 constructing a `QuantumGraph` for a `Pipeline`. 

644 

645 Parameters 

646 ---------- 

647 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

648 Sequence of tasks from which a graph is to be constructed. Must 

649 have nested task classes already imported. 

650 universe : `DimensionUniverse` 

651 Universe of all possible dimensions. 

652 

653 Notes 

654 ----- 

655 The scaffolding data structure contains nested data structures for both 

656 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

657 data structures are shared between the pipeline-level structure (which 

658 aggregates all datasets and categorizes them from the perspective of the 

659 complete pipeline) and the individual tasks that use them as inputs and 

660 outputs. 

661 

662 `QuantumGraph` construction proceeds in four steps, with each corresponding 

663 to a different `_PipelineScaffolding` method: 

664 

665 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

666 the DatasetTypes used by the pipeline (delegating to 

667 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

668 nested `_TaskScaffolding` and `_DatasetDict` objects. 

669 

670 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

671 returns related tuples of all dimensions used to identify any regular 

672 input, output, and intermediate datasets (not prerequisites). We then 

673 iterate over these tuples of related dimensions, identifying the subsets 

674 that correspond to distinct data IDs for each task and dataset type, 

675 and then create `_QuantumScaffolding` objects. 

676 

677 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

678 dataset data IDs previously identified, transforming unresolved 

679 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

680 up prerequisite datasets for all quanta. 

681 

682 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

683 per-task `_QuantumScaffolding` objects. 

684 """ 

685 

686 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

687 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

688 self.tasks = [] 

689 # Aggregate and categorize the DatasetTypes in the Pipeline. 

690 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

691 # Construct dictionaries that map those DatasetTypes to structures 

692 # that will (later) hold additional information about them. 

693 for attr in ( 

694 "initInputs", 

695 "initIntermediates", 

696 "initOutputs", 

697 "inputs", 

698 "intermediates", 

699 "outputs", 

700 "prerequisites", 

701 ): 

702 setattr( 

703 self, 

704 attr, 

705 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

706 ) 

707 self.missing = _DatasetDict(universe=registry.dimensions) 

708 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

709 # Aggregate all dimensions for all non-init, non-prerequisite 

710 # DatasetTypes. These are the ones we'll include in the big join 

711 # query. 

712 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

713 # Construct scaffolding nodes for each Task, and add backreferences 

714 # to the Task from each DatasetScaffolding node. 

715 # Note that there's only one scaffolding node for each DatasetType, 

716 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

717 # reference it. 

718 if isinstance(pipeline, Pipeline): 

719 pipeline = pipeline.toExpandedPipeline() 

720 self.tasks = [ 

721 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

722 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

723 ] 

724 

725 def __repr__(self) -> str: 

726 # Default dataclass-injected __repr__ gets caught in an infinite loop 

727 # because of back-references. 

728 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

729 

730 tasks: list[_TaskScaffolding] 

731 """Scaffolding data structures for each task in the pipeline 

732 (`list` of `_TaskScaffolding`). 

733 """ 

734 

735 initInputs: _DatasetDict 

736 """Datasets consumed but not produced when constructing the tasks in this 

737 pipeline (`_DatasetDict`). 

738 """ 

739 

740 initIntermediates: _DatasetDict 

741 """Datasets that are both consumed and produced when constructing the tasks 

742 in this pipeline (`_DatasetDict`). 

743 """ 

744 

745 initOutputs: _DatasetDict 

746 """Datasets produced but not consumed when constructing the tasks in this 

747 pipeline (`_DatasetDict`). 

748 """ 

749 

750 inputs: _DatasetDict 

751 """Datasets that are consumed but not produced when running this pipeline 

752 (`_DatasetDict`). 

753 """ 

754 

755 intermediates: _DatasetDict 

756 """Datasets that are both produced and consumed when running this pipeline 

757 (`_DatasetDict`). 

758 """ 

759 

760 outputs: _DatasetDict 

761 """Datasets produced but not consumed when when running this pipeline 

762 (`_DatasetDict`). 

763 """ 

764 

765 prerequisites: _DatasetDict 

766 """Datasets that are consumed when running this pipeline and looked up 

767 per-Quantum when generating the graph (`_DatasetDict`). 

768 """ 

769 

770 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

771 """Datasets that should be used as constraints in the initial query, 

772 according to tasks (`NamedValueSet`). 

773 """ 

774 

775 dimensions: DimensionGraph 

776 """All dimensions used by any regular input, intermediate, or output 

777 (not prerequisite) dataset; the set of dimension used in the "Big Join 

778 Query" (`DimensionGraph`). 

779 

780 This is required to be a superset of all task quantum dimensions. 

781 """ 

782 

783 missing: _DatasetDict 

784 """Datasets whose existence was originally predicted but were not 

785 actually found. 

786 

787 Quanta that require these datasets as inputs will be pruned (recursively) 

788 when actually constructing a `QuantumGraph` object. 

789 

790 These are currently populated only when the "initial dataset query 

791 constraint" does not include all overall-input dataset types, and hence the 

792 initial data ID query can include data IDs that it should not. 

793 """ 

794 

795 globalInitOutputs: _DatasetDict | None = None 

796 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

797 """ 

798 

799 @contextmanager 

800 def connectDataIds( 

801 self, 

802 registry: Registry, 

803 collections: Any, 

804 userQuery: Optional[str], 

805 externalDataId: DataCoordinate, 

806 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

807 bind: Optional[Mapping[str, Any]] = None, 

808 ) -> Iterator[DataCoordinateQueryResults]: 

809 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

810 

811 This method populates `_TaskScaffolding.dataIds` and 

812 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

813 

814 Parameters 

815 ---------- 

816 registry : `lsst.daf.butler.Registry` 

817 Registry for the data repository; used for all data ID queries. 

818 collections 

819 Expressions representing the collections to search for input 

820 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

821 userQuery : `str` or `None` 

822 User-provided expression to limit the data IDs processed. 

823 externalDataId : `DataCoordinate` 

824 Externally-provided data ID that should be used to restrict the 

825 results, just as if these constraints had been included via ``AND`` 

826 in ``userQuery``. This includes (at least) any instrument named 

827 in the pipeline definition. 

828 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

829 The query constraint variant that should be used to constraint the 

830 query based on dataset existance, defaults to 

831 `DatasetQueryConstraintVariant.ALL`. 

832 bind : `Mapping`, optional 

833 Mapping containing literal values that should be injected into the 

834 ``userQuery`` expression, keyed by the identifiers they replace. 

835 

836 Returns 

837 ------- 

838 commonDataIds : \ 

839 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

840 An interface to a database temporary table containing all data IDs 

841 that will appear in this `QuantumGraph`. Returned inside a 

842 context manager, which will drop the temporary table at the end of 

843 the `with` block in which this method is called. 

844 """ 

845 _LOG.debug("Building query for data IDs.") 

846 # Initialization datasets always have empty data IDs. 

847 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

848 for datasetType, refs in itertools.chain( 

849 self.initInputs.items(), 

850 self.initIntermediates.items(), 

851 self.initOutputs.items(), 

852 ): 

853 refs[emptyDataId] = _RefHolder(datasetType) 

854 # Run one big query for the data IDs for task dimensions and regular 

855 # inputs and outputs. We limit the query to only dimensions that are 

856 # associated with the input dataset types, but don't (yet) try to 

857 # obtain the dataset_ids for those inputs. 

858 _LOG.debug( 

859 "Submitting data ID query over dimensions %s and materializing results.", 

860 list(self.dimensions.names), 

861 ) 

862 queryArgs: dict[str, Any] = { 

863 "dimensions": self.dimensions, 

864 "where": userQuery, 

865 "dataId": externalDataId, 

866 "bind": bind, 

867 } 

868 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

869 _LOG.debug( 

870 "Constraining graph query using default of %s.", 

871 list(self.defaultDatasetQueryConstraints.names), 

872 ) 

873 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

874 queryArgs["collections"] = collections 

875 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

876 _LOG.debug("Not using dataset existence to constrain query.") 

877 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

878 constraint = set(datasetQueryConstraint) 

879 inputs = {k.name: k for k in self.inputs.keys()} 

880 if remainder := constraint.difference(inputs.keys()): 

881 raise ValueError( 

882 f"{remainder} dataset type(s) specified as a graph constraint, but" 

883 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

884 ) 

885 _LOG.debug(f"Constraining graph query using {constraint}") 

886 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

887 queryArgs["collections"] = collections 

888 else: 

889 raise ValueError( 

890 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

891 ) 

892 

893 if "datasets" in queryArgs: 

894 for i, dataset_type in enumerate(queryArgs["datasets"]): 

895 if dataset_type.isComponent(): 

896 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

897 

898 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

899 _LOG.debug("Expanding data IDs.") 

900 commonDataIds = commonDataIds.expanded() 

901 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

902 # Iterate over query results, populating data IDs for datasets and 

903 # quanta and then connecting them to each other. 

904 n = -1 

905 for n, commonDataId in enumerate(commonDataIds): 

906 # Create DatasetRefs for all DatasetTypes from this result row, 

907 # noting that we might have created some already. 

908 # We remember both those that already existed and those that we 

909 # create now. 

910 refsForRow = {} 

911 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

912 for datasetType, refs in itertools.chain( 

913 self.inputs.items(), 

914 self.intermediates.items(), 

915 self.outputs.items(), 

916 ): 

917 datasetDataId: Optional[DataCoordinate] 

918 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

919 datasetDataId = commonDataId.subset(datasetType.dimensions) 

920 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

921 ref_holder = refs.get(datasetDataId) 

922 if ref_holder is None: 

923 ref_holder = _RefHolder(datasetType) 

924 refs[datasetDataId] = ref_holder 

925 refsForRow[datasetType.name] = ref_holder 

926 # Create _QuantumScaffolding objects for all tasks from this 

927 # result row, noting that we might have created some already. 

928 for task in self.tasks: 

929 quantumDataId = commonDataId.subset(task.dimensions) 

930 quantum = task.quanta.get(quantumDataId) 

931 if quantum is None: 

932 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

933 task.quanta[quantumDataId] = quantum 

934 # Whether this is a new quantum or an existing one, we can 

935 # now associate the DatasetRefs for this row with it. The 

936 # fact that a Quantum data ID and a dataset data ID both 

937 # came from the same result row is what tells us they 

938 # should be associated. 

939 # Many of these associates will be duplicates (because 

940 # another query row that differed from this one only in 

941 # irrelevant dimensions already added them), and we use 

942 # sets to skip. 

943 for datasetType in task.inputs: 

944 dataId = dataIdCacheForRow[datasetType.dimensions] 

945 ref_holder = refsForRow[datasetType.name] 

946 quantum.inputs[datasetType.name][dataId] = ref_holder 

947 for datasetType in task.outputs: 

948 dataId = dataIdCacheForRow[datasetType.dimensions] 

949 ref_holder = refsForRow[datasetType.name] 

950 quantum.outputs[datasetType.name][dataId] = ref_holder 

951 if n < 0: 

952 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

953 emptiness_explained = False 

954 for message in commonDataIds.explain_no_results(): 

955 _LOG.critical(message) 

956 emptiness_explained = True 

957 if not emptiness_explained: 

958 _LOG.critical( 

959 "To reproduce this query for debugging purposes, run " 

960 "Registry.queryDataIds with these arguments:" 

961 ) 

962 # We could just repr() the queryArgs dict to get something 

963 # the user could make sense of, but it's friendlier to 

964 # put these args in an easier-to-construct equivalent form 

965 # so they can read it more easily and copy and paste into 

966 # a Python terminal. 

967 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

968 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

969 if queryArgs["where"]: 

970 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

971 if "datasets" in queryArgs: 

972 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

973 if "collections" in queryArgs: 

974 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

975 _LOG.debug("Finished processing %d rows from data ID query.", n) 

976 yield commonDataIds 

977 

978 def resolveDatasetRefs( 

979 self, 

980 registry: Registry, 

981 collections: Any, 

982 run: str, 

983 commonDataIds: DataCoordinateQueryResults, 

984 *, 

985 skipExistingIn: Any = None, 

986 clobberOutputs: bool = True, 

987 constrainedByAllDatasets: bool = True, 

988 ) -> None: 

989 """Perform follow up queries for each dataset data ID produced in 

990 `fillDataIds`. 

991 

992 This method populates `_DatasetScaffolding.refs` (except for those in 

993 `prerequisites`). 

994 

995 Parameters 

996 ---------- 

997 registry : `lsst.daf.butler.Registry` 

998 Registry for the data repository; used for all data ID queries. 

999 collections 

1000 Expressions representing the collections to search for input 

1001 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1002 run : `str` 

1003 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1004 output datasets, if it already exists. 

1005 commonDataIds : \ 

1006 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1007 Result of a previous call to `connectDataIds`. 

1008 skipExistingIn 

1009 Expressions representing the collections to search for existing 

1010 output datasets that should be skipped. See 

1011 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1012 `None` or empty string/sequence disables skipping. 

1013 clobberOutputs : `bool`, optional 

1014 If `True` (default), allow quanta to created even if outputs exist; 

1015 this requires the same behavior behavior to be enabled when 

1016 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1017 (those with metadata, or all outputs if there is no metadata 

1018 dataset configured) will be skipped rather than clobbered. 

1019 constrainedByAllDatasets : `bool`, optional 

1020 Indicates if the commonDataIds were generated with a constraint on 

1021 all dataset types. 

1022 

1023 Raises 

1024 ------ 

1025 OutputExistsError 

1026 Raised if an output dataset already exists in the output run 

1027 and ``skipExistingIn`` does not include output run, or if only 

1028 some outputs are present and ``clobberOutputs`` is `False`. 

1029 """ 

1030 # Run may be provided but it does not have to exist, in that case we 

1031 # use it for resolving references but don't check it for existing refs. 

1032 run_exists = False 

1033 if run: 

1034 try: 

1035 run_exists = bool(registry.queryCollections(run)) 

1036 except MissingCollectionError: 

1037 # Undocumented exception is raise if it does not exist 

1038 pass 

1039 

1040 skip_collections_wildcard: CollectionWildcard | None = None 

1041 skipExistingInRun = False 

1042 if skipExistingIn: 

1043 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1044 if run_exists: 

1045 # as optimization check in the explicit list of names first 

1046 skipExistingInRun = run in skip_collections_wildcard.strings 

1047 if not skipExistingInRun: 

1048 # need to flatten it and check again 

1049 skipExistingInRun = run in registry.queryCollections( 

1050 skipExistingIn, 

1051 collectionTypes=CollectionType.RUN, 

1052 ) 

1053 

1054 idMaker = _DatasetIdMaker(run) 

1055 

1056 resolvedRefQueryResults: Iterable[DatasetRef] 

1057 

1058 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1059 # few different code paths that each transfer different pieces of 

1060 # information about what dataset query constraints were applied here, 

1061 # and none of them has the complete picture until we get here. We're 

1062 # long overdue for a QG generation rewrite that will make this go away 

1063 # entirely anyway. 

1064 constrainedByAllDatasets = ( 

1065 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1066 ) 

1067 

1068 # Look up [init] intermediate and output datasets in the output 

1069 # collection, if there is an output collection. 

1070 if run_exists or skip_collections_wildcard is not None: 

1071 for datasetType, refs in itertools.chain( 

1072 self.initIntermediates.items(), 

1073 self.initOutputs.items(), 

1074 self.intermediates.items(), 

1075 self.outputs.items(), 

1076 ): 

1077 _LOG.debug( 

1078 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1079 len(refs), 

1080 datasetType.name, 

1081 ) 

1082 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1083 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1084 # TODO: this assert incorrectly bans component inputs; 

1085 # investigate on DM-33027. 

1086 # assert not datasetType.isComponent(), \ 

1087 # "Output datasets cannot be components." 

1088 # 

1089 # Instead we have to handle them manually to avoid a 

1090 # deprecation warning, but it is at least confusing and 

1091 # possibly a bug for components to appear here at all. 

1092 if datasetType.isComponent(): 

1093 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1094 component = datasetType.component() 

1095 else: 

1096 parent_dataset_type = datasetType 

1097 component = None 

1098 

1099 # look at RUN collection first 

1100 if run_exists: 

1101 try: 

1102 resolvedRefQueryResults = subset.findDatasets( 

1103 parent_dataset_type, collections=run, findFirst=True 

1104 ) 

1105 except MissingDatasetTypeError: 

1106 resolvedRefQueryResults = [] 

1107 for resolvedRef in resolvedRefQueryResults: 

1108 # TODO: we could easily support per-DatasetType 

1109 # skipExisting and I could imagine that being useful - 

1110 # it's probably required in order to support writing 

1111 # initOutputs before QuantumGraph generation. 

1112 assert resolvedRef.dataId in refs 

1113 if not (skipExistingInRun or isInit or clobberOutputs): 

1114 raise OutputExistsError( 

1115 f"Output dataset {datasetType.name} already exists in " 

1116 f"output RUN collection '{run}' with data ID" 

1117 f" {resolvedRef.dataId}." 

1118 ) 

1119 # To resolve all outputs we have to remember existing 

1120 # ones to avoid generating new dataset IDs for them. 

1121 refs[resolvedRef.dataId].ref = ( 

1122 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1123 ) 

1124 

1125 # And check skipExistingIn too, if RUN collection is in 

1126 # it is handled above 

1127 if skip_collections_wildcard is not None: 

1128 try: 

1129 resolvedRefQueryResults = subset.findDatasets( 

1130 parent_dataset_type, 

1131 collections=skip_collections_wildcard, 

1132 findFirst=True, 

1133 ) 

1134 except MissingDatasetTypeError: 

1135 resolvedRefQueryResults = [] 

1136 for resolvedRef in resolvedRefQueryResults: 

1137 if resolvedRef.dataId not in refs: 

1138 continue 

1139 refs[resolvedRef.dataId].ref = ( 

1140 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1141 ) 

1142 

1143 # Look up input and initInput datasets in the input collection(s). We 

1144 # accumulate datasets in self.missing, if the common data IDs were not 

1145 # constrained on dataset type existence. 

1146 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1147 _LOG.debug( 

1148 "Resolving %d datasets for input dataset %s.", 

1149 len(refs), 

1150 datasetType.name, 

1151 ) 

1152 if datasetType.isComponent(): 

1153 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1154 component = datasetType.component() 

1155 else: 

1156 parent_dataset_type = datasetType 

1157 component = None 

1158 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1159 try: 

1160 resolvedRefQueryResults = commonDataIds.subset( 

1161 datasetType.dimensions, unique=True 

1162 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1163 except MissingDatasetTypeError: 

1164 resolvedRefQueryResults = [] 

1165 dataIdsNotFoundYet = set(refs.keys()) 

1166 for resolvedRef in resolvedRefQueryResults: 

1167 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1168 if resolvedRef.dataId not in refs: 

1169 continue 

1170 refs[resolvedRef.dataId].ref = ( 

1171 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1172 ) 

1173 if dataIdsNotFoundYet: 

1174 if constrainedByAllDatasets: 

1175 raise RuntimeError( 

1176 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1177 f"'{datasetType.name}' was/were present in a previous " 

1178 "query, but could not be found now. " 

1179 "This is either a logic bug in QuantumGraph generation " 

1180 "or the input collections have been modified since " 

1181 "QuantumGraph generation began." 

1182 ) 

1183 elif not datasetType.dimensions: 

1184 raise RuntimeError( 

1185 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1186 f"collections {collections}." 

1187 ) 

1188 else: 

1189 # If the common dataIds were not constrained using all the 

1190 # input dataset types, it is possible that some data ids 

1191 # found don't correspond to existing datasets. Mark these 

1192 # for later pruning from the quantum graph. 

1193 for k in dataIdsNotFoundYet: 

1194 missing_for_dataset_type[k] = refs[k] 

1195 if missing_for_dataset_type: 

1196 self.missing[datasetType] = missing_for_dataset_type 

1197 

1198 # Resolve the missing refs, just so they look like all of the others; 

1199 # in the end other code will make sure they never appear in the QG. 

1200 for dataset_type, refDict in self.missing.items(): 

1201 idMaker.resolveDict(dataset_type, refDict) 

1202 

1203 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1204 # replacing the unresolved refs there, and then look up prerequisites. 

1205 for task in self.tasks: 

1206 _LOG.debug( 

1207 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1208 len(task.quanta), 

1209 task.taskDef.label, 

1210 ) 

1211 # The way iterConnections is designed makes it impossible to 

1212 # annotate precisely enough to satisfy MyPy here. 

1213 lookupFunctions = { 

1214 c.name: c.lookupFunction # type: ignore 

1215 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1216 if c.lookupFunction is not None # type: ignore 

1217 } 

1218 dataIdsFailed = [] 

1219 dataIdsSucceeded = [] 

1220 for quantum in task.quanta.values(): 

1221 # Process outputs datasets only if skipExistingIn is not None 

1222 # or there is a run to look for outputs in and clobberOutputs 

1223 # is True. Note that if skipExistingIn is None, any output 

1224 # datasets that already exist would have already caused an 

1225 # exception to be raised. 

1226 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1227 resolvedRefs = [] 

1228 unresolvedDataIds = [] 

1229 haveMetadata = False 

1230 for datasetType, originalRefs in quantum.outputs.items(): 

1231 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1232 if ref is not None: 

1233 resolvedRefs.append(ref) 

1234 originalRefs[dataId].ref = ref 

1235 if datasetType.name == task.taskDef.metadataDatasetName: 

1236 haveMetadata = True 

1237 else: 

1238 unresolvedDataIds.append((datasetType, dataId)) 

1239 if resolvedRefs: 

1240 if haveMetadata or not unresolvedDataIds: 

1241 dataIdsSucceeded.append(quantum.dataId) 

1242 if skip_collections_wildcard is not None: 

1243 continue 

1244 else: 

1245 dataIdsFailed.append(quantum.dataId) 

1246 if not clobberOutputs: 

1247 raise OutputExistsError( 

1248 f"Quantum {quantum.dataId} of task with label " 

1249 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1250 f"({resolvedRefs}) " 

1251 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1252 "and clobbering outputs was not enabled." 

1253 ) 

1254 # Update the input DatasetRefs to the resolved ones we already 

1255 # searched for. 

1256 for datasetType, input_refs in quantum.inputs.items(): 

1257 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1258 input_refs[data_id].ref = ref 

1259 # Look up prerequisite datasets in the input collection(s). 

1260 # These may have dimensions that extend beyond those we queried 

1261 # for originally, because we want to permit those data ID 

1262 # values to differ across quanta and dataset types. 

1263 for datasetType in task.prerequisites: 

1264 if datasetType.isComponent(): 

1265 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1266 component = datasetType.component() 

1267 else: 

1268 parent_dataset_type = datasetType 

1269 component = None 

1270 lookupFunction = lookupFunctions.get(datasetType.name) 

1271 if lookupFunction is not None: 

1272 # PipelineTask has provided its own function to do the 

1273 # lookup. This always takes precedence. 

1274 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1275 elif ( 

1276 datasetType.isCalibration() 

1277 and datasetType.dimensions <= quantum.dataId.graph 

1278 and quantum.dataId.graph.temporal 

1279 ): 

1280 # This is a master calibration lookup, which we have to 

1281 # handle specially because the query system can't do a 

1282 # temporal join on a non-dimension-based timespan yet. 

1283 timespan = quantum.dataId.timespan 

1284 try: 

1285 prereq_ref = registry.findDataset( 

1286 parent_dataset_type, 

1287 quantum.dataId, 

1288 collections=collections, 

1289 timespan=timespan, 

1290 ) 

1291 if prereq_ref is not None: 

1292 if component is not None: 

1293 prereq_ref = prereq_ref.makeComponentRef(component) 

1294 prereq_refs = [prereq_ref] 

1295 else: 

1296 prereq_refs = [] 

1297 except (KeyError, MissingDatasetTypeError): 

1298 # This dataset type is not present in the registry, 

1299 # which just means there are no datasets here. 

1300 prereq_refs = [] 

1301 else: 

1302 # Most general case. 

1303 prereq_refs = [ 

1304 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1305 for prereq_ref in registry.queryDatasets( 

1306 parent_dataset_type, 

1307 collections=collections, 

1308 dataId=quantum.dataId, 

1309 findFirst=True, 

1310 ).expanded() 

1311 ] 

1312 

1313 for ref in prereq_refs: 

1314 if ref is not None: 

1315 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1316 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1317 

1318 # Resolve all quantum inputs and outputs. 

1319 for datasetDict in (quantum.inputs, quantum.outputs): 

1320 for dataset_type, refDict in datasetDict.items(): 

1321 idMaker.resolveDict(dataset_type, refDict) 

1322 

1323 # Resolve task initInputs and initOutputs. 

1324 for datasetDict in (task.initInputs, task.initOutputs): 

1325 for dataset_type, refDict in datasetDict.items(): 

1326 idMaker.resolveDict(dataset_type, refDict) 

1327 

1328 # Actually remove any quanta that we decided to skip above. 

1329 if dataIdsSucceeded: 

1330 if skip_collections_wildcard is not None: 

1331 _LOG.debug( 

1332 "Pruning successful %d quanta for task with label '%s' because all of their " 

1333 "outputs exist or metadata was written successfully.", 

1334 len(dataIdsSucceeded), 

1335 task.taskDef.label, 

1336 ) 

1337 for dataId in dataIdsSucceeded: 

1338 del task.quanta[dataId] 

1339 elif clobberOutputs: 

1340 _LOG.info( 

1341 "Found %d successful quanta for task with label '%s' " 

1342 "that will need to be clobbered during execution.", 

1343 len(dataIdsSucceeded), 

1344 task.taskDef.label, 

1345 ) 

1346 else: 

1347 raise AssertionError("OutputExistsError should have already been raised.") 

1348 if dataIdsFailed: 

1349 if clobberOutputs: 

1350 _LOG.info( 

1351 "Found %d failed/incomplete quanta for task with label '%s' " 

1352 "that will need to be clobbered during execution.", 

1353 len(dataIdsFailed), 

1354 task.taskDef.label, 

1355 ) 

1356 else: 

1357 raise AssertionError("OutputExistsError should have already been raised.") 

1358 

1359 # Collect initOutputs that do not belong to any task. 

1360 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1361 for task in self.tasks: 

1362 global_dataset_types -= set(task.initOutputs) 

1363 if global_dataset_types: 

1364 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1365 for dataset_type, refDict in self.globalInitOutputs.items(): 

1366 idMaker.resolveDict(dataset_type, refDict) 

1367 

1368 def makeQuantumGraph( 

1369 self, 

1370 registry: Registry, 

1371 metadata: Optional[Mapping[str, Any]] = None, 

1372 datastore: Optional[Datastore] = None, 

1373 ) -> QuantumGraph: 

1374 """Create a `QuantumGraph` from the quanta already present in 

1375 the scaffolding data structure. 

1376 

1377 Parameters 

1378 --------- 

1379 registry : `lsst.daf.butler.Registry` 

1380 Registry for the data repository; used for all data ID queries. 

1381 metadata : Optional Mapping of `str` to primitives 

1382 This is an optional parameter of extra data to carry with the 

1383 graph. Entries in this mapping should be able to be serialized in 

1384 JSON. 

1385 datastore : `Datastore`, optional 

1386 If not `None` then fill datastore records in each generated 

1387 Quantum. 

1388 

1389 Returns 

1390 ------- 

1391 graph : `QuantumGraph` 

1392 The full `QuantumGraph`. 

1393 """ 

1394 

1395 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1396 """Extract all DatasetRefs from the dictionaries""" 

1397 for ref_dict in dataset_dict.values(): 

1398 for holder in ref_dict.values(): 

1399 yield holder.resolved_ref 

1400 

1401 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None 

1402 if datastore is not None: 

1403 datastore_records = datastore.export_records( 

1404 itertools.chain( 

1405 _make_refs(self.inputs), 

1406 _make_refs(self.initInputs), 

1407 _make_refs(self.prerequisites), 

1408 ) 

1409 ) 

1410 

1411 graphInput: dict[TaskDef, set[Quantum]] = {} 

1412 for task in self.tasks: 

1413 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1414 graphInput[task.taskDef] = qset 

1415 

1416 taskInitInputs = { 

1417 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1418 for task in self.tasks 

1419 } 

1420 taskInitOutputs = { 

1421 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1422 for task in self.tasks 

1423 } 

1424 

1425 globalInitOutputs: list[DatasetRef] = [] 

1426 if self.globalInitOutputs is not None: 

1427 for refs_dict in self.globalInitOutputs.values(): 

1428 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1429 

1430 graph = QuantumGraph( 

1431 graphInput, 

1432 metadata=metadata, 

1433 pruneRefs=list(self.missing.iter_resolved_refs()), 

1434 universe=self.dimensions.universe, 

1435 initInputs=taskInitInputs, 

1436 initOutputs=taskInitOutputs, 

1437 globalInitOutputs=globalInitOutputs, 

1438 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1439 ) 

1440 return graph 

1441 

1442 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1443 """Make a list of all dataset types used by a graph as defined in 

1444 registry. 

1445 """ 

1446 chain = [ 

1447 self.initInputs, 

1448 self.initIntermediates, 

1449 self.initOutputs, 

1450 self.inputs, 

1451 self.intermediates, 

1452 self.outputs, 

1453 self.prerequisites, 

1454 ] 

1455 if self.globalInitOutputs is not None: 

1456 chain.append(self.globalInitOutputs) 

1457 

1458 # Collect names of all dataset types. 

1459 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1460 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1461 

1462 # Check for types that do not exist in registry yet: 

1463 # - inputs must exist 

1464 # - intermediates and outputs may not exist, but there must not be 

1465 # more than one definition (e.g. differing in storage class) 

1466 # - prerequisites may not exist, treat it the same as outputs here 

1467 for dstype in itertools.chain(self.initInputs, self.inputs): 

1468 if dstype.name not in dataset_types: 

1469 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1470 

1471 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1472 chain = [ 

1473 self.initIntermediates, 

1474 self.initOutputs, 

1475 self.intermediates, 

1476 self.outputs, 

1477 self.prerequisites, 

1478 ] 

1479 if self.globalInitOutputs is not None: 

1480 chain.append(self.globalInitOutputs) 

1481 for dstype in itertools.chain(*chain): 

1482 if dstype.name not in dataset_types: 

1483 new_outputs[dstype.name].add(dstype) 

1484 for name, dstypes in new_outputs.items(): 

1485 if len(dstypes) > 1: 

1486 raise ValueError( 

1487 "Pipeline contains multiple definitions for a dataset type " 

1488 f"which is not defined in registry yet: {dstypes}" 

1489 ) 

1490 elif len(dstypes) == 1: 

1491 dataset_types[name] = dstypes.pop() 

1492 

1493 return dataset_types.values() 

1494 

1495 

1496# ------------------------ 

1497# Exported definitions -- 

1498# ------------------------ 

1499 

1500 

1501class GraphBuilderError(Exception): 

1502 """Base class for exceptions generated by graph builder.""" 

1503 

1504 pass 

1505 

1506 

1507class OutputExistsError(GraphBuilderError): 

1508 """Exception generated when output datasets already exist.""" 

1509 

1510 pass 

1511 

1512 

1513class PrerequisiteMissingError(GraphBuilderError): 

1514 """Exception generated when a prerequisite dataset does not exist.""" 

1515 

1516 pass 

1517 

1518 

1519class GraphBuilder: 

1520 """GraphBuilder class is responsible for building task execution graph from 

1521 a Pipeline. 

1522 

1523 Parameters 

1524 ---------- 

1525 registry : `~lsst.daf.butler.Registry` 

1526 Data butler instance. 

1527 skipExistingIn 

1528 Expressions representing the collections to search for existing 

1529 output datasets that should be skipped. See 

1530 :ref:`daf_butler_ordered_collection_searches`. 

1531 clobberOutputs : `bool`, optional 

1532 If `True` (default), allow quanta to created even if partial outputs 

1533 exist; this requires the same behavior behavior to be enabled when 

1534 executing. 

1535 datastore : `Datastore`, optional 

1536 If not `None` then fill datastore records in each generated Quantum. 

1537 """ 

1538 

1539 def __init__( 

1540 self, 

1541 registry: Registry, 

1542 skipExistingIn: Any = None, 

1543 clobberOutputs: bool = True, 

1544 datastore: Optional[Datastore] = None, 

1545 ): 

1546 self.registry = registry 

1547 self.dimensions = registry.dimensions 

1548 self.skipExistingIn = skipExistingIn 

1549 self.clobberOutputs = clobberOutputs 

1550 self.datastore = datastore 

1551 

1552 def makeGraph( 

1553 self, 

1554 pipeline: Pipeline | Iterable[TaskDef], 

1555 collections: Any, 

1556 run: str, 

1557 userQuery: Optional[str], 

1558 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1559 metadata: Optional[Mapping[str, Any]] = None, 

1560 bind: Optional[Mapping[str, Any]] = None, 

1561 ) -> QuantumGraph: 

1562 """Create execution graph for a pipeline. 

1563 

1564 Parameters 

1565 ---------- 

1566 pipeline : `Pipeline` or `Iterable` [ `TaskDef` ] 

1567 Pipeline definition, task names/classes and their configs. 

1568 collections 

1569 Expressions representing the collections to search for input 

1570 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1571 run : `str` 

1572 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1573 output datasets. Collection does not have to exist and it will be 

1574 created when graph is executed. 

1575 userQuery : `str` 

1576 String which defines user-defined selection for registry, should be 

1577 empty or `None` if there is no restrictions on data selection. 

1578 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1579 The query constraint variant that should be used to constraint the 

1580 query based on dataset existance, defaults to 

1581 `DatasetQueryConstraintVariant.ALL`. 

1582 metadata : Optional Mapping of `str` to primitives 

1583 This is an optional parameter of extra data to carry with the 

1584 graph. Entries in this mapping should be able to be serialized in 

1585 JSON. 

1586 bind : `Mapping`, optional 

1587 Mapping containing literal values that should be injected into the 

1588 ``userQuery`` expression, keyed by the identifiers they replace. 

1589 

1590 Returns 

1591 ------- 

1592 graph : `QuantumGraph` 

1593 

1594 Raises 

1595 ------ 

1596 UserExpressionError 

1597 Raised when user expression cannot be parsed. 

1598 OutputExistsError 

1599 Raised when output datasets already exist. 

1600 Exception 

1601 Other exceptions types may be raised by underlying registry 

1602 classes. 

1603 """ 

1604 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1605 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1606 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1607 instrument_class: Optional[Any] = None 

1608 if isinstance(pipeline, Pipeline): 

1609 instrument_class_name = pipeline.getInstrument() 

1610 if instrument_class_name is not None: 

1611 instrument_class = doImportType(instrument_class_name) 

1612 pipeline = list(pipeline.toExpandedPipeline()) 

1613 if instrument_class is not None: 

1614 dataId = DataCoordinate.standardize( 

1615 instrument=instrument_class.getName(), universe=self.registry.dimensions 

1616 ) 

1617 else: 

1618 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1619 with scaffolding.connectDataIds( 

1620 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1621 ) as commonDataIds: 

1622 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1623 scaffolding.resolveDatasetRefs( 

1624 self.registry, 

1625 collections, 

1626 run, 

1627 commonDataIds, 

1628 skipExistingIn=self.skipExistingIn, 

1629 clobberOutputs=self.clobberOutputs, 

1630 constrainedByAllDatasets=condition, 

1631 ) 

1632 return scaffolding.makeQuantumGraph( 

1633 registry=self.registry, metadata=metadata, datastore=self.datastore 

1634 )