Coverage for python/lsst/pipe/base/graphBuilder.py: 15%

548 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-25 09:14 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from collections.abc import Collection, Iterable, Iterator, Mapping 

35from contextlib import contextmanager 

36from dataclasses import dataclass 

37from typing import Any 

38 

39from lsst.daf.butler import ( 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56 

57# ----------------------------- 

58# Imports for other modules -- 

59# ----------------------------- 

60from . import automatic_connection_constants as acc 

61from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

62from ._status import NoWorkFound 

63from .connections import AdjustQuantumHelper, iterConnections 

64from .graph import QuantumGraph 

65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

66 

67# ---------------------------------- 

68# Local non-exported definitions -- 

69# ---------------------------------- 

70 

71_LOG = logging.getLogger(__name__) 

72 

73 

74@dataclass 

75class _RefHolder: 

76 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future 

77 resolved reference. 

78 

79 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use 

80 `None` to represent a reference that is yet to be resolved. Information 

81 about its corresponding dataset type and coordinate is stored in 

82 `_DatasetDict` mapping. 

83 """ 

84 

85 dataset_type: DatasetType 

86 """Dataset type of the dataset to be created later. I need to store it here 

87 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

88 between different compatible dataset types.""" 

89 

90 ref: DatasetRef | None = None 

91 """Dataset reference, initially `None`, created when all datasets are 

92 resolved. 

93 """ 

94 

95 @property 

96 def resolved_ref(self) -> DatasetRef: 

97 """Access resolved reference, should only be called after the 

98 reference is set (`~lsst.daf.butler.DatasetRef`). 

99 """ 

100 assert self.ref is not None, "Dataset reference is not set." 

101 return self.ref 

102 

103 

104class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]): 

105 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

106 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

107 type. 

108 

109 Parameters 

110 ---------- 

111 args 

112 Positional arguments are forwarded to the `dict` constructor. 

113 universe : `~lsst.daf.butler.DimensionUniverse` 

114 Universe of all possible dimensions. 

115 """ 

116 

117 def __init__(self, *args: Any, universe: DimensionUniverse): 

118 super().__init__(*args) 

119 self.universe = universe 

120 

121 @classmethod 

122 def fromDatasetTypes( 

123 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

124 ) -> _DatasetDict: 

125 """Construct a dictionary from a flat iterable of 

126 `~lsst.daf.butler.DatasetType` keys. 

127 

128 Parameters 

129 ---------- 

130 datasetTypes : `~collections.abc.Iterable` of \ 

131 `~lsst.daf.butler.DatasetType` 

132 DatasetTypes to use as keys for the dict. Values will be empty 

133 dictionaries. 

134 universe : `~lsst.daf.butler.DimensionUniverse` 

135 Universe of all possible dimensions. 

136 

137 Returns 

138 ------- 

139 dictionary : `_DatasetDict` 

140 A new `_DatasetDict` instance. 

141 """ 

142 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

143 

144 @classmethod 

145 def fromSubset( 

146 cls, 

147 datasetTypes: Collection[DatasetType], 

148 first: _DatasetDict, 

149 *rest: _DatasetDict, 

150 ) -> _DatasetDict: 

151 """Return a new dictionary by extracting items corresponding to the 

152 given keys from one or more existing dictionaries. 

153 

154 Parameters 

155 ---------- 

156 datasetTypes : `~collections.abc.Iterable` of \ 

157 `~lsst.daf.butler.DatasetType` 

158 DatasetTypes to use as keys for the dict. Values will be obtained 

159 by lookups against ``first`` and ``rest``. 

160 first : `_DatasetDict` 

161 Another dictionary from which to extract values. 

162 rest 

163 Additional dictionaries from which to extract values. 

164 

165 Returns 

166 ------- 

167 dictionary : `_DatasetDict` 

168 A new dictionary instance. 

169 """ 

170 combined = ChainMap(first, *rest) 

171 

172 # Dataset types known to match immediately can be processed 

173 # without checks. 

174 matches = combined.keys() & set(datasetTypes) 

175 _dict = {k: combined[k] for k in matches} 

176 

177 if len(_dict) < len(datasetTypes): 

178 # Work out which ones are missing. 

179 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

180 

181 # Get the known names for comparison. 

182 combined_by_name = {k.name: k for k in combined} 

183 

184 missing = set() 

185 incompatible = {} 

186 for datasetType in missing_datasetTypes: 

187 # The dataset type is not found. It may not be listed 

188 # or it may be that it is there with the same name 

189 # but different definition. 

190 if datasetType.name in combined_by_name: 

191 # This implies some inconsistency in definitions 

192 # for connections. If there is support for storage 

193 # class conversion we can let it slide. 

194 # At this point we do not know 

195 # where the inconsistency is but trust that down 

196 # stream code will be more explicit about input 

197 # vs output incompatibilities. 

198 existing = combined_by_name[datasetType.name] 

199 convertible_to_existing = existing.is_compatible_with(datasetType) 

200 convertible_from_existing = datasetType.is_compatible_with(existing) 

201 if convertible_to_existing and convertible_from_existing: 

202 _LOG.debug( 

203 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

204 datasetType.name, 

205 datasetType.storageClass_name, 

206 existing.storageClass_name, 

207 ) 

208 _dict[datasetType] = combined[existing] 

209 elif convertible_to_existing or convertible_from_existing: 

210 # We'd need to refactor a fair amount to recognize 

211 # whether this is an error or not, so I'm not going to 

212 # bother until we need to do that for other reasons 

213 # (it won't be too long). 

214 _LOG.info( 

215 "Dataset type %s is present with multiple only partially-compatible storage " 

216 "classes %s and %s.", 

217 datasetType.name, 

218 datasetType.storageClass_name, 

219 existing.storageClass_name, 

220 ) 

221 _dict[datasetType] = combined[existing] 

222 else: 

223 incompatible[datasetType] = existing 

224 else: 

225 missing.add(datasetType) 

226 

227 if missing or incompatible: 

228 reasons = [] 

229 if missing: 

230 reasons.append( 

231 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

232 f"types: [{', '.join(d.name for d in combined)}]." 

233 ) 

234 if incompatible: 

235 for x, y in incompatible.items(): 

236 reasons.append(f"{x} incompatible with {y}") 

237 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

238 

239 return cls(_dict, universe=first.universe) 

240 

241 @property 

242 def dimensions(self) -> DimensionGraph: 

243 """The union of all dimensions used by all dataset types in this 

244 dictionary, including implied dependencies (`DimensionGraph`). 

245 """ 

246 base = self.universe.empty 

247 if len(self) == 0: 

248 return base 

249 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

250 

251 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

252 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts 

253 into a new mapping with `~lsst.daf.butler.DatasetType` keys and 

254 `~lsst.daf.butler.DatasetRef` values. 

255 

256 This method assumes that each nest contains exactly one item, as is the 

257 case for all "init" datasets. 

258 

259 Parameters 

260 ---------- 

261 storage_classes : `dict` [ `str`, `str` ] 

262 Mapping from dataset type name to the storage class to use for that 

263 dataset type. These are typically the storage classes declared 

264 for a particular task, which may differ rom the data repository 

265 definitions. 

266 

267 Returns 

268 ------- 

269 dictionary : `~lsst.daf.butler.NamedKeyDict` 

270 Dictionary mapping `~lsst.daf.butler.DatasetType` to 

271 `~lsst.daf.butler.DatasetRef`, with both 

272 `~lsst.daf.butler.DatasetType` instances and string names usable 

273 as keys. 

274 """ 

275 return NamedKeyDict( 

276 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

277 ) 

278 

279 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

280 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into 

281 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of 

282 `~lsst.daf.butler.DatasetRef` values. 

283 

284 Parameters 

285 ---------- 

286 storage_classes : `dict` [ `str`, `str` ] 

287 Mapping from dataset type name to the storage class to use for that 

288 dataset type. These are typically the storage classes declared 

289 for a particular task, which may differ rom the data repository 

290 definitions. 

291 

292 Returns 

293 ------- 

294 dictionary : `~lsst.daf.butler.NamedKeyDict` 

295 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of 

296 `~lsst.daf.butler.DatasetRef`, with both 

297 `~lsst.daf.butler.DatasetType` instances and string names usable 

298 as keys. 

299 """ 

300 result = {} 

301 for dataset_type, holders in self.items(): 

302 if ( 

303 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

304 ) != dataset_type.storageClass_name: 

305 dataset_type = dataset_type.overrideStorageClass(override) 

306 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

307 else: 

308 refs = [holder.resolved_ref for holder in holders.values()] 

309 result[dataset_type] = refs 

310 return NamedKeyDict(result) 

311 

312 def extract( 

313 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

314 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

315 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances 

316 that match the given `~lsst.daf.butler.DatasetType` and data IDs. 

317 

318 Parameters 

319 ---------- 

320 datasetType : `~lsst.daf.butler.DatasetType` 

321 Dataset type to match. 

322 dataIds : `~collections.abc.Iterable` \ 

323 [ `~lsst.daf.butler.DataCoordinate` ] 

324 Data IDs to match. 

325 

326 Returns 

327 ------- 

328 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ] 

329 DatasetRef instances for which ``ref.datasetType == datasetType`` 

330 and ``ref.dataId`` is in ``dataIds``. 

331 """ 

332 refs = self[datasetType] 

333 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

334 

335 def isdisjoint(self, other: _DatasetDict) -> bool: 

336 """Test whether ``self`` and ``other`` have any datasets in common. 

337 

338 Datasets are considered in common if they have the same *parent* 

339 dataset type name and data ID; storage classes and components are not 

340 considered. 

341 """ 

342 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

343 for k, v in other.items(): 

344 parent_name, _ = k.nameAndComponent() 

345 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

346 return False 

347 return True 

348 

349 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

350 """Iterate over all DatasetRef instances held by this data structure, 

351 assuming that each `_RefHolder` already carries are resolved ref. 

352 """ 

353 for holders_by_data_id in self.values(): 

354 for holder in holders_by_data_id.values(): 

355 yield holder.resolved_ref 

356 

357 

358class _QuantumScaffolding: 

359 """Helper class aggregating information about a `Quantum`, used when 

360 constructing a `QuantumGraph`. 

361 

362 See `_PipelineScaffolding` for a top-down description of the full 

363 scaffolding data structure. 

364 

365 Parameters 

366 ---------- 

367 task : _TaskScaffolding 

368 Back-reference to the helper object for the `PipelineTask` this quantum 

369 represents an execution of. 

370 dataId : `~lsst.daf.butler.DataCoordinate` 

371 Data ID for this quantum. 

372 """ 

373 

374 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

375 self.task = task 

376 self.dataId = dataId 

377 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

378 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

379 self.prerequisites = _DatasetDict.fromDatasetTypes( 

380 task.prerequisites.keys(), universe=dataId.universe 

381 ) 

382 

383 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

384 

385 def __repr__(self) -> str: 

386 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

387 

388 task: _TaskScaffolding 

389 """Back-reference to the helper object for the `PipelineTask` this quantum 

390 represents an execution of. 

391 """ 

392 

393 dataId: DataCoordinate 

394 """Data ID for this quantum. 

395 """ 

396 

397 inputs: _DatasetDict 

398 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to 

399 this quantum. 

400 

401 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty 

402 dictionary at construction. Those nested dictionaries are populated 

403 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef` 

404 instances in `_PipelineScaffolding.connectDataIds`. 

405 """ 

406 

407 outputs: _DatasetDict 

408 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this 

409 quantum. 

410 """ 

411 

412 prerequisites: _DatasetDict 

413 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite 

414 inputs to this quantum. 

415 """ 

416 

417 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum: 

418 """Transform the scaffolding object into a true `Quantum` instance. 

419 

420 Parameters 

421 ---------- 

422 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

423 `~lsst.daf.butler.DatastoreRecordData` ], optional 

424 If not `None` then fill datastore records in each generated Quantum 

425 using the records from this structure. 

426 

427 Returns 

428 ------- 

429 quantum : `Quantum` 

430 An actual `Quantum` instance. 

431 """ 

432 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

433 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

434 # Give the task's Connections class an opportunity to remove some 

435 # inputs, or complain if they are unacceptable. 

436 # This will raise if one of the check conditions is not met, which is 

437 # the intended behavior. 

438 # If it raises NotWorkFound, there is a bug in the QG algorithm 

439 # or the adjustQuantum is incorrectly trying to make a prerequisite 

440 # input behave like a regular input; adjustQuantum should only raise 

441 # NoWorkFound if a regular input is missing, and it shouldn't be 

442 # possible for us to have generated ``self`` if that's true. 

443 helper = AdjustQuantumHelper( 

444 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

445 ) 

446 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

447 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

448 quantum_records: Mapping[str, DatastoreRecordData] | None = None 

449 if datastore_records is not None: 

450 quantum_records = {} 

451 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

452 input_refs += list(initInputs.values()) 

453 input_ids = set(ref.id for ref in input_refs) 

454 for datastore_name, records in datastore_records.items(): 

455 matching_records = records.subset(input_ids) 

456 if matching_records is not None: 

457 quantum_records[datastore_name] = matching_records 

458 return Quantum( 

459 taskName=self.task.taskDef.taskName, 

460 taskClass=self.task.taskDef.taskClass, 

461 dataId=self.dataId, 

462 initInputs=initInputs, 

463 inputs=helper.inputs, 

464 outputs=helper.outputs, 

465 datastore_records=quantum_records, 

466 ) 

467 

468 

469@dataclass 

470class _TaskScaffolding: 

471 """Helper class aggregating information about a `PipelineTask`, used when 

472 constructing a `QuantumGraph`. 

473 

474 See `_PipelineScaffolding` for a top-down description of the full 

475 scaffolding data structure. 

476 

477 Parameters 

478 ---------- 

479 taskDef : `TaskDef` 

480 Data structure that identifies the task class and its config. 

481 parent : `_PipelineScaffolding` 

482 The parent data structure that will hold the instance being 

483 constructed. 

484 datasetTypes : `TaskDatasetTypes` 

485 Data structure that categorizes the dataset types used by this task. 

486 """ 

487 

488 def __init__( 

489 self, 

490 taskDef: TaskDef, 

491 parent: _PipelineScaffolding, 

492 datasetTypes: TaskDatasetTypes, 

493 ): 

494 universe = parent.dimensions.universe 

495 self.taskDef = taskDef 

496 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

497 assert self.dimensions.issubset(parent.dimensions) 

498 # Initialize _DatasetDicts as subsets of the one or two 

499 # corresponding dicts in the parent _PipelineScaffolding. 

500 self.initInputs = _DatasetDict.fromSubset( 

501 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

502 ) 

503 self.initOutputs = _DatasetDict.fromSubset( 

504 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

505 ) 

506 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

507 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

508 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

509 self.dataIds: set[DataCoordinate] = set() 

510 self.quanta = {} 

511 self.storage_classes = { 

512 connection.name: connection.storageClass 

513 for connection in self.taskDef.connections.allConnections.values() 

514 } 

515 self.storage_classes[ 

516 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

517 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

518 self.storage_classes[ 

519 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

520 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

521 self.storage_classes[ 

522 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

523 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

524 

525 def __repr__(self) -> str: 

526 # Default dataclass-injected __repr__ gets caught in an infinite loop 

527 # because of back-references. 

528 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

529 

530 taskDef: TaskDef 

531 """Data structure that identifies the task class and its config 

532 (`TaskDef`). 

533 """ 

534 

535 dimensions: DimensionGraph 

536 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

537 """ 

538 

539 initInputs: _DatasetDict 

540 """Dictionary containing information about datasets used to construct this 

541 task (`_DatasetDict`). 

542 """ 

543 

544 initOutputs: _DatasetDict 

545 """Dictionary containing information about datasets produced as a 

546 side-effect of constructing this task (`_DatasetDict`). 

547 """ 

548 

549 inputs: _DatasetDict 

550 """Dictionary containing information about datasets used as regular, 

551 graph-constraining inputs to this task (`_DatasetDict`). 

552 """ 

553 

554 outputs: _DatasetDict 

555 """Dictionary containing information about datasets produced by this task 

556 (`_DatasetDict`). 

557 """ 

558 

559 prerequisites: _DatasetDict 

560 """Dictionary containing information about input datasets that must be 

561 present in the repository before any Pipeline containing this task is run 

562 (`_DatasetDict`). 

563 """ 

564 

565 quanta: dict[DataCoordinate, _QuantumScaffolding] 

566 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

567 this task with that data ID. 

568 """ 

569 

570 storage_classes: dict[str, str] 

571 """Mapping from dataset type name to storage class declared by this task. 

572 """ 

573 

574 def makeQuantumSet( 

575 self, 

576 missing: _DatasetDict, 

577 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

578 ) -> set[Quantum]: 

579 """Create a `set` of `Quantum` from the information in ``self``. 

580 

581 Parameters 

582 ---------- 

583 missing : `_DatasetDict` 

584 Input datasets that have not been found. 

585 datastore_records : `dict` 

586 Record from the datastore to export with quanta. 

587 

588 Returns 

589 ------- 

590 nodes : `set` of `Quantum` 

591 The `Quantum` elements corresponding to this task. 

592 """ 

593 outputs = set() 

594 for q in self.quanta.values(): 

595 try: 

596 tmpQuanta = q.makeQuantum(datastore_records) 

597 outputs.add(tmpQuanta) 

598 except (NoWorkFound, FileNotFoundError) as exc: 

599 if not missing.isdisjoint(q.inputs): 

600 # This is a node that is known to be pruned later and 

601 # should be left in even though some follow up queries 

602 # fail. This allows the pruning to start from this quantum 

603 # with known issues, and prune other nodes it touches. 

604 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

605 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

606 tmpQuantum = Quantum( 

607 taskName=q.task.taskDef.taskName, 

608 taskClass=q.task.taskDef.taskClass, 

609 dataId=q.dataId, 

610 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

611 inputs=inputs, 

612 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

613 ) 

614 outputs.add(tmpQuantum) 

615 else: 

616 raise exc 

617 return outputs 

618 

619 

620class _DatasetIdMaker: 

621 """Helper class which generates random dataset UUIDs for unresolved 

622 datasets. 

623 """ 

624 

625 def __init__(self, run: str): 

626 self.run = run 

627 # Cache of dataset refs generated so far. 

628 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

629 

630 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

631 # For components we need their parent dataset ID. 

632 if dataset_type.isComponent(): 

633 parent_type = dataset_type.makeCompositeDatasetType() 

634 # Parent should be resolved if this is an existing input, or it 

635 # should be in the cache already if it is an intermediate. 

636 key = parent_type, data_id 

637 if key not in self.resolved: 

638 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

639 parent_ref = self.resolved[key] 

640 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

641 

642 key = dataset_type, data_id 

643 if (resolved := self.resolved.get(key)) is None: 

644 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

645 self.resolved[key] = resolved 

646 return resolved 

647 

648 def resolveDict( 

649 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool 

650 ) -> None: 

651 """Resolve all unresolved references in the provided dictionary.""" 

652 for data_id, holder in refs.items(): 

653 if holder.ref is None or (is_output and holder.ref.run != self.run): 

654 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

655 

656 

657@dataclass 

658class _PipelineScaffolding: 

659 """A helper data structure that organizes the information involved in 

660 constructing a `QuantumGraph` for a `Pipeline`. 

661 

662 Parameters 

663 ---------- 

664 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

665 Sequence of tasks from which a graph is to be constructed. Must 

666 have nested task classes already imported. 

667 universe : `~lsst.daf.butler.DimensionUniverse` 

668 Universe of all possible dimensions. 

669 

670 Notes 

671 ----- 

672 The scaffolding data structure contains nested data structures for both 

673 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

674 data structures are shared between the pipeline-level structure (which 

675 aggregates all datasets and categorizes them from the perspective of the 

676 complete pipeline) and the individual tasks that use them as inputs and 

677 outputs. 

678 

679 `QuantumGraph` construction proceeds in four steps, with each corresponding 

680 to a different `_PipelineScaffolding` method: 

681 

682 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

683 the DatasetTypes used by the pipeline (delegating to 

684 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

685 nested `_TaskScaffolding` and `_DatasetDict` objects. 

686 

687 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

688 returns related tuples of all dimensions used to identify any regular 

689 input, output, and intermediate datasets (not prerequisites). We then 

690 iterate over these tuples of related dimensions, identifying the subsets 

691 that correspond to distinct data IDs for each task and dataset type, 

692 and then create `_QuantumScaffolding` objects. 

693 

694 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

695 dataset data IDs previously identified, transforming unresolved 

696 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

697 up prerequisite datasets for all quanta. 

698 

699 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

700 per-task `_QuantumScaffolding` objects. 

701 """ 

702 

703 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

704 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

705 self.tasks = [] 

706 # Aggregate and categorize the DatasetTypes in the Pipeline. 

707 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

708 # Construct dictionaries that map those DatasetTypes to structures 

709 # that will (later) hold additional information about them. 

710 for attr in ( 

711 "initInputs", 

712 "initIntermediates", 

713 "initOutputs", 

714 "inputs", 

715 "intermediates", 

716 "outputs", 

717 "prerequisites", 

718 ): 

719 setattr( 

720 self, 

721 attr, 

722 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

723 ) 

724 self.missing = _DatasetDict(universe=registry.dimensions) 

725 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

726 # Aggregate all dimensions for all non-init, non-prerequisite 

727 # DatasetTypes. These are the ones we'll include in the big join 

728 # query. 

729 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

730 # Construct scaffolding nodes for each Task, and add backreferences 

731 # to the Task from each DatasetScaffolding node. 

732 # Note that there's only one scaffolding node for each DatasetType, 

733 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

734 # reference it. 

735 if isinstance(pipeline, Pipeline): 

736 pipeline = pipeline.toExpandedPipeline() 

737 self.tasks = [ 

738 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

739 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

740 ] 

741 

742 def __repr__(self) -> str: 

743 # Default dataclass-injected __repr__ gets caught in an infinite loop 

744 # because of back-references. 

745 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

746 

747 tasks: list[_TaskScaffolding] 

748 """Scaffolding data structures for each task in the pipeline 

749 (`list` of `_TaskScaffolding`). 

750 """ 

751 

752 initInputs: _DatasetDict 

753 """Datasets consumed but not produced when constructing the tasks in this 

754 pipeline (`_DatasetDict`). 

755 """ 

756 

757 initIntermediates: _DatasetDict 

758 """Datasets that are both consumed and produced when constructing the tasks 

759 in this pipeline (`_DatasetDict`). 

760 """ 

761 

762 initOutputs: _DatasetDict 

763 """Datasets produced but not consumed when constructing the tasks in this 

764 pipeline (`_DatasetDict`). 

765 """ 

766 

767 inputs: _DatasetDict 

768 """Datasets that are consumed but not produced when running this pipeline 

769 (`_DatasetDict`). 

770 """ 

771 

772 intermediates: _DatasetDict 

773 """Datasets that are both produced and consumed when running this pipeline 

774 (`_DatasetDict`). 

775 """ 

776 

777 outputs: _DatasetDict 

778 """Datasets produced but not consumed when when running this pipeline 

779 (`_DatasetDict`). 

780 """ 

781 

782 prerequisites: _DatasetDict 

783 """Datasets that are consumed when running this pipeline and looked up 

784 per-Quantum when generating the graph (`_DatasetDict`). 

785 """ 

786 

787 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

788 """Datasets that should be used as constraints in the initial query, 

789 according to tasks (`~lsst.daf.butler.NamedValueSet`). 

790 """ 

791 

792 dimensions: DimensionGraph 

793 """All dimensions used by any regular input, intermediate, or output 

794 (not prerequisite) dataset; the set of dimension used in the "Big Join 

795 Query" (`~lsst.daf.butler.DimensionGraph`). 

796 

797 This is required to be a superset of all task quantum dimensions. 

798 """ 

799 

800 missing: _DatasetDict 

801 """Datasets whose existence was originally predicted but were not 

802 actually found. 

803 

804 Quanta that require these datasets as inputs will be pruned (recursively) 

805 when actually constructing a `QuantumGraph` object. 

806 

807 These are currently populated only when the "initial dataset query 

808 constraint" does not include all overall-input dataset types, and hence the 

809 initial data ID query can include data IDs that it should not. 

810 """ 

811 

812 globalInitOutputs: _DatasetDict | None = None 

813 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

814 """ 

815 

816 @contextmanager 

817 def connectDataIds( 

818 self, 

819 registry: Registry, 

820 collections: Any, 

821 userQuery: str | None, 

822 externalDataId: DataCoordinate, 

823 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

824 bind: Mapping[str, Any] | None = None, 

825 ) -> Iterator[DataCoordinateQueryResults]: 

826 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

827 

828 This method populates `_TaskScaffolding.dataIds` and 

829 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

830 

831 Parameters 

832 ---------- 

833 registry : `lsst.daf.butler.Registry` 

834 Registry for the data repository; used for all data ID queries. 

835 collections 

836 Expressions representing the collections to search for input 

837 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

838 userQuery : `str` or `None` 

839 User-provided expression to limit the data IDs processed. 

840 externalDataId : `~lsst.daf.butler.DataCoordinate` 

841 Externally-provided data ID that should be used to restrict the 

842 results, just as if these constraints had been included via ``AND`` 

843 in ``userQuery``. This includes (at least) any instrument named 

844 in the pipeline definition. 

845 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

846 The query constraint variant that should be used to constraint the 

847 query based on dataset existance, defaults to 

848 `DatasetQueryConstraintVariant.ALL`. 

849 bind : `~collections.abc.Mapping`, optional 

850 Mapping containing literal values that should be injected into the 

851 ``userQuery`` expression, keyed by the identifiers they replace. 

852 

853 Returns 

854 ------- 

855 commonDataIds : \ 

856 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

857 An interface to a database temporary table containing all data IDs 

858 that will appear in this `QuantumGraph`. Returned inside a 

859 context manager, which will drop the temporary table at the end of 

860 the `with` block in which this method is called. 

861 """ 

862 _LOG.debug("Building query for data IDs.") 

863 # Initialization datasets always have empty data IDs. 

864 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

865 for datasetType, refs in itertools.chain( 

866 self.initInputs.items(), 

867 self.initIntermediates.items(), 

868 self.initOutputs.items(), 

869 ): 

870 refs[emptyDataId] = _RefHolder(datasetType) 

871 # Run one big query for the data IDs for task dimensions and regular 

872 # inputs and outputs. We limit the query to only dimensions that are 

873 # associated with the input dataset types, but don't (yet) try to 

874 # obtain the dataset_ids for those inputs. 

875 _LOG.debug( 

876 "Submitting data ID query over dimensions %s and materializing results.", 

877 list(self.dimensions.names), 

878 ) 

879 queryArgs: dict[str, Any] = { 

880 "dimensions": self.dimensions, 

881 "where": userQuery, 

882 "dataId": externalDataId, 

883 "bind": bind, 

884 } 

885 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

886 _LOG.debug( 

887 "Constraining graph query using default of %s.", 

888 list(self.defaultDatasetQueryConstraints.names), 

889 ) 

890 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

891 queryArgs["collections"] = collections 

892 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

893 _LOG.debug("Not using dataset existence to constrain query.") 

894 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

895 constraint = set(datasetQueryConstraint) 

896 inputs = {k.name: k for k in self.inputs.keys()} 

897 if remainder := constraint.difference(inputs.keys()): 

898 raise ValueError( 

899 f"{remainder} dataset type(s) specified as a graph constraint, but" 

900 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

901 ) 

902 _LOG.debug(f"Constraining graph query using {constraint}") 

903 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

904 queryArgs["collections"] = collections 

905 else: 

906 raise ValueError( 

907 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

908 ) 

909 

910 if "datasets" in queryArgs: 

911 for i, dataset_type in enumerate(queryArgs["datasets"]): 

912 if dataset_type.isComponent(): 

913 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

914 

915 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

916 _LOG.debug("Expanding data IDs.") 

917 commonDataIds = commonDataIds.expanded() 

918 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

919 # Iterate over query results, populating data IDs for datasets and 

920 # quanta and then connecting them to each other. 

921 n = -1 

922 for n, commonDataId in enumerate(commonDataIds): 

923 # Create DatasetRefs for all DatasetTypes from this result row, 

924 # noting that we might have created some already. 

925 # We remember both those that already existed and those that we 

926 # create now. 

927 refsForRow = {} 

928 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

929 for datasetType, refs in itertools.chain( 

930 self.inputs.items(), 

931 self.intermediates.items(), 

932 self.outputs.items(), 

933 ): 

934 datasetDataId: DataCoordinate | None 

935 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

936 datasetDataId = commonDataId.subset(datasetType.dimensions) 

937 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

938 ref_holder = refs.get(datasetDataId) 

939 if ref_holder is None: 

940 ref_holder = _RefHolder(datasetType) 

941 refs[datasetDataId] = ref_holder 

942 refsForRow[datasetType.name] = ref_holder 

943 # Create _QuantumScaffolding objects for all tasks from this 

944 # result row, noting that we might have created some already. 

945 for task in self.tasks: 

946 quantumDataId = commonDataId.subset(task.dimensions) 

947 quantum = task.quanta.get(quantumDataId) 

948 if quantum is None: 

949 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

950 task.quanta[quantumDataId] = quantum 

951 # Whether this is a new quantum or an existing one, we can 

952 # now associate the DatasetRefs for this row with it. The 

953 # fact that a Quantum data ID and a dataset data ID both 

954 # came from the same result row is what tells us they 

955 # should be associated. 

956 # Many of these associates will be duplicates (because 

957 # another query row that differed from this one only in 

958 # irrelevant dimensions already added them), and we use 

959 # sets to skip. 

960 for datasetType in task.inputs: 

961 dataId = dataIdCacheForRow[datasetType.dimensions] 

962 ref_holder = refsForRow[datasetType.name] 

963 quantum.inputs[datasetType.name][dataId] = ref_holder 

964 for datasetType in task.outputs: 

965 dataId = dataIdCacheForRow[datasetType.dimensions] 

966 ref_holder = refsForRow[datasetType.name] 

967 quantum.outputs[datasetType.name][dataId] = ref_holder 

968 if n < 0: 

969 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

970 emptiness_explained = False 

971 for message in commonDataIds.explain_no_results(): 

972 _LOG.critical(message) 

973 emptiness_explained = True 

974 if not emptiness_explained: 

975 _LOG.critical( 

976 "To reproduce this query for debugging purposes, run " 

977 "Registry.queryDataIds with these arguments:" 

978 ) 

979 # We could just repr() the queryArgs dict to get something 

980 # the user could make sense of, but it's friendlier to 

981 # put these args in an easier-to-construct equivalent form 

982 # so they can read it more easily and copy and paste into 

983 # a Python terminal. 

984 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

985 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

986 if queryArgs["where"]: 

987 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

988 if "datasets" in queryArgs: 

989 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

990 if "collections" in queryArgs: 

991 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

992 _LOG.debug("Finished processing %d rows from data ID query.", n) 

993 yield commonDataIds 

994 

995 def resolveDatasetRefs( 

996 self, 

997 registry: Registry, 

998 collections: Any, 

999 run: str, 

1000 commonDataIds: DataCoordinateQueryResults, 

1001 *, 

1002 skipExistingIn: Any = None, 

1003 clobberOutputs: bool = True, 

1004 constrainedByAllDatasets: bool = True, 

1005 ) -> None: 

1006 """Perform follow up queries for each dataset data ID produced in 

1007 `fillDataIds`. 

1008 

1009 This method populates `_DatasetScaffolding.refs` (except for those in 

1010 `prerequisites`). 

1011 

1012 Parameters 

1013 ---------- 

1014 registry : `lsst.daf.butler.Registry` 

1015 Registry for the data repository; used for all data ID queries. 

1016 collections 

1017 Expressions representing the collections to search for input 

1018 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1019 run : `str` 

1020 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1021 output datasets, if it already exists. 

1022 commonDataIds : \ 

1023 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1024 Result of a previous call to `connectDataIds`. 

1025 skipExistingIn 

1026 Expressions representing the collections to search for existing 

1027 output datasets that should be skipped. See 

1028 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1029 `None` or empty string/sequence disables skipping. 

1030 clobberOutputs : `bool`, optional 

1031 If `True` (default), allow quanta to created even if outputs exist; 

1032 this requires the same behavior behavior to be enabled when 

1033 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1034 (those with metadata, or all outputs if there is no metadata 

1035 dataset configured) will be skipped rather than clobbered. 

1036 constrainedByAllDatasets : `bool`, optional 

1037 Indicates if the commonDataIds were generated with a constraint on 

1038 all dataset types. 

1039 

1040 Raises 

1041 ------ 

1042 OutputExistsError 

1043 Raised if an output dataset already exists in the output run 

1044 and ``skipExistingIn`` does not include output run, or if only 

1045 some outputs are present and ``clobberOutputs`` is `False`. 

1046 """ 

1047 # Run may be provided but it does not have to exist, in that case we 

1048 # use it for resolving references but don't check it for existing refs. 

1049 run_exists = False 

1050 if run: 

1051 try: 

1052 run_exists = bool(registry.queryCollections(run)) 

1053 except MissingCollectionError: 

1054 # Undocumented exception is raise if it does not exist 

1055 pass 

1056 

1057 skip_collections_wildcard: CollectionWildcard | None = None 

1058 skipExistingInRun = False 

1059 if skipExistingIn: 

1060 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1061 if run_exists: 

1062 # as optimization check in the explicit list of names first 

1063 skipExistingInRun = run in skip_collections_wildcard.strings 

1064 if not skipExistingInRun: 

1065 # need to flatten it and check again 

1066 skipExistingInRun = run in registry.queryCollections( 

1067 skipExistingIn, 

1068 collectionTypes=CollectionType.RUN, 

1069 ) 

1070 

1071 idMaker = _DatasetIdMaker(run) 

1072 

1073 resolvedRefQueryResults: Iterable[DatasetRef] 

1074 

1075 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1076 # few different code paths that each transfer different pieces of 

1077 # information about what dataset query constraints were applied here, 

1078 # and none of them has the complete picture until we get here. We're 

1079 # long overdue for a QG generation rewrite that will make this go away 

1080 # entirely anyway. 

1081 constrainedByAllDatasets = ( 

1082 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1083 ) 

1084 

1085 # Look up [init] intermediate and output datasets in the output 

1086 # collection, if there is an output collection. 

1087 if run_exists or skip_collections_wildcard is not None: 

1088 for datasetType, refs in itertools.chain( 

1089 self.initIntermediates.items(), 

1090 self.initOutputs.items(), 

1091 self.intermediates.items(), 

1092 self.outputs.items(), 

1093 ): 

1094 _LOG.debug( 

1095 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1096 len(refs), 

1097 datasetType.name, 

1098 ) 

1099 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1100 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1101 # TODO: this assert incorrectly bans component inputs; 

1102 # investigate on DM-33027. 

1103 # assert not datasetType.isComponent(), \ 

1104 # "Output datasets cannot be components." 

1105 # 

1106 # Instead we have to handle them manually to avoid a 

1107 # deprecation warning, but it is at least confusing and 

1108 # possibly a bug for components to appear here at all. 

1109 if datasetType.isComponent(): 

1110 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1111 component = datasetType.component() 

1112 else: 

1113 parent_dataset_type = datasetType 

1114 component = None 

1115 

1116 # look at RUN collection first 

1117 if run_exists: 

1118 try: 

1119 resolvedRefQueryResults = subset.findDatasets( 

1120 parent_dataset_type, collections=run, findFirst=True 

1121 ) 

1122 except MissingDatasetTypeError: 

1123 resolvedRefQueryResults = [] 

1124 for resolvedRef in resolvedRefQueryResults: 

1125 # TODO: we could easily support per-DatasetType 

1126 # skipExisting and I could imagine that being useful - 

1127 # it's probably required in order to support writing 

1128 # initOutputs before QuantumGraph generation. 

1129 assert resolvedRef.dataId in refs 

1130 if not (skipExistingInRun or isInit or clobberOutputs): 

1131 raise OutputExistsError( 

1132 f"Output dataset {datasetType.name} already exists in " 

1133 f"output RUN collection '{run}' with data ID" 

1134 f" {resolvedRef.dataId}." 

1135 ) 

1136 # To resolve all outputs we have to remember existing 

1137 # ones to avoid generating new dataset IDs for them. 

1138 refs[resolvedRef.dataId].ref = ( 

1139 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1140 ) 

1141 

1142 # And check skipExistingIn too, if RUN collection is in 

1143 # it is handled above 

1144 if skip_collections_wildcard is not None: 

1145 try: 

1146 resolvedRefQueryResults = subset.findDatasets( 

1147 parent_dataset_type, 

1148 collections=skip_collections_wildcard, 

1149 findFirst=True, 

1150 ) 

1151 except MissingDatasetTypeError: 

1152 resolvedRefQueryResults = [] 

1153 for resolvedRef in resolvedRefQueryResults: 

1154 if resolvedRef.dataId not in refs: 

1155 continue 

1156 refs[resolvedRef.dataId].ref = ( 

1157 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1158 ) 

1159 

1160 # Look up input and initInput datasets in the input collection(s). We 

1161 # accumulate datasets in self.missing, if the common data IDs were not 

1162 # constrained on dataset type existence. 

1163 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1164 _LOG.debug( 

1165 "Resolving %d datasets for input dataset %s.", 

1166 len(refs), 

1167 datasetType.name, 

1168 ) 

1169 if datasetType.isComponent(): 

1170 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1171 component = datasetType.component() 

1172 else: 

1173 parent_dataset_type = datasetType 

1174 component = None 

1175 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1176 try: 

1177 resolvedRefQueryResults = commonDataIds.subset( 

1178 datasetType.dimensions, unique=True 

1179 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1180 except MissingDatasetTypeError: 

1181 resolvedRefQueryResults = [] 

1182 dataIdsNotFoundYet = set(refs.keys()) 

1183 for resolvedRef in resolvedRefQueryResults: 

1184 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1185 if resolvedRef.dataId not in refs: 

1186 continue 

1187 refs[resolvedRef.dataId].ref = ( 

1188 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1189 ) 

1190 if dataIdsNotFoundYet: 

1191 if constrainedByAllDatasets: 

1192 raise RuntimeError( 

1193 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1194 f"'{datasetType.name}' was/were present in a previous " 

1195 "query, but could not be found now. " 

1196 "This is either a logic bug in QuantumGraph generation " 

1197 "or the input collections have been modified since " 

1198 "QuantumGraph generation began." 

1199 ) 

1200 elif not datasetType.dimensions: 

1201 raise RuntimeError( 

1202 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1203 f"collections {collections}." 

1204 ) 

1205 else: 

1206 # If the common dataIds were not constrained using all the 

1207 # input dataset types, it is possible that some data ids 

1208 # found don't correspond to existing datasets. Mark these 

1209 # for later pruning from the quantum graph. 

1210 for k in dataIdsNotFoundYet: 

1211 missing_for_dataset_type[k] = refs[k] 

1212 if missing_for_dataset_type: 

1213 self.missing[datasetType] = missing_for_dataset_type 

1214 

1215 # Resolve the missing refs, just so they look like all of the others; 

1216 # in the end other code will make sure they never appear in the QG. 

1217 for dataset_type, refDict in self.missing.items(): 

1218 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1219 

1220 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1221 # replacing the unresolved refs there, and then look up prerequisites. 

1222 for task in self.tasks: 

1223 _LOG.debug( 

1224 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1225 len(task.quanta), 

1226 task.taskDef.label, 

1227 ) 

1228 # The way iterConnections is designed makes it impossible to 

1229 # annotate precisely enough to satisfy MyPy here. 

1230 lookupFunctions = { 

1231 c.name: c.lookupFunction # type: ignore 

1232 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1233 if c.lookupFunction is not None # type: ignore 

1234 } 

1235 dataIdsFailed = [] 

1236 dataIdsSucceeded = [] 

1237 for quantum in task.quanta.values(): 

1238 # Process outputs datasets only if skipExistingIn is not None 

1239 # or there is a run to look for outputs in and clobberOutputs 

1240 # is True. Note that if skipExistingIn is None, any output 

1241 # datasets that already exist would have already caused an 

1242 # exception to be raised. 

1243 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1244 resolvedRefs = [] 

1245 unresolvedDataIds = [] 

1246 haveMetadata = False 

1247 for datasetType, originalRefs in quantum.outputs.items(): 

1248 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1249 if ref is not None: 

1250 resolvedRefs.append(ref) 

1251 originalRefs[dataId].ref = ref 

1252 if datasetType.name == task.taskDef.metadataDatasetName: 

1253 haveMetadata = True 

1254 else: 

1255 unresolvedDataIds.append((datasetType, dataId)) 

1256 if resolvedRefs: 

1257 if haveMetadata or not unresolvedDataIds: 

1258 dataIdsSucceeded.append(quantum.dataId) 

1259 if skip_collections_wildcard is not None: 

1260 continue 

1261 else: 

1262 dataIdsFailed.append(quantum.dataId) 

1263 if not clobberOutputs and run_exists: 

1264 raise OutputExistsError( 

1265 f"Quantum {quantum.dataId} of task with label " 

1266 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1267 f"({resolvedRefs}) " 

1268 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1269 "and clobbering outputs was not enabled." 

1270 ) 

1271 # Update the input DatasetRefs to the resolved ones we already 

1272 # searched for. 

1273 for datasetType, input_refs in quantum.inputs.items(): 

1274 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1275 input_refs[data_id].ref = ref 

1276 # Look up prerequisite datasets in the input collection(s). 

1277 # These may have dimensions that extend beyond those we queried 

1278 # for originally, because we want to permit those data ID 

1279 # values to differ across quanta and dataset types. 

1280 for datasetType in task.prerequisites: 

1281 if datasetType.isComponent(): 

1282 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1283 component = datasetType.component() 

1284 else: 

1285 parent_dataset_type = datasetType 

1286 component = None 

1287 lookupFunction = lookupFunctions.get(datasetType.name) 

1288 if lookupFunction is not None: 

1289 # PipelineTask has provided its own function to do the 

1290 # lookup. This always takes precedence. 

1291 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1292 elif ( 

1293 datasetType.isCalibration() 

1294 and datasetType.dimensions <= quantum.dataId.graph 

1295 and quantum.dataId.graph.temporal 

1296 ): 

1297 # This is a master calibration lookup, which we have to 

1298 # handle specially because the query system can't do a 

1299 # temporal join on a non-dimension-based timespan yet. 

1300 timespan = quantum.dataId.timespan 

1301 try: 

1302 prereq_ref = registry.findDataset( 

1303 parent_dataset_type, 

1304 quantum.dataId, 

1305 collections=collections, 

1306 timespan=timespan, 

1307 ) 

1308 if prereq_ref is not None: 

1309 if component is not None: 

1310 prereq_ref = prereq_ref.makeComponentRef(component) 

1311 prereq_refs = [prereq_ref] 

1312 else: 

1313 prereq_refs = [] 

1314 except (KeyError, MissingDatasetTypeError): 

1315 # This dataset type is not present in the registry, 

1316 # which just means there are no datasets here. 

1317 prereq_refs = [] 

1318 else: 

1319 # Most general case. 

1320 prereq_refs = [ 

1321 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1322 for prereq_ref in registry.queryDatasets( 

1323 parent_dataset_type, 

1324 collections=collections, 

1325 dataId=quantum.dataId, 

1326 findFirst=True, 

1327 ).expanded() 

1328 ] 

1329 

1330 for ref in prereq_refs: 

1331 if ref is not None: 

1332 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1333 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1334 

1335 # Resolve all quantum inputs and outputs. 

1336 for dataset_type, refDict in quantum.inputs.items(): 

1337 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1338 for dataset_type, refDict in quantum.outputs.items(): 

1339 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1340 

1341 # Resolve task initInputs and initOutputs. 

1342 for dataset_type, refDict in task.initInputs.items(): 

1343 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1344 for dataset_type, refDict in task.initOutputs.items(): 

1345 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1346 

1347 # Actually remove any quanta that we decided to skip above. 

1348 if dataIdsSucceeded: 

1349 if skip_collections_wildcard is not None: 

1350 _LOG.debug( 

1351 "Pruning successful %d quanta for task with label '%s' because all of their " 

1352 "outputs exist or metadata was written successfully.", 

1353 len(dataIdsSucceeded), 

1354 task.taskDef.label, 

1355 ) 

1356 for dataId in dataIdsSucceeded: 

1357 del task.quanta[dataId] 

1358 elif clobberOutputs and run_exists: 

1359 _LOG.info( 

1360 "Found %d successful quanta for task with label '%s' " 

1361 "that will need to be clobbered during execution.", 

1362 len(dataIdsSucceeded), 

1363 task.taskDef.label, 

1364 ) 

1365 if dataIdsFailed: 

1366 if clobberOutputs and run_exists: 

1367 _LOG.info( 

1368 "Found %d failed/incomplete quanta for task with label '%s' " 

1369 "that will need to be clobbered during execution.", 

1370 len(dataIdsFailed), 

1371 task.taskDef.label, 

1372 ) 

1373 

1374 # Collect initOutputs that do not belong to any task. 

1375 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1376 for task in self.tasks: 

1377 global_dataset_types -= set(task.initOutputs) 

1378 if global_dataset_types: 

1379 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1380 for dataset_type, refDict in self.globalInitOutputs.items(): 

1381 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1382 

1383 def makeQuantumGraph( 

1384 self, 

1385 registry: Registry, 

1386 metadata: Mapping[str, Any] | None = None, 

1387 datastore: Datastore | None = None, 

1388 ) -> QuantumGraph: 

1389 """Create a `QuantumGraph` from the quanta already present in 

1390 the scaffolding data structure. 

1391 

1392 Parameters 

1393 ---------- 

1394 registry : `lsst.daf.butler.Registry` 

1395 Registry for the data repository; used for all data ID queries. 

1396 metadata : `~collections.abc.Mapping` of `str` to primitives, optional 

1397 This is an optional parameter of extra data to carry with the 

1398 graph. Entries in this mapping should be able to be serialized in 

1399 JSON. 

1400 datastore : `~lsst.daf.butler.Datastore`, optional 

1401 If not `None` then fill datastore records in each generated 

1402 Quantum. 

1403 

1404 Returns 

1405 ------- 

1406 graph : `QuantumGraph` 

1407 The full `QuantumGraph`. 

1408 """ 

1409 

1410 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1411 """Extract all DatasetRefs from the dictionaries""" 

1412 for ref_dict in dataset_dict.values(): 

1413 for holder in ref_dict.values(): 

1414 yield holder.resolved_ref 

1415 

1416 datastore_records: Mapping[str, DatastoreRecordData] | None = None 

1417 if datastore is not None: 

1418 datastore_records = datastore.export_records( 

1419 itertools.chain( 

1420 _make_refs(self.inputs), 

1421 _make_refs(self.initInputs), 

1422 _make_refs(self.prerequisites), 

1423 ) 

1424 ) 

1425 

1426 graphInput: dict[TaskDef, set[Quantum]] = {} 

1427 for task in self.tasks: 

1428 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1429 graphInput[task.taskDef] = qset 

1430 

1431 taskInitInputs = { 

1432 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1433 for task in self.tasks 

1434 } 

1435 taskInitOutputs = { 

1436 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1437 for task in self.tasks 

1438 } 

1439 

1440 globalInitOutputs: list[DatasetRef] = [] 

1441 if self.globalInitOutputs is not None: 

1442 for refs_dict in self.globalInitOutputs.values(): 

1443 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1444 

1445 graph = QuantumGraph( 

1446 graphInput, 

1447 metadata=metadata, 

1448 pruneRefs=list(self.missing.iter_resolved_refs()), 

1449 universe=self.dimensions.universe, 

1450 initInputs=taskInitInputs, 

1451 initOutputs=taskInitOutputs, 

1452 globalInitOutputs=globalInitOutputs, 

1453 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1454 ) 

1455 return graph 

1456 

1457 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1458 """Make a list of all dataset types used by a graph as defined in 

1459 registry. 

1460 """ 

1461 chain = [ 

1462 self.initInputs, 

1463 self.initIntermediates, 

1464 self.initOutputs, 

1465 self.inputs, 

1466 self.intermediates, 

1467 self.outputs, 

1468 self.prerequisites, 

1469 ] 

1470 if self.globalInitOutputs is not None: 

1471 chain.append(self.globalInitOutputs) 

1472 

1473 # Collect names of all dataset types. 

1474 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1475 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1476 

1477 # Check for types that do not exist in registry yet: 

1478 # - inputs must exist 

1479 # - intermediates and outputs may not exist, but there must not be 

1480 # more than one definition (e.g. differing in storage class) 

1481 # - prerequisites may not exist, treat it the same as outputs here 

1482 for dstype in itertools.chain(self.initInputs, self.inputs): 

1483 if dstype.name not in dataset_types: 

1484 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1485 

1486 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1487 chain = [ 

1488 self.initIntermediates, 

1489 self.initOutputs, 

1490 self.intermediates, 

1491 self.outputs, 

1492 self.prerequisites, 

1493 ] 

1494 if self.globalInitOutputs is not None: 

1495 chain.append(self.globalInitOutputs) 

1496 for dstype in itertools.chain(*chain): 

1497 if dstype.name not in dataset_types: 

1498 new_outputs[dstype.name].add(dstype) 

1499 for name, dstypes in new_outputs.items(): 

1500 if len(dstypes) > 1: 

1501 raise ValueError( 

1502 "Pipeline contains multiple definitions for a dataset type " 

1503 f"which is not defined in registry yet: {dstypes}" 

1504 ) 

1505 elif len(dstypes) == 1: 

1506 dataset_types[name] = dstypes.pop() 

1507 

1508 return dataset_types.values() 

1509 

1510 

1511# ------------------------ 

1512# Exported definitions -- 

1513# ------------------------ 

1514 

1515 

1516class GraphBuilderError(Exception): 

1517 """Base class for exceptions generated by graph builder.""" 

1518 

1519 pass 

1520 

1521 

1522class OutputExistsError(GraphBuilderError): 

1523 """Exception generated when output datasets already exist.""" 

1524 

1525 pass 

1526 

1527 

1528class PrerequisiteMissingError(GraphBuilderError): 

1529 """Exception generated when a prerequisite dataset does not exist.""" 

1530 

1531 pass 

1532 

1533 

1534class GraphBuilder: 

1535 """GraphBuilder class is responsible for building task execution graph from 

1536 a Pipeline. 

1537 

1538 Parameters 

1539 ---------- 

1540 registry : `~lsst.daf.butler.Registry` 

1541 Data butler instance. 

1542 skipExistingIn 

1543 Expressions representing the collections to search for existing 

1544 output datasets that should be skipped. See 

1545 :ref:`daf_butler_ordered_collection_searches`. 

1546 clobberOutputs : `bool`, optional 

1547 If `True` (default), allow quanta to created even if partial outputs 

1548 exist; this requires the same behavior behavior to be enabled when 

1549 executing. 

1550 datastore : `~lsst.daf.butler.Datastore`, optional 

1551 If not `None` then fill datastore records in each generated Quantum. 

1552 """ 

1553 

1554 def __init__( 

1555 self, 

1556 registry: Registry, 

1557 skipExistingIn: Any = None, 

1558 clobberOutputs: bool = True, 

1559 datastore: Datastore | None = None, 

1560 ): 

1561 self.registry = registry 

1562 self.dimensions = registry.dimensions 

1563 self.skipExistingIn = skipExistingIn 

1564 self.clobberOutputs = clobberOutputs 

1565 self.datastore = datastore 

1566 

1567 def makeGraph( 

1568 self, 

1569 pipeline: Pipeline | Iterable[TaskDef], 

1570 collections: Any, 

1571 run: str, 

1572 userQuery: str | None, 

1573 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1574 metadata: Mapping[str, Any] | None = None, 

1575 bind: Mapping[str, Any] | None = None, 

1576 dataId: DataCoordinate | None = None, 

1577 ) -> QuantumGraph: 

1578 """Create execution graph for a pipeline. 

1579 

1580 Parameters 

1581 ---------- 

1582 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1583 Pipeline definition, task names/classes and their configs. 

1584 collections 

1585 Expressions representing the collections to search for input 

1586 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1587 run : `str` 

1588 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1589 output datasets. Collection does not have to exist and it will be 

1590 created when graph is executed. 

1591 userQuery : `str` 

1592 String which defines user-defined selection for registry, should be 

1593 empty or `None` if there is no restrictions on data selection. 

1594 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1595 The query constraint variant that should be used to constraint the 

1596 query based on dataset existance, defaults to 

1597 `DatasetQueryConstraintVariant.ALL`. 

1598 metadata : Optional Mapping of `str` to primitives 

1599 This is an optional parameter of extra data to carry with the 

1600 graph. Entries in this mapping should be able to be serialized in 

1601 JSON. 

1602 bind : `~collections.abc.Mapping`, optional 

1603 Mapping containing literal values that should be injected into the 

1604 ``userQuery`` expression, keyed by the identifiers they replace. 

1605 dataId : `lsst.daf.butler.DataCoordinate`, optional 

1606 Data ID that should also be included in the query constraint. 

1607 

1608 Returns 

1609 ------- 

1610 graph : `QuantumGraph` 

1611 

1612 Raises 

1613 ------ 

1614 UserExpressionError 

1615 Raised when user expression cannot be parsed. 

1616 OutputExistsError 

1617 Raised when output datasets already exist. 

1618 Exception 

1619 Other exceptions types may be raised by underlying registry 

1620 classes. 

1621 """ 

1622 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1623 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1624 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1625 if dataId is None: 

1626 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1627 if isinstance(pipeline, Pipeline): 

1628 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId) 

1629 with scaffolding.connectDataIds( 

1630 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1631 ) as commonDataIds: 

1632 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1633 scaffolding.resolveDatasetRefs( 

1634 self.registry, 

1635 collections, 

1636 run, 

1637 commonDataIds, 

1638 skipExistingIn=self.skipExistingIn, 

1639 clobberOutputs=self.clobberOutputs, 

1640 constrainedByAllDatasets=condition, 

1641 ) 

1642 return scaffolding.makeQuantumGraph( 

1643 registry=self.registry, metadata=metadata, datastore=self.datastore 

1644 )