Coverage for python/lsst/pipe/base/graphBuilder.py: 15%

548 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 11:14 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining GraphBuilder class and related methods. 

23""" 

24 

25from __future__ import annotations 

26 

27__all__ = ["GraphBuilder"] 

28 

29# ------------------------------- 

30# Imports of standard modules -- 

31# ------------------------------- 

32import itertools 

33import logging 

34from collections import ChainMap, defaultdict 

35from collections.abc import Collection, Iterable, Iterator, Mapping 

36from contextlib import contextmanager 

37from dataclasses import dataclass 

38from typing import Any 

39 

40from lsst.daf.butler import ( 

41 CollectionType, 

42 DataCoordinate, 

43 DatasetRef, 

44 DatasetType, 

45 Datastore, 

46 DatastoreRecordData, 

47 DimensionGraph, 

48 DimensionUniverse, 

49 NamedKeyDict, 

50 NamedValueSet, 

51 Quantum, 

52 Registry, 

53) 

54from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

55from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

56from lsst.daf.butler.registry.wildcards import CollectionWildcard 

57 

58# ----------------------------- 

59# Imports for other modules -- 

60# ----------------------------- 

61from . import automatic_connection_constants as acc 

62from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

63from ._status import NoWorkFound 

64from .connections import AdjustQuantumHelper, iterConnections 

65from .graph import QuantumGraph 

66from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

67 

68# ---------------------------------- 

69# Local non-exported definitions -- 

70# ---------------------------------- 

71 

72_LOG = logging.getLogger(__name__) 

73 

74 

75@dataclass 

76class _RefHolder: 

77 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future 

78 resolved reference. 

79 

80 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use 

81 `None` to represent a reference that is yet to be resolved. Information 

82 about its corresponding dataset type and coordinate is stored in 

83 `_DatasetDict` mapping. 

84 """ 

85 

86 dataset_type: DatasetType 

87 """Dataset type of the dataset to be created later. I need to store it here 

88 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

89 between different compatible dataset types.""" 

90 

91 ref: DatasetRef | None = None 

92 """Dataset reference, initially `None`, created when all datasets are 

93 resolved. 

94 """ 

95 

96 @property 

97 def resolved_ref(self) -> DatasetRef: 

98 """Access resolved reference, should only be called after the 

99 reference is set (`~lsst.daf.butler.DatasetRef`). 

100 """ 

101 assert self.ref is not None, "Dataset reference is not set." 

102 return self.ref 

103 

104 

105class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]): 

106 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

107 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

108 type. 

109 

110 Parameters 

111 ---------- 

112 args 

113 Positional arguments are forwarded to the `dict` constructor. 

114 universe : `~lsst.daf.butler.DimensionUniverse` 

115 Universe of all possible dimensions. 

116 """ 

117 

118 def __init__(self, *args: Any, universe: DimensionUniverse): 

119 super().__init__(*args) 

120 self.universe = universe 

121 

122 @classmethod 

123 def fromDatasetTypes( 

124 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

125 ) -> _DatasetDict: 

126 """Construct a dictionary from a flat iterable of 

127 `~lsst.daf.butler.DatasetType` keys. 

128 

129 Parameters 

130 ---------- 

131 datasetTypes : `~collections.abc.Iterable` of \ 

132 `~lsst.daf.butler.DatasetType` 

133 DatasetTypes to use as keys for the dict. Values will be empty 

134 dictionaries. 

135 universe : `~lsst.daf.butler.DimensionUniverse` 

136 Universe of all possible dimensions. 

137 

138 Returns 

139 ------- 

140 dictionary : `_DatasetDict` 

141 A new `_DatasetDict` instance. 

142 """ 

143 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

144 

145 @classmethod 

146 def fromSubset( 

147 cls, 

148 datasetTypes: Collection[DatasetType], 

149 first: _DatasetDict, 

150 *rest: _DatasetDict, 

151 ) -> _DatasetDict: 

152 """Return a new dictionary by extracting items corresponding to the 

153 given keys from one or more existing dictionaries. 

154 

155 Parameters 

156 ---------- 

157 datasetTypes : `~collections.abc.Iterable` of \ 

158 `~lsst.daf.butler.DatasetType` 

159 DatasetTypes to use as keys for the dict. Values will be obtained 

160 by lookups against ``first`` and ``rest``. 

161 first : `_DatasetDict` 

162 Another dictionary from which to extract values. 

163 rest 

164 Additional dictionaries from which to extract values. 

165 

166 Returns 

167 ------- 

168 dictionary : `_DatasetDict` 

169 A new dictionary instance. 

170 """ 

171 combined = ChainMap(first, *rest) 

172 

173 # Dataset types known to match immediately can be processed 

174 # without checks. 

175 matches = combined.keys() & set(datasetTypes) 

176 _dict = {k: combined[k] for k in matches} 

177 

178 if len(_dict) < len(datasetTypes): 

179 # Work out which ones are missing. 

180 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

181 

182 # Get the known names for comparison. 

183 combined_by_name = {k.name: k for k in combined} 

184 

185 missing = set() 

186 incompatible = {} 

187 for datasetType in missing_datasetTypes: 

188 # The dataset type is not found. It may not be listed 

189 # or it may be that it is there with the same name 

190 # but different definition. 

191 if datasetType.name in combined_by_name: 

192 # This implies some inconsistency in definitions 

193 # for connections. If there is support for storage 

194 # class conversion we can let it slide. 

195 # At this point we do not know 

196 # where the inconsistency is but trust that down 

197 # stream code will be more explicit about input 

198 # vs output incompatibilities. 

199 existing = combined_by_name[datasetType.name] 

200 convertible_to_existing = existing.is_compatible_with(datasetType) 

201 convertible_from_existing = datasetType.is_compatible_with(existing) 

202 if convertible_to_existing and convertible_from_existing: 

203 _LOG.debug( 

204 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

205 datasetType.name, 

206 datasetType.storageClass_name, 

207 existing.storageClass_name, 

208 ) 

209 _dict[datasetType] = combined[existing] 

210 elif convertible_to_existing or convertible_from_existing: 

211 # We'd need to refactor a fair amount to recognize 

212 # whether this is an error or not, so I'm not going to 

213 # bother until we need to do that for other reasons 

214 # (it won't be too long). 

215 _LOG.info( 

216 "Dataset type %s is present with multiple only partially-compatible storage " 

217 "classes %s and %s.", 

218 datasetType.name, 

219 datasetType.storageClass_name, 

220 existing.storageClass_name, 

221 ) 

222 _dict[datasetType] = combined[existing] 

223 else: 

224 incompatible[datasetType] = existing 

225 else: 

226 missing.add(datasetType) 

227 

228 if missing or incompatible: 

229 reasons = [] 

230 if missing: 

231 reasons.append( 

232 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

233 f"types: [{', '.join(d.name for d in combined)}]." 

234 ) 

235 if incompatible: 

236 for x, y in incompatible.items(): 

237 reasons.append(f"{x} incompatible with {y}") 

238 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

239 

240 return cls(_dict, universe=first.universe) 

241 

242 @property 

243 def dimensions(self) -> DimensionGraph: 

244 """The union of all dimensions used by all dataset types in this 

245 dictionary, including implied dependencies (`DimensionGraph`). 

246 """ 

247 base = self.universe.empty 

248 if len(self) == 0: 

249 return base 

250 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

251 

252 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

253 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts 

254 into a new mapping with `~lsst.daf.butler.DatasetType` keys and 

255 `~lsst.daf.butler.DatasetRef` values. 

256 

257 This method assumes that each nest contains exactly one item, as is the 

258 case for all "init" datasets. 

259 

260 Parameters 

261 ---------- 

262 storage_classes : `dict` [ `str`, `str` ] 

263 Mapping from dataset type name to the storage class to use for that 

264 dataset type. These are typically the storage classes declared 

265 for a particular task, which may differ rom the data repository 

266 definitions. 

267 

268 Returns 

269 ------- 

270 dictionary : `~lsst.daf.butler.NamedKeyDict` 

271 Dictionary mapping `~lsst.daf.butler.DatasetType` to 

272 `~lsst.daf.butler.DatasetRef`, with both 

273 `~lsst.daf.butler.DatasetType` instances and string names usable 

274 as keys. 

275 """ 

276 return NamedKeyDict( 

277 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

278 ) 

279 

280 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

281 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into 

282 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of 

283 `~lsst.daf.butler.DatasetRef` values. 

284 

285 Parameters 

286 ---------- 

287 storage_classes : `dict` [ `str`, `str` ] 

288 Mapping from dataset type name to the storage class to use for that 

289 dataset type. These are typically the storage classes declared 

290 for a particular task, which may differ rom the data repository 

291 definitions. 

292 

293 Returns 

294 ------- 

295 dictionary : `~lsst.daf.butler.NamedKeyDict` 

296 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of 

297 `~lsst.daf.butler.DatasetRef`, with both 

298 `~lsst.daf.butler.DatasetType` instances and string names usable 

299 as keys. 

300 """ 

301 result = {} 

302 for dataset_type, holders in self.items(): 

303 if ( 

304 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

305 ) != dataset_type.storageClass_name: 

306 dataset_type = dataset_type.overrideStorageClass(override) 

307 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

308 else: 

309 refs = [holder.resolved_ref for holder in holders.values()] 

310 result[dataset_type] = refs 

311 return NamedKeyDict(result) 

312 

313 def extract( 

314 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

315 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

316 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances 

317 that match the given `~lsst.daf.butler.DatasetType` and data IDs. 

318 

319 Parameters 

320 ---------- 

321 datasetType : `~lsst.daf.butler.DatasetType` 

322 Dataset type to match. 

323 dataIds : `~collections.abc.Iterable` \ 

324 [ `~lsst.daf.butler.DataCoordinate` ] 

325 Data IDs to match. 

326 

327 Returns 

328 ------- 

329 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ] 

330 DatasetRef instances for which ``ref.datasetType == datasetType`` 

331 and ``ref.dataId`` is in ``dataIds``. 

332 """ 

333 refs = self[datasetType] 

334 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

335 

336 def isdisjoint(self, other: _DatasetDict) -> bool: 

337 """Test whether ``self`` and ``other`` have any datasets in common. 

338 

339 Datasets are considered in common if they have the same *parent* 

340 dataset type name and data ID; storage classes and components are not 

341 considered. 

342 """ 

343 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

344 for k, v in other.items(): 

345 parent_name, _ = k.nameAndComponent() 

346 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

347 return False 

348 return True 

349 

350 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

351 """Iterate over all DatasetRef instances held by this data structure, 

352 assuming that each `_RefHolder` already carries are resolved ref. 

353 """ 

354 for holders_by_data_id in self.values(): 

355 for holder in holders_by_data_id.values(): 

356 yield holder.resolved_ref 

357 

358 

359class _QuantumScaffolding: 

360 """Helper class aggregating information about a `Quantum`, used when 

361 constructing a `QuantumGraph`. 

362 

363 See `_PipelineScaffolding` for a top-down description of the full 

364 scaffolding data structure. 

365 

366 Parameters 

367 ---------- 

368 task : _TaskScaffolding 

369 Back-reference to the helper object for the `PipelineTask` this quantum 

370 represents an execution of. 

371 dataId : `~lsst.daf.butler.DataCoordinate` 

372 Data ID for this quantum. 

373 """ 

374 

375 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

376 self.task = task 

377 self.dataId = dataId 

378 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

379 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

380 self.prerequisites = _DatasetDict.fromDatasetTypes( 

381 task.prerequisites.keys(), universe=dataId.universe 

382 ) 

383 

384 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

385 

386 def __repr__(self) -> str: 

387 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

388 

389 task: _TaskScaffolding 

390 """Back-reference to the helper object for the `PipelineTask` this quantum 

391 represents an execution of. 

392 """ 

393 

394 dataId: DataCoordinate 

395 """Data ID for this quantum. 

396 """ 

397 

398 inputs: _DatasetDict 

399 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to 

400 this quantum. 

401 

402 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty 

403 dictionary at construction. Those nested dictionaries are populated 

404 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef` 

405 instances in `_PipelineScaffolding.connectDataIds`. 

406 """ 

407 

408 outputs: _DatasetDict 

409 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this 

410 quantum. 

411 """ 

412 

413 prerequisites: _DatasetDict 

414 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite 

415 inputs to this quantum. 

416 """ 

417 

418 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum: 

419 """Transform the scaffolding object into a true `Quantum` instance. 

420 

421 Parameters 

422 ---------- 

423 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

424 `~lsst.daf.butler.DatastoreRecordData` ], optional 

425 If not `None` then fill datastore records in each generated Quantum 

426 using the records from this structure. 

427 

428 Returns 

429 ------- 

430 quantum : `Quantum` 

431 An actual `Quantum` instance. 

432 """ 

433 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

434 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

435 # Give the task's Connections class an opportunity to remove some 

436 # inputs, or complain if they are unacceptable. 

437 # This will raise if one of the check conditions is not met, which is 

438 # the intended behavior. 

439 # If it raises NotWorkFound, there is a bug in the QG algorithm 

440 # or the adjustQuantum is incorrectly trying to make a prerequisite 

441 # input behave like a regular input; adjustQuantum should only raise 

442 # NoWorkFound if a regular input is missing, and it shouldn't be 

443 # possible for us to have generated ``self`` if that's true. 

444 helper = AdjustQuantumHelper( 

445 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

446 ) 

447 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

448 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

449 quantum_records: Mapping[str, DatastoreRecordData] | None = None 

450 if datastore_records is not None: 

451 quantum_records = {} 

452 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

453 input_refs += list(initInputs.values()) 

454 input_ids = set(ref.id for ref in input_refs) 

455 for datastore_name, records in datastore_records.items(): 

456 matching_records = records.subset(input_ids) 

457 if matching_records is not None: 

458 quantum_records[datastore_name] = matching_records 

459 # ignore the types because quantum really can take a sequence of inputs 

460 return Quantum( 

461 taskName=self.task.taskDef.taskName, 

462 taskClass=self.task.taskDef.taskClass, 

463 dataId=self.dataId, 

464 initInputs=initInputs, 

465 inputs=helper.inputs, 

466 outputs=helper.outputs, 

467 datastore_records=quantum_records, 

468 ) 

469 

470 

471@dataclass 

472class _TaskScaffolding: 

473 """Helper class aggregating information about a `PipelineTask`, used when 

474 constructing a `QuantumGraph`. 

475 

476 See `_PipelineScaffolding` for a top-down description of the full 

477 scaffolding data structure. 

478 

479 Parameters 

480 ---------- 

481 taskDef : `TaskDef` 

482 Data structure that identifies the task class and its config. 

483 parent : `_PipelineScaffolding` 

484 The parent data structure that will hold the instance being 

485 constructed. 

486 datasetTypes : `TaskDatasetTypes` 

487 Data structure that categorizes the dataset types used by this task. 

488 """ 

489 

490 def __init__( 

491 self, 

492 taskDef: TaskDef, 

493 parent: _PipelineScaffolding, 

494 datasetTypes: TaskDatasetTypes, 

495 ): 

496 universe = parent.dimensions.universe 

497 self.taskDef = taskDef 

498 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

499 assert self.dimensions.issubset(parent.dimensions) 

500 # Initialize _DatasetDicts as subsets of the one or two 

501 # corresponding dicts in the parent _PipelineScaffolding. 

502 self.initInputs = _DatasetDict.fromSubset( 

503 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

504 ) 

505 self.initOutputs = _DatasetDict.fromSubset( 

506 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

507 ) 

508 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

509 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

510 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

511 self.dataIds: set[DataCoordinate] = set() 

512 self.quanta = {} 

513 self.storage_classes = { 

514 connection.name: connection.storageClass 

515 for connection in self.taskDef.connections.allConnections.values() 

516 } 

517 self.storage_classes[ 

518 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

519 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

520 self.storage_classes[ 

521 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

522 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

523 self.storage_classes[ 

524 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

525 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

526 

527 def __repr__(self) -> str: 

528 # Default dataclass-injected __repr__ gets caught in an infinite loop 

529 # because of back-references. 

530 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

531 

532 taskDef: TaskDef 

533 """Data structure that identifies the task class and its config 

534 (`TaskDef`). 

535 """ 

536 

537 dimensions: DimensionGraph 

538 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

539 """ 

540 

541 initInputs: _DatasetDict 

542 """Dictionary containing information about datasets used to construct this 

543 task (`_DatasetDict`). 

544 """ 

545 

546 initOutputs: _DatasetDict 

547 """Dictionary containing information about datasets produced as a 

548 side-effect of constructing this task (`_DatasetDict`). 

549 """ 

550 

551 inputs: _DatasetDict 

552 """Dictionary containing information about datasets used as regular, 

553 graph-constraining inputs to this task (`_DatasetDict`). 

554 """ 

555 

556 outputs: _DatasetDict 

557 """Dictionary containing information about datasets produced by this task 

558 (`_DatasetDict`). 

559 """ 

560 

561 prerequisites: _DatasetDict 

562 """Dictionary containing information about input datasets that must be 

563 present in the repository before any Pipeline containing this task is run 

564 (`_DatasetDict`). 

565 """ 

566 

567 quanta: dict[DataCoordinate, _QuantumScaffolding] 

568 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

569 this task with that data ID. 

570 """ 

571 

572 storage_classes: dict[str, str] 

573 """Mapping from dataset type name to storage class declared by this task. 

574 """ 

575 

576 def makeQuantumSet( 

577 self, 

578 missing: _DatasetDict, 

579 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

580 ) -> set[Quantum]: 

581 """Create a `set` of `Quantum` from the information in ``self``. 

582 

583 Parameters 

584 ---------- 

585 missing : `_DatasetDict` 

586 Input datasets that have not been found. 

587 datastore_records : `dict` 

588 Record from the datastore to export with quanta. 

589 

590 Returns 

591 ------- 

592 nodes : `set` of `Quantum` 

593 The `Quantum` elements corresponding to this task. 

594 """ 

595 outputs = set() 

596 for q in self.quanta.values(): 

597 try: 

598 tmpQuanta = q.makeQuantum(datastore_records) 

599 outputs.add(tmpQuanta) 

600 except (NoWorkFound, FileNotFoundError) as exc: 

601 if not missing.isdisjoint(q.inputs): 

602 # This is a node that is known to be pruned later and 

603 # should be left in even though some follow up queries 

604 # fail. This allows the pruning to start from this quantum 

605 # with known issues, and prune other nodes it touches. 

606 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

607 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

608 tmpQuantum = Quantum( 

609 taskName=q.task.taskDef.taskName, 

610 taskClass=q.task.taskDef.taskClass, 

611 dataId=q.dataId, 

612 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

613 inputs=inputs, 

614 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

615 ) 

616 outputs.add(tmpQuantum) 

617 else: 

618 raise exc 

619 return outputs 

620 

621 

622class _DatasetIdMaker: 

623 """Helper class which generates random dataset UUIDs for unresolved 

624 datasets. 

625 """ 

626 

627 def __init__(self, run: str): 

628 self.run = run 

629 # Cache of dataset refs generated so far. 

630 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

631 

632 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

633 # For components we need their parent dataset ID. 

634 if dataset_type.isComponent(): 

635 parent_type = dataset_type.makeCompositeDatasetType() 

636 # Parent should be resolved if this is an existing input, or it 

637 # should be in the cache already if it is an intermediate. 

638 key = parent_type, data_id 

639 if key not in self.resolved: 

640 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

641 parent_ref = self.resolved[key] 

642 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

643 

644 key = dataset_type, data_id 

645 if (resolved := self.resolved.get(key)) is None: 

646 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

647 self.resolved[key] = resolved 

648 return resolved 

649 

650 def resolveDict( 

651 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool 

652 ) -> None: 

653 """Resolve all unresolved references in the provided dictionary.""" 

654 for data_id, holder in refs.items(): 

655 if holder.ref is None or (is_output and holder.ref.run != self.run): 

656 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

657 

658 

659@dataclass 

660class _PipelineScaffolding: 

661 """A helper data structure that organizes the information involved in 

662 constructing a `QuantumGraph` for a `Pipeline`. 

663 

664 Parameters 

665 ---------- 

666 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

667 Sequence of tasks from which a graph is to be constructed. Must 

668 have nested task classes already imported. 

669 universe : `~lsst.daf.butler.DimensionUniverse` 

670 Universe of all possible dimensions. 

671 

672 Notes 

673 ----- 

674 The scaffolding data structure contains nested data structures for both 

675 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

676 data structures are shared between the pipeline-level structure (which 

677 aggregates all datasets and categorizes them from the perspective of the 

678 complete pipeline) and the individual tasks that use them as inputs and 

679 outputs. 

680 

681 `QuantumGraph` construction proceeds in four steps, with each corresponding 

682 to a different `_PipelineScaffolding` method: 

683 

684 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

685 the DatasetTypes used by the pipeline (delegating to 

686 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

687 nested `_TaskScaffolding` and `_DatasetDict` objects. 

688 

689 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

690 returns related tuples of all dimensions used to identify any regular 

691 input, output, and intermediate datasets (not prerequisites). We then 

692 iterate over these tuples of related dimensions, identifying the subsets 

693 that correspond to distinct data IDs for each task and dataset type, 

694 and then create `_QuantumScaffolding` objects. 

695 

696 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

697 dataset data IDs previously identified, transforming unresolved 

698 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

699 up prerequisite datasets for all quanta. 

700 

701 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

702 per-task `_QuantumScaffolding` objects. 

703 """ 

704 

705 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

706 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

707 self.tasks = [] 

708 # Aggregate and categorize the DatasetTypes in the Pipeline. 

709 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

710 # Construct dictionaries that map those DatasetTypes to structures 

711 # that will (later) hold additional information about them. 

712 for attr in ( 

713 "initInputs", 

714 "initIntermediates", 

715 "initOutputs", 

716 "inputs", 

717 "intermediates", 

718 "outputs", 

719 "prerequisites", 

720 ): 

721 setattr( 

722 self, 

723 attr, 

724 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

725 ) 

726 self.missing = _DatasetDict(universe=registry.dimensions) 

727 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

728 # Aggregate all dimensions for all non-init, non-prerequisite 

729 # DatasetTypes. These are the ones we'll include in the big join 

730 # query. 

731 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

732 # Construct scaffolding nodes for each Task, and add backreferences 

733 # to the Task from each DatasetScaffolding node. 

734 # Note that there's only one scaffolding node for each DatasetType, 

735 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

736 # reference it. 

737 if isinstance(pipeline, Pipeline): 

738 pipeline = pipeline.toExpandedPipeline() 

739 self.tasks = [ 

740 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

741 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

742 ] 

743 

744 def __repr__(self) -> str: 

745 # Default dataclass-injected __repr__ gets caught in an infinite loop 

746 # because of back-references. 

747 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

748 

749 tasks: list[_TaskScaffolding] 

750 """Scaffolding data structures for each task in the pipeline 

751 (`list` of `_TaskScaffolding`). 

752 """ 

753 

754 initInputs: _DatasetDict 

755 """Datasets consumed but not produced when constructing the tasks in this 

756 pipeline (`_DatasetDict`). 

757 """ 

758 

759 initIntermediates: _DatasetDict 

760 """Datasets that are both consumed and produced when constructing the tasks 

761 in this pipeline (`_DatasetDict`). 

762 """ 

763 

764 initOutputs: _DatasetDict 

765 """Datasets produced but not consumed when constructing the tasks in this 

766 pipeline (`_DatasetDict`). 

767 """ 

768 

769 inputs: _DatasetDict 

770 """Datasets that are consumed but not produced when running this pipeline 

771 (`_DatasetDict`). 

772 """ 

773 

774 intermediates: _DatasetDict 

775 """Datasets that are both produced and consumed when running this pipeline 

776 (`_DatasetDict`). 

777 """ 

778 

779 outputs: _DatasetDict 

780 """Datasets produced but not consumed when when running this pipeline 

781 (`_DatasetDict`). 

782 """ 

783 

784 prerequisites: _DatasetDict 

785 """Datasets that are consumed when running this pipeline and looked up 

786 per-Quantum when generating the graph (`_DatasetDict`). 

787 """ 

788 

789 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

790 """Datasets that should be used as constraints in the initial query, 

791 according to tasks (`~lsst.daf.butler.NamedValueSet`). 

792 """ 

793 

794 dimensions: DimensionGraph 

795 """All dimensions used by any regular input, intermediate, or output 

796 (not prerequisite) dataset; the set of dimension used in the "Big Join 

797 Query" (`~lsst.daf.butler.DimensionGraph`). 

798 

799 This is required to be a superset of all task quantum dimensions. 

800 """ 

801 

802 missing: _DatasetDict 

803 """Datasets whose existence was originally predicted but were not 

804 actually found. 

805 

806 Quanta that require these datasets as inputs will be pruned (recursively) 

807 when actually constructing a `QuantumGraph` object. 

808 

809 These are currently populated only when the "initial dataset query 

810 constraint" does not include all overall-input dataset types, and hence the 

811 initial data ID query can include data IDs that it should not. 

812 """ 

813 

814 globalInitOutputs: _DatasetDict | None = None 

815 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

816 """ 

817 

818 @contextmanager 

819 def connectDataIds( 

820 self, 

821 registry: Registry, 

822 collections: Any, 

823 userQuery: str | None, 

824 externalDataId: DataCoordinate, 

825 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

826 bind: Mapping[str, Any] | None = None, 

827 ) -> Iterator[DataCoordinateQueryResults]: 

828 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

829 

830 This method populates `_TaskScaffolding.dataIds` and 

831 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

832 

833 Parameters 

834 ---------- 

835 registry : `lsst.daf.butler.Registry` 

836 Registry for the data repository; used for all data ID queries. 

837 collections 

838 Expressions representing the collections to search for input 

839 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

840 userQuery : `str` or `None` 

841 User-provided expression to limit the data IDs processed. 

842 externalDataId : `~lsst.daf.butler.DataCoordinate` 

843 Externally-provided data ID that should be used to restrict the 

844 results, just as if these constraints had been included via ``AND`` 

845 in ``userQuery``. This includes (at least) any instrument named 

846 in the pipeline definition. 

847 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

848 The query constraint variant that should be used to constraint the 

849 query based on dataset existance, defaults to 

850 `DatasetQueryConstraintVariant.ALL`. 

851 bind : `~collections.abc.Mapping`, optional 

852 Mapping containing literal values that should be injected into the 

853 ``userQuery`` expression, keyed by the identifiers they replace. 

854 

855 Returns 

856 ------- 

857 commonDataIds : \ 

858 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

859 An interface to a database temporary table containing all data IDs 

860 that will appear in this `QuantumGraph`. Returned inside a 

861 context manager, which will drop the temporary table at the end of 

862 the `with` block in which this method is called. 

863 """ 

864 _LOG.debug("Building query for data IDs.") 

865 # Initialization datasets always have empty data IDs. 

866 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

867 for datasetType, refs in itertools.chain( 

868 self.initInputs.items(), 

869 self.initIntermediates.items(), 

870 self.initOutputs.items(), 

871 ): 

872 refs[emptyDataId] = _RefHolder(datasetType) 

873 # Run one big query for the data IDs for task dimensions and regular 

874 # inputs and outputs. We limit the query to only dimensions that are 

875 # associated with the input dataset types, but don't (yet) try to 

876 # obtain the dataset_ids for those inputs. 

877 _LOG.debug( 

878 "Submitting data ID query over dimensions %s and materializing results.", 

879 list(self.dimensions.names), 

880 ) 

881 queryArgs: dict[str, Any] = { 

882 "dimensions": self.dimensions, 

883 "where": userQuery, 

884 "dataId": externalDataId, 

885 "bind": bind, 

886 } 

887 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

888 _LOG.debug( 

889 "Constraining graph query using default of %s.", 

890 list(self.defaultDatasetQueryConstraints.names), 

891 ) 

892 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

893 queryArgs["collections"] = collections 

894 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

895 _LOG.debug("Not using dataset existence to constrain query.") 

896 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

897 constraint = set(datasetQueryConstraint) 

898 inputs = {k.name: k for k in self.inputs.keys()} 

899 if remainder := constraint.difference(inputs.keys()): 

900 raise ValueError( 

901 f"{remainder} dataset type(s) specified as a graph constraint, but" 

902 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

903 ) 

904 _LOG.debug(f"Constraining graph query using {constraint}") 

905 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

906 queryArgs["collections"] = collections 

907 else: 

908 raise ValueError( 

909 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

910 ) 

911 

912 if "datasets" in queryArgs: 

913 for i, dataset_type in enumerate(queryArgs["datasets"]): 

914 if dataset_type.isComponent(): 

915 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

916 

917 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

918 _LOG.debug("Expanding data IDs.") 

919 commonDataIds = commonDataIds.expanded() 

920 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

921 # Iterate over query results, populating data IDs for datasets and 

922 # quanta and then connecting them to each other. 

923 n = -1 

924 for n, commonDataId in enumerate(commonDataIds): 

925 # Create DatasetRefs for all DatasetTypes from this result row, 

926 # noting that we might have created some already. 

927 # We remember both those that already existed and those that we 

928 # create now. 

929 refsForRow = {} 

930 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

931 for datasetType, refs in itertools.chain( 

932 self.inputs.items(), 

933 self.intermediates.items(), 

934 self.outputs.items(), 

935 ): 

936 datasetDataId: DataCoordinate | None 

937 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

938 datasetDataId = commonDataId.subset(datasetType.dimensions) 

939 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

940 ref_holder = refs.get(datasetDataId) 

941 if ref_holder is None: 

942 ref_holder = _RefHolder(datasetType) 

943 refs[datasetDataId] = ref_holder 

944 refsForRow[datasetType.name] = ref_holder 

945 # Create _QuantumScaffolding objects for all tasks from this 

946 # result row, noting that we might have created some already. 

947 for task in self.tasks: 

948 quantumDataId = commonDataId.subset(task.dimensions) 

949 quantum = task.quanta.get(quantumDataId) 

950 if quantum is None: 

951 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

952 task.quanta[quantumDataId] = quantum 

953 # Whether this is a new quantum or an existing one, we can 

954 # now associate the DatasetRefs for this row with it. The 

955 # fact that a Quantum data ID and a dataset data ID both 

956 # came from the same result row is what tells us they 

957 # should be associated. 

958 # Many of these associates will be duplicates (because 

959 # another query row that differed from this one only in 

960 # irrelevant dimensions already added them), and we use 

961 # sets to skip. 

962 for datasetType in task.inputs: 

963 dataId = dataIdCacheForRow[datasetType.dimensions] 

964 ref_holder = refsForRow[datasetType.name] 

965 quantum.inputs[datasetType.name][dataId] = ref_holder 

966 for datasetType in task.outputs: 

967 dataId = dataIdCacheForRow[datasetType.dimensions] 

968 ref_holder = refsForRow[datasetType.name] 

969 quantum.outputs[datasetType.name][dataId] = ref_holder 

970 if n < 0: 

971 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

972 emptiness_explained = False 

973 for message in commonDataIds.explain_no_results(): 

974 _LOG.critical(message) 

975 emptiness_explained = True 

976 if not emptiness_explained: 

977 _LOG.critical( 

978 "To reproduce this query for debugging purposes, run " 

979 "Registry.queryDataIds with these arguments:" 

980 ) 

981 # We could just repr() the queryArgs dict to get something 

982 # the user could make sense of, but it's friendlier to 

983 # put these args in an easier-to-construct equivalent form 

984 # so they can read it more easily and copy and paste into 

985 # a Python terminal. 

986 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

987 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

988 if queryArgs["where"]: 

989 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

990 if "datasets" in queryArgs: 

991 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

992 if "collections" in queryArgs: 

993 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

994 _LOG.debug("Finished processing %d rows from data ID query.", n) 

995 yield commonDataIds 

996 

997 def resolveDatasetRefs( 

998 self, 

999 registry: Registry, 

1000 collections: Any, 

1001 run: str, 

1002 commonDataIds: DataCoordinateQueryResults, 

1003 *, 

1004 skipExistingIn: Any = None, 

1005 clobberOutputs: bool = True, 

1006 constrainedByAllDatasets: bool = True, 

1007 ) -> None: 

1008 """Perform follow up queries for each dataset data ID produced in 

1009 `fillDataIds`. 

1010 

1011 This method populates `_DatasetScaffolding.refs` (except for those in 

1012 `prerequisites`). 

1013 

1014 Parameters 

1015 ---------- 

1016 registry : `lsst.daf.butler.Registry` 

1017 Registry for the data repository; used for all data ID queries. 

1018 collections 

1019 Expressions representing the collections to search for input 

1020 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1021 run : `str` 

1022 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1023 output datasets, if it already exists. 

1024 commonDataIds : \ 

1025 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1026 Result of a previous call to `connectDataIds`. 

1027 skipExistingIn 

1028 Expressions representing the collections to search for existing 

1029 output datasets that should be skipped. See 

1030 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1031 `None` or empty string/sequence disables skipping. 

1032 clobberOutputs : `bool`, optional 

1033 If `True` (default), allow quanta to created even if outputs exist; 

1034 this requires the same behavior behavior to be enabled when 

1035 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1036 (those with metadata, or all outputs if there is no metadata 

1037 dataset configured) will be skipped rather than clobbered. 

1038 constrainedByAllDatasets : `bool`, optional 

1039 Indicates if the commonDataIds were generated with a constraint on 

1040 all dataset types. 

1041 

1042 Raises 

1043 ------ 

1044 OutputExistsError 

1045 Raised if an output dataset already exists in the output run 

1046 and ``skipExistingIn`` does not include output run, or if only 

1047 some outputs are present and ``clobberOutputs`` is `False`. 

1048 """ 

1049 # Run may be provided but it does not have to exist, in that case we 

1050 # use it for resolving references but don't check it for existing refs. 

1051 run_exists = False 

1052 if run: 

1053 try: 

1054 run_exists = bool(registry.queryCollections(run)) 

1055 except MissingCollectionError: 

1056 # Undocumented exception is raise if it does not exist 

1057 pass 

1058 

1059 skip_collections_wildcard: CollectionWildcard | None = None 

1060 skipExistingInRun = False 

1061 if skipExistingIn: 

1062 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1063 if run_exists: 

1064 # as optimization check in the explicit list of names first 

1065 skipExistingInRun = run in skip_collections_wildcard.strings 

1066 if not skipExistingInRun: 

1067 # need to flatten it and check again 

1068 skipExistingInRun = run in registry.queryCollections( 

1069 skipExistingIn, 

1070 collectionTypes=CollectionType.RUN, 

1071 ) 

1072 

1073 idMaker = _DatasetIdMaker(run) 

1074 

1075 resolvedRefQueryResults: Iterable[DatasetRef] 

1076 

1077 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1078 # few different code paths that each transfer different pieces of 

1079 # information about what dataset query constraints were applied here, 

1080 # and none of them has the complete picture until we get here. We're 

1081 # long overdue for a QG generation rewrite that will make this go away 

1082 # entirely anyway. 

1083 constrainedByAllDatasets = ( 

1084 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1085 ) 

1086 

1087 # Look up [init] intermediate and output datasets in the output 

1088 # collection, if there is an output collection. 

1089 if run_exists or skip_collections_wildcard is not None: 

1090 for datasetType, refs in itertools.chain( 

1091 self.initIntermediates.items(), 

1092 self.initOutputs.items(), 

1093 self.intermediates.items(), 

1094 self.outputs.items(), 

1095 ): 

1096 _LOG.debug( 

1097 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1098 len(refs), 

1099 datasetType.name, 

1100 ) 

1101 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1102 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1103 # TODO: this assert incorrectly bans component inputs; 

1104 # investigate on DM-33027. 

1105 # assert not datasetType.isComponent(), \ 

1106 # "Output datasets cannot be components." 

1107 # 

1108 # Instead we have to handle them manually to avoid a 

1109 # deprecation warning, but it is at least confusing and 

1110 # possibly a bug for components to appear here at all. 

1111 if datasetType.isComponent(): 

1112 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1113 component = datasetType.component() 

1114 else: 

1115 parent_dataset_type = datasetType 

1116 component = None 

1117 

1118 # look at RUN collection first 

1119 if run_exists: 

1120 try: 

1121 resolvedRefQueryResults = subset.findDatasets( 

1122 parent_dataset_type, collections=run, findFirst=True 

1123 ) 

1124 except MissingDatasetTypeError: 

1125 resolvedRefQueryResults = [] 

1126 for resolvedRef in resolvedRefQueryResults: 

1127 # TODO: we could easily support per-DatasetType 

1128 # skipExisting and I could imagine that being useful - 

1129 # it's probably required in order to support writing 

1130 # initOutputs before QuantumGraph generation. 

1131 assert resolvedRef.dataId in refs 

1132 if not (skipExistingInRun or isInit or clobberOutputs): 

1133 raise OutputExistsError( 

1134 f"Output dataset {datasetType.name} already exists in " 

1135 f"output RUN collection '{run}' with data ID" 

1136 f" {resolvedRef.dataId}." 

1137 ) 

1138 # To resolve all outputs we have to remember existing 

1139 # ones to avoid generating new dataset IDs for them. 

1140 refs[resolvedRef.dataId].ref = ( 

1141 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1142 ) 

1143 

1144 # And check skipExistingIn too, if RUN collection is in 

1145 # it is handled above 

1146 if skip_collections_wildcard is not None: 

1147 try: 

1148 resolvedRefQueryResults = subset.findDatasets( 

1149 parent_dataset_type, 

1150 collections=skip_collections_wildcard, 

1151 findFirst=True, 

1152 ) 

1153 except MissingDatasetTypeError: 

1154 resolvedRefQueryResults = [] 

1155 for resolvedRef in resolvedRefQueryResults: 

1156 if resolvedRef.dataId not in refs: 

1157 continue 

1158 refs[resolvedRef.dataId].ref = ( 

1159 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1160 ) 

1161 

1162 # Look up input and initInput datasets in the input collection(s). We 

1163 # accumulate datasets in self.missing, if the common data IDs were not 

1164 # constrained on dataset type existence. 

1165 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1166 _LOG.debug( 

1167 "Resolving %d datasets for input dataset %s.", 

1168 len(refs), 

1169 datasetType.name, 

1170 ) 

1171 if datasetType.isComponent(): 

1172 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1173 component = datasetType.component() 

1174 else: 

1175 parent_dataset_type = datasetType 

1176 component = None 

1177 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1178 try: 

1179 resolvedRefQueryResults = commonDataIds.subset( 

1180 datasetType.dimensions, unique=True 

1181 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1182 except MissingDatasetTypeError: 

1183 resolvedRefQueryResults = [] 

1184 dataIdsNotFoundYet = set(refs.keys()) 

1185 for resolvedRef in resolvedRefQueryResults: 

1186 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1187 if resolvedRef.dataId not in refs: 

1188 continue 

1189 refs[resolvedRef.dataId].ref = ( 

1190 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1191 ) 

1192 if dataIdsNotFoundYet: 

1193 if constrainedByAllDatasets: 

1194 raise RuntimeError( 

1195 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1196 f"'{datasetType.name}' was/were present in a previous " 

1197 "query, but could not be found now. " 

1198 "This is either a logic bug in QuantumGraph generation " 

1199 "or the input collections have been modified since " 

1200 "QuantumGraph generation began." 

1201 ) 

1202 elif not datasetType.dimensions: 

1203 raise RuntimeError( 

1204 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1205 f"collections {collections}." 

1206 ) 

1207 else: 

1208 # If the common dataIds were not constrained using all the 

1209 # input dataset types, it is possible that some data ids 

1210 # found don't correspond to existing datasets. Mark these 

1211 # for later pruning from the quantum graph. 

1212 for k in dataIdsNotFoundYet: 

1213 missing_for_dataset_type[k] = refs[k] 

1214 if missing_for_dataset_type: 

1215 self.missing[datasetType] = missing_for_dataset_type 

1216 

1217 # Resolve the missing refs, just so they look like all of the others; 

1218 # in the end other code will make sure they never appear in the QG. 

1219 for dataset_type, refDict in self.missing.items(): 

1220 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1221 

1222 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1223 # replacing the unresolved refs there, and then look up prerequisites. 

1224 for task in self.tasks: 

1225 _LOG.debug( 

1226 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1227 len(task.quanta), 

1228 task.taskDef.label, 

1229 ) 

1230 # The way iterConnections is designed makes it impossible to 

1231 # annotate precisely enough to satisfy MyPy here. 

1232 lookupFunctions = { 

1233 c.name: c.lookupFunction # type: ignore 

1234 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1235 if c.lookupFunction is not None # type: ignore 

1236 } 

1237 dataIdsFailed = [] 

1238 dataIdsSucceeded = [] 

1239 for quantum in task.quanta.values(): 

1240 # Process outputs datasets only if skipExistingIn is not None 

1241 # or there is a run to look for outputs in and clobberOutputs 

1242 # is True. Note that if skipExistingIn is None, any output 

1243 # datasets that already exist would have already caused an 

1244 # exception to be raised. 

1245 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1246 resolvedRefs = [] 

1247 unresolvedDataIds = [] 

1248 haveMetadata = False 

1249 for datasetType, originalRefs in quantum.outputs.items(): 

1250 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1251 if ref is not None: 

1252 resolvedRefs.append(ref) 

1253 originalRefs[dataId].ref = ref 

1254 if datasetType.name == task.taskDef.metadataDatasetName: 

1255 haveMetadata = True 

1256 else: 

1257 unresolvedDataIds.append((datasetType, dataId)) 

1258 if resolvedRefs: 

1259 if haveMetadata or not unresolvedDataIds: 

1260 dataIdsSucceeded.append(quantum.dataId) 

1261 if skip_collections_wildcard is not None: 

1262 continue 

1263 else: 

1264 dataIdsFailed.append(quantum.dataId) 

1265 if not clobberOutputs and run_exists: 

1266 raise OutputExistsError( 

1267 f"Quantum {quantum.dataId} of task with label " 

1268 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1269 f"({resolvedRefs}) " 

1270 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1271 "and clobbering outputs was not enabled." 

1272 ) 

1273 # Update the input DatasetRefs to the resolved ones we already 

1274 # searched for. 

1275 for datasetType, input_refs in quantum.inputs.items(): 

1276 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1277 input_refs[data_id].ref = ref 

1278 # Look up prerequisite datasets in the input collection(s). 

1279 # These may have dimensions that extend beyond those we queried 

1280 # for originally, because we want to permit those data ID 

1281 # values to differ across quanta and dataset types. 

1282 for datasetType in task.prerequisites: 

1283 if datasetType.isComponent(): 

1284 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1285 component = datasetType.component() 

1286 else: 

1287 parent_dataset_type = datasetType 

1288 component = None 

1289 lookupFunction = lookupFunctions.get(datasetType.name) 

1290 if lookupFunction is not None: 

1291 # PipelineTask has provided its own function to do the 

1292 # lookup. This always takes precedence. 

1293 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1294 elif ( 

1295 datasetType.isCalibration() 

1296 and datasetType.dimensions <= quantum.dataId.graph 

1297 and quantum.dataId.graph.temporal 

1298 ): 

1299 # This is a master calibration lookup, which we have to 

1300 # handle specially because the query system can't do a 

1301 # temporal join on a non-dimension-based timespan yet. 

1302 timespan = quantum.dataId.timespan 

1303 try: 

1304 prereq_ref = registry.findDataset( 

1305 parent_dataset_type, 

1306 quantum.dataId, 

1307 collections=collections, 

1308 timespan=timespan, 

1309 ) 

1310 if prereq_ref is not None: 

1311 if component is not None: 

1312 prereq_ref = prereq_ref.makeComponentRef(component) 

1313 prereq_refs = [prereq_ref] 

1314 else: 

1315 prereq_refs = [] 

1316 except (KeyError, MissingDatasetTypeError): 

1317 # This dataset type is not present in the registry, 

1318 # which just means there are no datasets here. 

1319 prereq_refs = [] 

1320 else: 

1321 # Most general case. 

1322 prereq_refs = [ 

1323 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1324 for prereq_ref in registry.queryDatasets( 

1325 parent_dataset_type, 

1326 collections=collections, 

1327 dataId=quantum.dataId, 

1328 findFirst=True, 

1329 ).expanded() 

1330 ] 

1331 

1332 for ref in prereq_refs: 

1333 if ref is not None: 

1334 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1335 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1336 

1337 # Resolve all quantum inputs and outputs. 

1338 for dataset_type, refDict in quantum.inputs.items(): 

1339 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1340 for dataset_type, refDict in quantum.outputs.items(): 

1341 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1342 

1343 # Resolve task initInputs and initOutputs. 

1344 for dataset_type, refDict in task.initInputs.items(): 

1345 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1346 for dataset_type, refDict in task.initOutputs.items(): 

1347 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1348 

1349 # Actually remove any quanta that we decided to skip above. 

1350 if dataIdsSucceeded: 

1351 if skip_collections_wildcard is not None: 

1352 _LOG.debug( 

1353 "Pruning successful %d quanta for task with label '%s' because all of their " 

1354 "outputs exist or metadata was written successfully.", 

1355 len(dataIdsSucceeded), 

1356 task.taskDef.label, 

1357 ) 

1358 for dataId in dataIdsSucceeded: 

1359 del task.quanta[dataId] 

1360 elif clobberOutputs and run_exists: 

1361 _LOG.info( 

1362 "Found %d successful quanta for task with label '%s' " 

1363 "that will need to be clobbered during execution.", 

1364 len(dataIdsSucceeded), 

1365 task.taskDef.label, 

1366 ) 

1367 if dataIdsFailed: 

1368 if clobberOutputs and run_exists: 

1369 _LOG.info( 

1370 "Found %d failed/incomplete quanta for task with label '%s' " 

1371 "that will need to be clobbered during execution.", 

1372 len(dataIdsFailed), 

1373 task.taskDef.label, 

1374 ) 

1375 

1376 # Collect initOutputs that do not belong to any task. 

1377 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1378 for task in self.tasks: 

1379 global_dataset_types -= set(task.initOutputs) 

1380 if global_dataset_types: 

1381 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1382 for dataset_type, refDict in self.globalInitOutputs.items(): 

1383 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1384 

1385 def makeQuantumGraph( 

1386 self, 

1387 registry: Registry, 

1388 metadata: Mapping[str, Any] | None = None, 

1389 datastore: Datastore | None = None, 

1390 ) -> QuantumGraph: 

1391 """Create a `QuantumGraph` from the quanta already present in 

1392 the scaffolding data structure. 

1393 

1394 Parameters 

1395 ---------- 

1396 registry : `lsst.daf.butler.Registry` 

1397 Registry for the data repository; used for all data ID queries. 

1398 metadata : `~collections.abc.Mapping` of `str` to primitives, optional 

1399 This is an optional parameter of extra data to carry with the 

1400 graph. Entries in this mapping should be able to be serialized in 

1401 JSON. 

1402 datastore : `~lsst.daf.butler.Datastore`, optional 

1403 If not `None` then fill datastore records in each generated 

1404 Quantum. 

1405 

1406 Returns 

1407 ------- 

1408 graph : `QuantumGraph` 

1409 The full `QuantumGraph`. 

1410 """ 

1411 

1412 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1413 """Extract all DatasetRefs from the dictionaries""" 

1414 for ref_dict in dataset_dict.values(): 

1415 for holder in ref_dict.values(): 

1416 yield holder.resolved_ref 

1417 

1418 datastore_records: Mapping[str, DatastoreRecordData] | None = None 

1419 if datastore is not None: 

1420 datastore_records = datastore.export_records( 

1421 itertools.chain( 

1422 _make_refs(self.inputs), 

1423 _make_refs(self.initInputs), 

1424 _make_refs(self.prerequisites), 

1425 ) 

1426 ) 

1427 

1428 graphInput: dict[TaskDef, set[Quantum]] = {} 

1429 for task in self.tasks: 

1430 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1431 graphInput[task.taskDef] = qset 

1432 

1433 taskInitInputs = { 

1434 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1435 for task in self.tasks 

1436 } 

1437 taskInitOutputs = { 

1438 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1439 for task in self.tasks 

1440 } 

1441 

1442 globalInitOutputs: list[DatasetRef] = [] 

1443 if self.globalInitOutputs is not None: 

1444 for refs_dict in self.globalInitOutputs.values(): 

1445 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1446 

1447 graph = QuantumGraph( 

1448 graphInput, 

1449 metadata=metadata, 

1450 pruneRefs=list(self.missing.iter_resolved_refs()), 

1451 universe=self.dimensions.universe, 

1452 initInputs=taskInitInputs, 

1453 initOutputs=taskInitOutputs, 

1454 globalInitOutputs=globalInitOutputs, 

1455 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1456 ) 

1457 return graph 

1458 

1459 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1460 """Make a list of all dataset types used by a graph as defined in 

1461 registry. 

1462 """ 

1463 chain = [ 

1464 self.initInputs, 

1465 self.initIntermediates, 

1466 self.initOutputs, 

1467 self.inputs, 

1468 self.intermediates, 

1469 self.outputs, 

1470 self.prerequisites, 

1471 ] 

1472 if self.globalInitOutputs is not None: 

1473 chain.append(self.globalInitOutputs) 

1474 

1475 # Collect names of all dataset types. 

1476 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1477 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1478 

1479 # Check for types that do not exist in registry yet: 

1480 # - inputs must exist 

1481 # - intermediates and outputs may not exist, but there must not be 

1482 # more than one definition (e.g. differing in storage class) 

1483 # - prerequisites may not exist, treat it the same as outputs here 

1484 for dstype in itertools.chain(self.initInputs, self.inputs): 

1485 if dstype.name not in dataset_types: 

1486 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1487 

1488 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1489 chain = [ 

1490 self.initIntermediates, 

1491 self.initOutputs, 

1492 self.intermediates, 

1493 self.outputs, 

1494 self.prerequisites, 

1495 ] 

1496 if self.globalInitOutputs is not None: 

1497 chain.append(self.globalInitOutputs) 

1498 for dstype in itertools.chain(*chain): 

1499 if dstype.name not in dataset_types: 

1500 new_outputs[dstype.name].add(dstype) 

1501 for name, dstypes in new_outputs.items(): 

1502 if len(dstypes) > 1: 

1503 raise ValueError( 

1504 "Pipeline contains multiple definitions for a dataset type " 

1505 f"which is not defined in registry yet: {dstypes}" 

1506 ) 

1507 elif len(dstypes) == 1: 

1508 dataset_types[name] = dstypes.pop() 

1509 

1510 return dataset_types.values() 

1511 

1512 

1513# ------------------------ 

1514# Exported definitions -- 

1515# ------------------------ 

1516 

1517 

1518class GraphBuilderError(Exception): 

1519 """Base class for exceptions generated by graph builder.""" 

1520 

1521 pass 

1522 

1523 

1524class OutputExistsError(GraphBuilderError): 

1525 """Exception generated when output datasets already exist.""" 

1526 

1527 pass 

1528 

1529 

1530class PrerequisiteMissingError(GraphBuilderError): 

1531 """Exception generated when a prerequisite dataset does not exist.""" 

1532 

1533 pass 

1534 

1535 

1536class GraphBuilder: 

1537 """GraphBuilder class is responsible for building task execution graph from 

1538 a Pipeline. 

1539 

1540 Parameters 

1541 ---------- 

1542 registry : `~lsst.daf.butler.Registry` 

1543 Data butler instance. 

1544 skipExistingIn 

1545 Expressions representing the collections to search for existing 

1546 output datasets that should be skipped. See 

1547 :ref:`daf_butler_ordered_collection_searches`. 

1548 clobberOutputs : `bool`, optional 

1549 If `True` (default), allow quanta to created even if partial outputs 

1550 exist; this requires the same behavior behavior to be enabled when 

1551 executing. 

1552 datastore : `~lsst.daf.butler.Datastore`, optional 

1553 If not `None` then fill datastore records in each generated Quantum. 

1554 """ 

1555 

1556 def __init__( 

1557 self, 

1558 registry: Registry, 

1559 skipExistingIn: Any = None, 

1560 clobberOutputs: bool = True, 

1561 datastore: Datastore | None = None, 

1562 ): 

1563 self.registry = registry 

1564 self.dimensions = registry.dimensions 

1565 self.skipExistingIn = skipExistingIn 

1566 self.clobberOutputs = clobberOutputs 

1567 self.datastore = datastore 

1568 

1569 def makeGraph( 

1570 self, 

1571 pipeline: Pipeline | Iterable[TaskDef], 

1572 collections: Any, 

1573 run: str, 

1574 userQuery: str | None, 

1575 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1576 metadata: Mapping[str, Any] | None = None, 

1577 bind: Mapping[str, Any] | None = None, 

1578 dataId: DataCoordinate | None = None, 

1579 ) -> QuantumGraph: 

1580 """Create execution graph for a pipeline. 

1581 

1582 Parameters 

1583 ---------- 

1584 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1585 Pipeline definition, task names/classes and their configs. 

1586 collections 

1587 Expressions representing the collections to search for input 

1588 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1589 run : `str` 

1590 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1591 output datasets. Collection does not have to exist and it will be 

1592 created when graph is executed. 

1593 userQuery : `str` 

1594 String which defines user-defined selection for registry, should be 

1595 empty or `None` if there is no restrictions on data selection. 

1596 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1597 The query constraint variant that should be used to constraint the 

1598 query based on dataset existance, defaults to 

1599 `DatasetQueryConstraintVariant.ALL`. 

1600 metadata : Optional Mapping of `str` to primitives 

1601 This is an optional parameter of extra data to carry with the 

1602 graph. Entries in this mapping should be able to be serialized in 

1603 JSON. 

1604 bind : `~collections.abc.Mapping`, optional 

1605 Mapping containing literal values that should be injected into the 

1606 ``userQuery`` expression, keyed by the identifiers they replace. 

1607 dataId : `lsst.daf.butler.DataCoordinate`, optional 

1608 Data ID that should also be included in the query constraint. 

1609 

1610 Returns 

1611 ------- 

1612 graph : `QuantumGraph` 

1613 

1614 Raises 

1615 ------ 

1616 UserExpressionError 

1617 Raised when user expression cannot be parsed. 

1618 OutputExistsError 

1619 Raised when output datasets already exist. 

1620 Exception 

1621 Other exceptions types may be raised by underlying registry 

1622 classes. 

1623 """ 

1624 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1625 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1626 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1627 if dataId is None: 

1628 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1629 if isinstance(pipeline, Pipeline): 

1630 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId) 

1631 with scaffolding.connectDataIds( 

1632 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1633 ) as commonDataIds: 

1634 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1635 scaffolding.resolveDatasetRefs( 

1636 self.registry, 

1637 collections, 

1638 run, 

1639 commonDataIds, 

1640 skipExistingIn=self.skipExistingIn, 

1641 clobberOutputs=self.clobberOutputs, 

1642 constrainedByAllDatasets=condition, 

1643 ) 

1644 return scaffolding.makeQuantumGraph( 

1645 registry=self.registry, metadata=metadata, datastore=self.datastore 

1646 )