Coverage for python/lsst/pipe/base/graphBuilder.py: 16%

546 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-16 09:02 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Module defining GraphBuilder class and related methods. 

24""" 

25 

26__all__ = ["GraphBuilder"] 

27 

28# ------------------------------- 

29# Imports of standard modules -- 

30# ------------------------------- 

31import itertools 

32import logging 

33from collections import ChainMap, defaultdict 

34from collections.abc import Collection, Iterable, Iterator, Mapping 

35from contextlib import contextmanager 

36from dataclasses import dataclass 

37from typing import Any 

38 

39from lsst.daf.butler import ( 

40 CollectionType, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Datastore, 

45 DatastoreRecordData, 

46 DimensionGraph, 

47 DimensionUniverse, 

48 NamedKeyDict, 

49 NamedValueSet, 

50 Quantum, 

51 Registry, 

52) 

53from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

54from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

55from lsst.daf.butler.registry.wildcards import CollectionWildcard 

56 

57# ----------------------------- 

58# Imports for other modules -- 

59# ----------------------------- 

60from . import automatic_connection_constants as acc 

61from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

62from ._status import NoWorkFound 

63from .connections import AdjustQuantumHelper, iterConnections 

64from .graph import QuantumGraph 

65from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

66 

67# ---------------------------------- 

68# Local non-exported definitions -- 

69# ---------------------------------- 

70 

71_LOG = logging.getLogger(__name__) 

72 

73 

74@dataclass 

75class _RefHolder: 

76 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future 

77 resolved reference. 

78 

79 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use 

80 `None` to represent a reference that is yet to be resolved. Information 

81 about its corresponding dataset type and coordinate is stored in 

82 `_DatasetDict` mapping. 

83 """ 

84 

85 dataset_type: DatasetType 

86 """Dataset type of the dataset to be created later. I need to store it here 

87 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

88 between different compatible dataset types.""" 

89 

90 ref: DatasetRef | None = None 

91 """Dataset reference, initially `None`, created when all datasets are 

92 resolved. 

93 """ 

94 

95 @property 

96 def resolved_ref(self) -> DatasetRef: 

97 """Access resolved reference, should only be called after the 

98 reference is set (`~lsst.daf.butler.DatasetRef`). 

99 """ 

100 assert self.ref is not None, "Dataset reference is not set." 

101 return self.ref 

102 

103 

104class _DatasetDict(NamedKeyDict[DatasetType, dict[DataCoordinate, _RefHolder]]): 

105 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

106 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

107 type. 

108 

109 Parameters 

110 ---------- 

111 args 

112 Positional arguments are forwarded to the `dict` constructor. 

113 universe : `~lsst.daf.butler.DimensionUniverse` 

114 Universe of all possible dimensions. 

115 """ 

116 

117 def __init__(self, *args: Any, universe: DimensionUniverse): 

118 super().__init__(*args) 

119 self.universe = universe 

120 

121 @classmethod 

122 def fromDatasetTypes( 

123 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

124 ) -> _DatasetDict: 

125 """Construct a dictionary from a flat iterable of 

126 `~lsst.daf.butler.DatasetType` keys. 

127 

128 Parameters 

129 ---------- 

130 datasetTypes : `~collections.abc.Iterable` of \ 

131 `~lsst.daf.butler.DatasetType` 

132 DatasetTypes to use as keys for the dict. Values will be empty 

133 dictionaries. 

134 universe : `~lsst.daf.butler.DimensionUniverse` 

135 Universe of all possible dimensions. 

136 

137 Returns 

138 ------- 

139 dictionary : `_DatasetDict` 

140 A new `_DatasetDict` instance. 

141 """ 

142 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

143 

144 @classmethod 

145 def fromSubset( 

146 cls, 

147 datasetTypes: Collection[DatasetType], 

148 first: _DatasetDict, 

149 *rest: _DatasetDict, 

150 ) -> _DatasetDict: 

151 """Return a new dictionary by extracting items corresponding to the 

152 given keys from one or more existing dictionaries. 

153 

154 Parameters 

155 ---------- 

156 datasetTypes : `~collections.abc.Iterable` of \ 

157 `~lsst.daf.butler.DatasetType` 

158 DatasetTypes to use as keys for the dict. Values will be obtained 

159 by lookups against ``first`` and ``rest``. 

160 first : `_DatasetDict` 

161 Another dictionary from which to extract values. 

162 rest 

163 Additional dictionaries from which to extract values. 

164 

165 Returns 

166 ------- 

167 dictionary : `_DatasetDict` 

168 A new dictionary instance. 

169 """ 

170 combined = ChainMap(first, *rest) 

171 

172 # Dataset types known to match immediately can be processed 

173 # without checks. 

174 matches = combined.keys() & set(datasetTypes) 

175 _dict = {k: combined[k] for k in matches} 

176 

177 if len(_dict) < len(datasetTypes): 

178 # Work out which ones are missing. 

179 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

180 

181 # Get the known names for comparison. 

182 combined_by_name = {k.name: k for k in combined} 

183 

184 missing = set() 

185 incompatible = {} 

186 for datasetType in missing_datasetTypes: 

187 # The dataset type is not found. It may not be listed 

188 # or it may be that it is there with the same name 

189 # but different definition. 

190 if datasetType.name in combined_by_name: 

191 # This implies some inconsistency in definitions 

192 # for connections. If there is support for storage 

193 # class conversion we can let it slide. 

194 # At this point we do not know 

195 # where the inconsistency is but trust that down 

196 # stream code will be more explicit about input 

197 # vs output incompatibilities. 

198 existing = combined_by_name[datasetType.name] 

199 convertible_to_existing = existing.is_compatible_with(datasetType) 

200 convertible_from_existing = datasetType.is_compatible_with(existing) 

201 if convertible_to_existing and convertible_from_existing: 

202 _LOG.debug( 

203 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

204 datasetType.name, 

205 datasetType.storageClass_name, 

206 existing.storageClass_name, 

207 ) 

208 _dict[datasetType] = combined[existing] 

209 elif convertible_to_existing or convertible_from_existing: 

210 # We'd need to refactor a fair amount to recognize 

211 # whether this is an error or not, so I'm not going to 

212 # bother until we need to do that for other reasons 

213 # (it won't be too long). 

214 _LOG.info( 

215 "Dataset type %s is present with multiple only partially-compatible storage " 

216 "classes %s and %s.", 

217 datasetType.name, 

218 datasetType.storageClass_name, 

219 existing.storageClass_name, 

220 ) 

221 _dict[datasetType] = combined[existing] 

222 else: 

223 incompatible[datasetType] = existing 

224 else: 

225 missing.add(datasetType) 

226 

227 if missing or incompatible: 

228 reasons = [] 

229 if missing: 

230 reasons.append( 

231 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

232 f"types: [{', '.join(d.name for d in combined)}]." 

233 ) 

234 if incompatible: 

235 for x, y in incompatible.items(): 

236 reasons.append(f"{x} incompatible with {y}") 

237 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

238 

239 return cls(_dict, universe=first.universe) 

240 

241 @property 

242 def dimensions(self) -> DimensionGraph: 

243 """The union of all dimensions used by all dataset types in this 

244 dictionary, including implied dependencies (`DimensionGraph`). 

245 """ 

246 base = self.universe.empty 

247 if len(self) == 0: 

248 return base 

249 return base.union(*[datasetType.dimensions for datasetType in self.keys()]) 

250 

251 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

252 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts 

253 into a new mapping with `~lsst.daf.butler.DatasetType` keys and 

254 `~lsst.daf.butler.DatasetRef` values. 

255 

256 This method assumes that each nest contains exactly one item, as is the 

257 case for all "init" datasets. 

258 

259 Parameters 

260 ---------- 

261 storage_classes : `dict` [ `str`, `str` ] 

262 Mapping from dataset type name to the storage class to use for that 

263 dataset type. These are typically the storage classes declared 

264 for a particular task, which may differ rom the data repository 

265 definitions. 

266 

267 Returns 

268 ------- 

269 dictionary : `~lsst.daf.butler.NamedKeyDict` 

270 Dictionary mapping `~lsst.daf.butler.DatasetType` to 

271 `~lsst.daf.butler.DatasetRef`, with both 

272 `~lsst.daf.butler.DatasetType` instances and string names usable 

273 as keys. 

274 """ 

275 return NamedKeyDict( 

276 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

277 ) 

278 

279 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

280 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into 

281 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of 

282 `~lsst.daf.butler.DatasetRef` values. 

283 

284 Parameters 

285 ---------- 

286 storage_classes : `dict` [ `str`, `str` ] 

287 Mapping from dataset type name to the storage class to use for that 

288 dataset type. These are typically the storage classes declared 

289 for a particular task, which may differ rom the data repository 

290 definitions. 

291 

292 Returns 

293 ------- 

294 dictionary : `~lsst.daf.butler.NamedKeyDict` 

295 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of 

296 `~lsst.daf.butler.DatasetRef`, with both 

297 `~lsst.daf.butler.DatasetType` instances and string names usable 

298 as keys. 

299 """ 

300 result = {} 

301 for dataset_type, holders in self.items(): 

302 if ( 

303 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

304 ) != dataset_type.storageClass_name: 

305 dataset_type = dataset_type.overrideStorageClass(override) 

306 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

307 else: 

308 refs = [holder.resolved_ref for holder in holders.values()] 

309 result[dataset_type] = refs 

310 return NamedKeyDict(result) 

311 

312 def extract( 

313 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

314 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

315 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances 

316 that match the given `~lsst.daf.butler.DatasetType` and data IDs. 

317 

318 Parameters 

319 ---------- 

320 datasetType : `~lsst.daf.butler.DatasetType` 

321 Dataset type to match. 

322 dataIds : `~collections.abc.Iterable` \ 

323 [ `~lsst.daf.butler.DataCoordinate` ] 

324 Data IDs to match. 

325 

326 Returns 

327 ------- 

328 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ] 

329 DatasetRef instances for which ``ref.datasetType == datasetType`` 

330 and ``ref.dataId`` is in ``dataIds``. 

331 """ 

332 refs = self[datasetType] 

333 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

334 

335 def isdisjoint(self, other: _DatasetDict) -> bool: 

336 """Test whether ``self`` and ``other`` have any datasets in common. 

337 

338 Datasets are considered in common if they have the same *parent* 

339 dataset type name and data ID; storage classes and components are not 

340 considered. 

341 """ 

342 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

343 for k, v in other.items(): 

344 parent_name, _ = k.nameAndComponent() 

345 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

346 return False 

347 return True 

348 

349 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

350 """Iterate over all DatasetRef instances held by this data structure, 

351 assuming that each `_RefHolder` already carries are resolved ref. 

352 """ 

353 for holders_by_data_id in self.values(): 

354 for holder in holders_by_data_id.values(): 

355 yield holder.resolved_ref 

356 

357 

358class _QuantumScaffolding: 

359 """Helper class aggregating information about a `Quantum`, used when 

360 constructing a `QuantumGraph`. 

361 

362 See `_PipelineScaffolding` for a top-down description of the full 

363 scaffolding data structure. 

364 

365 Parameters 

366 ---------- 

367 task : _TaskScaffolding 

368 Back-reference to the helper object for the `PipelineTask` this quantum 

369 represents an execution of. 

370 dataId : `~lsst.daf.butler.DataCoordinate` 

371 Data ID for this quantum. 

372 """ 

373 

374 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

375 self.task = task 

376 self.dataId = dataId 

377 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

378 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

379 self.prerequisites = _DatasetDict.fromDatasetTypes( 

380 task.prerequisites.keys(), universe=dataId.universe 

381 ) 

382 

383 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

384 

385 def __repr__(self) -> str: 

386 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

387 

388 task: _TaskScaffolding 

389 """Back-reference to the helper object for the `PipelineTask` this quantum 

390 represents an execution of. 

391 """ 

392 

393 dataId: DataCoordinate 

394 """Data ID for this quantum. 

395 """ 

396 

397 inputs: _DatasetDict 

398 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to 

399 this quantum. 

400 

401 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty 

402 dictionary at construction. Those nested dictionaries are populated 

403 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef` 

404 instances in `_PipelineScaffolding.connectDataIds`. 

405 """ 

406 

407 outputs: _DatasetDict 

408 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this 

409 quantum. 

410 """ 

411 

412 prerequisites: _DatasetDict 

413 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite 

414 inputs to this quantum. 

415 """ 

416 

417 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum: 

418 """Transform the scaffolding object into a true `Quantum` instance. 

419 

420 Parameters 

421 ---------- 

422 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

423 `~lsst.daf.butler.DatastoreRecordData` ], optional 

424 If not `None` then fill datastore records in each generated Quantum 

425 using the records from this structure. 

426 

427 Returns 

428 ------- 

429 quantum : `Quantum` 

430 An actual `Quantum` instance. 

431 """ 

432 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

433 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

434 # Give the task's Connections class an opportunity to remove some 

435 # inputs, or complain if they are unacceptable. 

436 # This will raise if one of the check conditions is not met, which is 

437 # the intended behavior. 

438 # If it raises NotWorkFound, there is a bug in the QG algorithm 

439 # or the adjustQuantum is incorrectly trying to make a prerequisite 

440 # input behave like a regular input; adjustQuantum should only raise 

441 # NoWorkFound if a regular input is missing, and it shouldn't be 

442 # possible for us to have generated ``self`` if that's true. 

443 helper = AdjustQuantumHelper( 

444 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

445 ) 

446 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

447 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

448 quantum_records: Mapping[str, DatastoreRecordData] | None = None 

449 if datastore_records is not None: 

450 quantum_records = {} 

451 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

452 input_refs += list(initInputs.values()) 

453 input_ids = set(ref.id for ref in input_refs) 

454 for datastore_name, records in datastore_records.items(): 

455 matching_records = records.subset(input_ids) 

456 if matching_records is not None: 

457 quantum_records[datastore_name] = matching_records 

458 return Quantum( 

459 taskName=self.task.taskDef.taskName, 

460 taskClass=self.task.taskDef.taskClass, 

461 dataId=self.dataId, 

462 initInputs=initInputs, 

463 inputs=helper.inputs, 

464 outputs=helper.outputs, 

465 datastore_records=quantum_records, 

466 ) 

467 

468 

469@dataclass 

470class _TaskScaffolding: 

471 """Helper class aggregating information about a `PipelineTask`, used when 

472 constructing a `QuantumGraph`. 

473 

474 See `_PipelineScaffolding` for a top-down description of the full 

475 scaffolding data structure. 

476 

477 Parameters 

478 ---------- 

479 taskDef : `TaskDef` 

480 Data structure that identifies the task class and its config. 

481 parent : `_PipelineScaffolding` 

482 The parent data structure that will hold the instance being 

483 constructed. 

484 datasetTypes : `TaskDatasetTypes` 

485 Data structure that categorizes the dataset types used by this task. 

486 """ 

487 

488 def __init__( 

489 self, 

490 taskDef: TaskDef, 

491 parent: _PipelineScaffolding, 

492 datasetTypes: TaskDatasetTypes, 

493 ): 

494 universe = parent.dimensions.universe 

495 self.taskDef = taskDef 

496 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

497 assert self.dimensions.issubset(parent.dimensions) 

498 # Initialize _DatasetDicts as subsets of the one or two 

499 # corresponding dicts in the parent _PipelineScaffolding. 

500 self.initInputs = _DatasetDict.fromSubset( 

501 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

502 ) 

503 self.initOutputs = _DatasetDict.fromSubset( 

504 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

505 ) 

506 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

507 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

508 self.prerequisites = _DatasetDict.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

509 self.dataIds: set[DataCoordinate] = set() 

510 self.quanta = {} 

511 self.storage_classes = { 

512 connection.name: connection.storageClass 

513 for connection in self.taskDef.connections.allConnections.values() 

514 } 

515 self.storage_classes[ 

516 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

517 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

518 self.storage_classes[ 

519 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

520 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

521 self.storage_classes[ 

522 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

523 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

524 

525 def __repr__(self) -> str: 

526 # Default dataclass-injected __repr__ gets caught in an infinite loop 

527 # because of back-references. 

528 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

529 

530 taskDef: TaskDef 

531 """Data structure that identifies the task class and its config 

532 (`TaskDef`). 

533 """ 

534 

535 dimensions: DimensionGraph 

536 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

537 """ 

538 

539 initInputs: _DatasetDict 

540 """Dictionary containing information about datasets used to construct this 

541 task (`_DatasetDict`). 

542 """ 

543 

544 initOutputs: _DatasetDict 

545 """Dictionary containing information about datasets produced as a 

546 side-effect of constructing this task (`_DatasetDict`). 

547 """ 

548 

549 inputs: _DatasetDict 

550 """Dictionary containing information about datasets used as regular, 

551 graph-constraining inputs to this task (`_DatasetDict`). 

552 """ 

553 

554 outputs: _DatasetDict 

555 """Dictionary containing information about datasets produced by this task 

556 (`_DatasetDict`). 

557 """ 

558 

559 prerequisites: _DatasetDict 

560 """Dictionary containing information about input datasets that must be 

561 present in the repository before any Pipeline containing this task is run 

562 (`_DatasetDict`). 

563 """ 

564 

565 quanta: dict[DataCoordinate, _QuantumScaffolding] 

566 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

567 this task with that data ID. 

568 """ 

569 

570 storage_classes: dict[str, str] 

571 """Mapping from dataset type name to storage class declared by this task. 

572 """ 

573 

574 def makeQuantumSet( 

575 self, 

576 missing: _DatasetDict, 

577 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

578 ) -> set[Quantum]: 

579 """Create a `set` of `Quantum` from the information in ``self``. 

580 

581 Parameters 

582 ---------- 

583 missing : `_DatasetDict` 

584 Input datasets that have not been found. 

585 datastore_records : `dict` 

586 Record from the datastore to export with quanta. 

587 

588 Returns 

589 ------- 

590 nodes : `set` of `Quantum` 

591 The `Quantum` elements corresponding to this task. 

592 """ 

593 outputs = set() 

594 for q in self.quanta.values(): 

595 try: 

596 tmpQuanta = q.makeQuantum(datastore_records) 

597 outputs.add(tmpQuanta) 

598 except (NoWorkFound, FileNotFoundError) as exc: 

599 if not missing.isdisjoint(q.inputs): 

600 # This is a node that is known to be pruned later and 

601 # should be left in even though some follow up queries 

602 # fail. This allows the pruning to start from this quantum 

603 # with known issues, and prune other nodes it touches. 

604 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

605 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

606 tmpQuantum = Quantum( 

607 taskName=q.task.taskDef.taskName, 

608 taskClass=q.task.taskDef.taskClass, 

609 dataId=q.dataId, 

610 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

611 inputs=inputs, 

612 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

613 ) 

614 outputs.add(tmpQuantum) 

615 else: 

616 raise exc 

617 return outputs 

618 

619 

620class _DatasetIdMaker: 

621 """Helper class which generates random dataset UUIDs for unresolved 

622 datasets. 

623 """ 

624 

625 def __init__(self, run: str): 

626 self.run = run 

627 # Cache of dataset refs generated so far. 

628 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

629 

630 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

631 # For components we need their parent dataset ID. 

632 if dataset_type.isComponent(): 

633 parent_type = dataset_type.makeCompositeDatasetType() 

634 # Parent should be resolved if this is an existing input, or it 

635 # should be in the cache already if it is an intermediate. 

636 key = parent_type, data_id 

637 if key not in self.resolved: 

638 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

639 parent_ref = self.resolved[key] 

640 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

641 

642 key = dataset_type, data_id 

643 if (resolved := self.resolved.get(key)) is None: 

644 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

645 self.resolved[key] = resolved 

646 return resolved 

647 

648 def resolveDict(self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder]) -> None: 

649 """Resolve all unresolved references in the provided dictionary.""" 

650 for data_id, holder in refs.items(): 

651 if holder.ref is None: 

652 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

653 

654 

655@dataclass 

656class _PipelineScaffolding: 

657 """A helper data structure that organizes the information involved in 

658 constructing a `QuantumGraph` for a `Pipeline`. 

659 

660 Parameters 

661 ---------- 

662 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

663 Sequence of tasks from which a graph is to be constructed. Must 

664 have nested task classes already imported. 

665 universe : `~lsst.daf.butler.DimensionUniverse` 

666 Universe of all possible dimensions. 

667 

668 Notes 

669 ----- 

670 The scaffolding data structure contains nested data structures for both 

671 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

672 data structures are shared between the pipeline-level structure (which 

673 aggregates all datasets and categorizes them from the perspective of the 

674 complete pipeline) and the individual tasks that use them as inputs and 

675 outputs. 

676 

677 `QuantumGraph` construction proceeds in four steps, with each corresponding 

678 to a different `_PipelineScaffolding` method: 

679 

680 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

681 the DatasetTypes used by the pipeline (delegating to 

682 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

683 nested `_TaskScaffolding` and `_DatasetDict` objects. 

684 

685 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

686 returns related tuples of all dimensions used to identify any regular 

687 input, output, and intermediate datasets (not prerequisites). We then 

688 iterate over these tuples of related dimensions, identifying the subsets 

689 that correspond to distinct data IDs for each task and dataset type, 

690 and then create `_QuantumScaffolding` objects. 

691 

692 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

693 dataset data IDs previously identified, transforming unresolved 

694 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

695 up prerequisite datasets for all quanta. 

696 

697 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

698 per-task `_QuantumScaffolding` objects. 

699 """ 

700 

701 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

702 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

703 self.tasks = [] 

704 # Aggregate and categorize the DatasetTypes in the Pipeline. 

705 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

706 # Construct dictionaries that map those DatasetTypes to structures 

707 # that will (later) hold additional information about them. 

708 for attr in ( 

709 "initInputs", 

710 "initIntermediates", 

711 "initOutputs", 

712 "inputs", 

713 "intermediates", 

714 "outputs", 

715 "prerequisites", 

716 ): 

717 setattr( 

718 self, 

719 attr, 

720 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

721 ) 

722 self.missing = _DatasetDict(universe=registry.dimensions) 

723 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

724 # Aggregate all dimensions for all non-init, non-prerequisite 

725 # DatasetTypes. These are the ones we'll include in the big join 

726 # query. 

727 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

728 # Construct scaffolding nodes for each Task, and add backreferences 

729 # to the Task from each DatasetScaffolding node. 

730 # Note that there's only one scaffolding node for each DatasetType, 

731 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

732 # reference it. 

733 if isinstance(pipeline, Pipeline): 

734 pipeline = pipeline.toExpandedPipeline() 

735 self.tasks = [ 

736 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

737 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values()) 

738 ] 

739 

740 def __repr__(self) -> str: 

741 # Default dataclass-injected __repr__ gets caught in an infinite loop 

742 # because of back-references. 

743 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

744 

745 tasks: list[_TaskScaffolding] 

746 """Scaffolding data structures for each task in the pipeline 

747 (`list` of `_TaskScaffolding`). 

748 """ 

749 

750 initInputs: _DatasetDict 

751 """Datasets consumed but not produced when constructing the tasks in this 

752 pipeline (`_DatasetDict`). 

753 """ 

754 

755 initIntermediates: _DatasetDict 

756 """Datasets that are both consumed and produced when constructing the tasks 

757 in this pipeline (`_DatasetDict`). 

758 """ 

759 

760 initOutputs: _DatasetDict 

761 """Datasets produced but not consumed when constructing the tasks in this 

762 pipeline (`_DatasetDict`). 

763 """ 

764 

765 inputs: _DatasetDict 

766 """Datasets that are consumed but not produced when running this pipeline 

767 (`_DatasetDict`). 

768 """ 

769 

770 intermediates: _DatasetDict 

771 """Datasets that are both produced and consumed when running this pipeline 

772 (`_DatasetDict`). 

773 """ 

774 

775 outputs: _DatasetDict 

776 """Datasets produced but not consumed when when running this pipeline 

777 (`_DatasetDict`). 

778 """ 

779 

780 prerequisites: _DatasetDict 

781 """Datasets that are consumed when running this pipeline and looked up 

782 per-Quantum when generating the graph (`_DatasetDict`). 

783 """ 

784 

785 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

786 """Datasets that should be used as constraints in the initial query, 

787 according to tasks (`~lsst.daf.butler.NamedValueSet`). 

788 """ 

789 

790 dimensions: DimensionGraph 

791 """All dimensions used by any regular input, intermediate, or output 

792 (not prerequisite) dataset; the set of dimension used in the "Big Join 

793 Query" (`~lsst.daf.butler.DimensionGraph`). 

794 

795 This is required to be a superset of all task quantum dimensions. 

796 """ 

797 

798 missing: _DatasetDict 

799 """Datasets whose existence was originally predicted but were not 

800 actually found. 

801 

802 Quanta that require these datasets as inputs will be pruned (recursively) 

803 when actually constructing a `QuantumGraph` object. 

804 

805 These are currently populated only when the "initial dataset query 

806 constraint" does not include all overall-input dataset types, and hence the 

807 initial data ID query can include data IDs that it should not. 

808 """ 

809 

810 globalInitOutputs: _DatasetDict | None = None 

811 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

812 """ 

813 

814 @contextmanager 

815 def connectDataIds( 

816 self, 

817 registry: Registry, 

818 collections: Any, 

819 userQuery: str | None, 

820 externalDataId: DataCoordinate, 

821 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

822 bind: Mapping[str, Any] | None = None, 

823 ) -> Iterator[DataCoordinateQueryResults]: 

824 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

825 

826 This method populates `_TaskScaffolding.dataIds` and 

827 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

828 

829 Parameters 

830 ---------- 

831 registry : `lsst.daf.butler.Registry` 

832 Registry for the data repository; used for all data ID queries. 

833 collections 

834 Expressions representing the collections to search for input 

835 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

836 userQuery : `str` or `None` 

837 User-provided expression to limit the data IDs processed. 

838 externalDataId : `~lsst.daf.butler.DataCoordinate` 

839 Externally-provided data ID that should be used to restrict the 

840 results, just as if these constraints had been included via ``AND`` 

841 in ``userQuery``. This includes (at least) any instrument named 

842 in the pipeline definition. 

843 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

844 The query constraint variant that should be used to constraint the 

845 query based on dataset existance, defaults to 

846 `DatasetQueryConstraintVariant.ALL`. 

847 bind : `~collections.abc.Mapping`, optional 

848 Mapping containing literal values that should be injected into the 

849 ``userQuery`` expression, keyed by the identifiers they replace. 

850 

851 Returns 

852 ------- 

853 commonDataIds : \ 

854 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

855 An interface to a database temporary table containing all data IDs 

856 that will appear in this `QuantumGraph`. Returned inside a 

857 context manager, which will drop the temporary table at the end of 

858 the `with` block in which this method is called. 

859 """ 

860 _LOG.debug("Building query for data IDs.") 

861 # Initialization datasets always have empty data IDs. 

862 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

863 for datasetType, refs in itertools.chain( 

864 self.initInputs.items(), 

865 self.initIntermediates.items(), 

866 self.initOutputs.items(), 

867 ): 

868 refs[emptyDataId] = _RefHolder(datasetType) 

869 # Run one big query for the data IDs for task dimensions and regular 

870 # inputs and outputs. We limit the query to only dimensions that are 

871 # associated with the input dataset types, but don't (yet) try to 

872 # obtain the dataset_ids for those inputs. 

873 _LOG.debug( 

874 "Submitting data ID query over dimensions %s and materializing results.", 

875 list(self.dimensions.names), 

876 ) 

877 queryArgs: dict[str, Any] = { 

878 "dimensions": self.dimensions, 

879 "where": userQuery, 

880 "dataId": externalDataId, 

881 "bind": bind, 

882 } 

883 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

884 _LOG.debug( 

885 "Constraining graph query using default of %s.", 

886 list(self.defaultDatasetQueryConstraints.names), 

887 ) 

888 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

889 queryArgs["collections"] = collections 

890 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

891 _LOG.debug("Not using dataset existence to constrain query.") 

892 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

893 constraint = set(datasetQueryConstraint) 

894 inputs = {k.name: k for k in self.inputs.keys()} 

895 if remainder := constraint.difference(inputs.keys()): 

896 raise ValueError( 

897 f"{remainder} dataset type(s) specified as a graph constraint, but" 

898 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

899 ) 

900 _LOG.debug(f"Constraining graph query using {constraint}") 

901 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

902 queryArgs["collections"] = collections 

903 else: 

904 raise ValueError( 

905 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

906 ) 

907 

908 if "datasets" in queryArgs: 

909 for i, dataset_type in enumerate(queryArgs["datasets"]): 

910 if dataset_type.isComponent(): 

911 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

912 

913 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

914 _LOG.debug("Expanding data IDs.") 

915 commonDataIds = commonDataIds.expanded() 

916 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

917 # Iterate over query results, populating data IDs for datasets and 

918 # quanta and then connecting them to each other. 

919 n = -1 

920 for n, commonDataId in enumerate(commonDataIds): 

921 # Create DatasetRefs for all DatasetTypes from this result row, 

922 # noting that we might have created some already. 

923 # We remember both those that already existed and those that we 

924 # create now. 

925 refsForRow = {} 

926 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

927 for datasetType, refs in itertools.chain( 

928 self.inputs.items(), 

929 self.intermediates.items(), 

930 self.outputs.items(), 

931 ): 

932 datasetDataId: DataCoordinate | None 

933 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

934 datasetDataId = commonDataId.subset(datasetType.dimensions) 

935 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

936 ref_holder = refs.get(datasetDataId) 

937 if ref_holder is None: 

938 ref_holder = _RefHolder(datasetType) 

939 refs[datasetDataId] = ref_holder 

940 refsForRow[datasetType.name] = ref_holder 

941 # Create _QuantumScaffolding objects for all tasks from this 

942 # result row, noting that we might have created some already. 

943 for task in self.tasks: 

944 quantumDataId = commonDataId.subset(task.dimensions) 

945 quantum = task.quanta.get(quantumDataId) 

946 if quantum is None: 

947 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

948 task.quanta[quantumDataId] = quantum 

949 # Whether this is a new quantum or an existing one, we can 

950 # now associate the DatasetRefs for this row with it. The 

951 # fact that a Quantum data ID and a dataset data ID both 

952 # came from the same result row is what tells us they 

953 # should be associated. 

954 # Many of these associates will be duplicates (because 

955 # another query row that differed from this one only in 

956 # irrelevant dimensions already added them), and we use 

957 # sets to skip. 

958 for datasetType in task.inputs: 

959 dataId = dataIdCacheForRow[datasetType.dimensions] 

960 ref_holder = refsForRow[datasetType.name] 

961 quantum.inputs[datasetType.name][dataId] = ref_holder 

962 for datasetType in task.outputs: 

963 dataId = dataIdCacheForRow[datasetType.dimensions] 

964 ref_holder = refsForRow[datasetType.name] 

965 quantum.outputs[datasetType.name][dataId] = ref_holder 

966 if n < 0: 

967 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

968 emptiness_explained = False 

969 for message in commonDataIds.explain_no_results(): 

970 _LOG.critical(message) 

971 emptiness_explained = True 

972 if not emptiness_explained: 

973 _LOG.critical( 

974 "To reproduce this query for debugging purposes, run " 

975 "Registry.queryDataIds with these arguments:" 

976 ) 

977 # We could just repr() the queryArgs dict to get something 

978 # the user could make sense of, but it's friendlier to 

979 # put these args in an easier-to-construct equivalent form 

980 # so they can read it more easily and copy and paste into 

981 # a Python terminal. 

982 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

983 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

984 if queryArgs["where"]: 

985 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

986 if "datasets" in queryArgs: 

987 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

988 if "collections" in queryArgs: 

989 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

990 _LOG.debug("Finished processing %d rows from data ID query.", n) 

991 yield commonDataIds 

992 

993 def resolveDatasetRefs( 

994 self, 

995 registry: Registry, 

996 collections: Any, 

997 run: str, 

998 commonDataIds: DataCoordinateQueryResults, 

999 *, 

1000 skipExistingIn: Any = None, 

1001 clobberOutputs: bool = True, 

1002 constrainedByAllDatasets: bool = True, 

1003 ) -> None: 

1004 """Perform follow up queries for each dataset data ID produced in 

1005 `fillDataIds`. 

1006 

1007 This method populates `_DatasetScaffolding.refs` (except for those in 

1008 `prerequisites`). 

1009 

1010 Parameters 

1011 ---------- 

1012 registry : `lsst.daf.butler.Registry` 

1013 Registry for the data repository; used for all data ID queries. 

1014 collections 

1015 Expressions representing the collections to search for input 

1016 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1017 run : `str` 

1018 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1019 output datasets, if it already exists. 

1020 commonDataIds : \ 

1021 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1022 Result of a previous call to `connectDataIds`. 

1023 skipExistingIn 

1024 Expressions representing the collections to search for existing 

1025 output datasets that should be skipped. See 

1026 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1027 `None` or empty string/sequence disables skipping. 

1028 clobberOutputs : `bool`, optional 

1029 If `True` (default), allow quanta to created even if outputs exist; 

1030 this requires the same behavior behavior to be enabled when 

1031 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1032 (those with metadata, or all outputs if there is no metadata 

1033 dataset configured) will be skipped rather than clobbered. 

1034 constrainedByAllDatasets : `bool`, optional 

1035 Indicates if the commonDataIds were generated with a constraint on 

1036 all dataset types. 

1037 

1038 Raises 

1039 ------ 

1040 OutputExistsError 

1041 Raised if an output dataset already exists in the output run 

1042 and ``skipExistingIn`` does not include output run, or if only 

1043 some outputs are present and ``clobberOutputs`` is `False`. 

1044 """ 

1045 # Run may be provided but it does not have to exist, in that case we 

1046 # use it for resolving references but don't check it for existing refs. 

1047 run_exists = False 

1048 if run: 

1049 try: 

1050 run_exists = bool(registry.queryCollections(run)) 

1051 except MissingCollectionError: 

1052 # Undocumented exception is raise if it does not exist 

1053 pass 

1054 

1055 skip_collections_wildcard: CollectionWildcard | None = None 

1056 skipExistingInRun = False 

1057 if skipExistingIn: 

1058 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1059 if run_exists: 

1060 # as optimization check in the explicit list of names first 

1061 skipExistingInRun = run in skip_collections_wildcard.strings 

1062 if not skipExistingInRun: 

1063 # need to flatten it and check again 

1064 skipExistingInRun = run in registry.queryCollections( 

1065 skipExistingIn, 

1066 collectionTypes=CollectionType.RUN, 

1067 ) 

1068 

1069 idMaker = _DatasetIdMaker(run) 

1070 

1071 resolvedRefQueryResults: Iterable[DatasetRef] 

1072 

1073 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1074 # few different code paths that each transfer different pieces of 

1075 # information about what dataset query constraints were applied here, 

1076 # and none of them has the complete picture until we get here. We're 

1077 # long overdue for a QG generation rewrite that will make this go away 

1078 # entirely anyway. 

1079 constrainedByAllDatasets = ( 

1080 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1081 ) 

1082 

1083 # Look up [init] intermediate and output datasets in the output 

1084 # collection, if there is an output collection. 

1085 if run_exists or skip_collections_wildcard is not None: 

1086 for datasetType, refs in itertools.chain( 

1087 self.initIntermediates.items(), 

1088 self.initOutputs.items(), 

1089 self.intermediates.items(), 

1090 self.outputs.items(), 

1091 ): 

1092 _LOG.debug( 

1093 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1094 len(refs), 

1095 datasetType.name, 

1096 ) 

1097 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1098 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1099 # TODO: this assert incorrectly bans component inputs; 

1100 # investigate on DM-33027. 

1101 # assert not datasetType.isComponent(), \ 

1102 # "Output datasets cannot be components." 

1103 # 

1104 # Instead we have to handle them manually to avoid a 

1105 # deprecation warning, but it is at least confusing and 

1106 # possibly a bug for components to appear here at all. 

1107 if datasetType.isComponent(): 

1108 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1109 component = datasetType.component() 

1110 else: 

1111 parent_dataset_type = datasetType 

1112 component = None 

1113 

1114 # look at RUN collection first 

1115 if run_exists: 

1116 try: 

1117 resolvedRefQueryResults = subset.findDatasets( 

1118 parent_dataset_type, collections=run, findFirst=True 

1119 ) 

1120 except MissingDatasetTypeError: 

1121 resolvedRefQueryResults = [] 

1122 for resolvedRef in resolvedRefQueryResults: 

1123 # TODO: we could easily support per-DatasetType 

1124 # skipExisting and I could imagine that being useful - 

1125 # it's probably required in order to support writing 

1126 # initOutputs before QuantumGraph generation. 

1127 assert resolvedRef.dataId in refs 

1128 if not (skipExistingInRun or isInit or clobberOutputs): 

1129 raise OutputExistsError( 

1130 f"Output dataset {datasetType.name} already exists in " 

1131 f"output RUN collection '{run}' with data ID" 

1132 f" {resolvedRef.dataId}." 

1133 ) 

1134 # To resolve all outputs we have to remember existing 

1135 # ones to avoid generating new dataset IDs for them. 

1136 refs[resolvedRef.dataId].ref = ( 

1137 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1138 ) 

1139 

1140 # And check skipExistingIn too, if RUN collection is in 

1141 # it is handled above 

1142 if skip_collections_wildcard is not None: 

1143 try: 

1144 resolvedRefQueryResults = subset.findDatasets( 

1145 parent_dataset_type, 

1146 collections=skip_collections_wildcard, 

1147 findFirst=True, 

1148 ) 

1149 except MissingDatasetTypeError: 

1150 resolvedRefQueryResults = [] 

1151 for resolvedRef in resolvedRefQueryResults: 

1152 if resolvedRef.dataId not in refs: 

1153 continue 

1154 refs[resolvedRef.dataId].ref = ( 

1155 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1156 ) 

1157 

1158 # Look up input and initInput datasets in the input collection(s). We 

1159 # accumulate datasets in self.missing, if the common data IDs were not 

1160 # constrained on dataset type existence. 

1161 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1162 _LOG.debug( 

1163 "Resolving %d datasets for input dataset %s.", 

1164 len(refs), 

1165 datasetType.name, 

1166 ) 

1167 if datasetType.isComponent(): 

1168 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1169 component = datasetType.component() 

1170 else: 

1171 parent_dataset_type = datasetType 

1172 component = None 

1173 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1174 try: 

1175 resolvedRefQueryResults = commonDataIds.subset( 

1176 datasetType.dimensions, unique=True 

1177 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1178 except MissingDatasetTypeError: 

1179 resolvedRefQueryResults = [] 

1180 dataIdsNotFoundYet = set(refs.keys()) 

1181 for resolvedRef in resolvedRefQueryResults: 

1182 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1183 if resolvedRef.dataId not in refs: 

1184 continue 

1185 refs[resolvedRef.dataId].ref = ( 

1186 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1187 ) 

1188 if dataIdsNotFoundYet: 

1189 if constrainedByAllDatasets: 

1190 raise RuntimeError( 

1191 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1192 f"'{datasetType.name}' was/were present in a previous " 

1193 "query, but could not be found now. " 

1194 "This is either a logic bug in QuantumGraph generation " 

1195 "or the input collections have been modified since " 

1196 "QuantumGraph generation began." 

1197 ) 

1198 elif not datasetType.dimensions: 

1199 raise RuntimeError( 

1200 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1201 f"collections {collections}." 

1202 ) 

1203 else: 

1204 # If the common dataIds were not constrained using all the 

1205 # input dataset types, it is possible that some data ids 

1206 # found don't correspond to existing datasets. Mark these 

1207 # for later pruning from the quantum graph. 

1208 for k in dataIdsNotFoundYet: 

1209 missing_for_dataset_type[k] = refs[k] 

1210 if missing_for_dataset_type: 

1211 self.missing[datasetType] = missing_for_dataset_type 

1212 

1213 # Resolve the missing refs, just so they look like all of the others; 

1214 # in the end other code will make sure they never appear in the QG. 

1215 for dataset_type, refDict in self.missing.items(): 

1216 idMaker.resolveDict(dataset_type, refDict) 

1217 

1218 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1219 # replacing the unresolved refs there, and then look up prerequisites. 

1220 for task in self.tasks: 

1221 _LOG.debug( 

1222 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1223 len(task.quanta), 

1224 task.taskDef.label, 

1225 ) 

1226 # The way iterConnections is designed makes it impossible to 

1227 # annotate precisely enough to satisfy MyPy here. 

1228 lookupFunctions = { 

1229 c.name: c.lookupFunction # type: ignore 

1230 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1231 if c.lookupFunction is not None # type: ignore 

1232 } 

1233 dataIdsFailed = [] 

1234 dataIdsSucceeded = [] 

1235 for quantum in task.quanta.values(): 

1236 # Process outputs datasets only if skipExistingIn is not None 

1237 # or there is a run to look for outputs in and clobberOutputs 

1238 # is True. Note that if skipExistingIn is None, any output 

1239 # datasets that already exist would have already caused an 

1240 # exception to be raised. 

1241 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1242 resolvedRefs = [] 

1243 unresolvedDataIds = [] 

1244 haveMetadata = False 

1245 for datasetType, originalRefs in quantum.outputs.items(): 

1246 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1247 if ref is not None: 

1248 resolvedRefs.append(ref) 

1249 originalRefs[dataId].ref = ref 

1250 if datasetType.name == task.taskDef.metadataDatasetName: 

1251 haveMetadata = True 

1252 else: 

1253 unresolvedDataIds.append((datasetType, dataId)) 

1254 if resolvedRefs: 

1255 if haveMetadata or not unresolvedDataIds: 

1256 dataIdsSucceeded.append(quantum.dataId) 

1257 if skip_collections_wildcard is not None: 

1258 continue 

1259 else: 

1260 dataIdsFailed.append(quantum.dataId) 

1261 if not clobberOutputs: 

1262 raise OutputExistsError( 

1263 f"Quantum {quantum.dataId} of task with label " 

1264 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1265 f"({resolvedRefs}) " 

1266 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1267 "and clobbering outputs was not enabled." 

1268 ) 

1269 # Update the input DatasetRefs to the resolved ones we already 

1270 # searched for. 

1271 for datasetType, input_refs in quantum.inputs.items(): 

1272 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1273 input_refs[data_id].ref = ref 

1274 # Look up prerequisite datasets in the input collection(s). 

1275 # These may have dimensions that extend beyond those we queried 

1276 # for originally, because we want to permit those data ID 

1277 # values to differ across quanta and dataset types. 

1278 for datasetType in task.prerequisites: 

1279 if datasetType.isComponent(): 

1280 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1281 component = datasetType.component() 

1282 else: 

1283 parent_dataset_type = datasetType 

1284 component = None 

1285 lookupFunction = lookupFunctions.get(datasetType.name) 

1286 if lookupFunction is not None: 

1287 # PipelineTask has provided its own function to do the 

1288 # lookup. This always takes precedence. 

1289 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1290 elif ( 

1291 datasetType.isCalibration() 

1292 and datasetType.dimensions <= quantum.dataId.graph 

1293 and quantum.dataId.graph.temporal 

1294 ): 

1295 # This is a master calibration lookup, which we have to 

1296 # handle specially because the query system can't do a 

1297 # temporal join on a non-dimension-based timespan yet. 

1298 timespan = quantum.dataId.timespan 

1299 try: 

1300 prereq_ref = registry.findDataset( 

1301 parent_dataset_type, 

1302 quantum.dataId, 

1303 collections=collections, 

1304 timespan=timespan, 

1305 ) 

1306 if prereq_ref is not None: 

1307 if component is not None: 

1308 prereq_ref = prereq_ref.makeComponentRef(component) 

1309 prereq_refs = [prereq_ref] 

1310 else: 

1311 prereq_refs = [] 

1312 except (KeyError, MissingDatasetTypeError): 

1313 # This dataset type is not present in the registry, 

1314 # which just means there are no datasets here. 

1315 prereq_refs = [] 

1316 else: 

1317 # Most general case. 

1318 prereq_refs = [ 

1319 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1320 for prereq_ref in registry.queryDatasets( 

1321 parent_dataset_type, 

1322 collections=collections, 

1323 dataId=quantum.dataId, 

1324 findFirst=True, 

1325 ).expanded() 

1326 ] 

1327 

1328 for ref in prereq_refs: 

1329 if ref is not None: 

1330 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1331 task.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1332 

1333 # Resolve all quantum inputs and outputs. 

1334 for datasetDict in (quantum.inputs, quantum.outputs): 

1335 for dataset_type, refDict in datasetDict.items(): 

1336 idMaker.resolveDict(dataset_type, refDict) 

1337 

1338 # Resolve task initInputs and initOutputs. 

1339 for datasetDict in (task.initInputs, task.initOutputs): 

1340 for dataset_type, refDict in datasetDict.items(): 

1341 idMaker.resolveDict(dataset_type, refDict) 

1342 

1343 # Actually remove any quanta that we decided to skip above. 

1344 if dataIdsSucceeded: 

1345 if skip_collections_wildcard is not None: 

1346 _LOG.debug( 

1347 "Pruning successful %d quanta for task with label '%s' because all of their " 

1348 "outputs exist or metadata was written successfully.", 

1349 len(dataIdsSucceeded), 

1350 task.taskDef.label, 

1351 ) 

1352 for dataId in dataIdsSucceeded: 

1353 del task.quanta[dataId] 

1354 elif clobberOutputs: 

1355 _LOG.info( 

1356 "Found %d successful quanta for task with label '%s' " 

1357 "that will need to be clobbered during execution.", 

1358 len(dataIdsSucceeded), 

1359 task.taskDef.label, 

1360 ) 

1361 else: 

1362 raise AssertionError("OutputExistsError should have already been raised.") 

1363 if dataIdsFailed: 

1364 if clobberOutputs: 

1365 _LOG.info( 

1366 "Found %d failed/incomplete quanta for task with label '%s' " 

1367 "that will need to be clobbered during execution.", 

1368 len(dataIdsFailed), 

1369 task.taskDef.label, 

1370 ) 

1371 else: 

1372 raise AssertionError("OutputExistsError should have already been raised.") 

1373 

1374 # Collect initOutputs that do not belong to any task. 

1375 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1376 for task in self.tasks: 

1377 global_dataset_types -= set(task.initOutputs) 

1378 if global_dataset_types: 

1379 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1380 for dataset_type, refDict in self.globalInitOutputs.items(): 

1381 idMaker.resolveDict(dataset_type, refDict) 

1382 

1383 def makeQuantumGraph( 

1384 self, 

1385 registry: Registry, 

1386 metadata: Mapping[str, Any] | None = None, 

1387 datastore: Datastore | None = None, 

1388 ) -> QuantumGraph: 

1389 """Create a `QuantumGraph` from the quanta already present in 

1390 the scaffolding data structure. 

1391 

1392 Parameters 

1393 ---------- 

1394 registry : `lsst.daf.butler.Registry` 

1395 Registry for the data repository; used for all data ID queries. 

1396 metadata : `~collections.abc.Mapping` of `str` to primitives, optional 

1397 This is an optional parameter of extra data to carry with the 

1398 graph. Entries in this mapping should be able to be serialized in 

1399 JSON. 

1400 datastore : `~lsst.daf.butler.Datastore`, optional 

1401 If not `None` then fill datastore records in each generated 

1402 Quantum. 

1403 

1404 Returns 

1405 ------- 

1406 graph : `QuantumGraph` 

1407 The full `QuantumGraph`. 

1408 """ 

1409 

1410 def _make_refs(dataset_dict: _DatasetDict) -> Iterable[DatasetRef]: 

1411 """Extract all DatasetRefs from the dictionaries""" 

1412 for ref_dict in dataset_dict.values(): 

1413 for holder in ref_dict.values(): 

1414 yield holder.resolved_ref 

1415 

1416 datastore_records: Mapping[str, DatastoreRecordData] | None = None 

1417 if datastore is not None: 

1418 datastore_records = datastore.export_records( 

1419 itertools.chain( 

1420 _make_refs(self.inputs), 

1421 _make_refs(self.initInputs), 

1422 _make_refs(self.prerequisites), 

1423 ) 

1424 ) 

1425 

1426 graphInput: dict[TaskDef, set[Quantum]] = {} 

1427 for task in self.tasks: 

1428 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1429 graphInput[task.taskDef] = qset 

1430 

1431 taskInitInputs = { 

1432 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1433 for task in self.tasks 

1434 } 

1435 taskInitOutputs = { 

1436 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1437 for task in self.tasks 

1438 } 

1439 

1440 globalInitOutputs: list[DatasetRef] = [] 

1441 if self.globalInitOutputs is not None: 

1442 for refs_dict in self.globalInitOutputs.values(): 

1443 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1444 

1445 graph = QuantumGraph( 

1446 graphInput, 

1447 metadata=metadata, 

1448 pruneRefs=list(self.missing.iter_resolved_refs()), 

1449 universe=self.dimensions.universe, 

1450 initInputs=taskInitInputs, 

1451 initOutputs=taskInitOutputs, 

1452 globalInitOutputs=globalInitOutputs, 

1453 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1454 ) 

1455 return graph 

1456 

1457 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1458 """Make a list of all dataset types used by a graph as defined in 

1459 registry. 

1460 """ 

1461 chain = [ 

1462 self.initInputs, 

1463 self.initIntermediates, 

1464 self.initOutputs, 

1465 self.inputs, 

1466 self.intermediates, 

1467 self.outputs, 

1468 self.prerequisites, 

1469 ] 

1470 if self.globalInitOutputs is not None: 

1471 chain.append(self.globalInitOutputs) 

1472 

1473 # Collect names of all dataset types. 

1474 all_names: set[str] = set(dstype.name for dstype in itertools.chain(*chain)) 

1475 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1476 

1477 # Check for types that do not exist in registry yet: 

1478 # - inputs must exist 

1479 # - intermediates and outputs may not exist, but there must not be 

1480 # more than one definition (e.g. differing in storage class) 

1481 # - prerequisites may not exist, treat it the same as outputs here 

1482 for dstype in itertools.chain(self.initInputs, self.inputs): 

1483 if dstype.name not in dataset_types: 

1484 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1485 

1486 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1487 chain = [ 

1488 self.initIntermediates, 

1489 self.initOutputs, 

1490 self.intermediates, 

1491 self.outputs, 

1492 self.prerequisites, 

1493 ] 

1494 if self.globalInitOutputs is not None: 

1495 chain.append(self.globalInitOutputs) 

1496 for dstype in itertools.chain(*chain): 

1497 if dstype.name not in dataset_types: 

1498 new_outputs[dstype.name].add(dstype) 

1499 for name, dstypes in new_outputs.items(): 

1500 if len(dstypes) > 1: 

1501 raise ValueError( 

1502 "Pipeline contains multiple definitions for a dataset type " 

1503 f"which is not defined in registry yet: {dstypes}" 

1504 ) 

1505 elif len(dstypes) == 1: 

1506 dataset_types[name] = dstypes.pop() 

1507 

1508 return dataset_types.values() 

1509 

1510 

1511# ------------------------ 

1512# Exported definitions -- 

1513# ------------------------ 

1514 

1515 

1516class GraphBuilderError(Exception): 

1517 """Base class for exceptions generated by graph builder.""" 

1518 

1519 pass 

1520 

1521 

1522class OutputExistsError(GraphBuilderError): 

1523 """Exception generated when output datasets already exist.""" 

1524 

1525 pass 

1526 

1527 

1528class PrerequisiteMissingError(GraphBuilderError): 

1529 """Exception generated when a prerequisite dataset does not exist.""" 

1530 

1531 pass 

1532 

1533 

1534class GraphBuilder: 

1535 """GraphBuilder class is responsible for building task execution graph from 

1536 a Pipeline. 

1537 

1538 Parameters 

1539 ---------- 

1540 registry : `~lsst.daf.butler.Registry` 

1541 Data butler instance. 

1542 skipExistingIn 

1543 Expressions representing the collections to search for existing 

1544 output datasets that should be skipped. See 

1545 :ref:`daf_butler_ordered_collection_searches`. 

1546 clobberOutputs : `bool`, optional 

1547 If `True` (default), allow quanta to created even if partial outputs 

1548 exist; this requires the same behavior behavior to be enabled when 

1549 executing. 

1550 datastore : `~lsst.daf.butler.Datastore`, optional 

1551 If not `None` then fill datastore records in each generated Quantum. 

1552 """ 

1553 

1554 def __init__( 

1555 self, 

1556 registry: Registry, 

1557 skipExistingIn: Any = None, 

1558 clobberOutputs: bool = True, 

1559 datastore: Datastore | None = None, 

1560 ): 

1561 self.registry = registry 

1562 self.dimensions = registry.dimensions 

1563 self.skipExistingIn = skipExistingIn 

1564 self.clobberOutputs = clobberOutputs 

1565 self.datastore = datastore 

1566 

1567 def makeGraph( 

1568 self, 

1569 pipeline: Pipeline | Iterable[TaskDef], 

1570 collections: Any, 

1571 run: str, 

1572 userQuery: str | None, 

1573 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1574 metadata: Mapping[str, Any] | None = None, 

1575 bind: Mapping[str, Any] | None = None, 

1576 dataId: DataCoordinate | None = None, 

1577 ) -> QuantumGraph: 

1578 """Create execution graph for a pipeline. 

1579 

1580 Parameters 

1581 ---------- 

1582 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1583 Pipeline definition, task names/classes and their configs. 

1584 collections 

1585 Expressions representing the collections to search for input 

1586 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1587 run : `str` 

1588 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1589 output datasets. Collection does not have to exist and it will be 

1590 created when graph is executed. 

1591 userQuery : `str` 

1592 String which defines user-defined selection for registry, should be 

1593 empty or `None` if there is no restrictions on data selection. 

1594 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1595 The query constraint variant that should be used to constraint the 

1596 query based on dataset existance, defaults to 

1597 `DatasetQueryConstraintVariant.ALL`. 

1598 metadata : Optional Mapping of `str` to primitives 

1599 This is an optional parameter of extra data to carry with the 

1600 graph. Entries in this mapping should be able to be serialized in 

1601 JSON. 

1602 bind : `~collections.abc.Mapping`, optional 

1603 Mapping containing literal values that should be injected into the 

1604 ``userQuery`` expression, keyed by the identifiers they replace. 

1605 dataId : `lsst.daf.butler.DataCoordinate`, optional 

1606 Data ID that should also be included in the query constraint. 

1607 

1608 Returns 

1609 ------- 

1610 graph : `QuantumGraph` 

1611 

1612 Raises 

1613 ------ 

1614 UserExpressionError 

1615 Raised when user expression cannot be parsed. 

1616 OutputExistsError 

1617 Raised when output datasets already exist. 

1618 Exception 

1619 Other exceptions types may be raised by underlying registry 

1620 classes. 

1621 """ 

1622 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1623 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1624 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1625 if dataId is None: 

1626 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1627 if isinstance(pipeline, Pipeline): 

1628 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId) 

1629 with scaffolding.connectDataIds( 

1630 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1631 ) as commonDataIds: 

1632 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1633 scaffolding.resolveDatasetRefs( 

1634 self.registry, 

1635 collections, 

1636 run, 

1637 commonDataIds, 

1638 skipExistingIn=self.skipExistingIn, 

1639 clobberOutputs=self.clobberOutputs, 

1640 constrainedByAllDatasets=condition, 

1641 ) 

1642 return scaffolding.makeQuantumGraph( 

1643 registry=self.registry, metadata=metadata, datastore=self.datastore 

1644 )