Coverage for python/lsst/pipe/base/graphBuilder.py: 17%

597 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-06 02:28 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining GraphBuilder class and related methods. 

23""" 

24 

25from __future__ import annotations 

26 

27__all__ = ["GraphBuilder"] 

28 

29# ------------------------------- 

30# Imports of standard modules -- 

31# ------------------------------- 

32import contextlib 

33import itertools 

34import logging 

35from collections import ChainMap, defaultdict 

36from collections.abc import Collection, Iterable, Iterator, Mapping 

37from contextlib import contextmanager 

38from dataclasses import dataclass 

39from typing import Any, TypeVar, cast 

40 

41from lsst.daf.butler import ( 

42 CollectionType, 

43 DataCoordinate, 

44 DatasetRef, 

45 DatasetType, 

46 Datastore, 

47 DatastoreRecordData, 

48 DimensionGraph, 

49 DimensionUniverse, 

50 NamedKeyDict, 

51 NamedValueSet, 

52 Quantum, 

53 Registry, 

54 SkyPixDimension, 

55) 

56from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

57from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

58from lsst.daf.butler.registry.wildcards import CollectionWildcard 

59from lsst.sphgeom import PixelizationABC, RangeSet 

60 

61# ----------------------------- 

62# Imports for other modules -- 

63# ----------------------------- 

64from . import automatic_connection_constants as acc 

65from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

66from ._status import NoWorkFound 

67from .connections import AdjustQuantumHelper, iterConnections 

68from .graph import QuantumGraph 

69from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

70 

71# ---------------------------------- 

72# Local non-exported definitions -- 

73# ---------------------------------- 

74 

75_LOG = logging.getLogger(__name__) 

76 

77 

78@dataclass 

79class _RefHolder: 

80 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future 

81 resolved reference. 

82 

83 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use 

84 `None` to represent a reference that is yet to be resolved. Information 

85 about its corresponding dataset type and coordinate is stored in 

86 `_DatasetDict` mapping. 

87 """ 

88 

89 dataset_type: DatasetType 

90 """Dataset type of the dataset to be created later. I need to store it here 

91 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

92 between different compatible dataset types.""" 

93 

94 ref: DatasetRef | None = None 

95 """Dataset reference, initially `None`, created when all datasets are 

96 resolved. 

97 """ 

98 

99 @property 

100 def resolved_ref(self) -> DatasetRef: 

101 """Access resolved reference, should only be called after the 

102 reference is set (`~lsst.daf.butler.DatasetRef`). 

103 """ 

104 assert self.ref is not None, "Dataset reference is not set." 

105 return self.ref 

106 

107 

108_Refs = TypeVar("_Refs") 

109 

110 

111class _DatasetDictBase(NamedKeyDict[DatasetType, _Refs]): 

112 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

113 collection of the known `~lsst.daf.butler.DatasetRef` instances of that 

114 type. 

115 

116 Parameters 

117 ---------- 

118 args 

119 Positional arguments are forwarded to the `dict` constructor. 

120 universe : `~lsst.daf.butler.DimensionUniverse` 

121 Universe of all possible dimensions. 

122 """ 

123 

124 def __init__(self, *args: Any, universe: DimensionUniverse): 

125 super().__init__(*args) 

126 self.universe = universe 

127 

128 @classmethod 

129 def _fromSubset( 

130 cls, 

131 datasetTypes: Collection[DatasetType], 

132 first: _DatasetDictBase, 

133 *rest: _DatasetDictBase, 

134 ) -> _DatasetDictBase: 

135 """Return a new dictionary by extracting items corresponding to the 

136 given keys from one or more existing dictionaries. 

137 

138 Parameters 

139 ---------- 

140 datasetTypes : `~collections.abc.Iterable` of \ 

141 `~lsst.daf.butler.DatasetType` 

142 DatasetTypes to use as keys for the dict. Values will be obtained 

143 by lookups against ``first`` and ``rest``. 

144 first : `_DatasetDictBase` 

145 Another dictionary from which to extract values. Its actual type 

146 must be idedntical to the type of sub-class used to call this 

147 method. 

148 rest 

149 Additional dictionaries from which to extract values. 

150 

151 Returns 

152 ------- 

153 dictionary : `_DatasetDictBase` 

154 A new dictionary instance. 

155 """ 

156 combined = ChainMap(first, *rest) 

157 

158 # Dataset types known to match immediately can be processed 

159 # without checks. 

160 matches = combined.keys() & set(datasetTypes) 

161 _dict = {k: combined[k] for k in matches} 

162 

163 if len(_dict) < len(datasetTypes): 

164 # Work out which ones are missing. 

165 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

166 

167 # Get the known names for comparison. 

168 combined_by_name = {k.name: k for k in combined} 

169 

170 missing = set() 

171 incompatible = {} 

172 for datasetType in missing_datasetTypes: 

173 # The dataset type is not found. It may not be listed 

174 # or it may be that it is there with the same name 

175 # but different definition. 

176 if datasetType.name in combined_by_name: 

177 # This implies some inconsistency in definitions 

178 # for connections. If there is support for storage 

179 # class conversion we can let it slide. 

180 # At this point we do not know 

181 # where the inconsistency is but trust that down 

182 # stream code will be more explicit about input 

183 # vs output incompatibilities. 

184 existing = combined_by_name[datasetType.name] 

185 convertible_to_existing = existing.is_compatible_with(datasetType) 

186 convertible_from_existing = datasetType.is_compatible_with(existing) 

187 if convertible_to_existing and convertible_from_existing: 

188 _LOG.debug( 

189 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

190 datasetType.name, 

191 datasetType.storageClass_name, 

192 existing.storageClass_name, 

193 ) 

194 _dict[datasetType] = combined[existing] 

195 elif convertible_to_existing or convertible_from_existing: 

196 # We'd need to refactor a fair amount to recognize 

197 # whether this is an error or not, so I'm not going to 

198 # bother until we need to do that for other reasons 

199 # (it won't be too long). 

200 _LOG.info( 

201 "Dataset type %s is present with multiple only partially-compatible storage " 

202 "classes %s and %s.", 

203 datasetType.name, 

204 datasetType.storageClass_name, 

205 existing.storageClass_name, 

206 ) 

207 _dict[datasetType] = combined[existing] 

208 else: 

209 incompatible[datasetType] = existing 

210 else: 

211 missing.add(datasetType) 

212 

213 if missing or incompatible: 

214 reasons = [] 

215 if missing: 

216 reasons.append( 

217 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

218 f"types: [{', '.join(d.name for d in combined)}]." 

219 ) 

220 if incompatible: 

221 for x, y in incompatible.items(): 

222 reasons.append(f"{x} incompatible with {y}") 

223 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

224 

225 return cls(_dict, universe=first.universe) 

226 

227 @property 

228 def dimensions(self) -> DimensionGraph: 

229 """The union of all dimensions used by all dataset types in this 

230 dictionary, including implied dependencies (`DimensionGraph`). 

231 """ 

232 base = self.universe.empty 

233 if len(self) == 0: 

234 return base 

235 return base.union(*[datasetType.dimensions for datasetType in self]) 

236 

237 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

238 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts 

239 into a new mapping with `~lsst.daf.butler.DatasetType` keys and 

240 `~lsst.daf.butler.DatasetRef` values. 

241 

242 This method assumes that each nest contains exactly one item, as is the 

243 case for all "init" datasets. 

244 

245 Parameters 

246 ---------- 

247 storage_classes : `dict` [ `str`, `str` ] 

248 Mapping from dataset type name to the storage class to use for that 

249 dataset type. These are typically the storage classes declared 

250 for a particular task, which may differ rom the data repository 

251 definitions. 

252 

253 Returns 

254 ------- 

255 dictionary : `~lsst.daf.butler.NamedKeyDict` 

256 Dictionary mapping `~lsst.daf.butler.DatasetType` to 

257 `~lsst.daf.butler.DatasetRef`, with both 

258 `~lsst.daf.butler.DatasetType` instances and string names usable 

259 as keys. 

260 """ 

261 return NamedKeyDict( 

262 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

263 ) 

264 

265 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

266 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into 

267 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of 

268 `~lsst.daf.butler.DatasetRef` values. 

269 

270 Parameters 

271 ---------- 

272 storage_classes : `dict` [ `str`, `str` ] 

273 Mapping from dataset type name to the storage class to use for that 

274 dataset type. These are typically the storage classes declared 

275 for a particular task, which may differ rom the data repository 

276 definitions. 

277 

278 Returns 

279 ------- 

280 dictionary : `~lsst.daf.butler.NamedKeyDict` 

281 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of 

282 `~lsst.daf.butler.DatasetRef`, with both 

283 `~lsst.daf.butler.DatasetType` instances and string names usable 

284 as keys. 

285 """ 

286 raise NotImplementedError() 

287 

288 

289class _DatasetDict(_DatasetDictBase[dict[DataCoordinate, _RefHolder]]): 

290 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

291 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

292 type. 

293 """ 

294 

295 @classmethod 

296 def fromDatasetTypes( 

297 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

298 ) -> _DatasetDict: 

299 """Construct a dictionary from a flat iterable of 

300 `~lsst.daf.butler.DatasetType` keys. 

301 

302 Parameters 

303 ---------- 

304 datasetTypes : `~collections.abc.Iterable` of \ 

305 `~lsst.daf.butler.DatasetType` 

306 DatasetTypes to use as keys for the dict. Values will be empty 

307 dictionaries. 

308 universe : `~lsst.daf.butler.DimensionUniverse` 

309 Universe of all possible dimensions. 

310 

311 Returns 

312 ------- 

313 dictionary : `_DatasetDict` 

314 A new `_DatasetDict` instance. 

315 """ 

316 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

317 

318 @classmethod 

319 def fromSubset( 

320 cls, 

321 datasetTypes: Collection[DatasetType], 

322 first: _DatasetDict, 

323 *rest: _DatasetDict, 

324 ) -> _DatasetDict: 

325 """Return a new dictionary by extracting items corresponding to the 

326 given keys from one or more existing dictionaries. 

327 

328 Parameters 

329 ---------- 

330 datasetTypes : `~collections.abc.Iterable` of \ 

331 `~lsst.daf.butler.DatasetType` 

332 DatasetTypes to use as keys for the dict. Values will be obtained 

333 by lookups against ``first`` and ``rest``. 

334 first : `_DatasetDict` 

335 Another dictionary from which to extract values. 

336 rest 

337 Additional dictionaries from which to extract values. 

338 

339 Returns 

340 ------- 

341 dictionary : `_DatasetDict` 

342 A new dictionary instance. 

343 """ 

344 return cast(_DatasetDict, cls._fromSubset(datasetTypes, first, *rest)) 

345 

346 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

347 # Docstring inherited. 

348 result = {} 

349 for dataset_type, holders in self.items(): 

350 if ( 

351 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

352 ) != dataset_type.storageClass_name: 

353 dataset_type = dataset_type.overrideStorageClass(override) 

354 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

355 else: 

356 refs = [holder.resolved_ref for holder in holders.values()] 

357 result[dataset_type] = refs 

358 return NamedKeyDict(result) 

359 

360 def extract( 

361 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

362 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

363 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances 

364 that match the given `~lsst.daf.butler.DatasetType` and data IDs. 

365 

366 Parameters 

367 ---------- 

368 datasetType : `~lsst.daf.butler.DatasetType` 

369 Dataset type to match. 

370 dataIds : `~collections.abc.Iterable` \ 

371 [ `~lsst.daf.butler.DataCoordinate` ] 

372 Data IDs to match. 

373 

374 Returns 

375 ------- 

376 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ] 

377 DatasetRef instances for which ``ref.datasetType == datasetType`` 

378 and ``ref.dataId`` is in ``dataIds``. 

379 """ 

380 refs = self[datasetType] 

381 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

382 

383 def isdisjoint(self, other: _DatasetDict) -> bool: 

384 """Test whether ``self`` and ``other`` have any datasets in common. 

385 

386 Datasets are considered in common if they have the same *parent* 

387 dataset type name and data ID; storage classes and components are not 

388 considered. 

389 """ 

390 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

391 for k, v in other.items(): 

392 parent_name, _ = k.nameAndComponent() 

393 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

394 return False 

395 return True 

396 

397 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

398 """Iterate over all DatasetRef instances held by this data structure, 

399 assuming that each `_RefHolder` already carries are resolved ref. 

400 """ 

401 for holders_by_data_id in self.values(): 

402 for holder in holders_by_data_id.values(): 

403 yield holder.resolved_ref 

404 

405 

406class _DatasetDictMulti(_DatasetDictBase[defaultdict[DataCoordinate, list[_RefHolder]]]): 

407 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

408 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

409 type. Nexted dictionary can contain multiple refs for the same data ID, 

410 suitable for use with calibration datasets. 

411 """ 

412 

413 @classmethod 

414 def fromDatasetTypes( 

415 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

416 ) -> _DatasetDictMulti: 

417 """Construct a dictionary from a flat iterable of 

418 `~lsst.daf.butler.DatasetType` keys. 

419 

420 Parameters 

421 ---------- 

422 datasetTypes : `~collections.abc.Iterable` of \ 

423 `~lsst.daf.butler.DatasetType` 

424 DatasetTypes to use as keys for the dict. Values will be empty 

425 dictionaries. 

426 universe : `~lsst.daf.butler.DimensionUniverse` 

427 Universe of all possible dimensions. 

428 

429 Returns 

430 ------- 

431 dictionary : `_DatasetDictMulti` 

432 A new `_DatasetDictMulti` instance. 

433 """ 

434 return cls({datasetType: defaultdict(list) for datasetType in datasetTypes}, universe=universe) 

435 

436 @classmethod 

437 def fromSubset( 

438 cls, 

439 datasetTypes: Collection[DatasetType], 

440 first: _DatasetDictMulti, 

441 *rest: _DatasetDictMulti, 

442 ) -> _DatasetDictMulti: 

443 """Return a new dictionary by extracting items corresponding to the 

444 given keys from one or more existing dictionaries. 

445 

446 Parameters 

447 ---------- 

448 datasetTypes : `~collections.abc.Iterable` of \ 

449 `~lsst.daf.butler.DatasetType` 

450 DatasetTypes to use as keys for the dict. Values will be obtained 

451 by lookups against ``first`` and ``rest``. 

452 first : `_DatasetDictMulti` 

453 Another dictionary from which to extract values. 

454 rest 

455 Additional dictionaries from which to extract values. 

456 

457 Returns 

458 ------- 

459 dictionary : `_DatasetDictMulti` 

460 A new dictionary instance. 

461 """ 

462 return cast(_DatasetDictMulti, cls._fromSubset(datasetTypes, first, *rest)) 

463 

464 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

465 # Docstring inherited. 

466 result = {} 

467 for dataset_type, holder_map in self.items(): 

468 if ( 

469 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

470 ) != dataset_type.storageClass_name: 

471 dataset_type = dataset_type.overrideStorageClass(override) 

472 refs = [] 

473 for holder_list in holder_map.values(): 

474 refs += [holder.resolved_ref.overrideStorageClass(override) for holder in holder_list] 

475 else: 

476 refs = [] 

477 for holder_list in holder_map.values(): 

478 refs += [holder.resolved_ref for holder in holder_list] 

479 result[dataset_type] = refs 

480 return NamedKeyDict(result) 

481 

482 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

483 """Iterate over all DatasetRef instances held by this data structure, 

484 assuming that each `_RefHolder` already carries are resolved ref. 

485 """ 

486 for holders_by_data_id in self.values(): 

487 for holder_list in holders_by_data_id.values(): 

488 for holder in holder_list: 

489 yield holder.resolved_ref 

490 

491 

492class _QuantumScaffolding: 

493 """Helper class aggregating information about a `Quantum`, used when 

494 constructing a `QuantumGraph`. 

495 

496 See `_PipelineScaffolding` for a top-down description of the full 

497 scaffolding data structure. 

498 

499 Parameters 

500 ---------- 

501 task : _TaskScaffolding 

502 Back-reference to the helper object for the `PipelineTask` this quantum 

503 represents an execution of. 

504 dataId : `~lsst.daf.butler.DataCoordinate` 

505 Data ID for this quantum. 

506 """ 

507 

508 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

509 self.task = task 

510 self.dataId = dataId 

511 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

512 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

513 self.prerequisites = _DatasetDict.fromDatasetTypes( 

514 task.prerequisites.keys(), universe=dataId.universe 

515 ) 

516 

517 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

518 

519 def __repr__(self) -> str: 

520 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

521 

522 task: _TaskScaffolding 

523 """Back-reference to the helper object for the `PipelineTask` this quantum 

524 represents an execution of. 

525 """ 

526 

527 dataId: DataCoordinate 

528 """Data ID for this quantum. 

529 """ 

530 

531 inputs: _DatasetDict 

532 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to 

533 this quantum. 

534 

535 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty 

536 dictionary at construction. Those nested dictionaries are populated 

537 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef` 

538 instances in `_PipelineScaffolding.connectDataIds`. 

539 """ 

540 

541 outputs: _DatasetDict 

542 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this 

543 quantum. 

544 """ 

545 

546 prerequisites: _DatasetDict 

547 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite 

548 inputs to this quantum. 

549 """ 

550 

551 def computeSpatialExtent(self, pixelization: PixelizationABC) -> RangeSet: 

552 """Return the spatial extent of this quantum's inputs and outputs in 

553 a skypix system. 

554 

555 Parameters 

556 ---------- 

557 pixelization : `lsst.sphgeom.PixelizationABC` 

558 Pixelization system. 

559 

560 Returns 

561 ------- 

562 extent : `lsst.sphgeom.RangeSet` 

563 Ranges of sky pixels that touch this quantum's inputs and outputs. 

564 """ 

565 result = RangeSet() 

566 for dataset_type, datasets in itertools.chain(self.inputs.items(), self.outputs.items()): 

567 if dataset_type.dimensions.spatial: 

568 for data_id in datasets: 

569 result |= pixelization.envelope(data_id.region) 

570 return result 

571 

572 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum: 

573 """Transform the scaffolding object into a true `Quantum` instance. 

574 

575 Parameters 

576 ---------- 

577 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

578 `~lsst.daf.butler.DatastoreRecordData` ], optional 

579 If not `None` then fill datastore records in each generated Quantum 

580 using the records from this structure. 

581 

582 Returns 

583 ------- 

584 quantum : `Quantum` 

585 An actual `Quantum` instance. 

586 """ 

587 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

588 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

589 # Give the task's Connections class an opportunity to remove some 

590 # inputs, or complain if they are unacceptable. 

591 # This will raise if one of the check conditions is not met, which is 

592 # the intended behavior. 

593 # If it raises NotWorkFound, there is a bug in the QG algorithm 

594 # or the adjustQuantum is incorrectly trying to make a prerequisite 

595 # input behave like a regular input; adjustQuantum should only raise 

596 # NoWorkFound if a regular input is missing, and it shouldn't be 

597 # possible for us to have generated ``self`` if that's true. 

598 helper = AdjustQuantumHelper( 

599 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

600 ) 

601 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

602 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

603 quantum_records: Mapping[str, DatastoreRecordData] | None = None 

604 if datastore_records is not None: 

605 quantum_records = {} 

606 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

607 input_refs += list(initInputs.values()) 

608 input_ids = {ref.id for ref in input_refs} 

609 for datastore_name, records in datastore_records.items(): 

610 matching_records = records.subset(input_ids) 

611 if matching_records is not None: 

612 quantum_records[datastore_name] = matching_records 

613 # ignore the types because quantum really can take a sequence of inputs 

614 return Quantum( 

615 taskName=self.task.taskDef.taskName, 

616 taskClass=self.task.taskDef.taskClass, 

617 dataId=self.dataId, 

618 initInputs=initInputs, 

619 inputs=helper.inputs, 

620 outputs=helper.outputs, 

621 datastore_records=quantum_records, 

622 ) 

623 

624 

625@dataclass 

626class _TaskScaffolding: 

627 """Helper class aggregating information about a `PipelineTask`, used when 

628 constructing a `QuantumGraph`. 

629 

630 See `_PipelineScaffolding` for a top-down description of the full 

631 scaffolding data structure. 

632 

633 Parameters 

634 ---------- 

635 taskDef : `TaskDef` 

636 Data structure that identifies the task class and its config. 

637 parent : `_PipelineScaffolding` 

638 The parent data structure that will hold the instance being 

639 constructed. 

640 datasetTypes : `TaskDatasetTypes` 

641 Data structure that categorizes the dataset types used by this task. 

642 """ 

643 

644 def __init__( 

645 self, 

646 taskDef: TaskDef, 

647 parent: _PipelineScaffolding, 

648 datasetTypes: TaskDatasetTypes, 

649 ): 

650 universe = parent.dimensions.universe 

651 self.taskDef = taskDef 

652 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

653 assert self.dimensions.issubset(parent.dimensions) 

654 # Initialize _DatasetDicts as subsets of the one or two 

655 # corresponding dicts in the parent _PipelineScaffolding. 

656 self.initInputs = _DatasetDict.fromSubset( 

657 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

658 ) 

659 self.initOutputs = _DatasetDict.fromSubset( 

660 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

661 ) 

662 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

663 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

664 self.prerequisites = _DatasetDictMulti.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

665 self.dataIds: set[DataCoordinate] = set() 

666 self.quanta = {} 

667 self.storage_classes = { 

668 connection.name: connection.storageClass 

669 for connection in self.taskDef.connections.allConnections.values() 

670 } 

671 self.storage_classes[ 

672 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

673 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

674 self.storage_classes[ 

675 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

676 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

677 self.storage_classes[ 

678 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

679 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

680 

681 def __repr__(self) -> str: 

682 # Default dataclass-injected __repr__ gets caught in an infinite loop 

683 # because of back-references. 

684 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

685 

686 taskDef: TaskDef 

687 """Data structure that identifies the task class and its config 

688 (`TaskDef`). 

689 """ 

690 

691 dimensions: DimensionGraph 

692 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

693 """ 

694 

695 initInputs: _DatasetDict 

696 """Dictionary containing information about datasets used to construct this 

697 task (`_DatasetDict`). 

698 """ 

699 

700 initOutputs: _DatasetDict 

701 """Dictionary containing information about datasets produced as a 

702 side-effect of constructing this task (`_DatasetDict`). 

703 """ 

704 

705 inputs: _DatasetDict 

706 """Dictionary containing information about datasets used as regular, 

707 graph-constraining inputs to this task (`_DatasetDict`). 

708 """ 

709 

710 outputs: _DatasetDict 

711 """Dictionary containing information about datasets produced by this task 

712 (`_DatasetDict`). 

713 """ 

714 

715 prerequisites: _DatasetDictMulti 

716 """Dictionary containing information about input datasets that must be 

717 present in the repository before any Pipeline containing this task is run 

718 (`_DatasetDictMulti`). 

719 """ 

720 

721 quanta: dict[DataCoordinate, _QuantumScaffolding] 

722 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

723 this task with that data ID. 

724 """ 

725 

726 storage_classes: dict[str, str] 

727 """Mapping from dataset type name to storage class declared by this task. 

728 """ 

729 

730 def makeQuantumSet( 

731 self, 

732 missing: _DatasetDict, 

733 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

734 ) -> set[Quantum]: 

735 """Create a `set` of `Quantum` from the information in ``self``. 

736 

737 Parameters 

738 ---------- 

739 missing : `_DatasetDict` 

740 Input datasets that have not been found. 

741 datastore_records : `dict` 

742 Record from the datastore to export with quanta. 

743 

744 Returns 

745 ------- 

746 nodes : `set` of `Quantum` 

747 The `Quantum` elements corresponding to this task. 

748 """ 

749 outputs = set() 

750 for q in self.quanta.values(): 

751 try: 

752 tmpQuanta = q.makeQuantum(datastore_records) 

753 outputs.add(tmpQuanta) 

754 except (NoWorkFound, FileNotFoundError) as exc: 

755 if not missing.isdisjoint(q.inputs): 

756 # This is a node that is known to be pruned later and 

757 # should be left in even though some follow up queries 

758 # fail. This allows the pruning to start from this quantum 

759 # with known issues, and prune other nodes it touches. 

760 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

761 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

762 tmpQuantum = Quantum( 

763 taskName=q.task.taskDef.taskName, 

764 taskClass=q.task.taskDef.taskClass, 

765 dataId=q.dataId, 

766 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

767 inputs=inputs, 

768 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

769 ) 

770 outputs.add(tmpQuantum) 

771 else: 

772 raise exc 

773 return outputs 

774 

775 

776class _DatasetIdMaker: 

777 """Helper class which generates random dataset UUIDs for unresolved 

778 datasets. 

779 """ 

780 

781 def __init__(self, run: str): 

782 self.run = run 

783 # Cache of dataset refs generated so far. 

784 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

785 

786 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

787 # For components we need their parent dataset ID. 

788 if dataset_type.isComponent(): 

789 parent_type = dataset_type.makeCompositeDatasetType() 

790 # Parent should be resolved if this is an existing input, or it 

791 # should be in the cache already if it is an intermediate. 

792 key = parent_type, data_id 

793 if key not in self.resolved: 

794 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

795 parent_ref = self.resolved[key] 

796 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

797 

798 key = dataset_type, data_id 

799 if (resolved := self.resolved.get(key)) is None: 

800 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

801 self.resolved[key] = resolved 

802 return resolved 

803 

804 def resolveDict( 

805 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool 

806 ) -> None: 

807 """Resolve all unresolved references in the provided dictionary.""" 

808 for data_id, holder in refs.items(): 

809 if holder.ref is None or (is_output and holder.ref.run != self.run): 

810 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

811 

812 

813@dataclass 

814class _PipelineScaffolding: 

815 """A helper data structure that organizes the information involved in 

816 constructing a `QuantumGraph` for a `Pipeline`. 

817 

818 Parameters 

819 ---------- 

820 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

821 Sequence of tasks from which a graph is to be constructed. Must 

822 have nested task classes already imported. 

823 universe : `~lsst.daf.butler.DimensionUniverse` 

824 Universe of all possible dimensions. 

825 

826 Notes 

827 ----- 

828 The scaffolding data structure contains nested data structures for both 

829 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

830 data structures are shared between the pipeline-level structure (which 

831 aggregates all datasets and categorizes them from the perspective of the 

832 complete pipeline) and the individual tasks that use them as inputs and 

833 outputs. 

834 

835 `QuantumGraph` construction proceeds in four steps, with each corresponding 

836 to a different `_PipelineScaffolding` method: 

837 

838 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

839 the DatasetTypes used by the pipeline (delegating to 

840 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

841 nested `_TaskScaffolding` and `_DatasetDict` objects. 

842 

843 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

844 returns related tuples of all dimensions used to identify any regular 

845 input, output, and intermediate datasets (not prerequisites). We then 

846 iterate over these tuples of related dimensions, identifying the subsets 

847 that correspond to distinct data IDs for each task and dataset type, 

848 and then create `_QuantumScaffolding` objects. 

849 

850 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

851 dataset data IDs previously identified, transforming unresolved 

852 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

853 up prerequisite datasets for all quanta. 

854 

855 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

856 per-task `_QuantumScaffolding` objects. 

857 """ 

858 

859 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

860 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

861 self.tasks = [] 

862 # Aggregate and categorize the DatasetTypes in the Pipeline. 

863 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

864 # Construct dictionaries that map those DatasetTypes to structures 

865 # that will (later) hold additional information about them. 

866 for attr in ( 

867 "initInputs", 

868 "initIntermediates", 

869 "initOutputs", 

870 "inputs", 

871 "intermediates", 

872 "outputs", 

873 ): 

874 setattr( 

875 self, 

876 attr, 

877 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

878 ) 

879 self.prerequisites = _DatasetDictMulti.fromDatasetTypes( 

880 datasetTypes.prerequisites, universe=registry.dimensions 

881 ) 

882 self.missing = _DatasetDict(universe=registry.dimensions) 

883 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

884 # Aggregate all dimensions for all non-init, non-prerequisite 

885 # DatasetTypes. These are the ones we'll include in the big join 

886 # query. 

887 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

888 # Construct scaffolding nodes for each Task, and add backreferences 

889 # to the Task from each DatasetScaffolding node. 

890 # Note that there's only one scaffolding node for each DatasetType, 

891 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

892 # reference it. 

893 if isinstance(pipeline, Pipeline): 

894 pipeline = pipeline.toExpandedPipeline() 

895 self.tasks = [ 

896 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

897 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values(), strict=True) 

898 ] 

899 

900 def __repr__(self) -> str: 

901 # Default dataclass-injected __repr__ gets caught in an infinite loop 

902 # because of back-references. 

903 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

904 

905 tasks: list[_TaskScaffolding] 

906 """Scaffolding data structures for each task in the pipeline 

907 (`list` of `_TaskScaffolding`). 

908 """ 

909 

910 initInputs: _DatasetDict 

911 """Datasets consumed but not produced when constructing the tasks in this 

912 pipeline (`_DatasetDict`). 

913 """ 

914 

915 initIntermediates: _DatasetDict 

916 """Datasets that are both consumed and produced when constructing the tasks 

917 in this pipeline (`_DatasetDict`). 

918 """ 

919 

920 initOutputs: _DatasetDict 

921 """Datasets produced but not consumed when constructing the tasks in this 

922 pipeline (`_DatasetDict`). 

923 """ 

924 

925 inputs: _DatasetDict 

926 """Datasets that are consumed but not produced when running this pipeline 

927 (`_DatasetDict`). 

928 """ 

929 

930 intermediates: _DatasetDict 

931 """Datasets that are both produced and consumed when running this pipeline 

932 (`_DatasetDict`). 

933 """ 

934 

935 outputs: _DatasetDict 

936 """Datasets produced but not consumed when when running this pipeline 

937 (`_DatasetDict`). 

938 """ 

939 

940 prerequisites: _DatasetDictMulti 

941 """Datasets that are consumed when running this pipeline and looked up 

942 per-Quantum when generating the graph (`_DatasetDictMulti`). 

943 """ 

944 

945 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

946 """Datasets that should be used as constraints in the initial query, 

947 according to tasks (`~lsst.daf.butler.NamedValueSet`). 

948 """ 

949 

950 dimensions: DimensionGraph 

951 """All dimensions used by any regular input, intermediate, or output 

952 (not prerequisite) dataset; the set of dimension used in the "Big Join 

953 Query" (`~lsst.daf.butler.DimensionGraph`). 

954 

955 This is required to be a superset of all task quantum dimensions. 

956 """ 

957 

958 missing: _DatasetDict 

959 """Datasets whose existence was originally predicted but were not 

960 actually found. 

961 

962 Quanta that require these datasets as inputs will be pruned (recursively) 

963 when actually constructing a `QuantumGraph` object. 

964 

965 These are currently populated only when the "initial dataset query 

966 constraint" does not include all overall-input dataset types, and hence the 

967 initial data ID query can include data IDs that it should not. 

968 """ 

969 

970 globalInitOutputs: _DatasetDict | None = None 

971 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

972 """ 

973 

974 @contextmanager 

975 def connectDataIds( 

976 self, 

977 registry: Registry, 

978 collections: Any, 

979 userQuery: str | None, 

980 externalDataId: DataCoordinate, 

981 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

982 bind: Mapping[str, Any] | None = None, 

983 ) -> Iterator[DataCoordinateQueryResults]: 

984 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

985 

986 This method populates `_TaskScaffolding.dataIds` and 

987 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

988 

989 Parameters 

990 ---------- 

991 registry : `lsst.daf.butler.Registry` 

992 Registry for the data repository; used for all data ID queries. 

993 collections 

994 Expressions representing the collections to search for input 

995 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

996 userQuery : `str` or `None` 

997 User-provided expression to limit the data IDs processed. 

998 externalDataId : `~lsst.daf.butler.DataCoordinate` 

999 Externally-provided data ID that should be used to restrict the 

1000 results, just as if these constraints had been included via ``AND`` 

1001 in ``userQuery``. This includes (at least) any instrument named 

1002 in the pipeline definition. 

1003 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1004 The query constraint variant that should be used to constraint the 

1005 query based on dataset existance, defaults to 

1006 `DatasetQueryConstraintVariant.ALL`. 

1007 bind : `~collections.abc.Mapping`, optional 

1008 Mapping containing literal values that should be injected into the 

1009 ``userQuery`` expression, keyed by the identifiers they replace. 

1010 

1011 Returns 

1012 ------- 

1013 commonDataIds : \ 

1014 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1015 An interface to a database temporary table containing all data IDs 

1016 that will appear in this `QuantumGraph`. Returned inside a 

1017 context manager, which will drop the temporary table at the end of 

1018 the `with` block in which this method is called. 

1019 """ 

1020 _LOG.debug("Building query for data IDs.") 

1021 # Initialization datasets always have empty data IDs. 

1022 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

1023 for datasetType, refs in itertools.chain( 

1024 self.initInputs.items(), 

1025 self.initIntermediates.items(), 

1026 self.initOutputs.items(), 

1027 ): 

1028 refs[emptyDataId] = _RefHolder(datasetType) 

1029 # Run one big query for the data IDs for task dimensions and regular 

1030 # inputs and outputs. We limit the query to only dimensions that are 

1031 # associated with the input dataset types, but don't (yet) try to 

1032 # obtain the dataset_ids for those inputs. 

1033 _LOG.debug( 

1034 "Submitting data ID query over dimensions %s and materializing results.", 

1035 list(self.dimensions.names), 

1036 ) 

1037 queryArgs: dict[str, Any] = { 

1038 "dimensions": self.dimensions, 

1039 "where": userQuery, 

1040 "dataId": externalDataId, 

1041 "bind": bind, 

1042 } 

1043 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

1044 _LOG.debug( 

1045 "Constraining graph query using default of %s.", 

1046 list(self.defaultDatasetQueryConstraints.names), 

1047 ) 

1048 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

1049 queryArgs["collections"] = collections 

1050 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

1051 _LOG.debug("Not using dataset existence to constrain query.") 

1052 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

1053 constraint = set(datasetQueryConstraint) 

1054 inputs = {k.name: k for k in self.inputs} 

1055 if remainder := constraint.difference(inputs.keys()): 

1056 raise ValueError( 

1057 f"{remainder} dataset type(s) specified as a graph constraint, but" 

1058 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

1059 ) 

1060 _LOG.debug(f"Constraining graph query using {constraint}") 

1061 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

1062 queryArgs["collections"] = collections 

1063 else: 

1064 raise ValueError( 

1065 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

1066 ) 

1067 

1068 if "datasets" in queryArgs: 

1069 for i, dataset_type in enumerate(queryArgs["datasets"]): 

1070 if dataset_type.isComponent(): 

1071 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

1072 

1073 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

1074 _LOG.debug("Expanding data IDs.") 

1075 commonDataIds = commonDataIds.expanded() 

1076 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

1077 # Iterate over query results, populating data IDs for datasets and 

1078 # quanta and then connecting them to each other. 

1079 n = -1 

1080 for commonDataId in commonDataIds: 

1081 # Create DatasetRefs for all DatasetTypes from this result row, 

1082 # noting that we might have created some already. 

1083 # We remember both those that already existed and those that we 

1084 # create now. 

1085 refsForRow = {} 

1086 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

1087 for datasetType, refs in itertools.chain( 

1088 self.inputs.items(), 

1089 self.intermediates.items(), 

1090 self.outputs.items(), 

1091 ): 

1092 datasetDataId: DataCoordinate | None 

1093 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

1094 datasetDataId = commonDataId.subset(datasetType.dimensions) 

1095 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

1096 ref_holder = refs.get(datasetDataId) 

1097 if ref_holder is None: 

1098 ref_holder = _RefHolder(datasetType) 

1099 refs[datasetDataId] = ref_holder 

1100 refsForRow[datasetType.name] = ref_holder 

1101 # Create _QuantumScaffolding objects for all tasks from this 

1102 # result row, noting that we might have created some already. 

1103 for task in self.tasks: 

1104 quantumDataId = commonDataId.subset(task.dimensions) 

1105 quantum = task.quanta.get(quantumDataId) 

1106 if quantum is None: 

1107 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

1108 task.quanta[quantumDataId] = quantum 

1109 # Whether this is a new quantum or an existing one, we can 

1110 # now associate the DatasetRefs for this row with it. The 

1111 # fact that a Quantum data ID and a dataset data ID both 

1112 # came from the same result row is what tells us they 

1113 # should be associated. 

1114 # Many of these associates will be duplicates (because 

1115 # another query row that differed from this one only in 

1116 # irrelevant dimensions already added them), and we use 

1117 # sets to skip. 

1118 for datasetType in task.inputs: 

1119 dataId = dataIdCacheForRow[datasetType.dimensions] 

1120 ref_holder = refsForRow[datasetType.name] 

1121 quantum.inputs[datasetType.name][dataId] = ref_holder 

1122 for datasetType in task.outputs: 

1123 dataId = dataIdCacheForRow[datasetType.dimensions] 

1124 ref_holder = refsForRow[datasetType.name] 

1125 quantum.outputs[datasetType.name][dataId] = ref_holder 

1126 if n < 0: 

1127 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

1128 emptiness_explained = False 

1129 for message in commonDataIds.explain_no_results(): 

1130 _LOG.critical(message) 

1131 emptiness_explained = True 

1132 if not emptiness_explained: 

1133 _LOG.critical( 

1134 "To reproduce this query for debugging purposes, run " 

1135 "Registry.queryDataIds with these arguments:" 

1136 ) 

1137 # We could just repr() the queryArgs dict to get something 

1138 # the user could make sense of, but it's friendlier to 

1139 # put these args in an easier-to-construct equivalent form 

1140 # so they can read it more easily and copy and paste into 

1141 # a Python terminal. 

1142 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

1143 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

1144 if queryArgs["where"]: 

1145 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

1146 if "datasets" in queryArgs: 

1147 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

1148 if "collections" in queryArgs: 

1149 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

1150 _LOG.debug("Finished processing %d rows from data ID query.", n) 

1151 yield commonDataIds 

1152 

1153 def resolveDatasetRefs( 

1154 self, 

1155 registry: Registry, 

1156 collections: Any, 

1157 run: str, 

1158 commonDataIds: DataCoordinateQueryResults, 

1159 *, 

1160 skipExistingIn: Any = None, 

1161 clobberOutputs: bool = True, 

1162 constrainedByAllDatasets: bool = True, 

1163 ) -> None: 

1164 """Perform follow up queries for each dataset data ID produced in 

1165 `fillDataIds`. 

1166 

1167 This method populates `_DatasetScaffolding.refs` (except for those in 

1168 `prerequisites`). 

1169 

1170 Parameters 

1171 ---------- 

1172 registry : `lsst.daf.butler.Registry` 

1173 Registry for the data repository; used for all data ID queries. 

1174 collections 

1175 Expressions representing the collections to search for input 

1176 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1177 run : `str` 

1178 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1179 output datasets, if it already exists. 

1180 commonDataIds : \ 

1181 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1182 Result of a previous call to `connectDataIds`. 

1183 skipExistingIn 

1184 Expressions representing the collections to search for existing 

1185 output datasets that should be skipped. See 

1186 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1187 `None` or empty string/sequence disables skipping. 

1188 clobberOutputs : `bool`, optional 

1189 If `True` (default), allow quanta to created even if outputs exist; 

1190 this requires the same behavior behavior to be enabled when 

1191 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1192 (those with metadata, or all outputs if there is no metadata 

1193 dataset configured) will be skipped rather than clobbered. 

1194 constrainedByAllDatasets : `bool`, optional 

1195 Indicates if the commonDataIds were generated with a constraint on 

1196 all dataset types. 

1197 

1198 Raises 

1199 ------ 

1200 OutputExistsError 

1201 Raised if an output dataset already exists in the output run 

1202 and ``skipExistingIn`` does not include output run, or if only 

1203 some outputs are present and ``clobberOutputs`` is `False`. 

1204 """ 

1205 # Run may be provided but it does not have to exist, in that case we 

1206 # use it for resolving references but don't check it for existing refs. 

1207 run_exists = False 

1208 if run: 

1209 with contextlib.suppress(MissingCollectionError): 

1210 run_exists = bool(registry.queryCollections(run)) 

1211 

1212 skip_collections_wildcard: CollectionWildcard | None = None 

1213 skipExistingInRun = False 

1214 if skipExistingIn: 

1215 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1216 if run_exists: 

1217 # as optimization check in the explicit list of names first 

1218 skipExistingInRun = run in skip_collections_wildcard.strings 

1219 if not skipExistingInRun: 

1220 # need to flatten it and check again 

1221 skipExistingInRun = run in registry.queryCollections( 

1222 skipExistingIn, 

1223 collectionTypes=CollectionType.RUN, 

1224 ) 

1225 

1226 idMaker = _DatasetIdMaker(run) 

1227 

1228 resolvedRefQueryResults: Iterable[DatasetRef] 

1229 

1230 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1231 # few different code paths that each transfer different pieces of 

1232 # information about what dataset query constraints were applied here, 

1233 # and none of them has the complete picture until we get here. We're 

1234 # long overdue for a QG generation rewrite that will make this go away 

1235 # entirely anyway. 

1236 constrainedByAllDatasets = ( 

1237 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1238 ) 

1239 

1240 # Look up [init] intermediate and output datasets in the output 

1241 # collection, if there is an output collection. 

1242 if run_exists or skip_collections_wildcard is not None: 

1243 for datasetType, refs in itertools.chain( 

1244 self.initIntermediates.items(), 

1245 self.initOutputs.items(), 

1246 self.intermediates.items(), 

1247 self.outputs.items(), 

1248 ): 

1249 _LOG.debug( 

1250 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1251 len(refs), 

1252 datasetType.name, 

1253 ) 

1254 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1255 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1256 # TODO: this assert incorrectly bans component inputs; 

1257 # investigate on DM-33027. 

1258 # assert not datasetType.isComponent(), \ 

1259 # "Output datasets cannot be components." 

1260 # 

1261 # Instead we have to handle them manually to avoid a 

1262 # deprecation warning, but it is at least confusing and 

1263 # possibly a bug for components to appear here at all. 

1264 if datasetType.isComponent(): 

1265 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1266 component = datasetType.component() 

1267 else: 

1268 parent_dataset_type = datasetType 

1269 component = None 

1270 

1271 # look at RUN collection first 

1272 if run_exists: 

1273 try: 

1274 resolvedRefQueryResults = subset.findDatasets( 

1275 parent_dataset_type, collections=run, findFirst=True 

1276 ) 

1277 except MissingDatasetTypeError: 

1278 resolvedRefQueryResults = [] 

1279 for resolvedRef in resolvedRefQueryResults: 

1280 # TODO: we could easily support per-DatasetType 

1281 # skipExisting and I could imagine that being useful - 

1282 # it's probably required in order to support writing 

1283 # initOutputs before QuantumGraph generation. 

1284 assert resolvedRef.dataId in refs 

1285 if not (skipExistingInRun or isInit or clobberOutputs): 

1286 raise OutputExistsError( 

1287 f"Output dataset {datasetType.name} already exists in " 

1288 f"output RUN collection '{run}' with data ID" 

1289 f" {resolvedRef.dataId}." 

1290 ) 

1291 # To resolve all outputs we have to remember existing 

1292 # ones to avoid generating new dataset IDs for them. 

1293 refs[resolvedRef.dataId].ref = ( 

1294 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1295 ) 

1296 

1297 # And check skipExistingIn too, if RUN collection is in 

1298 # it is handled above 

1299 if skip_collections_wildcard is not None: 

1300 try: 

1301 resolvedRefQueryResults = subset.findDatasets( 

1302 parent_dataset_type, 

1303 collections=skip_collections_wildcard, 

1304 findFirst=True, 

1305 ) 

1306 except MissingDatasetTypeError: 

1307 resolvedRefQueryResults = [] 

1308 for resolvedRef in resolvedRefQueryResults: 

1309 if resolvedRef.dataId not in refs: 

1310 continue 

1311 refs[resolvedRef.dataId].ref = ( 

1312 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1313 ) 

1314 

1315 # Look up input and initInput datasets in the input collection(s). We 

1316 # accumulate datasets in self.missing, if the common data IDs were not 

1317 # constrained on dataset type existence. 

1318 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1319 _LOG.debug( 

1320 "Resolving %d datasets for input dataset %s.", 

1321 len(refs), 

1322 datasetType.name, 

1323 ) 

1324 if datasetType.isComponent(): 

1325 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1326 component = datasetType.component() 

1327 else: 

1328 parent_dataset_type = datasetType 

1329 component = None 

1330 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1331 try: 

1332 resolvedRefQueryResults = commonDataIds.subset( 

1333 datasetType.dimensions, unique=True 

1334 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1335 except MissingDatasetTypeError: 

1336 resolvedRefQueryResults = [] 

1337 dataIdsNotFoundYet = set(refs.keys()) 

1338 for resolvedRef in resolvedRefQueryResults: 

1339 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1340 if resolvedRef.dataId not in refs: 

1341 continue 

1342 refs[resolvedRef.dataId].ref = ( 

1343 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1344 ) 

1345 if dataIdsNotFoundYet: 

1346 if constrainedByAllDatasets: 

1347 raise RuntimeError( 

1348 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1349 f"'{datasetType.name}' was/were present in a previous " 

1350 "query, but could not be found now. " 

1351 "This is either a logic bug in QuantumGraph generation " 

1352 "or the input collections have been modified since " 

1353 "QuantumGraph generation began." 

1354 ) 

1355 elif not datasetType.dimensions: 

1356 raise RuntimeError( 

1357 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1358 f"collections {collections}." 

1359 ) 

1360 else: 

1361 # If the common dataIds were not constrained using all the 

1362 # input dataset types, it is possible that some data ids 

1363 # found don't correspond to existing datasets. Mark these 

1364 # for later pruning from the quantum graph. 

1365 for k in dataIdsNotFoundYet: 

1366 missing_for_dataset_type[k] = refs[k] 

1367 if missing_for_dataset_type: 

1368 self.missing[datasetType] = missing_for_dataset_type 

1369 

1370 # Resolve the missing refs, just so they look like all of the others; 

1371 # in the end other code will make sure they never appear in the QG. 

1372 for dataset_type, refDict in self.missing.items(): 

1373 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1374 

1375 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1376 # replacing the unresolved refs there, and then look up prerequisites. 

1377 for task in self.tasks: 

1378 _LOG.debug( 

1379 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1380 len(task.quanta), 

1381 task.taskDef.label, 

1382 ) 

1383 # The way iterConnections is designed makes it impossible to 

1384 # annotate precisely enough to satisfy MyPy here. 

1385 lookupFunctions = { 

1386 c.name: c.lookupFunction # type: ignore 

1387 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1388 if c.lookupFunction is not None # type: ignore 

1389 } 

1390 dataIdsFailed = [] 

1391 dataIdsSucceeded = [] 

1392 for quantum in task.quanta.values(): 

1393 # Process outputs datasets only if skipExistingIn is not None 

1394 # or there is a run to look for outputs in and clobberOutputs 

1395 # is True. Note that if skipExistingIn is None, any output 

1396 # datasets that already exist would have already caused an 

1397 # exception to be raised. 

1398 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1399 resolvedRefs = [] 

1400 unresolvedDataIds = [] 

1401 haveMetadata = False 

1402 for datasetType, originalRefs in quantum.outputs.items(): 

1403 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1404 if ref is not None: 

1405 resolvedRefs.append(ref) 

1406 originalRefs[dataId].ref = ref 

1407 if datasetType.name == task.taskDef.metadataDatasetName: 

1408 haveMetadata = True 

1409 else: 

1410 unresolvedDataIds.append((datasetType, dataId)) 

1411 if resolvedRefs: 

1412 if haveMetadata or not unresolvedDataIds: 

1413 dataIdsSucceeded.append(quantum.dataId) 

1414 if skip_collections_wildcard is not None: 

1415 continue 

1416 else: 

1417 dataIdsFailed.append(quantum.dataId) 

1418 if not clobberOutputs and run_exists: 

1419 raise OutputExistsError( 

1420 f"Quantum {quantum.dataId} of task with label " 

1421 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1422 f"({resolvedRefs}) " 

1423 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1424 "and clobbering outputs was not enabled." 

1425 ) 

1426 # Update the input DatasetRefs to the resolved ones we already 

1427 # searched for. 

1428 for datasetType, input_refs in quantum.inputs.items(): 

1429 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1430 input_refs[data_id].ref = ref 

1431 # Look up prerequisite datasets in the input collection(s). 

1432 # These may have dimensions that extend beyond those we queried 

1433 # for originally, because we want to permit those data ID 

1434 # values to differ across quanta and dataset types. 

1435 for datasetType in task.prerequisites: 

1436 if datasetType.isComponent(): 

1437 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1438 component = datasetType.component() 

1439 else: 

1440 parent_dataset_type = datasetType 

1441 component = None 

1442 lookupFunction = lookupFunctions.get(datasetType.name) 

1443 if lookupFunction is not None: 

1444 # PipelineTask has provided its own function to do the 

1445 # lookup. This always takes precedence. 

1446 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1447 elif ( 

1448 datasetType.isCalibration() 

1449 and datasetType.dimensions <= quantum.dataId.graph 

1450 and quantum.dataId.graph.temporal 

1451 ): 

1452 # This is a master calibration lookup, which we have to 

1453 # handle specially because the query system can't do a 

1454 # temporal join on a non-dimension-based timespan yet. 

1455 timespan = quantum.dataId.timespan 

1456 try: 

1457 prereq_ref = registry.findDataset( 

1458 parent_dataset_type, 

1459 quantum.dataId, 

1460 collections=collections, 

1461 timespan=timespan, 

1462 ) 

1463 if prereq_ref is not None: 

1464 if component is not None: 

1465 prereq_ref = prereq_ref.makeComponentRef(component) 

1466 prereq_refs = [prereq_ref] 

1467 else: 

1468 prereq_refs = [] 

1469 except (KeyError, MissingDatasetTypeError): 

1470 # This dataset type is not present in the registry, 

1471 # which just means there are no datasets here. 

1472 prereq_refs = [] 

1473 else: 

1474 where = "" 

1475 bind: dict[str, Any] = {} 

1476 if not quantum.dataId.graph.spatial: 

1477 # This has skypix dimensions (probably a reference 

1478 # catalog), but the quantum's data is not spatial 

1479 # (it's probably a full-survey sequence point). 

1480 # Try to limit the spatial extent to the union of 

1481 # the spatial extent of the inputs and outputs. 

1482 for dimension in datasetType.dimensions: 

1483 if isinstance(dimension, SkyPixDimension): 

1484 extent = quantum.computeSpatialExtent(dimension.pixelization) 

1485 pixels: list[int] = [] 

1486 for begin, end in extent: 

1487 pixels.extend(range(begin, end)) 

1488 if not pixels: 

1489 _LOG.warning( 

1490 "Prerequisite input %r to task %r may be unbounded.", 

1491 datasetType.name, 

1492 quantum.task.taskDef.label, 

1493 ) 

1494 else: 

1495 bind["quantum_extent"] = pixels 

1496 where = f"{dimension.name} IN (quantum_extent)" 

1497 break 

1498 # Most general case. 

1499 prereq_refs = [ 

1500 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1501 for prereq_ref in registry.queryDatasets( 

1502 parent_dataset_type, 

1503 collections=collections, 

1504 dataId=quantum.dataId, 

1505 findFirst=True, 

1506 where=where, 

1507 bind=bind, 

1508 ).expanded() 

1509 ] 

1510 

1511 for ref in prereq_refs: 

1512 if ref is not None: 

1513 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1514 task.prerequisites[datasetType][ref.dataId].append(_RefHolder(datasetType, ref)) 

1515 

1516 # Resolve all quantum inputs and outputs. 

1517 for dataset_type, refDict in quantum.inputs.items(): 

1518 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1519 for dataset_type, refDict in quantum.outputs.items(): 

1520 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1521 

1522 # Resolve task initInputs and initOutputs. 

1523 for dataset_type, refDict in task.initInputs.items(): 

1524 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1525 for dataset_type, refDict in task.initOutputs.items(): 

1526 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1527 

1528 # Actually remove any quanta that we decided to skip above. 

1529 if dataIdsSucceeded: 

1530 if skip_collections_wildcard is not None: 

1531 _LOG.debug( 

1532 "Pruning successful %d quanta for task with label '%s' because all of their " 

1533 "outputs exist or metadata was written successfully.", 

1534 len(dataIdsSucceeded), 

1535 task.taskDef.label, 

1536 ) 

1537 for dataId in dataIdsSucceeded: 

1538 del task.quanta[dataId] 

1539 elif clobberOutputs and run_exists: 

1540 _LOG.info( 

1541 "Found %d successful quanta for task with label '%s' " 

1542 "that will need to be clobbered during execution.", 

1543 len(dataIdsSucceeded), 

1544 task.taskDef.label, 

1545 ) 

1546 if dataIdsFailed: 

1547 if clobberOutputs and run_exists: 

1548 _LOG.info( 

1549 "Found %d failed/incomplete quanta for task with label '%s' " 

1550 "that will need to be clobbered during execution.", 

1551 len(dataIdsFailed), 

1552 task.taskDef.label, 

1553 ) 

1554 

1555 # Collect initOutputs that do not belong to any task. 

1556 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1557 for task in self.tasks: 

1558 global_dataset_types -= set(task.initOutputs) 

1559 if global_dataset_types: 

1560 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1561 for dataset_type, refDict in self.globalInitOutputs.items(): 

1562 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1563 

1564 def makeQuantumGraph( 

1565 self, 

1566 registry: Registry, 

1567 metadata: Mapping[str, Any] | None = None, 

1568 datastore: Datastore | None = None, 

1569 ) -> QuantumGraph: 

1570 """Create a `QuantumGraph` from the quanta already present in 

1571 the scaffolding data structure. 

1572 

1573 Parameters 

1574 ---------- 

1575 registry : `lsst.daf.butler.Registry` 

1576 Registry for the data repository; used for all data ID queries. 

1577 metadata : `~collections.abc.Mapping` of `str` to primitives, optional 

1578 This is an optional parameter of extra data to carry with the 

1579 graph. Entries in this mapping should be able to be serialized in 

1580 JSON. 

1581 datastore : `~lsst.daf.butler.Datastore`, optional 

1582 If not `None` then fill datastore records in each generated 

1583 Quantum. 

1584 

1585 Returns 

1586 ------- 

1587 graph : `QuantumGraph` 

1588 The full `QuantumGraph`. 

1589 """ 

1590 datastore_records: Mapping[str, DatastoreRecordData] | None = None 

1591 if datastore is not None: 

1592 datastore_records = datastore.export_records( 

1593 itertools.chain( 

1594 self.inputs.iter_resolved_refs(), 

1595 self.initInputs.iter_resolved_refs(), 

1596 self.prerequisites.iter_resolved_refs(), 

1597 ) 

1598 ) 

1599 

1600 graphInput: dict[TaskDef, set[Quantum]] = {} 

1601 for task in self.tasks: 

1602 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1603 graphInput[task.taskDef] = qset 

1604 

1605 taskInitInputs = { 

1606 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1607 for task in self.tasks 

1608 } 

1609 taskInitOutputs = { 

1610 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1611 for task in self.tasks 

1612 } 

1613 

1614 globalInitOutputs: list[DatasetRef] = [] 

1615 if self.globalInitOutputs is not None: 

1616 for refs_dict in self.globalInitOutputs.values(): 

1617 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1618 

1619 graph = QuantumGraph( 

1620 graphInput, 

1621 metadata=metadata, 

1622 pruneRefs=list(self.missing.iter_resolved_refs()), 

1623 universe=self.dimensions.universe, 

1624 initInputs=taskInitInputs, 

1625 initOutputs=taskInitOutputs, 

1626 globalInitOutputs=globalInitOutputs, 

1627 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1628 ) 

1629 return graph 

1630 

1631 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1632 """Make a list of all dataset types used by a graph as defined in 

1633 registry. 

1634 """ 

1635 chain: list[_DatasetDict | _DatasetDictMulti] = [ 

1636 self.initInputs, 

1637 self.initIntermediates, 

1638 self.initOutputs, 

1639 self.inputs, 

1640 self.intermediates, 

1641 self.outputs, 

1642 self.prerequisites, 

1643 ] 

1644 if self.globalInitOutputs is not None: 

1645 chain.append(self.globalInitOutputs) 

1646 

1647 # Collect names of all dataset types. 

1648 all_names: set[str] = {dstype.name for dstype in itertools.chain(*chain)} 

1649 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1650 

1651 # Check for types that do not exist in registry yet: 

1652 # - inputs must exist 

1653 # - intermediates and outputs may not exist, but there must not be 

1654 # more than one definition (e.g. differing in storage class) 

1655 # - prerequisites may not exist, treat it the same as outputs here 

1656 for dstype in itertools.chain(self.initInputs, self.inputs): 

1657 if dstype.name not in dataset_types: 

1658 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1659 

1660 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1661 chain = [ 

1662 self.initIntermediates, 

1663 self.initOutputs, 

1664 self.intermediates, 

1665 self.outputs, 

1666 self.prerequisites, 

1667 ] 

1668 if self.globalInitOutputs is not None: 

1669 chain.append(self.globalInitOutputs) 

1670 for dstype in itertools.chain(*chain): 

1671 if dstype.name not in dataset_types: 

1672 new_outputs[dstype.name].add(dstype) 

1673 for name, dstypes in new_outputs.items(): 

1674 if len(dstypes) > 1: 

1675 raise ValueError( 

1676 "Pipeline contains multiple definitions for a dataset type " 

1677 f"which is not defined in registry yet: {dstypes}" 

1678 ) 

1679 elif len(dstypes) == 1: 

1680 dataset_types[name] = dstypes.pop() 

1681 

1682 return dataset_types.values() 

1683 

1684 

1685# ------------------------ 

1686# Exported definitions -- 

1687# ------------------------ 

1688 

1689 

1690class GraphBuilderError(Exception): 

1691 """Base class for exceptions generated by graph builder.""" 

1692 

1693 pass 

1694 

1695 

1696class OutputExistsError(GraphBuilderError): 

1697 """Exception generated when output datasets already exist.""" 

1698 

1699 pass 

1700 

1701 

1702class PrerequisiteMissingError(GraphBuilderError): 

1703 """Exception generated when a prerequisite dataset does not exist.""" 

1704 

1705 pass 

1706 

1707 

1708class GraphBuilder: 

1709 """GraphBuilder class is responsible for building task execution graph from 

1710 a Pipeline. 

1711 

1712 Parameters 

1713 ---------- 

1714 registry : `~lsst.daf.butler.Registry` 

1715 Data butler instance. 

1716 skipExistingIn 

1717 Expressions representing the collections to search for existing 

1718 output datasets that should be skipped. See 

1719 :ref:`daf_butler_ordered_collection_searches`. 

1720 clobberOutputs : `bool`, optional 

1721 If `True` (default), allow quanta to created even if partial outputs 

1722 exist; this requires the same behavior behavior to be enabled when 

1723 executing. 

1724 datastore : `~lsst.daf.butler.Datastore`, optional 

1725 If not `None` then fill datastore records in each generated Quantum. 

1726 """ 

1727 

1728 def __init__( 

1729 self, 

1730 registry: Registry, 

1731 skipExistingIn: Any = None, 

1732 clobberOutputs: bool = True, 

1733 datastore: Datastore | None = None, 

1734 ): 

1735 self.registry = registry 

1736 self.dimensions = registry.dimensions 

1737 self.skipExistingIn = skipExistingIn 

1738 self.clobberOutputs = clobberOutputs 

1739 self.datastore = datastore 

1740 

1741 def makeGraph( 

1742 self, 

1743 pipeline: Pipeline | Iterable[TaskDef], 

1744 collections: Any, 

1745 run: str, 

1746 userQuery: str | None, 

1747 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1748 metadata: Mapping[str, Any] | None = None, 

1749 bind: Mapping[str, Any] | None = None, 

1750 dataId: DataCoordinate | None = None, 

1751 ) -> QuantumGraph: 

1752 """Create execution graph for a pipeline. 

1753 

1754 Parameters 

1755 ---------- 

1756 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1757 Pipeline definition, task names/classes and their configs. 

1758 collections 

1759 Expressions representing the collections to search for input 

1760 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1761 run : `str` 

1762 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1763 output datasets. Collection does not have to exist and it will be 

1764 created when graph is executed. 

1765 userQuery : `str` 

1766 String which defines user-defined selection for registry, should be 

1767 empty or `None` if there is no restrictions on data selection. 

1768 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1769 The query constraint variant that should be used to constraint the 

1770 query based on dataset existance, defaults to 

1771 `DatasetQueryConstraintVariant.ALL`. 

1772 metadata : Optional Mapping of `str` to primitives 

1773 This is an optional parameter of extra data to carry with the 

1774 graph. Entries in this mapping should be able to be serialized in 

1775 JSON. 

1776 bind : `~collections.abc.Mapping`, optional 

1777 Mapping containing literal values that should be injected into the 

1778 ``userQuery`` expression, keyed by the identifiers they replace. 

1779 dataId : `lsst.daf.butler.DataCoordinate`, optional 

1780 Data ID that should also be included in the query constraint. 

1781 

1782 Returns 

1783 ------- 

1784 graph : `QuantumGraph` 

1785 

1786 Raises 

1787 ------ 

1788 UserExpressionError 

1789 Raised when user expression cannot be parsed. 

1790 OutputExistsError 

1791 Raised when output datasets already exist. 

1792 Exception 

1793 Other exceptions types may be raised by underlying registry 

1794 classes. 

1795 """ 

1796 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1797 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1798 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1799 if dataId is None: 

1800 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1801 if isinstance(pipeline, Pipeline): 

1802 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId) 

1803 with scaffolding.connectDataIds( 

1804 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1805 ) as commonDataIds: 

1806 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1807 scaffolding.resolveDatasetRefs( 

1808 self.registry, 

1809 collections, 

1810 run, 

1811 commonDataIds, 

1812 skipExistingIn=self.skipExistingIn, 

1813 clobberOutputs=self.clobberOutputs, 

1814 constrainedByAllDatasets=condition, 

1815 ) 

1816 return scaffolding.makeQuantumGraph( 

1817 registry=self.registry, metadata=metadata, datastore=self.datastore 

1818 )