Coverage for python/lsst/pipe/base/graphBuilder.py: 17%

606 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-23 10:31 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining GraphBuilder class and related methods. 

23""" 

24 

25from __future__ import annotations 

26 

27__all__ = ["GraphBuilder"] 

28 

29# ------------------------------- 

30# Imports of standard modules -- 

31# ------------------------------- 

32import contextlib 

33import itertools 

34import logging 

35from collections import ChainMap, defaultdict 

36from collections.abc import Collection, Iterable, Iterator, Mapping 

37from contextlib import contextmanager 

38from dataclasses import dataclass 

39from typing import Any, TypeVar, cast 

40 

41from lsst.daf.butler import ( 

42 CollectionType, 

43 DataCoordinate, 

44 DatasetRef, 

45 DatasetType, 

46 Datastore, 

47 DatastoreRecordData, 

48 DimensionGraph, 

49 DimensionUniverse, 

50 NamedKeyDict, 

51 NamedValueSet, 

52 Quantum, 

53 Registry, 

54 SkyPixDimension, 

55) 

56from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

57from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

58from lsst.daf.butler.registry.wildcards import CollectionWildcard 

59from lsst.sphgeom import PixelizationABC, RangeSet 

60 

61# ----------------------------- 

62# Imports for other modules -- 

63# ----------------------------- 

64from . import automatic_connection_constants as acc 

65from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

66from ._status import NoWorkFound 

67from .connections import AdjustQuantumHelper, iterConnections 

68from .graph import QuantumGraph 

69from .pipeline import Pipeline, PipelineDatasetTypes, TaskDatasetTypes, TaskDef 

70 

71# ---------------------------------- 

72# Local non-exported definitions -- 

73# ---------------------------------- 

74 

75_LOG = logging.getLogger(__name__) 

76 

77 

78@dataclass 

79class _RefHolder: 

80 r"""Placeholder for `~lsst.daf.butler.DatasetRef` representing a future 

81 resolved reference. 

82 

83 As we eliminated unresolved `~lsst.daf.butler.DatasetRef`\s we now use 

84 `None` to represent a reference that is yet to be resolved. Information 

85 about its corresponding dataset type and coordinate is stored in 

86 `_DatasetDict` mapping. 

87 """ 

88 

89 dataset_type: DatasetType 

90 """Dataset type of the dataset to be created later. I need to store it here 

91 instead of inferring from `_DatasetDict` because `_RefHolder` can be shared 

92 between different compatible dataset types.""" 

93 

94 ref: DatasetRef | None = None 

95 """Dataset reference, initially `None`, created when all datasets are 

96 resolved. 

97 """ 

98 

99 @property 

100 def resolved_ref(self) -> DatasetRef: 

101 """Access resolved reference, should only be called after the 

102 reference is set (`~lsst.daf.butler.DatasetRef`). 

103 """ 

104 assert self.ref is not None, "Dataset reference is not set." 

105 return self.ref 

106 

107 

108_Refs = TypeVar("_Refs") 

109 

110 

111class _DatasetDictBase(NamedKeyDict[DatasetType, _Refs]): 

112 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

113 collection of the known `~lsst.daf.butler.DatasetRef` instances of that 

114 type. 

115 

116 Parameters 

117 ---------- 

118 args 

119 Positional arguments are forwarded to the `dict` constructor. 

120 universe : `~lsst.daf.butler.DimensionUniverse` 

121 Universe of all possible dimensions. 

122 """ 

123 

124 def __init__(self, *args: Any, universe: DimensionUniverse): 

125 super().__init__(*args) 

126 self.universe = universe 

127 

128 @classmethod 

129 def _fromSubset( 

130 cls, 

131 datasetTypes: Collection[DatasetType], 

132 first: _DatasetDictBase, 

133 *rest: _DatasetDictBase, 

134 ) -> _DatasetDictBase: 

135 """Return a new dictionary by extracting items corresponding to the 

136 given keys from one or more existing dictionaries. 

137 

138 Parameters 

139 ---------- 

140 datasetTypes : `~collections.abc.Iterable` of \ 

141 `~lsst.daf.butler.DatasetType` 

142 DatasetTypes to use as keys for the dict. Values will be obtained 

143 by lookups against ``first`` and ``rest``. 

144 first : `_DatasetDictBase` 

145 Another dictionary from which to extract values. Its actual type 

146 must be idedntical to the type of sub-class used to call this 

147 method. 

148 rest 

149 Additional dictionaries from which to extract values. 

150 

151 Returns 

152 ------- 

153 dictionary : `_DatasetDictBase` 

154 A new dictionary instance. 

155 """ 

156 combined = ChainMap(first, *rest) 

157 

158 # Dataset types known to match immediately can be processed 

159 # without checks. 

160 matches = combined.keys() & set(datasetTypes) 

161 _dict = {k: combined[k] for k in matches} 

162 

163 if len(_dict) < len(datasetTypes): 

164 # Work out which ones are missing. 

165 missing_datasetTypes = set(datasetTypes) - _dict.keys() 

166 

167 # Get the known names for comparison. 

168 combined_by_name = {k.name: k for k in combined} 

169 

170 missing = set() 

171 incompatible = {} 

172 for datasetType in missing_datasetTypes: 

173 # The dataset type is not found. It may not be listed 

174 # or it may be that it is there with the same name 

175 # but different definition. 

176 if datasetType.name in combined_by_name: 

177 # This implies some inconsistency in definitions 

178 # for connections. If there is support for storage 

179 # class conversion we can let it slide. 

180 # At this point we do not know 

181 # where the inconsistency is but trust that down 

182 # stream code will be more explicit about input 

183 # vs output incompatibilities. 

184 existing = combined_by_name[datasetType.name] 

185 convertible_to_existing = existing.is_compatible_with(datasetType) 

186 convertible_from_existing = datasetType.is_compatible_with(existing) 

187 if convertible_to_existing and convertible_from_existing: 

188 _LOG.debug( 

189 "Dataset type %s has multiple fully-compatible storage classes %s and %s", 

190 datasetType.name, 

191 datasetType.storageClass_name, 

192 existing.storageClass_name, 

193 ) 

194 _dict[datasetType] = combined[existing] 

195 elif convertible_to_existing or convertible_from_existing: 

196 # We'd need to refactor a fair amount to recognize 

197 # whether this is an error or not, so I'm not going to 

198 # bother until we need to do that for other reasons 

199 # (it won't be too long). 

200 _LOG.info( 

201 "Dataset type %s is present with multiple only partially-compatible storage " 

202 "classes %s and %s.", 

203 datasetType.name, 

204 datasetType.storageClass_name, 

205 existing.storageClass_name, 

206 ) 

207 _dict[datasetType] = combined[existing] 

208 else: 

209 incompatible[datasetType] = existing 

210 else: 

211 missing.add(datasetType) 

212 

213 if missing or incompatible: 

214 reasons = [] 

215 if missing: 

216 reasons.append( 

217 f"DatasetTypes [{', '.join(d.name for d in missing)}] not present in list of known " 

218 f"types: [{', '.join(d.name for d in combined)}]." 

219 ) 

220 if incompatible: 

221 for x, y in incompatible.items(): 

222 reasons.append(f"{x} incompatible with {y}") 

223 raise KeyError("Errors matching dataset types: " + " & ".join(reasons)) 

224 

225 return cls(_dict, universe=first.universe) 

226 

227 @property 

228 def dimensions(self) -> DimensionGraph: 

229 """The union of all dimensions used by all dataset types in this 

230 dictionary, including implied dependencies (`DimensionGraph`). 

231 """ 

232 base = self.universe.empty 

233 if len(self) == 0: 

234 return base 

235 return base.union(*[datasetType.dimensions for datasetType in self]) 

236 

237 def unpackSingleRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, DatasetRef]: 

238 """Unpack nested single-element `~lsst.daf.butler.DatasetRef` dicts 

239 into a new mapping with `~lsst.daf.butler.DatasetType` keys and 

240 `~lsst.daf.butler.DatasetRef` values. 

241 

242 This method assumes that each nest contains exactly one item, as is the 

243 case for all "init" datasets. 

244 

245 Parameters 

246 ---------- 

247 storage_classes : `dict` [ `str`, `str` ] 

248 Mapping from dataset type name to the storage class to use for that 

249 dataset type. These are typically the storage classes declared 

250 for a particular task, which may differ rom the data repository 

251 definitions. 

252 

253 Returns 

254 ------- 

255 dictionary : `~lsst.daf.butler.NamedKeyDict` 

256 Dictionary mapping `~lsst.daf.butler.DatasetType` to 

257 `~lsst.daf.butler.DatasetRef`, with both 

258 `~lsst.daf.butler.DatasetType` instances and string names usable 

259 as keys. 

260 """ 

261 return NamedKeyDict( 

262 {datasetType: refs[0] for datasetType, refs in self.unpackMultiRefs(storage_classes).items()} 

263 ) 

264 

265 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

266 """Unpack nested multi-element `~lsst.daf.butler.DatasetRef` dicts into 

267 a new mapping with `~lsst.daf.butler.DatasetType` keys and `list` of 

268 `~lsst.daf.butler.DatasetRef` values. 

269 

270 Parameters 

271 ---------- 

272 storage_classes : `dict` [ `str`, `str` ] 

273 Mapping from dataset type name to the storage class to use for that 

274 dataset type. These are typically the storage classes declared 

275 for a particular task, which may differ rom the data repository 

276 definitions. 

277 

278 Returns 

279 ------- 

280 dictionary : `~lsst.daf.butler.NamedKeyDict` 

281 Dictionary mapping `~lsst.daf.butler.DatasetType` to `list` of 

282 `~lsst.daf.butler.DatasetRef`, with both 

283 `~lsst.daf.butler.DatasetType` instances and string names usable 

284 as keys. 

285 """ 

286 raise NotImplementedError() 

287 

288 

289class _DatasetDict(_DatasetDictBase[dict[DataCoordinate, _RefHolder]]): 

290 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

291 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

292 type. 

293 """ 

294 

295 @classmethod 

296 def fromDatasetTypes( 

297 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

298 ) -> _DatasetDict: 

299 """Construct a dictionary from a flat iterable of 

300 `~lsst.daf.butler.DatasetType` keys. 

301 

302 Parameters 

303 ---------- 

304 datasetTypes : `~collections.abc.Iterable` of \ 

305 `~lsst.daf.butler.DatasetType` 

306 DatasetTypes to use as keys for the dict. Values will be empty 

307 dictionaries. 

308 universe : `~lsst.daf.butler.DimensionUniverse` 

309 Universe of all possible dimensions. 

310 

311 Returns 

312 ------- 

313 dictionary : `_DatasetDict` 

314 A new `_DatasetDict` instance. 

315 """ 

316 return cls({datasetType: {} for datasetType in datasetTypes}, universe=universe) 

317 

318 @classmethod 

319 def fromSubset( 

320 cls, 

321 datasetTypes: Collection[DatasetType], 

322 first: _DatasetDict, 

323 *rest: _DatasetDict, 

324 ) -> _DatasetDict: 

325 """Return a new dictionary by extracting items corresponding to the 

326 given keys from one or more existing dictionaries. 

327 

328 Parameters 

329 ---------- 

330 datasetTypes : `~collections.abc.Iterable` of \ 

331 `~lsst.daf.butler.DatasetType` 

332 DatasetTypes to use as keys for the dict. Values will be obtained 

333 by lookups against ``first`` and ``rest``. 

334 first : `_DatasetDict` 

335 Another dictionary from which to extract values. 

336 rest 

337 Additional dictionaries from which to extract values. 

338 

339 Returns 

340 ------- 

341 dictionary : `_DatasetDict` 

342 A new dictionary instance. 

343 """ 

344 return cast(_DatasetDict, cls._fromSubset(datasetTypes, first, *rest)) 

345 

346 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

347 # Docstring inherited. 

348 result = {} 

349 for dataset_type, holders in self.items(): 

350 if ( 

351 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

352 ) != dataset_type.storageClass_name: 

353 dataset_type = dataset_type.overrideStorageClass(override) 

354 refs = [holder.resolved_ref.overrideStorageClass(override) for holder in holders.values()] 

355 else: 

356 refs = [holder.resolved_ref for holder in holders.values()] 

357 result[dataset_type] = refs 

358 return NamedKeyDict(result) 

359 

360 def extract( 

361 self, datasetType: DatasetType, dataIds: Iterable[DataCoordinate] 

362 ) -> Iterator[tuple[DataCoordinate, DatasetRef | None]]: 

363 """Iterate over the contained `~lsst.daf.butler.DatasetRef` instances 

364 that match the given `~lsst.daf.butler.DatasetType` and data IDs. 

365 

366 Parameters 

367 ---------- 

368 datasetType : `~lsst.daf.butler.DatasetType` 

369 Dataset type to match. 

370 dataIds : `~collections.abc.Iterable` \ 

371 [ `~lsst.daf.butler.DataCoordinate` ] 

372 Data IDs to match. 

373 

374 Returns 

375 ------- 

376 refs : `~collections.abc.Iterator` [ `~lsst.daf.butler.DatasetRef` ] 

377 DatasetRef instances for which ``ref.datasetType == datasetType`` 

378 and ``ref.dataId`` is in ``dataIds``. 

379 """ 

380 refs = self[datasetType] 

381 return ((dataId, refs[dataId].ref) for dataId in dataIds) 

382 

383 def isdisjoint(self, other: _DatasetDict) -> bool: 

384 """Test whether ``self`` and ``other`` have any datasets in common. 

385 

386 Datasets are considered in common if they have the same *parent* 

387 dataset type name and data ID; storage classes and components are not 

388 considered. 

389 """ 

390 by_parent_name = {k.nameAndComponent()[0]: v.keys() for k, v in self.items()} 

391 for k, v in other.items(): 

392 parent_name, _ = k.nameAndComponent() 

393 if not by_parent_name.get(parent_name, frozenset[DataCoordinate]()).isdisjoint(v.keys()): 

394 return False 

395 return True 

396 

397 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

398 """Iterate over all DatasetRef instances held by this data structure, 

399 assuming that each `_RefHolder` already carries are resolved ref. 

400 """ 

401 for holders_by_data_id in self.values(): 

402 for holder in holders_by_data_id.values(): 

403 yield holder.resolved_ref 

404 

405 

406class _DatasetDictMulti(_DatasetDictBase[defaultdict[DataCoordinate, list[_RefHolder]]]): 

407 """A custom dictionary that maps `~lsst.daf.butler.DatasetType` to a nested 

408 dictionary of the known `~lsst.daf.butler.DatasetRef` instances of that 

409 type. Nexted dictionary can contain multiple refs for the same data ID, 

410 suitable for use with calibration datasets. 

411 """ 

412 

413 @classmethod 

414 def fromDatasetTypes( 

415 cls, datasetTypes: Iterable[DatasetType], *, universe: DimensionUniverse 

416 ) -> _DatasetDictMulti: 

417 """Construct a dictionary from a flat iterable of 

418 `~lsst.daf.butler.DatasetType` keys. 

419 

420 Parameters 

421 ---------- 

422 datasetTypes : `~collections.abc.Iterable` of \ 

423 `~lsst.daf.butler.DatasetType` 

424 DatasetTypes to use as keys for the dict. Values will be empty 

425 dictionaries. 

426 universe : `~lsst.daf.butler.DimensionUniverse` 

427 Universe of all possible dimensions. 

428 

429 Returns 

430 ------- 

431 dictionary : `_DatasetDictMulti` 

432 A new `_DatasetDictMulti` instance. 

433 """ 

434 return cls({datasetType: defaultdict(list) for datasetType in datasetTypes}, universe=universe) 

435 

436 @classmethod 

437 def fromSubset( 

438 cls, 

439 datasetTypes: Collection[DatasetType], 

440 first: _DatasetDictMulti, 

441 *rest: _DatasetDictMulti, 

442 ) -> _DatasetDictMulti: 

443 """Return a new dictionary by extracting items corresponding to the 

444 given keys from one or more existing dictionaries. 

445 

446 Parameters 

447 ---------- 

448 datasetTypes : `~collections.abc.Iterable` of \ 

449 `~lsst.daf.butler.DatasetType` 

450 DatasetTypes to use as keys for the dict. Values will be obtained 

451 by lookups against ``first`` and ``rest``. 

452 first : `_DatasetDictMulti` 

453 Another dictionary from which to extract values. 

454 rest 

455 Additional dictionaries from which to extract values. 

456 

457 Returns 

458 ------- 

459 dictionary : `_DatasetDictMulti` 

460 A new dictionary instance. 

461 """ 

462 return cast(_DatasetDictMulti, cls._fromSubset(datasetTypes, first, *rest)) 

463 

464 def unpackMultiRefs(self, storage_classes: dict[str, str]) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

465 # Docstring inherited. 

466 result = {} 

467 for dataset_type, holder_map in self.items(): 

468 if ( 

469 override := storage_classes.get(dataset_type.name, dataset_type.storageClass_name) 

470 ) != dataset_type.storageClass_name: 

471 dataset_type = dataset_type.overrideStorageClass(override) 

472 refs = [] 

473 for holder_list in holder_map.values(): 

474 refs += [holder.resolved_ref.overrideStorageClass(override) for holder in holder_list] 

475 else: 

476 refs = [] 

477 for holder_list in holder_map.values(): 

478 refs += [holder.resolved_ref for holder in holder_list] 

479 result[dataset_type] = refs 

480 return NamedKeyDict(result) 

481 

482 def iter_resolved_refs(self) -> Iterator[DatasetRef]: 

483 """Iterate over all DatasetRef instances held by this data structure, 

484 assuming that each `_RefHolder` already carries are resolved ref. 

485 """ 

486 for holders_by_data_id in self.values(): 

487 for holder_list in holders_by_data_id.values(): 

488 for holder in holder_list: 

489 yield holder.resolved_ref 

490 

491 

492class _QuantumScaffolding: 

493 """Helper class aggregating information about a `Quantum`, used when 

494 constructing a `QuantumGraph`. 

495 

496 See `_PipelineScaffolding` for a top-down description of the full 

497 scaffolding data structure. 

498 

499 Parameters 

500 ---------- 

501 task : _TaskScaffolding 

502 Back-reference to the helper object for the `PipelineTask` this quantum 

503 represents an execution of. 

504 dataId : `~lsst.daf.butler.DataCoordinate` 

505 Data ID for this quantum. 

506 """ 

507 

508 def __init__(self, task: _TaskScaffolding, dataId: DataCoordinate): 

509 self.task = task 

510 self.dataId = dataId 

511 self.inputs = _DatasetDict.fromDatasetTypes(task.inputs.keys(), universe=dataId.universe) 

512 self.outputs = _DatasetDict.fromDatasetTypes(task.outputs.keys(), universe=dataId.universe) 

513 self.prerequisites = _DatasetDict.fromDatasetTypes( 

514 task.prerequisites.keys(), universe=dataId.universe 

515 ) 

516 

517 __slots__ = ("task", "dataId", "inputs", "outputs", "prerequisites") 

518 

519 def __repr__(self) -> str: 

520 return f"_QuantumScaffolding(taskDef={self.task.taskDef}, dataId={self.dataId}, ...)" 

521 

522 task: _TaskScaffolding 

523 """Back-reference to the helper object for the `PipelineTask` this quantum 

524 represents an execution of. 

525 """ 

526 

527 dataId: DataCoordinate 

528 """Data ID for this quantum. 

529 """ 

530 

531 inputs: _DatasetDict 

532 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` inputs to 

533 this quantum. 

534 

535 This is initialized to map each `~lsst.daf.butler.DatasetType` to an empty 

536 dictionary at construction. Those nested dictionaries are populated 

537 (with data IDs as keys) with unresolved `~lsst.daf.butler.DatasetRef` 

538 instances in `_PipelineScaffolding.connectDataIds`. 

539 """ 

540 

541 outputs: _DatasetDict 

542 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` outputs this 

543 quantum. 

544 """ 

545 

546 prerequisites: _DatasetDict 

547 """Nested dictionary containing `~lsst.daf.butler.DatasetRef` prerequisite 

548 inputs to this quantum. 

549 """ 

550 

551 def computeSpatialExtent(self, pixelization: PixelizationABC) -> RangeSet: 

552 """Return the spatial extent of this quantum's inputs and outputs in 

553 a skypix system. 

554 

555 Parameters 

556 ---------- 

557 pixelization : `lsst.sphgeom.PixelizationABC` 

558 Pixelization system. 

559 

560 Returns 

561 ------- 

562 extent : `lsst.sphgeom.RangeSet` 

563 Ranges of sky pixels that touch this quantum's inputs and outputs. 

564 """ 

565 result = RangeSet() 

566 for dataset_type, datasets in itertools.chain(self.inputs.items(), self.outputs.items()): 

567 if dataset_type.dimensions.spatial: 

568 for data_id in datasets: 

569 result |= pixelization.envelope(data_id.region) 

570 return result 

571 

572 def makeQuantum(self, datastore_records: Mapping[str, DatastoreRecordData] | None = None) -> Quantum: 

573 """Transform the scaffolding object into a true `Quantum` instance. 

574 

575 Parameters 

576 ---------- 

577 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

578 `~lsst.daf.butler.DatastoreRecordData` ], optional 

579 If not `None` then fill datastore records in each generated Quantum 

580 using the records from this structure. 

581 

582 Returns 

583 ------- 

584 quantum : `Quantum` 

585 An actual `Quantum` instance. 

586 """ 

587 allInputs = self.inputs.unpackMultiRefs(self.task.storage_classes) 

588 allInputs.update(self.prerequisites.unpackMultiRefs(self.task.storage_classes)) 

589 # Give the task's Connections class an opportunity to remove some 

590 # inputs, or complain if they are unacceptable. 

591 # This will raise if one of the check conditions is not met, which is 

592 # the intended behavior. 

593 # If it raises NotWorkFound, there is a bug in the QG algorithm 

594 # or the adjustQuantum is incorrectly trying to make a prerequisite 

595 # input behave like a regular input; adjustQuantum should only raise 

596 # NoWorkFound if a regular input is missing, and it shouldn't be 

597 # possible for us to have generated ``self`` if that's true. 

598 helper = AdjustQuantumHelper( 

599 inputs=allInputs, outputs=self.outputs.unpackMultiRefs(self.task.storage_classes) 

600 ) 

601 helper.adjust_in_place(self.task.taskDef.connections, self.task.taskDef.label, self.dataId) 

602 initInputs = self.task.initInputs.unpackSingleRefs(self.task.storage_classes) 

603 quantum_records: Mapping[str, DatastoreRecordData] | None = None 

604 if datastore_records is not None: 

605 quantum_records = {} 

606 input_refs = list(itertools.chain.from_iterable(helper.inputs.values())) 

607 input_refs += list(initInputs.values()) 

608 input_ids = {ref.id for ref in input_refs} 

609 for datastore_name, records in datastore_records.items(): 

610 matching_records = records.subset(input_ids) 

611 if matching_records is not None: 

612 quantum_records[datastore_name] = matching_records 

613 # ignore the types because quantum really can take a sequence of inputs 

614 return Quantum( 

615 taskName=self.task.taskDef.taskName, 

616 taskClass=self.task.taskDef.taskClass, 

617 dataId=self.dataId, 

618 initInputs=initInputs, 

619 inputs=helper.inputs, 

620 outputs=helper.outputs, 

621 datastore_records=quantum_records, 

622 ) 

623 

624 

625@dataclass 

626class _TaskScaffolding: 

627 """Helper class aggregating information about a `PipelineTask`, used when 

628 constructing a `QuantumGraph`. 

629 

630 See `_PipelineScaffolding` for a top-down description of the full 

631 scaffolding data structure. 

632 

633 Parameters 

634 ---------- 

635 taskDef : `TaskDef` 

636 Data structure that identifies the task class and its config. 

637 parent : `_PipelineScaffolding` 

638 The parent data structure that will hold the instance being 

639 constructed. 

640 datasetTypes : `TaskDatasetTypes` 

641 Data structure that categorizes the dataset types used by this task. 

642 """ 

643 

644 def __init__( 

645 self, 

646 taskDef: TaskDef, 

647 parent: _PipelineScaffolding, 

648 datasetTypes: TaskDatasetTypes, 

649 ): 

650 universe = parent.dimensions.universe 

651 self.taskDef = taskDef 

652 self.dimensions = DimensionGraph(universe, names=taskDef.connections.dimensions) 

653 assert self.dimensions.issubset(parent.dimensions) 

654 # Initialize _DatasetDicts as subsets of the one or two 

655 # corresponding dicts in the parent _PipelineScaffolding. 

656 self.initInputs = _DatasetDict.fromSubset( 

657 datasetTypes.initInputs, parent.initInputs, parent.initIntermediates 

658 ) 

659 self.initOutputs = _DatasetDict.fromSubset( 

660 datasetTypes.initOutputs, parent.initIntermediates, parent.initOutputs 

661 ) 

662 self.inputs = _DatasetDict.fromSubset(datasetTypes.inputs, parent.inputs, parent.intermediates) 

663 self.outputs = _DatasetDict.fromSubset(datasetTypes.outputs, parent.intermediates, parent.outputs) 

664 self.prerequisites = _DatasetDictMulti.fromSubset(datasetTypes.prerequisites, parent.prerequisites) 

665 self.dataIds: set[DataCoordinate] = set() 

666 self.quanta = {} 

667 self.storage_classes = { 

668 connection.name: connection.storageClass 

669 for connection in self.taskDef.connections.allConnections.values() 

670 } 

671 self.storage_classes[ 

672 acc.CONFIG_INIT_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

673 ] = acc.CONFIG_INIT_OUTPUT_STORAGE_CLASS 

674 self.storage_classes[ 

675 acc.LOG_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

676 ] = acc.LOG_OUTPUT_STORAGE_CLASS 

677 self.storage_classes[ 

678 acc.METADATA_OUTPUT_TEMPLATE.format(label=self.taskDef.label) 

679 ] = acc.METADATA_OUTPUT_STORAGE_CLASS 

680 

681 def __repr__(self) -> str: 

682 # Default dataclass-injected __repr__ gets caught in an infinite loop 

683 # because of back-references. 

684 return f"_TaskScaffolding(taskDef={self.taskDef}, ...)" 

685 

686 taskDef: TaskDef 

687 """Data structure that identifies the task class and its config 

688 (`TaskDef`). 

689 """ 

690 

691 dimensions: DimensionGraph 

692 """The dimensions of a single `Quantum` of this task (`DimensionGraph`). 

693 """ 

694 

695 initInputs: _DatasetDict 

696 """Dictionary containing information about datasets used to construct this 

697 task (`_DatasetDict`). 

698 """ 

699 

700 initOutputs: _DatasetDict 

701 """Dictionary containing information about datasets produced as a 

702 side-effect of constructing this task (`_DatasetDict`). 

703 """ 

704 

705 inputs: _DatasetDict 

706 """Dictionary containing information about datasets used as regular, 

707 graph-constraining inputs to this task (`_DatasetDict`). 

708 """ 

709 

710 outputs: _DatasetDict 

711 """Dictionary containing information about datasets produced by this task 

712 (`_DatasetDict`). 

713 """ 

714 

715 prerequisites: _DatasetDictMulti 

716 """Dictionary containing information about input datasets that must be 

717 present in the repository before any Pipeline containing this task is run 

718 (`_DatasetDictMulti`). 

719 """ 

720 

721 quanta: dict[DataCoordinate, _QuantumScaffolding] 

722 """Dictionary mapping data ID to a scaffolding object for the Quantum of 

723 this task with that data ID. 

724 """ 

725 

726 storage_classes: dict[str, str] 

727 """Mapping from dataset type name to storage class declared by this task. 

728 """ 

729 

730 def makeQuantumSet( 

731 self, 

732 missing: _DatasetDict, 

733 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

734 ) -> set[Quantum]: 

735 """Create a `set` of `Quantum` from the information in ``self``. 

736 

737 Parameters 

738 ---------- 

739 missing : `_DatasetDict` 

740 Input datasets that have not been found. 

741 datastore_records : `dict` 

742 Record from the datastore to export with quanta. 

743 

744 Returns 

745 ------- 

746 nodes : `set` of `Quantum` 

747 The `Quantum` elements corresponding to this task. 

748 """ 

749 outputs = set() 

750 for q in self.quanta.values(): 

751 try: 

752 tmpQuanta = q.makeQuantum(datastore_records) 

753 outputs.add(tmpQuanta) 

754 except (NoWorkFound, FileNotFoundError) as exc: 

755 if not missing.isdisjoint(q.inputs): 

756 # This is a node that is known to be pruned later and 

757 # should be left in even though some follow up queries 

758 # fail. This allows the pruning to start from this quantum 

759 # with known issues, and prune other nodes it touches. 

760 inputs = q.inputs.unpackMultiRefs(self.storage_classes) 

761 inputs.update(q.prerequisites.unpackMultiRefs(self.storage_classes)) 

762 tmpQuantum = Quantum( 

763 taskName=q.task.taskDef.taskName, 

764 taskClass=q.task.taskDef.taskClass, 

765 dataId=q.dataId, 

766 initInputs=q.task.initInputs.unpackSingleRefs(self.storage_classes), 

767 inputs=inputs, 

768 outputs=q.outputs.unpackMultiRefs(self.storage_classes), 

769 ) 

770 outputs.add(tmpQuantum) 

771 else: 

772 raise exc 

773 return outputs 

774 

775 

776class _DatasetIdMaker: 

777 """Helper class which generates random dataset UUIDs for unresolved 

778 datasets. 

779 """ 

780 

781 def __init__(self, run: str): 

782 self.run = run 

783 # Cache of dataset refs generated so far. 

784 self.resolved: dict[tuple[DatasetType, DataCoordinate], DatasetRef] = {} 

785 

786 def resolveRef(self, dataset_type: DatasetType, data_id: DataCoordinate) -> DatasetRef: 

787 # For components we need their parent dataset ID. 

788 if dataset_type.isComponent(): 

789 parent_type = dataset_type.makeCompositeDatasetType() 

790 # Parent should be resolved if this is an existing input, or it 

791 # should be in the cache already if it is an intermediate. 

792 key = parent_type, data_id 

793 if key not in self.resolved: 

794 raise ValueError(f"Composite dataset is missing from cache: {parent_type} {data_id}") 

795 parent_ref = self.resolved[key] 

796 return DatasetRef(dataset_type, data_id, id=parent_ref.id, run=parent_ref.run, conform=False) 

797 

798 key = dataset_type, data_id 

799 if (resolved := self.resolved.get(key)) is None: 

800 resolved = DatasetRef(dataset_type, data_id, run=self.run, conform=False) 

801 self.resolved[key] = resolved 

802 return resolved 

803 

804 def resolveDict( 

805 self, dataset_type: DatasetType, refs: dict[DataCoordinate, _RefHolder], is_output: bool 

806 ) -> None: 

807 """Resolve all unresolved references in the provided dictionary.""" 

808 for data_id, holder in refs.items(): 

809 if holder.ref is None or (is_output and holder.ref.run != self.run): 

810 holder.ref = self.resolveRef(holder.dataset_type, data_id) 

811 

812 

813@dataclass 

814class _PipelineScaffolding: 

815 """A helper data structure that organizes the information involved in 

816 constructing a `QuantumGraph` for a `Pipeline`. 

817 

818 Parameters 

819 ---------- 

820 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

821 Sequence of tasks from which a graph is to be constructed. Must 

822 have nested task classes already imported. 

823 universe : `~lsst.daf.butler.DimensionUniverse` 

824 Universe of all possible dimensions. 

825 

826 Notes 

827 ----- 

828 The scaffolding data structure contains nested data structures for both 

829 tasks (`_TaskScaffolding`) and datasets (`_DatasetDict`). The dataset 

830 data structures are shared between the pipeline-level structure (which 

831 aggregates all datasets and categorizes them from the perspective of the 

832 complete pipeline) and the individual tasks that use them as inputs and 

833 outputs. 

834 

835 `QuantumGraph` construction proceeds in four steps, with each corresponding 

836 to a different `_PipelineScaffolding` method: 

837 

838 1. When `_PipelineScaffolding` is constructed, we extract and categorize 

839 the DatasetTypes used by the pipeline (delegating to 

840 `PipelineDatasetTypes.fromPipeline`), then use these to construct the 

841 nested `_TaskScaffolding` and `_DatasetDict` objects. 

842 

843 2. In `connectDataIds`, we construct and run the "Big Join Query", which 

844 returns related tuples of all dimensions used to identify any regular 

845 input, output, and intermediate datasets (not prerequisites). We then 

846 iterate over these tuples of related dimensions, identifying the subsets 

847 that correspond to distinct data IDs for each task and dataset type, 

848 and then create `_QuantumScaffolding` objects. 

849 

850 3. In `resolveDatasetRefs`, we run follow-up queries against all of the 

851 dataset data IDs previously identified, transforming unresolved 

852 DatasetRefs into resolved DatasetRefs where appropriate. We then look 

853 up prerequisite datasets for all quanta. 

854 

855 4. In `makeQuantumGraph`, we construct a `QuantumGraph` from the lists of 

856 per-task `_QuantumScaffolding` objects. 

857 """ 

858 

859 def __init__(self, pipeline: Pipeline | Iterable[TaskDef], *, registry: Registry): 

860 _LOG.debug("Initializing data structures for QuantumGraph generation.") 

861 self.tasks = [] 

862 # Aggregate and categorize the DatasetTypes in the Pipeline. 

863 datasetTypes = PipelineDatasetTypes.fromPipeline(pipeline, registry=registry) 

864 # Construct dictionaries that map those DatasetTypes to structures 

865 # that will (later) hold additional information about them. 

866 for attr in ( 

867 "initInputs", 

868 "initIntermediates", 

869 "initOutputs", 

870 "inputs", 

871 "intermediates", 

872 "outputs", 

873 ): 

874 setattr( 

875 self, 

876 attr, 

877 _DatasetDict.fromDatasetTypes(getattr(datasetTypes, attr), universe=registry.dimensions), 

878 ) 

879 self.prerequisites = _DatasetDictMulti.fromDatasetTypes( 

880 datasetTypes.prerequisites, universe=registry.dimensions 

881 ) 

882 self.missing = _DatasetDict(universe=registry.dimensions) 

883 self.defaultDatasetQueryConstraints = datasetTypes.queryConstraints 

884 # Aggregate all dimensions for all non-init, non-prerequisite 

885 # DatasetTypes. These are the ones we'll include in the big join 

886 # query. 

887 self.dimensions = self.inputs.dimensions.union(self.intermediates.dimensions, self.outputs.dimensions) 

888 # Construct scaffolding nodes for each Task, and add backreferences 

889 # to the Task from each DatasetScaffolding node. 

890 # Note that there's only one scaffolding node for each DatasetType, 

891 # shared by _PipelineScaffolding and all _TaskScaffoldings that 

892 # reference it. 

893 if isinstance(pipeline, Pipeline): 

894 pipeline = pipeline.toExpandedPipeline() 

895 self.tasks = [ 

896 _TaskScaffolding(taskDef=taskDef, parent=self, datasetTypes=taskDatasetTypes) 

897 for taskDef, taskDatasetTypes in zip(pipeline, datasetTypes.byTask.values(), strict=True) 

898 ] 

899 

900 def __repr__(self) -> str: 

901 # Default dataclass-injected __repr__ gets caught in an infinite loop 

902 # because of back-references. 

903 return f"_PipelineScaffolding(tasks={self.tasks}, ...)" 

904 

905 tasks: list[_TaskScaffolding] 

906 """Scaffolding data structures for each task in the pipeline 

907 (`list` of `_TaskScaffolding`). 

908 """ 

909 

910 initInputs: _DatasetDict 

911 """Datasets consumed but not produced when constructing the tasks in this 

912 pipeline (`_DatasetDict`). 

913 """ 

914 

915 initIntermediates: _DatasetDict 

916 """Datasets that are both consumed and produced when constructing the tasks 

917 in this pipeline (`_DatasetDict`). 

918 """ 

919 

920 initOutputs: _DatasetDict 

921 """Datasets produced but not consumed when constructing the tasks in this 

922 pipeline (`_DatasetDict`). 

923 """ 

924 

925 inputs: _DatasetDict 

926 """Datasets that are consumed but not produced when running this pipeline 

927 (`_DatasetDict`). 

928 """ 

929 

930 intermediates: _DatasetDict 

931 """Datasets that are both produced and consumed when running this pipeline 

932 (`_DatasetDict`). 

933 """ 

934 

935 outputs: _DatasetDict 

936 """Datasets produced but not consumed when when running this pipeline 

937 (`_DatasetDict`). 

938 """ 

939 

940 prerequisites: _DatasetDictMulti 

941 """Datasets that are consumed when running this pipeline and looked up 

942 per-Quantum when generating the graph (`_DatasetDictMulti`). 

943 """ 

944 

945 defaultDatasetQueryConstraints: NamedValueSet[DatasetType] 

946 """Datasets that should be used as constraints in the initial query, 

947 according to tasks (`~lsst.daf.butler.NamedValueSet`). 

948 """ 

949 

950 dimensions: DimensionGraph 

951 """All dimensions used by any regular input, intermediate, or output 

952 (not prerequisite) dataset; the set of dimension used in the "Big Join 

953 Query" (`~lsst.daf.butler.DimensionGraph`). 

954 

955 This is required to be a superset of all task quantum dimensions. 

956 """ 

957 

958 missing: _DatasetDict 

959 """Datasets whose existence was originally predicted but were not 

960 actually found. 

961 

962 Quanta that require these datasets as inputs will be pruned (recursively) 

963 when actually constructing a `QuantumGraph` object. 

964 

965 These are currently populated only when the "initial dataset query 

966 constraint" does not include all overall-input dataset types, and hence the 

967 initial data ID query can include data IDs that it should not. 

968 """ 

969 

970 globalInitOutputs: _DatasetDict | None = None 

971 """Per-pipeline global output datasets (e.g. packages) (`_DatasetDict`) 

972 """ 

973 

974 @contextmanager 

975 def connectDataIds( 

976 self, 

977 registry: Registry, 

978 collections: Any, 

979 userQuery: str | None, 

980 externalDataId: DataCoordinate, 

981 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

982 bind: Mapping[str, Any] | None = None, 

983 ) -> Iterator[DataCoordinateQueryResults]: 

984 """Query for the data IDs that connect nodes in the `QuantumGraph`. 

985 

986 This method populates `_TaskScaffolding.dataIds` and 

987 `_DatasetScaffolding.dataIds` (except for those in `prerequisites`). 

988 

989 Parameters 

990 ---------- 

991 registry : `lsst.daf.butler.Registry` 

992 Registry for the data repository; used for all data ID queries. 

993 collections 

994 Expressions representing the collections to search for input 

995 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

996 userQuery : `str` or `None` 

997 User-provided expression to limit the data IDs processed. 

998 externalDataId : `~lsst.daf.butler.DataCoordinate` 

999 Externally-provided data ID that should be used to restrict the 

1000 results, just as if these constraints had been included via ``AND`` 

1001 in ``userQuery``. This includes (at least) any instrument named 

1002 in the pipeline definition. 

1003 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1004 The query constraint variant that should be used to constraint the 

1005 query based on dataset existance, defaults to 

1006 `DatasetQueryConstraintVariant.ALL`. 

1007 bind : `~collections.abc.Mapping`, optional 

1008 Mapping containing literal values that should be injected into the 

1009 ``userQuery`` expression, keyed by the identifiers they replace. 

1010 

1011 Returns 

1012 ------- 

1013 commonDataIds : \ 

1014 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1015 An interface to a database temporary table containing all data IDs 

1016 that will appear in this `QuantumGraph`. Returned inside a 

1017 context manager, which will drop the temporary table at the end of 

1018 the `with` block in which this method is called. 

1019 """ 

1020 _LOG.debug("Building query for data IDs.") 

1021 # Initialization datasets always have empty data IDs. 

1022 emptyDataId = DataCoordinate.makeEmpty(registry.dimensions) 

1023 for datasetType, refs in itertools.chain( 

1024 self.initInputs.items(), 

1025 self.initIntermediates.items(), 

1026 self.initOutputs.items(), 

1027 ): 

1028 refs[emptyDataId] = _RefHolder(datasetType) 

1029 # Run one big query for the data IDs for task dimensions and regular 

1030 # inputs and outputs. We limit the query to only dimensions that are 

1031 # associated with the input dataset types, but don't (yet) try to 

1032 # obtain the dataset_ids for those inputs. 

1033 _LOG.debug( 

1034 "Submitting data ID query over dimensions %s and materializing results.", 

1035 list(self.dimensions.names), 

1036 ) 

1037 queryArgs: dict[str, Any] = { 

1038 "dimensions": self.dimensions, 

1039 "where": userQuery, 

1040 "dataId": externalDataId, 

1041 "bind": bind, 

1042 } 

1043 if datasetQueryConstraint == DatasetQueryConstraintVariant.ALL: 

1044 _LOG.debug( 

1045 "Constraining graph query using default of %s.", 

1046 list(self.defaultDatasetQueryConstraints.names), 

1047 ) 

1048 queryArgs["datasets"] = list(self.defaultDatasetQueryConstraints) 

1049 queryArgs["collections"] = collections 

1050 elif datasetQueryConstraint == DatasetQueryConstraintVariant.OFF: 

1051 _LOG.debug("Not using dataset existence to constrain query.") 

1052 elif datasetQueryConstraint == DatasetQueryConstraintVariant.LIST: 

1053 constraint = set(datasetQueryConstraint) 

1054 inputs = {k.name: k for k in self.inputs} 

1055 if remainder := constraint.difference(inputs.keys()): 

1056 raise ValueError( 

1057 f"{remainder} dataset type(s) specified as a graph constraint, but" 

1058 f" do not appear as an input to the specified pipeline: {inputs.keys()}" 

1059 ) 

1060 _LOG.debug(f"Constraining graph query using {constraint}") 

1061 queryArgs["datasets"] = [typ for name, typ in inputs.items() if name in constraint] 

1062 queryArgs["collections"] = collections 

1063 else: 

1064 raise ValueError( 

1065 f"Unable to handle type {datasetQueryConstraint} given as datasetQueryConstraint." 

1066 ) 

1067 

1068 if "datasets" in queryArgs: 

1069 for i, dataset_type in enumerate(queryArgs["datasets"]): 

1070 if dataset_type.isComponent(): 

1071 queryArgs["datasets"][i] = dataset_type.makeCompositeDatasetType() 

1072 

1073 with registry.queryDataIds(**queryArgs).materialize() as commonDataIds: 

1074 _LOG.debug("Expanding data IDs.") 

1075 commonDataIds = commonDataIds.expanded() 

1076 _LOG.debug("Iterating over query results to associate quanta with datasets.") 

1077 # Iterate over query results, populating data IDs for datasets and 

1078 # quanta and then connecting them to each other. 

1079 n = 0 # Must count in loop since this is a lazy iterable. 

1080 for commonDataId in commonDataIds: 

1081 # Create DatasetRefs for all DatasetTypes from this result row, 

1082 # noting that we might have created some already. 

1083 # We remember both those that already existed and those that we 

1084 # create now. 

1085 n += 1 

1086 refsForRow = {} 

1087 dataIdCacheForRow: dict[DimensionGraph, DataCoordinate] = {} 

1088 for datasetType, refs in itertools.chain( 

1089 self.inputs.items(), 

1090 self.intermediates.items(), 

1091 self.outputs.items(), 

1092 ): 

1093 datasetDataId: DataCoordinate | None 

1094 if (datasetDataId := dataIdCacheForRow.get(datasetType.dimensions)) is None: 

1095 datasetDataId = commonDataId.subset(datasetType.dimensions) 

1096 dataIdCacheForRow[datasetType.dimensions] = datasetDataId 

1097 ref_holder = refs.get(datasetDataId) 

1098 if ref_holder is None: 

1099 ref_holder = _RefHolder(datasetType) 

1100 refs[datasetDataId] = ref_holder 

1101 refsForRow[datasetType.name] = ref_holder 

1102 # Create _QuantumScaffolding objects for all tasks from this 

1103 # result row, noting that we might have created some already. 

1104 for task in self.tasks: 

1105 quantumDataId = commonDataId.subset(task.dimensions) 

1106 quantum = task.quanta.get(quantumDataId) 

1107 if quantum is None: 

1108 quantum = _QuantumScaffolding(task=task, dataId=quantumDataId) 

1109 task.quanta[quantumDataId] = quantum 

1110 # Whether this is a new quantum or an existing one, we can 

1111 # now associate the DatasetRefs for this row with it. The 

1112 # fact that a Quantum data ID and a dataset data ID both 

1113 # came from the same result row is what tells us they 

1114 # should be associated. 

1115 # Many of these associates will be duplicates (because 

1116 # another query row that differed from this one only in 

1117 # irrelevant dimensions already added them), and we use 

1118 # sets to skip. 

1119 for datasetType in task.inputs: 

1120 dataId = dataIdCacheForRow[datasetType.dimensions] 

1121 ref_holder = refsForRow[datasetType.name] 

1122 quantum.inputs[datasetType.name][dataId] = ref_holder 

1123 for datasetType in task.outputs: 

1124 dataId = dataIdCacheForRow[datasetType.dimensions] 

1125 ref_holder = refsForRow[datasetType.name] 

1126 quantum.outputs[datasetType.name][dataId] = ref_holder 

1127 if n == 0: 

1128 _LOG.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

1129 emptiness_explained = False 

1130 for message in commonDataIds.explain_no_results(): 

1131 _LOG.critical(message) 

1132 emptiness_explained = True 

1133 if not emptiness_explained: 

1134 _LOG.critical( 

1135 "To reproduce this query for debugging purposes, run " 

1136 "Registry.queryDataIds with these arguments:" 

1137 ) 

1138 # We could just repr() the queryArgs dict to get something 

1139 # the user could make sense of, but it's friendlier to 

1140 # put these args in an easier-to-construct equivalent form 

1141 # so they can read it more easily and copy and paste into 

1142 # a Python terminal. 

1143 _LOG.critical(" dimensions=%s,", list(queryArgs["dimensions"].names)) 

1144 _LOG.critical(" dataId=%s,", queryArgs["dataId"].byName()) 

1145 if queryArgs["where"]: 

1146 _LOG.critical(" where=%s,", repr(queryArgs["where"])) 

1147 if "datasets" in queryArgs: 

1148 _LOG.critical(" datasets=%s,", [t.name for t in queryArgs["datasets"]]) 

1149 if "collections" in queryArgs: 

1150 _LOG.critical(" collections=%s,", list(queryArgs["collections"])) 

1151 _LOG.debug("Finished processing %d rows from data ID query.", n) 

1152 yield commonDataIds 

1153 

1154 def resolveDatasetRefs( 

1155 self, 

1156 registry: Registry, 

1157 collections: Any, 

1158 run: str, 

1159 commonDataIds: DataCoordinateQueryResults, 

1160 *, 

1161 skipExistingIn: Any = None, 

1162 clobberOutputs: bool = True, 

1163 constrainedByAllDatasets: bool = True, 

1164 ) -> None: 

1165 """Perform follow up queries for each dataset data ID produced in 

1166 `fillDataIds`. 

1167 

1168 This method populates `_DatasetScaffolding.refs` (except for those in 

1169 `prerequisites`). 

1170 

1171 Parameters 

1172 ---------- 

1173 registry : `lsst.daf.butler.Registry` 

1174 Registry for the data repository; used for all data ID queries. 

1175 collections 

1176 Expressions representing the collections to search for input 

1177 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1178 run : `str` 

1179 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1180 output datasets, if it already exists. 

1181 commonDataIds : \ 

1182 `lsst.daf.butler.registry.queries.DataCoordinateQueryResults` 

1183 Result of a previous call to `connectDataIds`. 

1184 skipExistingIn 

1185 Expressions representing the collections to search for existing 

1186 output datasets that should be skipped. See 

1187 :ref:`daf_butler_ordered_collection_searches` for allowed types. 

1188 `None` or empty string/sequence disables skipping. 

1189 clobberOutputs : `bool`, optional 

1190 If `True` (default), allow quanta to created even if outputs exist; 

1191 this requires the same behavior behavior to be enabled when 

1192 executing. If ``skipExistingIn`` is not `None`, completed quanta 

1193 (those with metadata, or all outputs if there is no metadata 

1194 dataset configured) will be skipped rather than clobbered. 

1195 constrainedByAllDatasets : `bool`, optional 

1196 Indicates if the commonDataIds were generated with a constraint on 

1197 all dataset types. 

1198 

1199 Raises 

1200 ------ 

1201 OutputExistsError 

1202 Raised if an output dataset already exists in the output run 

1203 and ``skipExistingIn`` does not include output run, or if only 

1204 some outputs are present and ``clobberOutputs`` is `False`. 

1205 """ 

1206 # Run may be provided but it does not have to exist, in that case we 

1207 # use it for resolving references but don't check it for existing refs. 

1208 run_exists = False 

1209 if run: 

1210 with contextlib.suppress(MissingCollectionError): 

1211 run_exists = bool(registry.queryCollections(run)) 

1212 

1213 skip_collections_wildcard: CollectionWildcard | None = None 

1214 skipExistingInRun = False 

1215 if skipExistingIn: 

1216 skip_collections_wildcard = CollectionWildcard.from_expression(skipExistingIn) 

1217 if run_exists: 

1218 # as optimization check in the explicit list of names first 

1219 skipExistingInRun = run in skip_collections_wildcard.strings 

1220 if not skipExistingInRun: 

1221 # need to flatten it and check again 

1222 skipExistingInRun = run in registry.queryCollections( 

1223 skipExistingIn, 

1224 collectionTypes=CollectionType.RUN, 

1225 ) 

1226 

1227 idMaker = _DatasetIdMaker(run) 

1228 

1229 resolvedRefQueryResults: Iterable[DatasetRef] 

1230 

1231 # Updating constrainedByAllDatasets here is not ideal, but we have a 

1232 # few different code paths that each transfer different pieces of 

1233 # information about what dataset query constraints were applied here, 

1234 # and none of them has the complete picture until we get here. We're 

1235 # long overdue for a QG generation rewrite that will make this go away 

1236 # entirely anyway. 

1237 constrainedByAllDatasets = ( 

1238 constrainedByAllDatasets and self.defaultDatasetQueryConstraints == self.inputs.keys() 

1239 ) 

1240 

1241 # Look up [init] intermediate and output datasets in the output 

1242 # collection, if there is an output collection. 

1243 if run_exists or skip_collections_wildcard is not None: 

1244 for datasetType, refs in itertools.chain( 

1245 self.initIntermediates.items(), 

1246 self.initOutputs.items(), 

1247 self.intermediates.items(), 

1248 self.outputs.items(), 

1249 ): 

1250 _LOG.debug( 

1251 "Resolving %d datasets for intermediate and/or output dataset %s.", 

1252 len(refs), 

1253 datasetType.name, 

1254 ) 

1255 isInit = datasetType in self.initIntermediates or datasetType in self.initOutputs 

1256 subset = commonDataIds.subset(datasetType.dimensions, unique=True) 

1257 # TODO: this assert incorrectly bans component inputs; 

1258 # investigate on DM-33027. 

1259 # assert not datasetType.isComponent(), \ 

1260 # "Output datasets cannot be components." 

1261 # 

1262 # Instead we have to handle them manually to avoid a 

1263 # deprecation warning, but it is at least confusing and 

1264 # possibly a bug for components to appear here at all. 

1265 if datasetType.isComponent(): 

1266 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1267 component = datasetType.component() 

1268 else: 

1269 parent_dataset_type = datasetType 

1270 component = None 

1271 

1272 # look at RUN collection first 

1273 if run_exists: 

1274 try: 

1275 resolvedRefQueryResults = subset.findDatasets( 

1276 parent_dataset_type, collections=run, findFirst=True 

1277 ) 

1278 except MissingDatasetTypeError: 

1279 resolvedRefQueryResults = [] 

1280 for resolvedRef in resolvedRefQueryResults: 

1281 # TODO: we could easily support per-DatasetType 

1282 # skipExisting and I could imagine that being useful - 

1283 # it's probably required in order to support writing 

1284 # initOutputs before QuantumGraph generation. 

1285 assert resolvedRef.dataId in refs 

1286 if not (skipExistingInRun or isInit or clobberOutputs): 

1287 raise OutputExistsError( 

1288 f"Output dataset {datasetType.name} already exists in " 

1289 f"output RUN collection '{run}' with data ID" 

1290 f" {resolvedRef.dataId}." 

1291 ) 

1292 # To resolve all outputs we have to remember existing 

1293 # ones to avoid generating new dataset IDs for them. 

1294 refs[resolvedRef.dataId].ref = ( 

1295 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1296 ) 

1297 

1298 # And check skipExistingIn too, if RUN collection is in 

1299 # it is handled above 

1300 if skip_collections_wildcard is not None: 

1301 try: 

1302 resolvedRefQueryResults = subset.findDatasets( 

1303 parent_dataset_type, 

1304 collections=skip_collections_wildcard, 

1305 findFirst=True, 

1306 ) 

1307 except MissingDatasetTypeError: 

1308 resolvedRefQueryResults = [] 

1309 for resolvedRef in resolvedRefQueryResults: 

1310 if resolvedRef.dataId not in refs: 

1311 continue 

1312 refs[resolvedRef.dataId].ref = ( 

1313 resolvedRef.makeComponentRef(component) if component is not None else resolvedRef 

1314 ) 

1315 

1316 # Look up input and initInput datasets in the input collection(s). We 

1317 # accumulate datasets in self.missing, if the common data IDs were not 

1318 # constrained on dataset type existence. 

1319 for datasetType, refs in itertools.chain(self.initInputs.items(), self.inputs.items()): 

1320 _LOG.debug( 

1321 "Resolving %d datasets for input dataset %s.", 

1322 len(refs), 

1323 datasetType.name, 

1324 ) 

1325 if datasetType.isComponent(): 

1326 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1327 component = datasetType.component() 

1328 else: 

1329 parent_dataset_type = datasetType 

1330 component = None 

1331 missing_for_dataset_type: dict[DataCoordinate, _RefHolder] = {} 

1332 try: 

1333 resolvedRefQueryResults = commonDataIds.subset( 

1334 datasetType.dimensions, unique=True 

1335 ).findDatasets(parent_dataset_type, collections=collections, findFirst=True) 

1336 except MissingDatasetTypeError: 

1337 resolvedRefQueryResults = [] 

1338 dataIdsNotFoundYet = set(refs.keys()) 

1339 for resolvedRef in resolvedRefQueryResults: 

1340 dataIdsNotFoundYet.discard(resolvedRef.dataId) 

1341 if resolvedRef.dataId not in refs: 

1342 continue 

1343 refs[resolvedRef.dataId].ref = ( 

1344 resolvedRef if component is None else resolvedRef.makeComponentRef(component) 

1345 ) 

1346 if dataIdsNotFoundYet: 

1347 if constrainedByAllDatasets: 

1348 raise RuntimeError( 

1349 f"{len(dataIdsNotFoundYet)} dataset(s) of type " 

1350 f"'{datasetType.name}' was/were present in a previous " 

1351 "query, but could not be found now. " 

1352 "This is either a logic bug in QuantumGraph generation " 

1353 "or the input collections have been modified since " 

1354 "QuantumGraph generation began." 

1355 ) 

1356 elif not datasetType.dimensions: 

1357 raise RuntimeError( 

1358 f"Dataset {datasetType.name!r} (with no dimensions) could not be found in " 

1359 f"collections {collections}." 

1360 ) 

1361 else: 

1362 # If the common dataIds were not constrained using all the 

1363 # input dataset types, it is possible that some data ids 

1364 # found don't correspond to existing datasets. Mark these 

1365 # for later pruning from the quantum graph. 

1366 for k in dataIdsNotFoundYet: 

1367 missing_for_dataset_type[k] = refs[k] 

1368 if missing_for_dataset_type: 

1369 self.missing[datasetType] = missing_for_dataset_type 

1370 

1371 # Resolve the missing refs, just so they look like all of the others; 

1372 # in the end other code will make sure they never appear in the QG. 

1373 for dataset_type, refDict in self.missing.items(): 

1374 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1375 

1376 # Copy the resolved DatasetRefs to the _QuantumScaffolding objects, 

1377 # replacing the unresolved refs there, and then look up prerequisites. 

1378 for task in self.tasks: 

1379 _LOG.debug( 

1380 "Applying resolutions and finding prerequisites for %d quanta of task with label '%s'.", 

1381 len(task.quanta), 

1382 task.taskDef.label, 

1383 ) 

1384 # The way iterConnections is designed makes it impossible to 

1385 # annotate precisely enough to satisfy MyPy here. 

1386 lookupFunctions = { 

1387 c.name: c.lookupFunction # type: ignore 

1388 for c in iterConnections(task.taskDef.connections, "prerequisiteInputs") 

1389 if c.lookupFunction is not None # type: ignore 

1390 } 

1391 dataIdsFailed = [] 

1392 dataIdsSucceeded = [] 

1393 for quantum in task.quanta.values(): 

1394 # Process outputs datasets only if skipExistingIn is not None 

1395 # or there is a run to look for outputs in and clobberOutputs 

1396 # is True. Note that if skipExistingIn is None, any output 

1397 # datasets that already exist would have already caused an 

1398 # exception to be raised. 

1399 if skip_collections_wildcard is not None or (run_exists and clobberOutputs): 

1400 resolvedRefs = [] 

1401 unresolvedDataIds = [] 

1402 haveMetadata = False 

1403 for datasetType, originalRefs in quantum.outputs.items(): 

1404 for dataId, ref in task.outputs.extract(datasetType, originalRefs.keys()): 

1405 if ref is not None: 

1406 resolvedRefs.append(ref) 

1407 originalRefs[dataId].ref = ref 

1408 if datasetType.name == task.taskDef.metadataDatasetName: 

1409 haveMetadata = True 

1410 else: 

1411 unresolvedDataIds.append((datasetType, dataId)) 

1412 if resolvedRefs: 

1413 if haveMetadata or not unresolvedDataIds: 

1414 dataIdsSucceeded.append(quantum.dataId) 

1415 if skip_collections_wildcard is not None: 

1416 continue 

1417 else: 

1418 dataIdsFailed.append(quantum.dataId) 

1419 if not clobberOutputs and run_exists: 

1420 raise OutputExistsError( 

1421 f"Quantum {quantum.dataId} of task with label " 

1422 f"'{quantum.task.taskDef.label}' has some outputs that exist " 

1423 f"({resolvedRefs}) " 

1424 f"and others that don't ({unresolvedDataIds}), with no metadata output, " 

1425 "and clobbering outputs was not enabled." 

1426 ) 

1427 # Update the input DatasetRefs to the resolved ones we already 

1428 # searched for. 

1429 for datasetType, input_refs in quantum.inputs.items(): 

1430 for data_id, ref in task.inputs.extract(datasetType, input_refs.keys()): 

1431 input_refs[data_id].ref = ref 

1432 # Look up prerequisite datasets in the input collection(s). 

1433 # These may have dimensions that extend beyond those we queried 

1434 # for originally, because we want to permit those data ID 

1435 # values to differ across quanta and dataset types. 

1436 for datasetType in task.prerequisites: 

1437 if datasetType.isComponent(): 

1438 parent_dataset_type = datasetType.makeCompositeDatasetType() 

1439 component = datasetType.component() 

1440 else: 

1441 parent_dataset_type = datasetType 

1442 component = None 

1443 lookupFunction = lookupFunctions.get(datasetType.name) 

1444 if lookupFunction is not None: 

1445 # PipelineTask has provided its own function to do the 

1446 # lookup. This always takes precedence. 

1447 prereq_refs = list(lookupFunction(datasetType, registry, quantum.dataId, collections)) 

1448 elif ( 

1449 datasetType.isCalibration() 

1450 and datasetType.dimensions <= quantum.dataId.graph 

1451 and quantum.dataId.graph.temporal 

1452 ): 

1453 # This is a master calibration lookup, which we have to 

1454 # handle specially because the query system can't do a 

1455 # temporal join on a non-dimension-based timespan yet. 

1456 timespan = quantum.dataId.timespan 

1457 try: 

1458 prereq_ref = registry.findDataset( 

1459 parent_dataset_type, 

1460 quantum.dataId, 

1461 collections=collections, 

1462 timespan=timespan, 

1463 ) 

1464 if prereq_ref is not None: 

1465 if component is not None: 

1466 prereq_ref = prereq_ref.makeComponentRef(component) 

1467 prereq_refs = [prereq_ref] 

1468 else: 

1469 prereq_refs = [] 

1470 except (KeyError, MissingDatasetTypeError): 

1471 # This dataset type is not present in the registry, 

1472 # which just means there are no datasets here. 

1473 prereq_refs = [] 

1474 else: 

1475 where = "" 

1476 bind: dict[str, Any] = {} 

1477 if not quantum.dataId.graph.spatial: 

1478 # This has skypix dimensions (probably a reference 

1479 # catalog), but the quantum's data is not spatial 

1480 # (it's probably a full-survey sequence point). 

1481 # Try to limit the spatial extent to the union of 

1482 # the spatial extent of the inputs and outputs. 

1483 for dimension in datasetType.dimensions: 

1484 if isinstance(dimension, SkyPixDimension): 

1485 extent = quantum.computeSpatialExtent(dimension.pixelization) 

1486 pixels: list[int] = [] 

1487 for begin, end in extent: 

1488 pixels.extend(range(begin, end)) 

1489 if not pixels: 

1490 _LOG.warning( 

1491 "Prerequisite input %r to task %r may be unbounded.", 

1492 datasetType.name, 

1493 quantum.task.taskDef.label, 

1494 ) 

1495 else: 

1496 bind["quantum_extent"] = pixels 

1497 where = f"{dimension.name} IN (quantum_extent)" 

1498 break 

1499 # Most general case. 

1500 prereq_refs = [ 

1501 prereq_ref if component is None else prereq_ref.makeComponentRef(component) 

1502 for prereq_ref in registry.queryDatasets( 

1503 parent_dataset_type, 

1504 collections=collections, 

1505 dataId=quantum.dataId, 

1506 findFirst=True, 

1507 where=where, 

1508 bind=bind, 

1509 ).expanded() 

1510 ] 

1511 

1512 for ref in prereq_refs: 

1513 if ref is not None: 

1514 quantum.prerequisites[datasetType][ref.dataId] = _RefHolder(datasetType, ref) 

1515 task.prerequisites[datasetType][ref.dataId].append(_RefHolder(datasetType, ref)) 

1516 

1517 # Resolve all quantum inputs and outputs. 

1518 for dataset_type, refDict in quantum.inputs.items(): 

1519 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1520 for dataset_type, refDict in quantum.outputs.items(): 

1521 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1522 

1523 # Actually remove any quanta that we decided to skip above. 

1524 if dataIdsSucceeded: 

1525 if skip_collections_wildcard is not None: 

1526 _LOG.debug( 

1527 "Pruning successful %d quanta for task with label '%s' because all of their " 

1528 "outputs exist or metadata was written successfully.", 

1529 len(dataIdsSucceeded), 

1530 task.taskDef.label, 

1531 ) 

1532 for dataId in dataIdsSucceeded: 

1533 del task.quanta[dataId] 

1534 elif clobberOutputs and run_exists: 

1535 _LOG.info( 

1536 "Found %d successful quanta for task with label '%s' " 

1537 "that will need to be clobbered during execution.", 

1538 len(dataIdsSucceeded), 

1539 task.taskDef.label, 

1540 ) 

1541 if dataIdsFailed: 

1542 if clobberOutputs and run_exists: 

1543 _LOG.info( 

1544 "Found %d failed/incomplete quanta for task with label '%s' " 

1545 "that will need to be clobbered during execution.", 

1546 len(dataIdsFailed), 

1547 task.taskDef.label, 

1548 ) 

1549 

1550 # Resolve task initInputs and initOutputs. 

1551 for dataset_type, refDict in task.initInputs.items(): 

1552 idMaker.resolveDict(dataset_type, refDict, is_output=False) 

1553 if task.quanta: 

1554 for dataset_type, refDict in task.initOutputs.items(): 

1555 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1556 else: 

1557 # If there are no quanta for this task remaining (because they 

1558 # all succeeded before and we're skipping those now), we do not 

1559 # resolve the init outputs as outputs; instead we might want to 

1560 # find them in the skip-existing-in collections instead, which 

1561 # means from the pipeline perspective they're initInputs, not 

1562 # initIntermediates. They will be resolved by the tasks that 

1563 # use them as inputs, or not at all. 

1564 for dataset_type in task.initOutputs: 

1565 init_datasets = self.initIntermediates.pop(dataset_type, None) 

1566 if init_datasets is not None: 

1567 self.initInputs[dataset_type] = init_datasets 

1568 self.initOutputs.pop(dataset_type, None) 

1569 # Removing the initInputs of this task from the scaffolding 

1570 # data structures is trickier, because the same initInput may 

1571 # be used by multiple tasks. 

1572 # TODO: DM-38498: handle the above problem better. 

1573 

1574 # Collect initOutputs that do not belong to any task. 

1575 global_dataset_types: set[DatasetType] = set(self.initOutputs) 

1576 for task in self.tasks: 

1577 global_dataset_types -= set(task.initOutputs) 

1578 if global_dataset_types: 

1579 self.globalInitOutputs = _DatasetDict.fromSubset(global_dataset_types, self.initOutputs) 

1580 for dataset_type, refDict in self.globalInitOutputs.items(): 

1581 idMaker.resolveDict(dataset_type, refDict, is_output=True) 

1582 

1583 def makeQuantumGraph( 

1584 self, 

1585 registry: Registry, 

1586 metadata: Mapping[str, Any] | None = None, 

1587 datastore: Datastore | None = None, 

1588 ) -> QuantumGraph: 

1589 """Create a `QuantumGraph` from the quanta already present in 

1590 the scaffolding data structure. 

1591 

1592 Parameters 

1593 ---------- 

1594 registry : `lsst.daf.butler.Registry` 

1595 Registry for the data repository; used for all data ID queries. 

1596 metadata : `~collections.abc.Mapping` of `str` to primitives, optional 

1597 This is an optional parameter of extra data to carry with the 

1598 graph. Entries in this mapping should be able to be serialized in 

1599 JSON. 

1600 datastore : `~lsst.daf.butler.Datastore`, optional 

1601 If not `None` then fill datastore records in each generated 

1602 Quantum. 

1603 

1604 Returns 

1605 ------- 

1606 graph : `QuantumGraph` 

1607 The full `QuantumGraph`. 

1608 """ 

1609 datastore_records: Mapping[str, DatastoreRecordData] | None = None 

1610 if datastore is not None: 

1611 datastore_records = datastore.export_records( 

1612 ref.makeCompositeRef() if ref.isComponent() else ref 

1613 for ref in itertools.chain( 

1614 self.inputs.iter_resolved_refs(), 

1615 self.initInputs.iter_resolved_refs(), 

1616 self.prerequisites.iter_resolved_refs(), 

1617 ) 

1618 ) 

1619 

1620 graphInput: dict[TaskDef, set[Quantum]] = {} 

1621 for task in self.tasks: 

1622 if not task.quanta: 

1623 continue 

1624 qset = task.makeQuantumSet(missing=self.missing, datastore_records=datastore_records) 

1625 graphInput[task.taskDef] = qset 

1626 

1627 taskInitInputs = { 

1628 task.taskDef: task.initInputs.unpackSingleRefs(task.storage_classes).values() 

1629 for task in self.tasks 

1630 if task.quanta 

1631 } 

1632 taskInitOutputs = { 

1633 task.taskDef: task.initOutputs.unpackSingleRefs(task.storage_classes).values() 

1634 for task in self.tasks 

1635 if task.quanta 

1636 } 

1637 

1638 globalInitOutputs: list[DatasetRef] = [] 

1639 if self.globalInitOutputs is not None: 

1640 for refs_dict in self.globalInitOutputs.values(): 

1641 globalInitOutputs.extend(holder.resolved_ref for holder in refs_dict.values()) 

1642 

1643 graph = QuantumGraph( 

1644 graphInput, 

1645 metadata=metadata, 

1646 pruneRefs=list(self.missing.iter_resolved_refs()), 

1647 universe=self.dimensions.universe, 

1648 initInputs=taskInitInputs, 

1649 initOutputs=taskInitOutputs, 

1650 globalInitOutputs=globalInitOutputs, 

1651 registryDatasetTypes=self._get_registry_dataset_types(registry), 

1652 ) 

1653 return graph 

1654 

1655 def _get_registry_dataset_types(self, registry: Registry) -> Iterable[DatasetType]: 

1656 """Make a list of all dataset types used by a graph as defined in 

1657 registry. 

1658 """ 

1659 chain: list[_DatasetDict | _DatasetDictMulti] = [ 

1660 self.initInputs, 

1661 self.initIntermediates, 

1662 self.initOutputs, 

1663 self.inputs, 

1664 self.intermediates, 

1665 self.outputs, 

1666 self.prerequisites, 

1667 ] 

1668 if self.globalInitOutputs is not None: 

1669 chain.append(self.globalInitOutputs) 

1670 

1671 # Collect names of all dataset types. 

1672 all_names: set[str] = {dstype.name for dstype in itertools.chain(*chain)} 

1673 dataset_types = {ds.name: ds for ds in registry.queryDatasetTypes(all_names)} 

1674 

1675 # Check for types that do not exist in registry yet: 

1676 # - inputs must exist 

1677 # - intermediates and outputs may not exist, but there must not be 

1678 # more than one definition (e.g. differing in storage class) 

1679 # - prerequisites may not exist, treat it the same as outputs here 

1680 for dstype in itertools.chain(self.initInputs, self.inputs): 

1681 if dstype.name not in dataset_types: 

1682 raise MissingDatasetTypeError(f"Registry is missing an input dataset type {dstype}") 

1683 

1684 new_outputs: dict[str, set[DatasetType]] = defaultdict(set) 

1685 chain = [ 

1686 self.initIntermediates, 

1687 self.initOutputs, 

1688 self.intermediates, 

1689 self.outputs, 

1690 self.prerequisites, 

1691 ] 

1692 if self.globalInitOutputs is not None: 

1693 chain.append(self.globalInitOutputs) 

1694 for dstype in itertools.chain(*chain): 

1695 if dstype.name not in dataset_types: 

1696 new_outputs[dstype.name].add(dstype) 

1697 for name, dstypes in new_outputs.items(): 

1698 if len(dstypes) > 1: 

1699 raise ValueError( 

1700 "Pipeline contains multiple definitions for a dataset type " 

1701 f"which is not defined in registry yet: {dstypes}" 

1702 ) 

1703 elif len(dstypes) == 1: 

1704 dataset_types[name] = dstypes.pop() 

1705 

1706 return dataset_types.values() 

1707 

1708 

1709# ------------------------ 

1710# Exported definitions -- 

1711# ------------------------ 

1712 

1713 

1714class GraphBuilderError(Exception): 

1715 """Base class for exceptions generated by graph builder.""" 

1716 

1717 pass 

1718 

1719 

1720class OutputExistsError(GraphBuilderError): 

1721 """Exception generated when output datasets already exist.""" 

1722 

1723 pass 

1724 

1725 

1726class PrerequisiteMissingError(GraphBuilderError): 

1727 """Exception generated when a prerequisite dataset does not exist.""" 

1728 

1729 pass 

1730 

1731 

1732class GraphBuilder: 

1733 """GraphBuilder class is responsible for building task execution graph from 

1734 a Pipeline. 

1735 

1736 Parameters 

1737 ---------- 

1738 registry : `~lsst.daf.butler.Registry` 

1739 Data butler instance. 

1740 skipExistingIn 

1741 Expressions representing the collections to search for existing 

1742 output datasets that should be skipped. See 

1743 :ref:`daf_butler_ordered_collection_searches`. 

1744 clobberOutputs : `bool`, optional 

1745 If `True` (default), allow quanta to created even if partial outputs 

1746 exist; this requires the same behavior behavior to be enabled when 

1747 executing. 

1748 datastore : `~lsst.daf.butler.Datastore`, optional 

1749 If not `None` then fill datastore records in each generated Quantum. 

1750 """ 

1751 

1752 def __init__( 

1753 self, 

1754 registry: Registry, 

1755 skipExistingIn: Any = None, 

1756 clobberOutputs: bool = True, 

1757 datastore: Datastore | None = None, 

1758 ): 

1759 self.registry = registry 

1760 self.dimensions = registry.dimensions 

1761 self.skipExistingIn = skipExistingIn 

1762 self.clobberOutputs = clobberOutputs 

1763 self.datastore = datastore 

1764 

1765 def makeGraph( 

1766 self, 

1767 pipeline: Pipeline | Iterable[TaskDef], 

1768 collections: Any, 

1769 run: str, 

1770 userQuery: str | None, 

1771 datasetQueryConstraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

1772 metadata: Mapping[str, Any] | None = None, 

1773 bind: Mapping[str, Any] | None = None, 

1774 dataId: DataCoordinate | None = None, 

1775 ) -> QuantumGraph: 

1776 """Create execution graph for a pipeline. 

1777 

1778 Parameters 

1779 ---------- 

1780 pipeline : `Pipeline` or `~collections.abc.Iterable` [ `TaskDef` ] 

1781 Pipeline definition, task names/classes and their configs. 

1782 collections 

1783 Expressions representing the collections to search for input 

1784 datasets. See :ref:`daf_butler_ordered_collection_searches`. 

1785 run : `str` 

1786 Name of the `~lsst.daf.butler.CollectionType.RUN` collection for 

1787 output datasets. Collection does not have to exist and it will be 

1788 created when graph is executed. 

1789 userQuery : `str` 

1790 String which defines user-defined selection for registry, should be 

1791 empty or `None` if there is no restrictions on data selection. 

1792 datasetQueryConstraint : `DatasetQueryConstraintVariant`, optional 

1793 The query constraint variant that should be used to constraint the 

1794 query based on dataset existance, defaults to 

1795 `DatasetQueryConstraintVariant.ALL`. 

1796 metadata : Optional Mapping of `str` to primitives 

1797 This is an optional parameter of extra data to carry with the 

1798 graph. Entries in this mapping should be able to be serialized in 

1799 JSON. 

1800 bind : `~collections.abc.Mapping`, optional 

1801 Mapping containing literal values that should be injected into the 

1802 ``userQuery`` expression, keyed by the identifiers they replace. 

1803 dataId : `lsst.daf.butler.DataCoordinate`, optional 

1804 Data ID that should also be included in the query constraint. 

1805 

1806 Returns 

1807 ------- 

1808 graph : `QuantumGraph` 

1809 

1810 Raises 

1811 ------ 

1812 UserExpressionError 

1813 Raised when user expression cannot be parsed. 

1814 OutputExistsError 

1815 Raised when output datasets already exist. 

1816 Exception 

1817 Other exceptions types may be raised by underlying registry 

1818 classes. 

1819 """ 

1820 scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) 

1821 if not collections and (scaffolding.initInputs or scaffolding.inputs or scaffolding.prerequisites): 

1822 raise ValueError("Pipeline requires input datasets but no input collections provided.") 

1823 if dataId is None: 

1824 dataId = DataCoordinate.makeEmpty(self.registry.dimensions) 

1825 if isinstance(pipeline, Pipeline): 

1826 dataId = pipeline.get_data_id(self.registry.dimensions).union(dataId) 

1827 with scaffolding.connectDataIds( 

1828 self.registry, collections, userQuery, dataId, datasetQueryConstraint, bind 

1829 ) as commonDataIds: 

1830 condition = datasetQueryConstraint == DatasetQueryConstraintVariant.ALL 

1831 scaffolding.resolveDatasetRefs( 

1832 self.registry, 

1833 collections, 

1834 run, 

1835 commonDataIds, 

1836 skipExistingIn=self.skipExistingIn, 

1837 clobberOutputs=self.clobberOutputs, 

1838 constrainedByAllDatasets=condition, 

1839 ) 

1840 return scaffolding.makeQuantumGraph( 

1841 registry=self.registry, metadata=metadata, datastore=self.datastore 

1842 )