Coverage for python/lsst/pipe/base/quantum_graph_builder.py: 25%

375 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:55 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""The base class for the QuantumGraph-generation algorithm and various 

29helper classes. 

30""" 

31 

32from __future__ import annotations 

33 

34__all__ = ( 

35 "QuantumGraphBuilder", 

36 "ExistingDatasets", 

37 "QuantumGraphBuilderError", 

38 "OutputExistsError", 

39 "PrerequisiteMissingError", 

40) 

41 

42import dataclasses 

43from abc import ABC, abstractmethod 

44from collections.abc import Iterable, Mapping, Sequence 

45from typing import TYPE_CHECKING, Any, final 

46 

47from deprecated.sphinx import deprecated 

48from lsst.daf.butler import ( 

49 Butler, 

50 CollectionType, 

51 DataCoordinate, 

52 DatasetRef, 

53 DatasetType, 

54 DimensionUniverse, 

55 NamedKeyDict, 

56 NamedKeyMapping, 

57 Quantum, 

58) 

59from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

60from lsst.utils.logging import LsstLogAdapter, getLogger 

61from lsst.utils.timer import timeMethod 

62 

63from . import automatic_connection_constants as acc 

64from ._status import NoWorkFound 

65from ._task_metadata import TaskMetadata 

66from .connections import AdjustQuantumHelper 

67from .graph import QuantumGraph 

68from .pipeline_graph import PipelineGraph, TaskNode 

69from .prerequisite_helpers import PrerequisiteInfo, SkyPixBoundsBuilder, TimespanBuilder 

70from .quantum_graph_skeleton import ( 

71 DatasetKey, 

72 PrerequisiteDatasetKey, 

73 QuantumGraphSkeleton, 

74 QuantumKey, 

75 TaskInitKey, 

76) 

77 

78if TYPE_CHECKING: 

79 from .pipeline import TaskDef 

80 

81 

82class QuantumGraphBuilderError(Exception): 

83 """Base class for exceptions generated by QuantumGraphBuilder.""" 

84 

85 pass 

86 

87 

88# TODO: remove class and switch downstream inheritance to just 

89# QuantumGraphBuilderError on DM-40443. 

90@deprecated( 

91 "Deprecated in favor of QuantumGraphBuilderError and will be removed after v27.", 

92 version="v27.0", 

93 category=FutureWarning, 

94) 

95class GraphBuilderError(QuantumGraphBuilderError): 

96 """Backwards-compatibility near-alias for QuantumGraphBuilderError.""" 

97 

98 pass 

99 

100 

101# Inherit from backwards-compatibility alias for backwards-compatibility. 

102class OutputExistsError(GraphBuilderError): 

103 """Exception generated when output datasets already exist.""" 

104 

105 pass 

106 

107 

108# Inherit from backwards-compatibility alias for backwards-compatibility. 

109class PrerequisiteMissingError(GraphBuilderError): 

110 """Exception generated when a prerequisite dataset does not exist.""" 

111 

112 pass 

113 

114 

115class InitInputMissingError(QuantumGraphBuilderError): 

116 """Exception generated when an init-input dataset does not exist.""" 

117 

118 pass 

119 

120 

121class QuantumGraphBuilder(ABC): 

122 """An abstract base class for building `QuantumGraph` objects from a 

123 pipeline. 

124 

125 Parameters 

126 ---------- 

127 pipeline_graph : `.pipeline_graph.PipelineGraph` 

128 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved 

129 in-place with the given butler (any existing resolution is ignored). 

130 butler : `lsst.daf.butler.Butler` 

131 Client for the data repository. Should be read-only. 

132 input_collections : `~collections.abc.Sequence` [ `str` ], optional 

133 Collections to search for overall-input datasets. If not provided, 

134 ``butler.collections`` is used (and must not be empty). 

135 output_run : `str`, optional 

136 Output `~lsst.daf.butler.CollectionType.RUN` collection. If not 

137 provided, ``butler.run`` is used (and must not be `None`). 

138 skip_existing_in : `~collections.abc.Sequence` [ `str` ], optional 

139 Collections to search for outputs that already exist for the purpose of 

140 skipping quanta that have already been run. 

141 clobber : `bool`, optional 

142 Whether to raise if predicted outputs already exist in ``output_run`` 

143 (not including those quanta that would be skipped because they've 

144 already been run). This never actually clobbers outputs; it just 

145 informs the graph generation algorithm whether execution will run with 

146 clobbering enabled. This is ignored if ``output_run`` does not exist. 

147 

148 Notes 

149 ----- 

150 Constructing a `QuantumGraphBuilder` will run queries for existing datasets 

151 with empty data IDs (including but not limited to init inputs and outputs), 

152 in addition to resolving the given pipeline graph and testing for existence 

153 of the ``output`` run collection. 

154 

155 The `build` method splits the pipeline graph into independent subgraphs, 

156 then calls the abstract method `process_subgraph` on each, to allow 

157 concrete implementations to populate the rough graph structure (the 

158 `~quantum_graph_skeleton.QuantumGraphSkeleton` class) and search for 

159 existing datasets (further populating the builder's `existing_datasets` 

160 struct). The `build` method then: 

161 

162 - assembles `lsst.daf.butler.Quantum` instances from all data IDs in the 

163 skeleton; 

164 - looks for existing outputs found in ``skip_existing_in`` to see if any 

165 quanta should be skipped; 

166 - calls `PipelineTaskConnections.adjustQuantum` on all quanta, adjusting 

167 downstream quanta appropriately when preliminary predicted outputs are 

168 rejected (pruning nodes that will not have the inputs they need to run); 

169 - attaches datastore records and registry dataset types to the graph. 

170 

171 In addition to implementing `process_subgraph`, derived classes are 

172 generally expected to add new construction keyword-only arguments to 

173 control the data IDs of the quantum graph, while forwarding all of the 

174 arguments defined in the base class to `super`. 

175 """ 

176 

177 def __init__( 

178 self, 

179 pipeline_graph: PipelineGraph, 

180 butler: Butler, 

181 *, 

182 input_collections: Sequence[str] | None = None, 

183 output_run: str | None = None, 

184 skip_existing_in: Sequence[str] = (), 

185 clobber: bool = False, 

186 ): 

187 self.log = getLogger(__name__) 

188 self.metadata = TaskMetadata() 

189 self._pipeline_graph = pipeline_graph 

190 self.butler = butler 

191 if input_collections is None: 

192 input_collections = butler.collections 

193 if not input_collections: 

194 raise ValueError("No input collections provided.") 

195 self.input_collections = input_collections 

196 if output_run is None: 

197 output_run = butler.run 

198 if not output_run: 

199 raise ValueError("No output RUN collection provided.") 

200 self.output_run = output_run 

201 self.skip_existing_in = skip_existing_in 

202 self.empty_data_id = DataCoordinate.make_empty(butler.dimensions) 

203 self.clobber = clobber 

204 # See whether the output run already exists. 

205 self.output_run_exists = False 

206 try: 

207 if self.butler.registry.getCollectionType(self.output_run) is not CollectionType.RUN: 

208 raise RuntimeError(f"{self.output_run!r} is not a RUN collection.") 

209 self.output_run_exists = True 

210 except MissingCollectionError: 

211 # If the run doesn't exist we never need to clobber. This is not 

212 # an error so you can run with clobber=True the first time you 

213 # attempt some processing as well as all subsequent times, instead 

214 # of forcing the user to make the first attempt different. 

215 self.clobber = False 

216 # We need to know whether the skip_existing_in collection sequence 

217 # starts with the output run collection, as an optimization to avoid 

218 # queries later. 

219 try: 

220 skip_existing_in_flat = self.butler.registry.queryCollections( 

221 self.skip_existing_in, flattenChains=True 

222 ) 

223 except MissingCollectionError: 

224 skip_existing_in_flat = [] 

225 if not skip_existing_in_flat: 

226 self.skip_existing_in = [] 

227 if self.skip_existing_in and self.output_run_exists: 

228 self.skip_existing_starts_with_output_run = self.output_run == skip_existing_in_flat[0] 

229 else: 

230 self.skip_existing_starts_with_output_run = False 

231 self.existing_datasets = ExistingDatasets() 

232 try: 

233 packages_storage_class = butler.get_dataset_type(acc.PACKAGES_INIT_OUTPUT_NAME).storageClass_name 

234 except MissingDatasetTypeError: 

235 packages_storage_class = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS 

236 self._global_init_output_types = { 

237 acc.PACKAGES_INIT_OUTPUT_NAME: DatasetType( 

238 acc.PACKAGES_INIT_OUTPUT_NAME, 

239 self.universe.empty, 

240 packages_storage_class, 

241 ) 

242 } 

243 with self.butler.registry.caching_context(): 

244 self._pipeline_graph.resolve(self.butler.registry) 

245 self._find_empty_dimension_datasets() 

246 self.prerequisite_info = { 

247 task_node.label: PrerequisiteInfo(task_node, self._pipeline_graph) 

248 for task_node in pipeline_graph.tasks.values() 

249 } 

250 

251 log: LsstLogAdapter 

252 """Logger to use for all quantum-graph generation messages. 

253 

254 General and per-task status messages should be logged at `~logging.INFO` 

255 level or higher, per-dataset-type status messages should be logged at 

256 `~lsst.utils.logging.VERBOSE` or higher, and per-data-ID status messages 

257 should be logged at `logging.DEBUG` or higher. 

258 """ 

259 

260 metadata: TaskMetadata 

261 """Metadata to store in the QuantumGraph. 

262 

263 The `TaskMetadata` class is used here primarily in order to enable 

264 resource-usage collection with the `lsst.utils.timer.timeMethod` decorator. 

265 """ 

266 

267 butler: Butler 

268 """Client for the data repository. 

269 

270 Should be read-only. 

271 """ 

272 

273 input_collections: Sequence[str] 

274 """Collections to search for overall-input datasets. 

275 """ 

276 

277 output_run: str 

278 """Output `~lsst.daf.butler.CollectionType.RUN` collection. 

279 """ 

280 

281 skip_existing_in: Sequence[str] 

282 """Collections to search for outputs that already exist for the purpose 

283 of skipping quanta that have already been run. 

284 """ 

285 

286 clobber: bool 

287 """Whether to raise if predicted outputs already exist in ``output_run`` 

288 

289 This never actually clobbers outputs; it just informs the graph generation 

290 algorithm whether execution will run with clobbering enabled. This is 

291 always `False` if `output_run_exists` is `False`. 

292 """ 

293 

294 empty_data_id: DataCoordinate 

295 """An empty data ID in the data repository's dimension universe. 

296 """ 

297 

298 output_run_exists: bool 

299 """Whether the output run exists in the data repository already. 

300 """ 

301 

302 skip_existing_starts_with_output_run: bool 

303 """Whether the `skip_existing_in` sequence begins with `output_run`. 

304 

305 If this is true, any dataset found in `output_run` can be used to 

306 short-circuit queries in `skip_existing_in`. 

307 """ 

308 

309 existing_datasets: ExistingDatasets 

310 """Struct holding datasets that have already been found in the data 

311 repository. 

312 

313 This is updated in-place as the `QuantumGraph` generation algorithm 

314 proceeds. 

315 """ 

316 

317 prerequisite_info: Mapping[str, PrerequisiteInfo] 

318 """Helper objects for finding prerequisite inputs, organized by task label. 

319 

320 Subclasses that find prerequisites should remove the 

321 covered `~prerequisite_helpers.PrerequisiteFinder` objects from this 

322 attribute. 

323 """ 

324 

325 @property 

326 def universe(self) -> DimensionUniverse: 

327 """Definitions of all data dimensions.""" 

328 return self.butler.dimensions 

329 

330 @final 

331 @timeMethod 

332 def build( 

333 self, metadata: Mapping[str, Any] | None = None, attach_datastore_records: bool = True 

334 ) -> QuantumGraph: 

335 """Build the quantum graph. 

336 

337 Parameters 

338 ---------- 

339 metadata : `~collections.abc.Mapping`, optional 

340 Flexible metadata to add to the quantum graph. 

341 attach_datastore_records : `bool`, optional 

342 Whether to include datastore records in the graph. Required for 

343 `lsst.daf.butler.QuantumBackedButler` execution. 

344 

345 Returns 

346 ------- 

347 quantum_graph : `QuantumGraph` 

348 DAG describing processing to be performed. 

349 

350 Notes 

351 ----- 

352 External code is expected to construct a `QuantumGraphBuilder` and then 

353 call this method exactly once. See class documentation for details on 

354 what it does. 

355 """ 

356 with self.butler.registry.caching_context(): 

357 full_skeleton = QuantumGraphSkeleton(self._pipeline_graph.tasks) 

358 subgraphs = list(self._pipeline_graph.split_independent()) 

359 for i, subgraph in enumerate(subgraphs): 

360 self.log.info( 

361 "Processing pipeline subgraph %d of %d with %d task(s).", 

362 i + 1, 

363 len(subgraphs), 

364 len(subgraph.tasks), 

365 ) 

366 self.log.verbose("Subgraph tasks: [%s]", ", ".join(label for label in subgraph.tasks)) 

367 subgraph_skeleton = self.process_subgraph(subgraph) 

368 full_skeleton.update(subgraph_skeleton) 

369 # Loop over tasks. The pipeline graph must be topologically 

370 # sorted, so a quantum is only processed after any quantum that 

371 # provides its inputs has been processed. 

372 for task_node in self._pipeline_graph.tasks.values(): 

373 self._resolve_task_quanta(task_node, full_skeleton) 

374 # Add global init-outputs to the skeleton. 

375 for dataset_type in self._global_init_output_types.values(): 

376 dataset_key = full_skeleton.add_dataset_node( 

377 dataset_type.name, self.empty_data_id, is_global_init_output=True 

378 ) 

379 ref = self.existing_datasets.outputs_in_the_way.get(dataset_key) 

380 if ref is None: 

381 ref = DatasetRef(dataset_type, self.empty_data_id, run=self.output_run) 

382 full_skeleton[dataset_key]["ref"] = ref 

383 # Remove dataset nodes with no edges that are not global init 

384 # outputs, which are generally overall-inputs whose original quanta 

385 # end up skipped or with no work to do (we can't remove these along 

386 # with the quanta because no quantum knows if its the only 

387 # consumer). 

388 full_skeleton.remove_orphan_datasets() 

389 if attach_datastore_records: 

390 self._attach_datastore_records(full_skeleton) 

391 # TODO initialize most metadata here instead of in ctrl_mpexec. 

392 if metadata is None: 

393 metadata = {} 

394 return self._construct_quantum_graph(full_skeleton, metadata) 

395 

396 @abstractmethod 

397 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

398 """Build the rough structure for an independent subset of the 

399 `QuantumGraph` and query for relevant existing datasets. 

400 

401 Parameters 

402 ---------- 

403 subgraph : `.pipeline_graph.PipelineGraph` 

404 Subset of the pipeline graph that should be processed by this call. 

405 This is always resolved and topologically sorted. It should not be 

406 modified. 

407 

408 Returns 

409 ------- 

410 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

411 Class representing an initial quantum graph. See 

412 `quantum_graph_skeleton.QuantumGraphSkeleton` docs for details. 

413 After this is returned, the object may be modified in-place in 

414 unspecified ways. 

415 

416 Notes 

417 ----- 

418 In addition to returning a 

419 `quantum_graph_skeleton.QuantumGraphSkeleton`, this method should 

420 populate the `existing_datasets` structure by querying for all relevant 

421 datasets with non-empty data IDs (those with empty data IDs will 

422 already be present). In particular: 

423 

424 - `~ExistingDatasets.inputs` must always be populated with all 

425 overall-input datasets (but not prerequisites), by querying 

426 `input_collections`; 

427 - `~ExistingDatasets.outputs_for_skip` must be populated with any 

428 intermediate our output datasets present in `skip_existing_in` (it 

429 can be ignored if `skip_existing_in` is empty); 

430 - `~ExistingDatasets.outputs_in_the_way` must be populated with any 

431 intermediate or output datasets present in `output_run`, if 

432 `output_run_exists` (it can be ignored if `output_run_exists` is 

433 `False`). Note that the presence of such datasets is not 

434 automatically an error, even if `clobber is `False`, as these may be 

435 quanta that will be skipped. 

436 - `~ExistingDatasets.inputs` must be populated with all 

437 prerequisite-input datasets that were included in the skeleton, by 

438 querying `input_collections` (not all prerequisite inputs need to be 

439 included in the skeleton, but the base class can only use per-quantum 

440 queries to find them, and that can be slow when there are many 

441 quanta). 

442 

443 Dataset types should never be components and should always use the 

444 "common" storage class definition in `pipeline_graph.DatasetTypeNode` 

445 (which is the data repository definition when the dataset type is 

446 registered). 

447 """ 

448 raise NotImplementedError() 

449 

450 @final 

451 @timeMethod 

452 def _resolve_task_quanta(self, task_node: TaskNode, skeleton: QuantumGraphSkeleton) -> None: 

453 """Process the quanta for one task in a skeleton graph to skip those 

454 that have already completed and adjust those that request it. 

455 

456 Parameters 

457 ---------- 

458 task_node : `pipeline_graph.TaskNode` 

459 Node for this task in the pipeline graph. 

460 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

461 Preliminary quantum graph, to be modified in-place. 

462 

463 Notes 

464 ----- 

465 This method modifies ``skeleton`` in-place in several ways: 

466 

467 - It adds a "ref" attribute to dataset nodes, using the contents of 

468 `existing_datasets`. This ensures producing and consuming tasks 

469 start from the same `DatasetRef`. 

470 - It adds "inputs", "outputs", and "init_inputs" attributes to the 

471 quantum nodes, holding the same `NamedValueMapping` objects needed to 

472 construct an actual `Quantum` instances. 

473 - It removes quantum nodes that are to be skipped because their outputs 

474 already exist in `skip_existing_in`. It also removes their outputs 

475 from `ExistingDatasets.outputs_in_the_way`. 

476 - It adds prerequisite dataset nodes and edges that connect them to the 

477 quanta that consume them. 

478 - It removes quantum nodes whose 

479 `~PipelineTaskConnections.adjustQuantum` calls raise `NoWorkFound` or 

480 predict no outputs; 

481 - It removes the nodes of output datasets that are "adjusted away". 

482 - It removes the edges of input datasets that are "adjusted away". 

483 

484 The difference between how adjusted inputs and outputs are handled 

485 reflects the fact that many quanta can share the same input, but only 

486 one produces each output. This can lead to the graph having 

487 superfluous isolated nodes after processing is complete, but these 

488 should only be removed after all the quanta from all tasks have been 

489 processed. 

490 """ 

491 # Extract the helper object for the prerequisite inputs of this task, 

492 # and tell it to prepare to construct skypix bounds and timespans for 

493 # each quantum (these will automatically do nothing if nothing needs 

494 # those bounds). 

495 task_prerequisite_info = self.prerequisite_info[task_node.label] 

496 task_prerequisite_info.update_bounds() 

497 # Loop over all quanta for this task, remembering the ones we've 

498 # gotten rid of. 

499 skipped_quanta = [] 

500 no_work_quanta = [] 

501 for quantum_key in skeleton.get_quanta(task_node.label): 

502 if self._skip_quantum_if_metadata_exists(task_node, quantum_key, skeleton): 

503 skipped_quanta.append(quantum_key) 

504 continue 

505 quantum_data_id = skeleton[quantum_key]["data_id"] 

506 skypix_bounds_builder = task_prerequisite_info.bounds.make_skypix_bounds_builder(quantum_data_id) 

507 timespan_builder = task_prerequisite_info.bounds.make_timespan_builder(quantum_data_id) 

508 adjusted_outputs = self._gather_quantum_outputs( 

509 task_node, quantum_key, skeleton, skypix_bounds_builder, timespan_builder 

510 ) 

511 adjusted_inputs = self._gather_quantum_inputs( 

512 task_node, 

513 quantum_key, 

514 skeleton, 

515 task_prerequisite_info, 

516 skypix_bounds_builder, 

517 timespan_builder, 

518 ) 

519 # Give the task's Connections class an opportunity to remove 

520 # some inputs, or complain if they are unacceptable. This will 

521 # raise if one of the check conditions is not met, which is the 

522 # intended behavior. 

523 helper = AdjustQuantumHelper(inputs=adjusted_inputs, outputs=adjusted_outputs) 

524 try: 

525 helper.adjust_in_place(task_node.get_connections(), task_node.label, quantum_data_id) 

526 except NoWorkFound as err: 

527 # Do not generate this quantum; it would not produce any 

528 # outputs. Remove it and all of the outputs it might have 

529 # produced from the skeleton. 

530 try: 

531 _, connection_name, _ = err.args 

532 details = f"not enough datasets for connection {connection_name}." 

533 except ValueError: 

534 details = str(err) 

535 self.log.debug( 

536 "No work found for quantum %s of task %s: %s", 

537 quantum_key.data_id_values, 

538 quantum_key.task_label, 

539 details, 

540 ) 

541 no_work_quanta.append(quantum_key) 

542 continue 

543 if helper.outputs_adjusted: 

544 if not any(adjusted_refs for adjusted_refs in helper.outputs.values()): 

545 # No outputs also means we don't generate this quantum. 

546 self.log.debug( 

547 "No outputs predicted for quantum %s of task %s.", 

548 quantum_key.data_id_values, 

549 quantum_key.task_label, 

550 ) 

551 no_work_quanta.append(quantum_key) 

552 continue 

553 # Remove output nodes that were not retained by 

554 # adjustQuantum. 

555 skeleton.remove_dataset_nodes( 

556 self._find_removed(skeleton.iter_outputs_of(quantum_key), helper.outputs) 

557 ) 

558 if helper.inputs_adjusted: 

559 if not any(bool(adjusted_refs) for adjusted_refs in helper.inputs.values()): 

560 raise QuantumGraphBuilderError( 

561 f"adjustQuantum implementation for {task_node.label}@{quantum_key.data_id_values} " 

562 "returned outputs but no inputs." 

563 ) 

564 # Remove input dataset edges that were not retained by 

565 # adjustQuantum. We can't remove the input dataset nodes 

566 # because some other quantum might still want them. 

567 skeleton.remove_input_edges( 

568 quantum_key, self._find_removed(skeleton.iter_inputs_of(quantum_key), helper.inputs) 

569 ) 

570 # Save the adjusted inputs and outputs to the quantum node's 

571 # state so we don't have to regenerate those data structures 

572 # from the graph. 

573 skeleton[quantum_key]["inputs"] = helper.inputs 

574 skeleton[quantum_key]["outputs"] = helper.outputs 

575 for no_work_quantum in no_work_quanta: 

576 skeleton.remove_quantum_node(no_work_quantum, remove_outputs=True) 

577 for skipped_quantum in skipped_quanta: 

578 skeleton.remove_quantum_node(skipped_quantum, remove_outputs=False) 

579 remaining_quanta = skeleton.get_quanta(task_node.label) 

580 self._resolve_task_init(task_node, skeleton, bool(skipped_quanta)) 

581 message_terms = [] 

582 if no_work_quanta: 

583 message_terms.append(f"{len(no_work_quanta)} had no work to do") 

584 if skipped_quanta: 

585 message_terms.append(f"{len(skipped_quanta)} previously succeeded") 

586 message_parenthetical = f" ({', '.join(message_terms)})" if message_terms else "" 

587 if remaining_quanta: 

588 self.log.info( 

589 "Generated %s for task %s%s.", 

590 _quantum_or_quanta(len(remaining_quanta)), 

591 task_node.label, 

592 message_parenthetical, 

593 ) 

594 else: 

595 self.log.info( 

596 "Dropping task %s because no quanta remain%s.", task_node.label, message_parenthetical 

597 ) 

598 skeleton.remove_task(task_node.label) 

599 

600 def _skip_quantum_if_metadata_exists( 

601 self, task_node: TaskNode, quantum_key: QuantumKey, skeleton: QuantumGraphSkeleton 

602 ) -> bool: 

603 """Identify and drop quanta that should be skipped because their 

604 metadata datasets already exist. 

605 

606 Parameters 

607 ---------- 

608 task_node : `pipeline_graph.TaskNode` 

609 Node for this task in the pipeline graph. 

610 quantum_key : `QuantumKey` 

611 Identifier for this quantum in the graph. 

612 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

613 Preliminary quantum graph, to be modified in-place. 

614 

615 Returns 

616 ------- 

617 skipped : `bool` 

618 `True` if the quantum is being skipped and has been removed from 

619 the graph, `False` otherwise. 

620 

621 Notes 

622 ----- 

623 If the metadata dataset for this quantum exists in 

624 `ExistingDatasets.outputs_for_skip`, the quantum will be skipped. This 

625 causes the quantum node to be removed from the graph. Dataset nodes 

626 that were previously the outputs of this quantum will have their "ref" 

627 attribute set from `ExistingDatasets.outputs_for_skip`, or will be 

628 removed if there is no such dataset there. Any output dataset in 

629 `ExistingDatasets.outputs_in_the_way` will be removed. 

630 """ 

631 metadata_dataset_key = DatasetKey( 

632 task_node.metadata_output.parent_dataset_type_name, quantum_key.data_id_values 

633 ) 

634 if metadata_dataset_key in self.existing_datasets.outputs_for_skip: 

635 # This quantum's metadata is already present in the the 

636 # skip_existing_in collections; we'll skip it. But the presence of 

637 # the metadata dataset doesn't guarantee that all of the other 

638 # outputs we predicted are present; we have to check. 

639 for output_dataset_key in list(skeleton.iter_outputs_of(quantum_key)): 

640 if ( 

641 output_ref := self.existing_datasets.outputs_for_skip.get(output_dataset_key) 

642 ) is not None: 

643 # Populate the skeleton graph's node attributes 

644 # with the existing DatasetRef, just like a 

645 # predicted output of a non-skipped quantum. 

646 skeleton[output_dataset_key]["ref"] = output_ref 

647 else: 

648 # Remove this dataset from the skeleton graph, 

649 # because the quantum that would have produced it 

650 # is being skipped and it doesn't already exist. 

651 skeleton.remove_dataset_nodes([output_dataset_key]) 

652 # If this dataset was "in the way" (i.e. already in the 

653 # output run), it isn't anymore. 

654 self.existing_datasets.outputs_in_the_way.pop(output_dataset_key, None) 

655 # Removing the quantum node from the graph will happen outside this 

656 # function. 

657 return True 

658 return False 

659 

660 @final 

661 def _gather_quantum_outputs( 

662 self, 

663 task_node: TaskNode, 

664 quantum_key: QuantumKey, 

665 skeleton: QuantumGraphSkeleton, 

666 skypix_bounds_builder: SkyPixBoundsBuilder, 

667 timespan_builder: TimespanBuilder, 

668 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

669 """Collect outputs or generate datasets for a preliminary quantum and 

670 put them in the form used by `~lsst.daf.butler.Quantum` and 

671 `~PipelineTaskConnections.adjustQuantum`. 

672 

673 Parameters 

674 ---------- 

675 task_node : `pipeline_graph.TaskNode` 

676 Node for this task in the pipeline graph. 

677 quantum_key : `QuantumKey` 

678 Identifier for this quantum in the graph. 

679 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

680 Preliminary quantum graph, to be modified in-place. 

681 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder` 

682 An object that accumulates the appropriate spatial bounds for a 

683 quantum. 

684 timespan_builder : `~prerequisite_helpers.TimespanBuilder` 

685 An object that accumulates the appropriate timespan for a quantum. 

686 

687 Returns 

688 ------- 

689 outputs : `~lsst.daf.butler.NamedKeyDict` [ \ 

690 `~lsst.daf.butler.DatasetType`, `list` [ \ 

691 `~lsst.daf.butler.DatasetRef` ] ] 

692 All outputs to the task, using the storage class and components 

693 defined by the task's own connections. 

694 

695 Notes 

696 ----- 

697 This first looks for outputs already present in the `output_run` by 

698 looking in `ExistingDatasets.outputs_in_the_way`; if it finds something 

699 and `clobber` is `True`, it uses that ref (it's not ideal that both the 

700 original dataset and its replacement will have the same UUID, but we 

701 don't have space in the quantum graph for two UUIDs, and we need the 

702 datastore records of the original there). If `clobber` is `False`, 

703 `RuntimeError` is raised. If there is no output already present, a new 

704 one with a random UUID is generated. In all cases the "ref" attribute 

705 of the dataset node in the skeleton is set. 

706 """ 

707 outputs_by_type: dict[str, list[DatasetRef]] = {} 

708 dataset_key: DatasetKey 

709 for dataset_key in skeleton.iter_outputs_of(quantum_key): 

710 dataset_data_id = skeleton[dataset_key]["data_id"] 

711 dataset_type_node = self._pipeline_graph.dataset_types[dataset_key.parent_dataset_type_name] 

712 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None: 

713 ref = DatasetRef(dataset_type_node.dataset_type, dataset_data_id, run=self.output_run) 

714 elif not self.clobber: 

715 # We intentionally raise here, before running adjustQuantum, 

716 # because it'd be weird if we left an old potential output of a 

717 # task sitting there in the output collection, just because the 

718 # task happened to not actually produce it. 

719 raise OutputExistsError( 

720 f"Potential output dataset {ref} already exists in the output run " 

721 f"{self.output_run}, but clobbering outputs was not expected to be necessary." 

722 ) 

723 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id) 

724 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id) 

725 skeleton[dataset_key]["ref"] = ref 

726 outputs_by_type.setdefault(dataset_key.parent_dataset_type_name, []).append(ref) 

727 adapted_outputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

728 for write_edge in task_node.iter_all_outputs(): 

729 dataset_type_node = self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name] 

730 edge_dataset_type = write_edge.adapt_dataset_type(dataset_type_node.dataset_type) 

731 adapted_outputs[edge_dataset_type] = [ 

732 write_edge.adapt_dataset_ref(ref) 

733 for ref in sorted(outputs_by_type.get(write_edge.parent_dataset_type_name, [])) 

734 ] 

735 return adapted_outputs 

736 

737 @final 

738 def _gather_quantum_inputs( 

739 self, 

740 task_node: TaskNode, 

741 quantum_key: QuantumKey, 

742 skeleton: QuantumGraphSkeleton, 

743 task_prerequisite_info: PrerequisiteInfo, 

744 skypix_bounds_builder: SkyPixBoundsBuilder, 

745 timespan_builder: TimespanBuilder, 

746 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

747 """Collect input datasets for a preliminary quantum and put them in the 

748 form used by `~lsst.daf.butler.Quantum` and 

749 `~PipelineTaskConnections.adjustQuantum`. 

750 

751 Parameters 

752 ---------- 

753 task_node : `pipeline_graph.TaskNode` 

754 Node for this task in the pipeline graph. 

755 quantum_key : `QuantumKey` 

756 Identifier for this quantum in the graph. 

757 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

758 Preliminary quantum graph, to be modified in-place. 

759 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder` 

760 An object that accumulates the appropriate spatial bounds for a 

761 quantum. 

762 timespan_builder : `~prerequisite_helpers.TimespanBuilder` 

763 An object that accumulates the appropriate timespan for a quantum. 

764 

765 Returns 

766 ------- 

767 inputs : `~lsst.daf.butler.NamedKeyDict` [ \ 

768 `~lsst.daf.butler.DatasetType`, `list` [ \ 

769 `~lsst.daf.butler.DatasetRef` ] ] 

770 All regular and prerequisite inputs to the task, using the storage 

771 class and components defined by the task's own connections. 

772 

773 Notes 

774 ----- 

775 On return, the dataset nodes that represent inputs to this quantum will 

776 either have their "ref" attribute set (using the common dataset type, 

777 not the task-specific one) or will be removed from the graph. 

778 

779 For regular inputs, usually an existing "ref" (corresponding to an 

780 output of another quantum) will be found and left unchanged. When 

781 there is no existing "ref" attribute, `ExistingDatasets.inputs` is 

782 searched next; if there is nothing there, the input will be removed. 

783 

784 Prerequisite inputs are always queried for directly here (delegating to 

785 `_find_prerequisite_inputs`). They are never produced by other tasks, 

786 and cannot in general be queried for in advance when 

787 `ExistingDatasets.inputs` is populated. 

788 """ 

789 quantum_data_id = skeleton[quantum_key]["data_id"] 

790 inputs_by_type: dict[str, set[DatasetRef]] = {} 

791 dataset_key: DatasetKey | PrerequisiteDatasetKey 

792 # Process inputs already present in the skeleton - this should include 

793 # all regular inputs (including intermediates) and may include some 

794 # prerequisites. 

795 for dataset_key in list(skeleton.iter_inputs_of(quantum_key)): 

796 if (ref := skeleton[dataset_key].get("ref")) is None: 

797 # This dataset is an overall input - if it was an intermediate, 

798 # we would have already either removed the node or set the 

799 # "ref" attribute when processing its producing quantum - and 

800 # this is the first time we're trying to resolve it. 

801 if (ref := self.existing_datasets.inputs.get(dataset_key)) is None: 

802 # It also doesn't exist in the input collections, so we 

803 # remove its node in the skeleton graph (so other consumers 

804 # won't have to check for it). 

805 skeleton.remove_dataset_nodes([dataset_key]) 

806 continue 

807 skeleton[dataset_key]["ref"] = ref 

808 inputs_by_type.setdefault(dataset_key.parent_dataset_type_name, set()).add(ref) 

809 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId) 

810 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId) 

811 # Query for any prerequisites not handled by process_subgraph. Note 

812 # that these were not already in the skeleton graph, so we add them 

813 # now. 

814 skypix_bounds = skypix_bounds_builder.finish() 

815 timespan = timespan_builder.finish() 

816 for finder in task_prerequisite_info.finders.values(): 

817 inputs_for_type = inputs_by_type.setdefault(finder.dataset_type_node.name, set()) 

818 dataset_keys = [] 

819 for ref in finder.find( 

820 self.butler, self.input_collections, quantum_data_id, skypix_bounds, timespan 

821 ): 

822 dataset_key = skeleton.add_prerequisite_node(ref.datasetType.name, ref=ref) 

823 dataset_keys.append(dataset_key) 

824 inputs_for_type.add(ref) 

825 skeleton.add_input_edges(quantum_key, dataset_keys) 

826 adapted_inputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

827 for read_edge in task_node.iter_all_inputs(): 

828 dataset_type_node = self._pipeline_graph.dataset_types[read_edge.parent_dataset_type_name] 

829 edge_dataset_type = read_edge.adapt_dataset_type(dataset_type_node.dataset_type) 

830 if (current_dataset_type := adapted_inputs.keys().get(edge_dataset_type.name)) is None: 

831 adapted_inputs[edge_dataset_type] = [ 

832 read_edge.adapt_dataset_ref(ref) 

833 for ref in sorted(inputs_by_type.get(read_edge.parent_dataset_type_name, frozenset())) 

834 ] 

835 elif current_dataset_type != edge_dataset_type: 

836 raise NotImplementedError( 

837 f"Task {task_node.label!r} has {edge_dataset_type.name!r} as an input via " 

838 "two different connections, with two different storage class overrides. " 

839 "This is not yet supported due to limitations in the Quantum data structure." 

840 ) 

841 # If neither the `if` nor the `elif` above match, it means 

842 # multiple input connections have exactly the same dataset 

843 # type, and hence nothing to do after the first one. 

844 return adapted_inputs 

845 

846 @final 

847 def _resolve_task_init( 

848 self, task_node: TaskNode, skeleton: QuantumGraphSkeleton, has_skipped_quanta: bool 

849 ) -> None: 

850 """Add init-input and init-output dataset nodes and edges for a task to 

851 the skeleton. 

852 

853 Parameters 

854 ---------- 

855 task_node : `pipeline_graph.TaskNode` 

856 Pipeline graph description of the task. 

857 skeleton : `QuantumGraphSkeleton` 

858 In-progress quantum graph data structure to update in-place. 

859 has_skipped_quanta : `bool` 

860 Whether any of this task's quanta were skipped because they had 

861 already succeeded. 

862 """ 

863 quanta = skeleton.get_quanta(task_node.label) 

864 task_init_key = TaskInitKey(task_node.label) 

865 if quanta: 

866 adapted_inputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict() 

867 # Process init-inputs. 

868 input_keys: list[DatasetKey] = [] 

869 for read_edge in task_node.init.iter_all_inputs(): 

870 dataset_key = skeleton.add_dataset_node( 

871 read_edge.parent_dataset_type_name, self.empty_data_id 

872 ) 

873 skeleton.add_input_edge(task_init_key, dataset_key) 

874 if (ref := skeleton[dataset_key].get("ref")) is None: 

875 try: 

876 ref = self.existing_datasets.inputs[dataset_key] 

877 except KeyError: 

878 raise InitInputMissingError( 

879 f"Overall init-input dataset {read_edge.parent_dataset_type_name!r} " 

880 f"needed by task {task_node.label!r} not found in input collection(s) " 

881 f"{self.input_collections}." 

882 ) from None 

883 skeleton[dataset_key]["ref"] = ref 

884 for quantum_key in skeleton.get_quanta(task_node.label): 

885 skeleton.add_input_edge(quantum_key, dataset_key) 

886 input_keys.append(dataset_key) 

887 adapted_ref = read_edge.adapt_dataset_ref(ref) 

888 adapted_inputs[adapted_ref.datasetType] = adapted_ref 

889 # Save the quantum-adapted init inputs to each quantum, and add 

890 # skeleton edges connecting the init inputs to each quantum. 

891 for quantum_key in skeleton.get_quanta(task_node.label): 

892 skeleton[quantum_key]["init_inputs"] = adapted_inputs 

893 # Process init-outputs. 

894 adapted_outputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict() 

895 for write_edge in task_node.init.iter_all_outputs(): 

896 dataset_key = skeleton.add_dataset_node( 

897 write_edge.parent_dataset_type_name, self.empty_data_id 

898 ) 

899 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None: 

900 ref = DatasetRef( 

901 self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name].dataset_type, 

902 self.empty_data_id, 

903 run=self.output_run, 

904 ) 

905 skeleton[dataset_key]["ref"] = ref 

906 skeleton.add_output_edge(task_init_key, dataset_key) 

907 adapted_ref = write_edge.adapt_dataset_ref(ref) 

908 adapted_outputs[adapted_ref.datasetType] = adapted_ref 

909 skeleton[task_init_key]["inputs"] = adapted_inputs 

910 skeleton[task_init_key]["outputs"] = adapted_outputs 

911 elif has_skipped_quanta: 

912 # No quanta remain for this task, but at least one quantum was 

913 # skipped because its outputs were present in the skip_existing_in 

914 # collections. This means all init outputs should be present in 

915 # the skip_existing_in collections, too, and we need to put those 

916 # refs in the graph. 

917 for write_edge in task_node.init.iter_all_outputs(): 

918 dataset_key = skeleton.add_dataset_node( 

919 write_edge.parent_dataset_type_name, self.empty_data_id 

920 ) 

921 if (ref := self.existing_datasets.outputs_for_skip.get(dataset_key)) is None: 

922 raise InitInputMissingError( 

923 f"Init-output dataset {write_edge.parent_dataset_type_name!r} of skipped task " 

924 f"{task_node.label!r} not found in skip-existing-in collection(s) " 

925 f"{self.skip_existing_in}." 

926 ) from None 

927 skeleton[dataset_key]["ref"] = ref 

928 # If this dataset was "in the way" (i.e. already in the output 

929 # run), it isn't anymore. 

930 self.existing_datasets.outputs_in_the_way.pop(dataset_key, None) 

931 # No quanta remain in this task, but none were skipped; this means 

932 # they all got pruned because of NoWorkFound conditions. This 

933 # dooms all downstream quanta to the same fate, so we don't bother 

934 # doing anything with the task's init-outputs, since nothing is 

935 # going to consume them. 

936 

937 @final 

938 @timeMethod 

939 def _find_empty_dimension_datasets(self) -> None: 

940 """Query for all dataset types with no dimensions, updating 

941 `existing_datasets` in-place. 

942 

943 This includes but is not limited to init inputs and init outputs. 

944 """ 

945 _, dataset_type_nodes = self._pipeline_graph.group_by_dimensions()[self.universe.empty.as_group()] 

946 dataset_types = [node.dataset_type for node in dataset_type_nodes.values()] 

947 dataset_types.extend(self._global_init_output_types.values()) 

948 for dataset_type in dataset_types: 

949 key = DatasetKey(dataset_type.name, self.empty_data_id.required_values) 

950 if ( 

951 self._pipeline_graph.producer_of(dataset_type.name) is None 

952 and dataset_type.name not in self._global_init_output_types 

953 ): 

954 # Dataset type is an overall input; we always need to try to 

955 # find these. 

956 try: 

957 ref = self.butler.find_dataset(dataset_type.name, collections=self.input_collections) 

958 except MissingDatasetTypeError: 

959 ref = None 

960 if ref is not None: 

961 self.existing_datasets.inputs[key] = ref 

962 elif self.skip_existing_in: 

963 # Dataset type is an intermediate or output; need to find these 

964 # if only they're from previously executed quanta that we might 

965 # skip... 

966 try: 

967 ref = self.butler.find_dataset(dataset_type.name, collections=self.skip_existing_in) 

968 except MissingDatasetTypeError: 

969 ref = None 

970 if ref is not None: 

971 self.existing_datasets.outputs_for_skip[key] = ref 

972 if ref.run == self.output_run: 

973 self.existing_datasets.outputs_in_the_way[key] = ref 

974 if self.output_run_exists and not self.skip_existing_starts_with_output_run: 

975 # ...or if they're in the way and would need to be clobbered 

976 # (and we haven't already found them in the previous block). 

977 try: 

978 ref = self.butler.find_dataset(dataset_type.name, collections=[self.output_run]) 

979 except MissingDatasetTypeError: 

980 ref = None 

981 if ref is not None: 

982 self.existing_datasets.outputs_in_the_way[key] = ref 

983 

984 @final 

985 @timeMethod 

986 def _attach_datastore_records(self, skeleton: QuantumGraphSkeleton) -> None: 

987 """Add datastore records for all overall inputs to a preliminary 

988 quantum graph. 

989 

990 Parameters 

991 ---------- 

992 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

993 Preliminary quantum graph to update in place. 

994 

995 Notes 

996 ----- 

997 On return, all quantum nodes in the skeleton graph will have a 

998 "datastore_records" attribute that is a mapping from datastore name 

999 to `lsst.daf.butler.DatastoreRecordData`, as used by 

1000 `lsst.daf.butler.Quantum`. 

1001 """ 

1002 overall_inputs = skeleton.extract_overall_inputs() 

1003 exported_records = self.butler._datastore.export_records(overall_inputs.values()) 

1004 for quantum_key in skeleton.iter_all_quanta(): 

1005 quantum_records = {} 

1006 input_ids = { 

1007 ref.id 

1008 for dataset_key in skeleton.iter_inputs_of(quantum_key) 

1009 if (ref := overall_inputs.get(dataset_key)) is not None 

1010 } 

1011 if input_ids: 

1012 for datastore_name, records in exported_records.items(): 

1013 matching_records = records.subset(input_ids) 

1014 if matching_records is not None: 

1015 quantum_records[datastore_name] = matching_records 

1016 skeleton[quantum_key]["datastore_records"] = quantum_records 

1017 

1018 @final 

1019 @timeMethod 

1020 def _construct_quantum_graph( 

1021 self, skeleton: QuantumGraphSkeleton, metadata: Mapping[str, Any] 

1022 ) -> QuantumGraph: 

1023 """Construct a `QuantumGraph` object from the contents of a 

1024 fully-processed `quantum_graph_skeleton.QuantumGraphSkeleton`. 

1025 

1026 Parameters 

1027 ---------- 

1028 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

1029 Preliminary quantum graph. Must have "init_inputs", "inputs", and 

1030 "outputs" attributes on all quantum nodes, as added by 

1031 `_resolve_task_quanta`, as well as a "datastore_records" attribute 

1032 as added by `_attach_datastore_records`. 

1033 metadata : `Mapping` 

1034 Flexible metadata to add to the graph. 

1035 

1036 Returns 

1037 ------- 

1038 quantum_graph : `QuantumGraph` 

1039 DAG describing processing to be performed. 

1040 """ 

1041 quanta: dict[TaskDef, set[Quantum]] = {} 

1042 init_inputs: dict[TaskDef, Iterable[DatasetRef]] = {} 

1043 init_outputs: dict[TaskDef, Iterable[DatasetRef]] = {} 

1044 for task_def in self._pipeline_graph._iter_task_defs(): 

1045 if not skeleton.has_task(task_def.label): 

1046 continue 

1047 task_node = self._pipeline_graph.tasks[task_def.label] 

1048 task_init_key = skeleton.get_task_init_node(task_def.label) 

1049 init_inputs[task_def] = skeleton[task_init_key]["inputs"].values() 

1050 init_outputs[task_def] = skeleton[task_init_key]["outputs"].values() 

1051 quanta_for_task: set[Quantum] = set() 

1052 for quantum_key in skeleton.get_quanta(task_node.label): 

1053 node_state = skeleton[quantum_key] 

1054 quanta_for_task.add( 

1055 Quantum( 

1056 taskName=task_node.task_class_name, 

1057 taskClass=task_node.task_class, 

1058 dataId=node_state["data_id"], 

1059 initInputs=node_state["init_inputs"], 

1060 inputs=node_state["inputs"], 

1061 outputs=node_state["outputs"], 

1062 datastore_records=node_state.get("datastore_records"), 

1063 ) 

1064 ) 

1065 quanta[task_def] = quanta_for_task 

1066 

1067 registry_dataset_types: list[DatasetType] = [ 

1068 node.dataset_type for node in self._pipeline_graph.dataset_types.values() 

1069 ] 

1070 

1071 all_metadata = self.metadata.to_dict() 

1072 all_metadata.update(metadata) 

1073 return QuantumGraph( 

1074 quanta, 

1075 metadata=all_metadata, 

1076 universe=self.universe, 

1077 initInputs=init_inputs, 

1078 initOutputs=init_outputs, 

1079 globalInitOutputs=[skeleton[key]["ref"] for key in skeleton.global_init_outputs], 

1080 registryDatasetTypes=registry_dataset_types, 

1081 ) 

1082 

1083 @staticmethod 

1084 @final 

1085 def _find_removed( 

1086 original: Iterable[DatasetKey | PrerequisiteDatasetKey], 

1087 adjusted: NamedKeyMapping[DatasetType, Sequence[DatasetRef]], 

1088 ) -> set[DatasetKey | PrerequisiteDatasetKey]: 

1089 """Identify skeleton-graph dataset nodes that have been removed by 

1090 `~PipelineTaskConnections.adjustQuantum`. 

1091 

1092 Parameters 

1093 ---------- 

1094 original : `~collections.abc.Iterable` [ `DatasetKey` or \ 

1095 `PrerequisiteDatasetKey` ] 

1096 Identifiers for the dataset nodes that were the original neighbors 

1097 (inputs or outputs) of a quantum. 

1098 adjusted : `~lsst.daf.butler.NamedKeyMapping` [ \ 

1099 `~lsst.daf.butler.DatasetType`, \ 

1100 `~collections.abc.Sequence` [ `lsst.daf.butler.DatasetType` ] ] 

1101 Adjusted neighbors, in the form used by `lsst.daf.butler.Quantum`. 

1102 

1103 Returns 

1104 ------- 

1105 removed : `set` [ `DatasetKey` ] 

1106 Datasets in ``original`` that have no counterpart in ``adjusted``. 

1107 """ 

1108 result = set(original) 

1109 for dataset_type, kept_refs in adjusted.items(): 

1110 parent_dataset_type_name, _ = DatasetType.splitDatasetTypeName(dataset_type.name) 

1111 for kept_ref in kept_refs: 

1112 # We don't know if this was a DatasetKey or a 

1113 # PrerequisiteDatasetKey; just try both. 

1114 result.discard(DatasetKey(parent_dataset_type_name, kept_ref.dataId.required_values)) 

1115 result.discard(PrerequisiteDatasetKey(parent_dataset_type_name, kept_ref.id.bytes)) 

1116 return result 

1117 

1118 

1119@dataclasses.dataclass(eq=False, order=False) 

1120class ExistingDatasets: 

1121 """Struct that holds the results of dataset queries for 

1122 `QuantumGraphBuilder`. 

1123 """ 

1124 

1125 inputs: dict[DatasetKey | PrerequisiteDatasetKey, DatasetRef] = dataclasses.field(default_factory=dict) 

1126 """Overall-input datasets found in `QuantumGraphBuilder.input_collections`. 

1127 

1128 This may include prerequisite inputs. It does include init-inputs. 

1129 It does not include intermediates. 

1130 """ 

1131 

1132 outputs_for_skip: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict) 

1133 """Output datasets found in `QuantumGraphBuilder.skip_existing_in`. 

1134 

1135 It is unspecified whether this contains include init-outputs; there is 

1136 no concept of skipping at the init stage, so this is not expected to 

1137 matter. 

1138 """ 

1139 

1140 outputs_in_the_way: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict) 

1141 """Output datasets found in `QuantumGraphBuilder.output_run`. 

1142 

1143 This includes regular outputs and init-outputs. 

1144 """ 

1145 

1146 

1147def _quantum_or_quanta(n: int) -> str: 

1148 """Correctly pluralize 'quantum' if needed.""" 

1149 return f"{n} quanta" if n != 1 else "1 quantum"