Coverage for python/lsst/pipe/base/quantum_graph_builder.py: 25%

370 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-19 10:39 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""The base class for the QuantumGraph-generation algorithm and various 

29helper classes. 

30""" 

31 

32from __future__ import annotations 

33 

34__all__ = ( 

35 "QuantumGraphBuilder", 

36 "ExistingDatasets", 

37 "QuantumGraphBuilderError", 

38 "OutputExistsError", 

39 "PrerequisiteMissingError", 

40) 

41 

42import dataclasses 

43from abc import ABC, abstractmethod 

44from collections.abc import Iterable, Mapping, Sequence 

45from typing import TYPE_CHECKING, Any, final 

46 

47from lsst.daf.butler import ( 

48 Butler, 

49 CollectionType, 

50 DataCoordinate, 

51 DatasetRef, 

52 DatasetType, 

53 DimensionUniverse, 

54 Quantum, 

55) 

56from lsst.daf.butler.core.named import NamedKeyDict, NamedKeyMapping 

57from lsst.daf.butler.registry import MissingCollectionError, MissingDatasetTypeError 

58from lsst.utils.logging import LsstLogAdapter, getLogger 

59from lsst.utils.timer import timeMethod 

60 

61from . import automatic_connection_constants as acc 

62from ._status import NoWorkFound 

63from ._task_metadata import TaskMetadata 

64from .connections import AdjustQuantumHelper 

65from .graph import QuantumGraph 

66from .pipeline_graph import PipelineGraph, TaskNode 

67from .prerequisite_helpers import PrerequisiteInfo, SkyPixBoundsBuilder, TimespanBuilder 

68from .quantum_graph_skeleton import ( 

69 DatasetKey, 

70 PrerequisiteDatasetKey, 

71 QuantumGraphSkeleton, 

72 QuantumKey, 

73 TaskInitKey, 

74) 

75 

76if TYPE_CHECKING: 

77 from .pipeline import TaskDef 

78 

79 

80class QuantumGraphBuilderError(Exception): 

81 """Base class for exceptions generated by QuantumGraphBuilder.""" 

82 

83 pass 

84 

85 

86class GraphBuilderError(QuantumGraphBuilderError): 

87 """Backwards-compatibility near-alias for QuantumGraphBuilderError.""" 

88 

89 pass 

90 

91 

92# Inherit from backwards-compatibility alias for backwards-compatibility. 

93class OutputExistsError(GraphBuilderError): 

94 """Exception generated when output datasets already exist.""" 

95 

96 pass 

97 

98 

99# Inherit from backwards-compatibility alias for backwards-compatibility. 

100class PrerequisiteMissingError(GraphBuilderError): 

101 """Exception generated when a prerequisite dataset does not exist.""" 

102 

103 pass 

104 

105 

106class InitInputMissingError(QuantumGraphBuilderError): 

107 """Exception generated when an init-input dataset does not exist.""" 

108 

109 pass 

110 

111 

112class QuantumGraphBuilder(ABC): 

113 """An abstract base class for building `QuantumGraph` objects from a 

114 pipeline. 

115 

116 Parameters 

117 ---------- 

118 pipeline_graph : `.pipeline_graph.PipelineGraph` 

119 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved 

120 in-place with the given butler (any existing resolution is ignored). 

121 butler : `lsst.daf.butler.Butler` 

122 Client for the data repository. Should be read-only. 

123 input_collections : `~collections.abc.Sequence` [ `str` ], optional 

124 Collections to search for overall-input datasets. If not provided, 

125 ``butler.collections`` is used (and must not be empty). 

126 output_run : `str`, optional 

127 Output `~lsst.daf.butler.CollectionType.RUN` collection. If not 

128 provided, ``butler.run`` is used (and must not be `None`). 

129 skip_existing_in : `~collections.abc.Sequence` [ `str` ], optional 

130 Collections to search for outputs that already exist for the purpose of 

131 skipping quanta that have already been run. 

132 clobber : `bool`, optional 

133 Whether to raise if predicted outputs already exist in ``output_run`` 

134 (not including those quanta that would be skipped because they've 

135 already been run). This never actually clobbers outputs; it just 

136 informs the graph generation algorithm whether execution will run with 

137 clobbering enabled. This is ignored if ``output_run`` does not exist. 

138 

139 Notes 

140 ----- 

141 Constructing a `QuantumGraphBuilder` will run queries for existing datasets 

142 with empty data IDs (including but not limited to init inputs and outputs), 

143 in addition to resolving the given pipeline graph and testing for existence 

144 of the ``output`` run collection. 

145 

146 The `build` method splits the pipeline graph into independent subgraphs, 

147 then calls the abstract method `process_subgraph` on each, to allow 

148 concrete implementations to populate the rough graph structure (the 

149 `~quantum_graph_skeleton.QuantumGraphSkeleton` class) and search for 

150 existing datasets (further populating the builder's `existing_datasets` 

151 struct). The `build` method then: 

152 

153 - assembles `lsst.daf.butler.Quantum` instances from all data IDs in the 

154 skeleton; 

155 - looks for existing outputs found in ``skip_existing_in`` to see if any 

156 quanta should be skipped; 

157 - calls `PipelineTaskConnections.adjustQuantum` on all quanta, adjusting 

158 downstream quanta appropriately when preliminary predicted outputs are 

159 rejected (pruning nodes that will not have the inputs they need to run); 

160 - attaches datastore records and registry dataset types to the graph. 

161 

162 In addition to implementing `process_subgraph`, derived classes are 

163 generally expected to add new construction keyword-only arguments to 

164 control the data IDs of the quantum graph, while forwarding all of the 

165 arguments defined in the base class to `super`. 

166 """ 

167 

168 def __init__( 

169 self, 

170 pipeline_graph: PipelineGraph, 

171 butler: Butler, 

172 *, 

173 input_collections: Sequence[str] | None = None, 

174 output_run: str | None = None, 

175 skip_existing_in: Sequence[str] = (), 

176 clobber: bool = False, 

177 ): 

178 self.log = getLogger(__name__) 

179 self.metadata = TaskMetadata() 

180 self._pipeline_graph = pipeline_graph 

181 self.butler = butler 

182 self._pipeline_graph.resolve(self.butler.registry) 

183 if input_collections is None: 

184 input_collections = butler.collections 

185 if not input_collections: 

186 raise ValueError("No input collections provided.") 

187 self.input_collections = input_collections 

188 if output_run is None: 

189 output_run = butler.run 

190 if not output_run: 

191 raise ValueError("No output RUN collection provided.") 

192 self.output_run = output_run 

193 self.skip_existing_in = skip_existing_in 

194 self.empty_data_id = DataCoordinate.makeEmpty(butler.dimensions) 

195 self.clobber = clobber 

196 # See whether the output run already exists. 

197 self.output_run_exists = False 

198 try: 

199 if self.butler.registry.getCollectionType(self.output_run) is not CollectionType.RUN: 

200 raise RuntimeError(f"{self.output_run!r} is not a RUN collection.") 

201 self.output_run_exists = True 

202 except MissingCollectionError: 

203 # If the run doesn't exist we never need to clobber. This is not 

204 # an error so you can run with clobber=True the first time you 

205 # attempt some processing as well as all subsequent times, instead 

206 # of forcing the user to make the first attempt different. 

207 self.clobber = False 

208 # We need to know whether the skip_existing_in collection sequence 

209 # starts with the output run collection, as an optimization to avoid 

210 # queries later. 

211 try: 

212 skip_existing_in_flat = self.butler.registry.queryCollections( 

213 self.skip_existing_in, flattenChains=True 

214 ) 

215 except MissingCollectionError: 

216 skip_existing_in_flat = [] 

217 if not skip_existing_in_flat: 

218 self.skip_existing_in = [] 

219 if self.skip_existing_in and self.output_run_exists: 

220 self.skip_existing_starts_with_output_run = self.output_run == skip_existing_in_flat[0] 

221 else: 

222 self.skip_existing_starts_with_output_run = False 

223 self.existing_datasets = ExistingDatasets() 

224 try: 

225 packages_storage_class = butler.registry.getDatasetType( 

226 acc.PACKAGES_INIT_OUTPUT_NAME 

227 ).storageClass_name 

228 except MissingDatasetTypeError: 

229 packages_storage_class = acc.PACKAGES_INIT_OUTPUT_STORAGE_CLASS 

230 self._global_init_output_types = { 

231 acc.PACKAGES_INIT_OUTPUT_NAME: DatasetType( 

232 acc.PACKAGES_INIT_OUTPUT_NAME, 

233 self.universe.empty, 

234 packages_storage_class, 

235 ) 

236 } 

237 self._find_empty_dimension_datasets() 

238 self.prerequisite_info = { 

239 task_node.label: PrerequisiteInfo(task_node, self._pipeline_graph) 

240 for task_node in pipeline_graph.tasks.values() 

241 } 

242 

243 log: LsstLogAdapter 

244 """Logger to use for all quantum-graph generation messages. 

245 

246 General and per-task status messages should be logged at `~logging.INFO` 

247 level or higher, per-dataset-type status messages should be logged at 

248 `~lsst.utils.logging.VERBOSE` or higher, and per-data-ID status messages 

249 should be logged at `logging.DEBUG` or higher. 

250 """ 

251 

252 metadata: TaskMetadata 

253 """Metadata to store in the QuantumGraph. 

254 

255 The `TaskMetadata` class is used here primarily in order to enable 

256 resource-usage collection with the `lsst.utils.timer.timeMethod` decorator. 

257 """ 

258 

259 butler: Butler 

260 """Client for the data repository. 

261 

262 Should be read-only. 

263 """ 

264 

265 input_collections: Sequence[str] 

266 """Collections to search for overall-input datasets. 

267 """ 

268 

269 output_run: str 

270 """Output `~lsst.daf.butler.CollectionType.RUN` collection. 

271 """ 

272 

273 skip_existing_in: Sequence[str] 

274 """Collections to search for outputs that already exist for the purpose 

275 of skipping quanta that have already been run. 

276 """ 

277 

278 clobber: bool 

279 """Whether to raise if predicted outputs already exist in ``output_run`` 

280 

281 This never actually clobbers outputs; it just informs the graph generation 

282 algorithm whether execution will run with clobbering enabled. This is 

283 always `False` if `output_run_exists` is `False`. 

284 """ 

285 

286 empty_data_id: DataCoordinate 

287 """An empty data ID in the data repository's dimension universe. 

288 """ 

289 

290 output_run_exists: bool 

291 """Whether the output run exists in the data repository already. 

292 """ 

293 

294 skip_existing_starts_with_output_run: bool 

295 """Whether the `skip_existing_in` sequence begins with `output_run`. 

296 

297 If this is true, any dataset found in `output_run` can be used to 

298 short-circuit queries in `skip_existing_in`. 

299 """ 

300 

301 existing_datasets: ExistingDatasets 

302 """Struct holding datasets that have already been found in the data 

303 repository. 

304 

305 This is updated in-place as the `QuantumGraph` generation algorithm 

306 proceeds. 

307 """ 

308 

309 prerequisite_info: Mapping[str, PrerequisiteInfo] 

310 """Helper objects for finding prerequisite inputs, organized by task label. 

311 

312 Subclasses that find prerequisites should remove the 

313 covered `~prerequisite_helpers.PrerequisiteFinder` objects from this 

314 attribute. 

315 """ 

316 

317 @property 

318 def universe(self) -> DimensionUniverse: 

319 """Definitions of all data dimensions.""" 

320 return self.butler.dimensions 

321 

322 @final 

323 @timeMethod 

324 def build(self, metadata: Mapping[str, Any] | None = None) -> QuantumGraph: 

325 """Build the quantum graph. 

326 

327 Parameters 

328 ---------- 

329 metadata : `~collections.abc.Mapping`, optional 

330 Flexible metadata to add to the quantum graph. 

331 

332 Returns 

333 ------- 

334 quantum_graph : `QuantumGraph` 

335 DAG describing processing to be performed. 

336 

337 Notes 

338 ----- 

339 External code is expected to construct a `QuantumGraphBuilder` and then 

340 call this method exactly once. See class documentation for details on 

341 what it does. 

342 """ 

343 full_skeleton = QuantumGraphSkeleton(self._pipeline_graph.tasks) 

344 subgraphs = list(self._pipeline_graph.split_independent()) 

345 for i, subgraph in enumerate(subgraphs): 

346 self.log.info( 

347 "Processing pipeline subgraph %d of %d with %d task(s).", 

348 i + 1, 

349 len(subgraphs), 

350 len(subgraph.tasks), 

351 ) 

352 self.log.verbose("Subgraph tasks: [%s]", ", ".join(label for label in subgraph.tasks)) 

353 subgraph_skeleton = self.process_subgraph(subgraph) 

354 full_skeleton.update(subgraph_skeleton) 

355 # Loop over tasks. The pipeline graph must be topologically sorted, 

356 # so a quantum is only processed after any quantum that provides its 

357 # inputs has been processed. 

358 for task_node in self._pipeline_graph.tasks.values(): 

359 self._resolve_task_quanta(task_node, full_skeleton) 

360 # Add global init-outputs to the skeleton. 

361 for dataset_type in self._global_init_output_types.values(): 

362 dataset_key = full_skeleton.add_dataset_node( 

363 dataset_type.name, self.empty_data_id, is_global_init_output=True 

364 ) 

365 ref = self.existing_datasets.outputs_in_the_way.get(dataset_key) 

366 if ref is None: 

367 ref = DatasetRef(dataset_type, self.empty_data_id, run=self.output_run) 

368 full_skeleton[dataset_key]["ref"] = ref 

369 # Remove dataset nodes with no edges that are not global init outputs, 

370 # which are generally overall-inputs whose original quanta end up 

371 # skipped or with no work to do (we can't remove these along with the 

372 # quanta because no quantum knows if its the only consumer). 

373 full_skeleton.remove_orphan_datasets() 

374 self._attach_datastore_records(full_skeleton) 

375 # TODO initialize most metadata here instead of in ctrl_mpexec. 

376 if metadata is None: 

377 metadata = {} 

378 return self._construct_quantum_graph(full_skeleton, metadata) 

379 

380 @abstractmethod 

381 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

382 """Build the rough structure for an independent subset of the 

383 `QuantumGraph` and query for relevant existing datasets. 

384 

385 Parameters 

386 ---------- 

387 subgraph : `.pipeline_graph.PipelineGraph` 

388 Subset of the pipeline graph that should be processed by this call. 

389 This is always resolved and topologically sorted. It should not be 

390 modified. 

391 

392 Returns 

393 ------- 

394 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

395 Class representing an initial quantum graph. See 

396 `quantum_graph_skeleton.QuantumGraphSkeleton` docs for details. 

397 After this is returned, the object may be modified in-place in 

398 unspecified ways. 

399 

400 Notes 

401 ----- 

402 In addition to returning a 

403 `quantum_graph_skeleton.QuantumGraphSkeleton`, this method should 

404 populate the `existing_datasets` structure by querying for all relevant 

405 datasets with non-empty data IDs (those with empty data IDs will 

406 already be present). In particular: 

407 

408 - `~ExistingDatasets.inputs` must always be populated with all 

409 overall-input datasets (but not prerequisites), by querying 

410 `input_collections`; 

411 - `~ExistingDatasets.outputs_for_skip` must be populated with any 

412 intermediate our output datasets present in `skip_existing_in` (it 

413 can be ignored if `skip_existing_in` is empty); 

414 - `~ExistingDatasets.outputs_in_the_way` must be populated with any 

415 intermediate or output datasets present in `output_run`, if 

416 `output_run_exists` (it can be ignored if `output_run_exists` is 

417 `False`). Note that the presence of such datasets is not 

418 automatically an error, even if `clobber is `False`, as these may be 

419 quanta that will be skipped. 

420 - `~ExistingDatasets.inputs` must be populated with all 

421 prerequisite-input datasets that were included in the skeleton, by 

422 querying `input_collections` (not all prerequisite inputs need to be 

423 included in the skeleton, but the base class can only use per-quantum 

424 queries to find them, and that can be slow when there are many 

425 quanta). 

426 

427 Dataset types should never be components and should always use the 

428 "common" storage class definition in `pipeline_graph.DatasetTypeNode` 

429 (which is the data repository definition when the dataset type is 

430 registered). 

431 """ 

432 raise NotImplementedError() 

433 

434 @final 

435 @timeMethod 

436 def _resolve_task_quanta(self, task_node: TaskNode, skeleton: QuantumGraphSkeleton) -> None: 

437 """Process the quanta for one task in a skeleton graph to skip those 

438 that have already completed and adjust those that request it. 

439 

440 Parameters 

441 ---------- 

442 task_node : `pipeline_graph.TaskNode` 

443 Node for this task in the pipeline graph. 

444 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

445 Preliminary quantum graph, to be modified in-place. 

446 

447 Notes 

448 ----- 

449 This method modifies ``skeleton`` in-place in several ways: 

450 

451 - It adds a "ref" attribute to dataset nodes, using the contents of 

452 `existing_datasets`. This ensures producing and consuming tasks 

453 start from the same `DatasetRef`. 

454 - It adds "inputs", "outputs", and "init_inputs" attributes to the 

455 quantum nodes, holding the same `NamedValueMapping` objects needed to 

456 construct an actual `Quantum` instances. 

457 - It removes quantum nodes that are to be skipped because their outputs 

458 already exist in `skip_existing_in`. It also removes their outputs 

459 from `ExistingDatasets.outputs_in_the_way`. 

460 - It adds prerequisite dataset nodes and edges that connect them to the 

461 quanta that consume them. 

462 - It removes quantum nodes whose 

463 `~PipelineTaskConnections.adjustQuantum` calls raise `NoWorkFound` or 

464 predict no outputs; 

465 - It removes the nodes of output datasets that are "adjusted away". 

466 - It removes the edges of input datasets that are "adjusted away". 

467 

468 The difference between how adjusted inputs and outputs are handled 

469 reflects the fact that many quanta can share the same input, but only 

470 one produces each output. This can lead to the graph having 

471 superfluous isolated nodes after processing is complete, but these 

472 should only be removed after all the quanta from all tasks have been 

473 processed. 

474 """ 

475 # Extract the helper object for the prerequisite inputs of this task, 

476 # and tell it to prepare to construct skypix bounds and timespans for 

477 # each quantum (these will automatically do nothing if nothing needs 

478 # those bounds). 

479 task_prerequisite_info = self.prerequisite_info[task_node.label] 

480 task_prerequisite_info.update_bounds() 

481 # Loop over all quanta for this task, remembering the ones we've 

482 # gotten rid of. 

483 skipped_quanta = [] 

484 no_work_quanta = [] 

485 for quantum_key in skeleton.get_quanta(task_node.label): 

486 if self._skip_quantum_if_metadata_exists(task_node, quantum_key, skeleton): 

487 skipped_quanta.append(quantum_key) 

488 continue 

489 quantum_data_id = skeleton[quantum_key]["data_id"] 

490 skypix_bounds_builder = task_prerequisite_info.bounds.make_skypix_bounds_builder(quantum_data_id) 

491 timespan_builder = task_prerequisite_info.bounds.make_timespan_builder(quantum_data_id) 

492 adjusted_outputs = self._gather_quantum_outputs( 

493 task_node, quantum_key, skeleton, skypix_bounds_builder, timespan_builder 

494 ) 

495 adjusted_inputs = self._gather_quantum_inputs( 

496 task_node, 

497 quantum_key, 

498 skeleton, 

499 task_prerequisite_info, 

500 skypix_bounds_builder, 

501 timespan_builder, 

502 ) 

503 # Give the task's Connections class an opportunity to remove 

504 # some inputs, or complain if they are unacceptable. This will 

505 # raise if one of the check conditions is not met, which is the 

506 # intended behavior. 

507 helper = AdjustQuantumHelper(inputs=adjusted_inputs, outputs=adjusted_outputs) 

508 try: 

509 helper.adjust_in_place( 

510 task_node._get_imported_data().connections, task_node.label, quantum_data_id 

511 ) 

512 except NoWorkFound as err: 

513 # Do not generate this quantum; it would not produce any 

514 # outputs. Remove it and all of the outputs it might have 

515 # produced from the skeleton. 

516 try: 

517 _, connection_name, _ = err.args 

518 details = f"not enough datasets for connection {connection_name}." 

519 except ValueError: 

520 details = str(err) 

521 self.log.debug( 

522 "No work found for quantum %s of task %s: %s", 

523 quantum_key.data_id_values, 

524 quantum_key.task_label, 

525 details, 

526 ) 

527 no_work_quanta.append(quantum_key) 

528 continue 

529 if helper.outputs_adjusted: 

530 if not any(adjusted_refs for adjusted_refs in helper.outputs.values()): 

531 # No outputs also means we don't generate this quantum. 

532 self.log.debug( 

533 "No outputs predicted for quantum %s of task %s.", 

534 quantum_key.data_id_values, 

535 quantum_key.task_label, 

536 ) 

537 no_work_quanta.append(quantum_key) 

538 continue 

539 # Remove output nodes that were not retained by 

540 # adjustQuantum. 

541 skeleton.remove_dataset_nodes( 

542 self._find_removed(skeleton.iter_outputs_of(quantum_key), helper.outputs) 

543 ) 

544 if helper.inputs_adjusted: 

545 if not any(bool(adjusted_refs) for adjusted_refs in helper.inputs.values()): 

546 raise QuantumGraphBuilderError( 

547 f"adjustQuantum implementation for {task_node.label}@{quantum_key.data_id_values} " 

548 "returned outputs but no inputs." 

549 ) 

550 # Remove input dataset edges that were not retained by 

551 # adjustQuantum. We can't remove the input dataset nodes 

552 # because some other quantum might still want them. 

553 skeleton.remove_input_edges( 

554 quantum_key, self._find_removed(skeleton.iter_inputs_of(quantum_key), helper.inputs) 

555 ) 

556 # Save the adjusted inputs and outputs to the quantum node's 

557 # state so we don't have to regenerate those data structures 

558 # from the graph. 

559 skeleton[quantum_key]["inputs"] = helper.inputs 

560 skeleton[quantum_key]["outputs"] = helper.outputs 

561 for no_work_quantum in no_work_quanta: 

562 skeleton.remove_quantum_node(no_work_quantum, remove_outputs=True) 

563 for skipped_quantum in skipped_quanta: 

564 skeleton.remove_quantum_node(skipped_quantum, remove_outputs=False) 

565 remaining_quanta = skeleton.get_quanta(task_node.label) 

566 self._resolve_task_init(task_node, skeleton, bool(skipped_quanta)) 

567 message_terms = [] 

568 if no_work_quanta: 

569 message_terms.append(f"{len(no_work_quanta)} had no work to do") 

570 if skipped_quanta: 

571 message_terms.append(f"{len(no_work_quanta)} previously succeeded") 

572 message_parenthetical = f" ({', '.join(message_terms)})" if message_terms else "" 

573 if remaining_quanta: 

574 self.log.info( 

575 "Generated %s for task %s%s.", 

576 _quantum_or_quanta(len(remaining_quanta)), 

577 task_node.label, 

578 message_parenthetical, 

579 ) 

580 else: 

581 self.log.info( 

582 "Dropping task %s because no quanta remain%s.", task_node.label, message_parenthetical 

583 ) 

584 skeleton.remove_task(task_node.label) 

585 

586 def _skip_quantum_if_metadata_exists( 

587 self, task_node: TaskNode, quantum_key: QuantumKey, skeleton: QuantumGraphSkeleton 

588 ) -> bool: 

589 """Identify and drop quanta that should be skipped because their 

590 metadata datasets already exist. 

591 

592 Parameters 

593 ---------- 

594 task_node : `pipeline_graph.TaskNode` 

595 Node for this task in the pipeline graph. 

596 quantum_key : `QuantumKey` 

597 Identifier for this quantum in the graph. 

598 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

599 Preliminary quantum graph, to be modified in-place. 

600 

601 Returns 

602 ------- 

603 skipped : `bool` 

604 `True` if the quantum is being skipped and has been removed from 

605 the graph, `False` otherwise. 

606 

607 Notes 

608 ----- 

609 If the metadata dataset for this quantum exists in 

610 `ExistingDatasets.outputs_for_skip`, the quantum will be skipped. This 

611 causes the quantum node to be removed from the graph. Dataset nodes 

612 that were previously the outputs of this quantum will have their "ref" 

613 attribute set from `ExistingDatasets.outputs_for_skip`, or will be 

614 removed if there is no such dataset there. Any output dataset in 

615 `ExistingDatasets.outputs_in_the_way` will be removed. 

616 """ 

617 metadata_dataset_key = DatasetKey( 

618 task_node.metadata_output.parent_dataset_type_name, quantum_key.data_id_values 

619 ) 

620 if metadata_dataset_key in self.existing_datasets.outputs_for_skip: 

621 # This quantum's metadata is already present in the the 

622 # skip_existing_in collections; we'll skip it. But the presence of 

623 # the metadata dataset doesn't guarantee that all of the other 

624 # outputs we predicted are present; we have to check. 

625 for output_dataset_key in list(skeleton.iter_outputs_of(quantum_key)): 

626 if ( 

627 output_ref := self.existing_datasets.outputs_for_skip.get(output_dataset_key) 

628 ) is not None: 

629 # Populate the skeleton graph's node attributes 

630 # with the existing DatasetRef, just like a 

631 # predicted output of a non-skipped quantum. 

632 skeleton[output_dataset_key]["ref"] = output_ref 

633 else: 

634 # Remove this dataset from the skeleton graph, 

635 # because the quantum that would have produced it 

636 # is being skipped and it doesn't already exist. 

637 skeleton.remove_dataset_nodes([output_dataset_key]) 

638 # If this dataset was "in the way" (i.e. already in the 

639 # output run), it isn't anymore. 

640 self.existing_datasets.outputs_in_the_way.pop(output_dataset_key, None) 

641 # Removing the quantum node from the graph will happen outside this 

642 # function. 

643 return True 

644 return False 

645 

646 @final 

647 def _gather_quantum_outputs( 

648 self, 

649 task_node: TaskNode, 

650 quantum_key: QuantumKey, 

651 skeleton: QuantumGraphSkeleton, 

652 skypix_bounds_builder: SkyPixBoundsBuilder, 

653 timespan_builder: TimespanBuilder, 

654 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

655 """Collect outputs or generate datasets for a preliminary quantum and 

656 put them in the form used by `~lsst.daf.butler.Quantum` and 

657 `~PipelineTaskConnections.adjustQuantum`. 

658 

659 Parameters 

660 ---------- 

661 task_node : `pipeline_graph.TaskNode` 

662 Node for this task in the pipeline graph. 

663 quantum_key : `QuantumKey` 

664 Identifier for this quantum in the graph. 

665 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

666 Preliminary quantum graph, to be modified in-place. 

667 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder` 

668 An object that accumulates the appropriate spatial bounds for a 

669 quantum. 

670 timespan_builder : `~prerequisite_helpers.TimespanBuilder` 

671 An object that accumulates the appropriate timespan for a quantum. 

672 

673 Returns 

674 ------- 

675 outputs : `~lsst.daf.butler.NamedKeyDict` [ \ 

676 `~lsst.daf.butler.DatasetType`, `list` [ \ 

677 `~lsst.daf.butler.DatasetRef` ] ] 

678 All outputs to the task, using the storage class and components 

679 defined by the task's own connections. 

680 

681 Notes 

682 ----- 

683 This first looks for outputs already present in the `output_run` by 

684 looking in `ExistingDatasets.outputs_in_the_way`; if it finds something 

685 and `clobber` is `True`, it uses that ref (it's not ideal that both the 

686 original dataset and its replacement will have the same UUID, but we 

687 don't have space in the quantum graph for two UUIDs, and we need the 

688 datastore records of the original there). If `clobber` is `False`, 

689 `RuntimeError` is raised. If there is no output already present, a new 

690 one with a random UUID is generated. In all cases the "ref" attribute 

691 of the dataset node in the skeleton is set. 

692 """ 

693 outputs_by_type: dict[str, list[DatasetRef]] = {} 

694 dataset_key: DatasetKey 

695 for dataset_key in skeleton.iter_outputs_of(quantum_key): 

696 dataset_data_id = skeleton[dataset_key]["data_id"] 

697 dataset_type_node = self._pipeline_graph.dataset_types[dataset_key.parent_dataset_type_name] 

698 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None: 

699 ref = DatasetRef(dataset_type_node.dataset_type, dataset_data_id, run=self.output_run) 

700 elif not self.clobber: 

701 # We intentionally raise here, before running adjustQuantum, 

702 # because it'd be weird if we left an old potential output of a 

703 # task sitting there in the output collection, just because the 

704 # task happened to not actually produce it. 

705 raise OutputExistsError( 

706 f"Potential output dataset {ref} already exists in the output run " 

707 f"{self.output_run}, but clobbering outputs was not expected to be necessary." 

708 ) 

709 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id) 

710 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, dataset_data_id) 

711 skeleton[dataset_key]["ref"] = ref 

712 outputs_by_type.setdefault(dataset_key.parent_dataset_type_name, []).append(ref) 

713 adapted_outputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

714 for write_edge in task_node.iter_all_outputs(): 

715 dataset_type_node = self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name] 

716 edge_dataset_type = write_edge.adapt_dataset_type(dataset_type_node.dataset_type) 

717 adapted_outputs[edge_dataset_type] = [ 

718 write_edge.adapt_dataset_ref(ref) 

719 for ref in sorted(outputs_by_type.get(write_edge.parent_dataset_type_name, [])) 

720 ] 

721 return adapted_outputs 

722 

723 @final 

724 def _gather_quantum_inputs( 

725 self, 

726 task_node: TaskNode, 

727 quantum_key: QuantumKey, 

728 skeleton: QuantumGraphSkeleton, 

729 task_prerequisite_info: PrerequisiteInfo, 

730 skypix_bounds_builder: SkyPixBoundsBuilder, 

731 timespan_builder: TimespanBuilder, 

732 ) -> NamedKeyDict[DatasetType, list[DatasetRef]]: 

733 """Collect input datasets for a preliminary quantum and put them in the 

734 form used by `~lsst.daf.butler.Quantum` and 

735 `~PipelineTaskConnections.adjustQuantum`. 

736 

737 Parameters 

738 ---------- 

739 task_node : `pipeline_graph.TaskNode` 

740 Node for this task in the pipeline graph. 

741 quantum_key : `QuantumKey` 

742 Identifier for this quantum in the graph. 

743 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

744 Preliminary quantum graph, to be modified in-place. 

745 skypix_bounds_builder : `~prerequisite_helpers.SkyPixBoundsBuilder` 

746 An object that accumulates the appropriate spatial bounds for a 

747 quantum. 

748 timespan_builder : `~prerequisite_helpers.TimespanBuilder` 

749 An object that accumulates the appropriate timespan for a quantum. 

750 

751 Returns 

752 ------- 

753 inputs : `~lsst.daf.butler.NamedKeyDict` [ \ 

754 `~lsst.daf.butler.DatasetType`, `list` [ \ 

755 `~lsst.daf.butler.DatasetRef` ] ] 

756 All regular and prerequisite inputs to the task, using the storage 

757 class and components defined by the task's own connections. 

758 

759 Notes 

760 ----- 

761 On return, the dataset nodes that represent inputs to this quantum will 

762 either have their "ref" attribute set (using the common dataset type, 

763 not the task-specific one) or will be removed from the graph. 

764 

765 For regular inputs, usually an existing "ref" (corresponding to an 

766 output of another quantum) will be found and left unchanged. When 

767 there is no existing "ref" attribute, `ExistingDatasets.inputs` is 

768 searched next; if there is nothing there, the input will be removed. 

769 

770 Prerequisite inputs are always queried for directly here (delegating to 

771 `_find_prerequisite_inputs`). They are never produced by other tasks, 

772 and cannot in general be queried for in advance when 

773 `ExistingDatasets.inputs` is populated. 

774 """ 

775 quantum_data_id = skeleton[quantum_key]["data_id"] 

776 inputs_by_type: dict[str, set[DatasetRef]] = {} 

777 dataset_key: DatasetKey | PrerequisiteDatasetKey 

778 # Process inputs already present in the skeleton - this should include 

779 # all regular inputs (including intermediates) and may include some 

780 # prerequisites. 

781 for dataset_key in list(skeleton.iter_inputs_of(quantum_key)): 

782 if (ref := skeleton[dataset_key].get("ref")) is None: 

783 # This dataset is an overall input - if it was an intermediate, 

784 # we would have already either removed the node or set the 

785 # "ref" attribute when processing its producing quantum - and 

786 # this is the first time we're trying to resolve it. 

787 if (ref := self.existing_datasets.inputs.get(dataset_key)) is None: 

788 # It also doesn't exist in the input collections, so we 

789 # remove its node in the skeleton graph (so other consumers 

790 # won't have to check for it). 

791 skeleton.remove_dataset_nodes([dataset_key]) 

792 continue 

793 skeleton[dataset_key]["ref"] = ref 

794 inputs_by_type.setdefault(dataset_key.parent_dataset_type_name, set()).add(ref) 

795 skypix_bounds_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId) 

796 timespan_builder.handle_dataset(dataset_key.parent_dataset_type_name, ref.dataId) 

797 # Query for any prerequisites not handled by process_subgraph. Note 

798 # that these were not already in the skeleton graph, so we add them 

799 # now. 

800 skypix_bounds = skypix_bounds_builder.finish() 

801 timespan = timespan_builder.finish() 

802 for finder in task_prerequisite_info.finders.values(): 

803 inputs_for_type = inputs_by_type.setdefault(finder.dataset_type_node.name, set()) 

804 dataset_keys = [] 

805 for ref in finder.find( 

806 self.butler, self.input_collections, quantum_data_id, skypix_bounds, timespan 

807 ): 

808 dataset_key = skeleton.add_prerequisite_node(ref.datasetType.name, ref=ref) 

809 dataset_keys.append(dataset_key) 

810 inputs_for_type.add(ref) 

811 skeleton.add_input_edges(quantum_key, dataset_keys) 

812 adapted_inputs: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict() 

813 for read_edge in task_node.iter_all_inputs(): 

814 dataset_type_node = self._pipeline_graph.dataset_types[read_edge.parent_dataset_type_name] 

815 edge_dataset_type = read_edge.adapt_dataset_type(dataset_type_node.dataset_type) 

816 if (current_dataset_type := adapted_inputs.keys().get(edge_dataset_type.name)) is None: 

817 adapted_inputs[edge_dataset_type] = [ 

818 read_edge.adapt_dataset_ref(ref) 

819 for ref in sorted(inputs_by_type.get(read_edge.parent_dataset_type_name, frozenset())) 

820 ] 

821 elif current_dataset_type != edge_dataset_type: 

822 raise NotImplementedError( 

823 f"Task {task_node.label!r} has {edge_dataset_type.name!r} as an input via " 

824 "two different connections, with two different storage class overrides. " 

825 "This is not yet supported due to limitations in the Quantum data structure." 

826 ) 

827 # If neither the `if` nor the `elif` above match, it means 

828 # multiple input connections have exactly the same dataset 

829 # type, and hence nothing to do after the first one. 

830 return adapted_inputs 

831 

832 @final 

833 def _resolve_task_init( 

834 self, task_node: TaskNode, skeleton: QuantumGraphSkeleton, has_skipped_quanta: bool 

835 ) -> None: 

836 """Add init-input and init-output dataset nodes and edges for a task to 

837 the skeleton. 

838 

839 Parameters 

840 ---------- 

841 task_node : `pipeline_graph.TaskNode` 

842 Pipeline graph description of the task. 

843 skeleton : `QuantumGraphSkeleton` 

844 In-progress quantum graph data structure to update in-place. 

845 has_skipped_quanta : `bool` 

846 Whether any of this task's quanta were skipped because they had 

847 already succeeded. 

848 """ 

849 quanta = skeleton.get_quanta(task_node.label) 

850 task_init_key = TaskInitKey(task_node.label) 

851 if quanta: 

852 adapted_inputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict() 

853 # Process init-inputs. 

854 input_keys: list[DatasetKey] = [] 

855 for read_edge in task_node.init.iter_all_inputs(): 

856 dataset_key = skeleton.add_dataset_node( 

857 read_edge.parent_dataset_type_name, self.empty_data_id 

858 ) 

859 skeleton.add_input_edge(task_init_key, dataset_key) 

860 if (ref := skeleton[dataset_key].get("ref")) is None: 

861 try: 

862 ref = self.existing_datasets.inputs[dataset_key] 

863 except KeyError: 

864 raise InitInputMissingError( 

865 f"Overall init-input dataset {read_edge.parent_dataset_type_name!r} " 

866 f"needed by task {task_node.label!r} not found in input collection(s) " 

867 f"{self.input_collections}." 

868 ) from None 

869 skeleton[dataset_key]["ref"] = ref 

870 for quantum_key in skeleton.get_quanta(task_node.label): 

871 skeleton.add_input_edge(quantum_key, dataset_key) 

872 input_keys.append(dataset_key) 

873 adapted_ref = read_edge.adapt_dataset_ref(ref) 

874 adapted_inputs[adapted_ref.datasetType] = adapted_ref 

875 # Save the quantum-adapted init inputs to each quantum, and add 

876 # skeleton edges connecting the init inputs to each quantum. 

877 for quantum_key in skeleton.get_quanta(task_node.label): 

878 skeleton[quantum_key]["init_inputs"] = adapted_inputs 

879 # Process init-outputs. 

880 adapted_outputs: NamedKeyDict[DatasetType, DatasetRef] = NamedKeyDict() 

881 for write_edge in task_node.init.iter_all_outputs(): 

882 dataset_key = skeleton.add_dataset_node( 

883 write_edge.parent_dataset_type_name, self.empty_data_id 

884 ) 

885 if (ref := self.existing_datasets.outputs_in_the_way.get(dataset_key)) is None: 

886 ref = DatasetRef( 

887 self._pipeline_graph.dataset_types[write_edge.parent_dataset_type_name].dataset_type, 

888 self.empty_data_id, 

889 run=self.output_run, 

890 ) 

891 skeleton[dataset_key]["ref"] = ref 

892 skeleton.add_output_edge(task_init_key, dataset_key) 

893 adapted_ref = write_edge.adapt_dataset_ref(ref) 

894 adapted_outputs[adapted_ref.datasetType] = adapted_ref 

895 skeleton[task_init_key]["inputs"] = adapted_inputs 

896 skeleton[task_init_key]["outputs"] = adapted_outputs 

897 elif has_skipped_quanta: 

898 # No quanta remain for this task, but at least one quantum was 

899 # skipped because its outputs were present in the skip_existing_in 

900 # collections. This means all init outputs should be present in 

901 # the skip_existing_in collections, too, and we need to put those 

902 # refs in the graph. 

903 for write_edge in task_node.init.iter_all_outputs(): 

904 dataset_key = skeleton.add_dataset_node( 

905 write_edge.parent_dataset_type_name, self.empty_data_id 

906 ) 

907 if (ref := self.existing_datasets.outputs_for_skip.get(dataset_key)) is None: 

908 raise InitInputMissingError( 

909 f"Init-output dataset {write_edge.parent_dataset_type_name!r} of skipped task " 

910 f"{task_node.label!r} not found in skip-existing-in collection(s) " 

911 f"{self.skip_existing_in}." 

912 ) from None 

913 skeleton[dataset_key]["ref"] = ref 

914 # If this dataset was "in the way" (i.e. already in the output 

915 # run), it isn't anymore. 

916 self.existing_datasets.outputs_in_the_way.pop(dataset_key, None) 

917 # No quanta remain in this task, but none were skipped; this means 

918 # they all got pruned because of NoWorkFound conditions. This 

919 # dooms all downstream quanta to the same fate, so we don't bother 

920 # doing anything with the task's init-outputs, since nothing is 

921 # going to consume them. 

922 

923 @final 

924 @timeMethod 

925 def _find_empty_dimension_datasets(self) -> None: 

926 """Query for all dataset types with no dimensions, updating 

927 `existing_datasets` in-place. 

928 

929 This includes but is not limited to init inputs and init outputs. 

930 """ 

931 _, dataset_type_nodes = self._pipeline_graph.group_by_dimensions()[self.universe.empty] 

932 dataset_types = [node.dataset_type for node in dataset_type_nodes.values()] 

933 dataset_types.extend(self._global_init_output_types.values()) 

934 for dataset_type in dataset_types: 

935 key = DatasetKey(dataset_type.name, self.empty_data_id.values_tuple()) 

936 if ( 

937 self._pipeline_graph.producer_of(dataset_type.name) is None 

938 and dataset_type.name not in self._global_init_output_types 

939 ): 

940 # Dataset type is an overall input; we always need to try to 

941 # find these. 

942 try: 

943 ref = self.butler.registry.findDataset( 

944 dataset_type.name, collections=self.input_collections 

945 ) 

946 except MissingDatasetTypeError: 

947 ref = None 

948 if ref is not None: 

949 self.existing_datasets.inputs[key] = ref 

950 elif self.skip_existing_in: 

951 # Dataset type is an intermediate or output; need to find these 

952 # if only they're from previously executed quanta that we might 

953 # skip... 

954 try: 

955 ref = self.butler.registry.findDataset( 

956 dataset_type.name, collections=self.skip_existing_in 

957 ) 

958 except MissingDatasetTypeError: 

959 ref = None 

960 if ref is not None: 

961 self.existing_datasets.outputs_for_skip[key] = ref 

962 if ref.run == self.output_run: 

963 self.existing_datasets.outputs_in_the_way[key] = ref 

964 if self.output_run_exists and not self.skip_existing_starts_with_output_run: 

965 # ...or if they're in the way and would need to be clobbered 

966 # (and we haven't already found them in the previous block). 

967 try: 

968 ref = self.butler.registry.findDataset(dataset_type.name, collections=[self.output_run]) 

969 except MissingDatasetTypeError: 

970 ref = None 

971 if ref is not None: 

972 self.existing_datasets.outputs_in_the_way[key] = ref 

973 

974 @final 

975 @timeMethod 

976 def _attach_datastore_records(self, skeleton: QuantumGraphSkeleton) -> None: 

977 """Add datastore records for all overall inputs to a preliminary 

978 quantum graph. 

979 

980 Parameters 

981 ---------- 

982 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

983 Preliminary quantum graph to update in place. 

984 

985 Notes 

986 ----- 

987 On return, all quantum nodes in the skeleton graph will have a 

988 "datastore_records" attribute that is a mapping from datastore name 

989 to `lsst.daf.butler.DatastoreRecordData`, as used by 

990 `lsst.daf.butler.Quantum`. 

991 """ 

992 overall_inputs = skeleton.extract_overall_inputs() 

993 exported_records = self.butler._datastore.export_records(overall_inputs.values()) 

994 for quantum_key in skeleton.iter_all_quanta(): 

995 quantum_records = {} 

996 input_ids = { 

997 ref.id 

998 for dataset_key in skeleton.iter_inputs_of(quantum_key) 

999 if (ref := overall_inputs.get(dataset_key)) is not None 

1000 } 

1001 if input_ids: 

1002 for datastore_name, records in exported_records.items(): 

1003 matching_records = records.subset(input_ids) 

1004 if matching_records is not None: 

1005 quantum_records[datastore_name] = matching_records 

1006 skeleton[quantum_key]["datastore_records"] = quantum_records 

1007 

1008 @final 

1009 @timeMethod 

1010 def _construct_quantum_graph( 

1011 self, skeleton: QuantumGraphSkeleton, metadata: Mapping[str, Any] 

1012 ) -> QuantumGraph: 

1013 """Construct a `QuantumGraph` object from the contents of a 

1014 fully-processed `quantum_graph_skeleton.QuantumGraphSkeleton`. 

1015 

1016 Parameters 

1017 ---------- 

1018 skeleton : `quantum_graph_skeleton.QuantumGraphSkeleton` 

1019 Preliminary quantum graph. Must have "init_inputs", "inputs", and 

1020 "outputs" attributes on all quantum nodes, as added by 

1021 `_resolve_task_quanta`, as well as a "datastore_records" attribute 

1022 as added by `_attach_datastore_records`. 

1023 metadata : `Mapping` 

1024 Flexible metadata to add to the graph. 

1025 

1026 Returns 

1027 ------- 

1028 quantum_graph : `QuantumGraph` 

1029 DAG describing processing to be performed. 

1030 """ 

1031 quanta: dict[TaskDef, set[Quantum]] = {} 

1032 init_inputs: dict[TaskDef, Iterable[DatasetRef]] = {} 

1033 init_outputs: dict[TaskDef, Iterable[DatasetRef]] = {} 

1034 for task_def in self._pipeline_graph._iter_task_defs(): 

1035 if not skeleton.has_task(task_def.label): 

1036 continue 

1037 task_node = self._pipeline_graph.tasks[task_def.label] 

1038 task_init_key = skeleton.get_task_init_node(task_def.label) 

1039 init_inputs[task_def] = skeleton[task_init_key]["inputs"].values() 

1040 init_outputs[task_def] = skeleton[task_init_key]["outputs"].values() 

1041 quanta_for_task: set[Quantum] = set() 

1042 for quantum_key in skeleton.get_quanta(task_node.label): 

1043 node_state = skeleton[quantum_key] 

1044 quanta_for_task.add( 

1045 Quantum( 

1046 taskName=task_node.task_class_name, 

1047 taskClass=task_node.task_class, 

1048 dataId=node_state["data_id"], 

1049 initInputs=node_state["init_inputs"], 

1050 inputs=node_state["inputs"], 

1051 outputs=node_state["outputs"], 

1052 datastore_records=node_state.get("datastore_records"), 

1053 ) 

1054 ) 

1055 quanta[task_def] = quanta_for_task 

1056 

1057 registry_dataset_types: list[DatasetType] = [ 

1058 node.dataset_type for node in self._pipeline_graph.dataset_types.values() 

1059 ] 

1060 

1061 all_metadata = self.metadata.to_dict() 

1062 all_metadata.update(metadata) 

1063 return QuantumGraph( 

1064 quanta, 

1065 metadata=all_metadata, 

1066 universe=self.universe, 

1067 initInputs=init_inputs, 

1068 initOutputs=init_outputs, 

1069 globalInitOutputs=[skeleton[key]["ref"] for key in skeleton.global_init_outputs], 

1070 registryDatasetTypes=registry_dataset_types, 

1071 ) 

1072 

1073 @staticmethod 

1074 @final 

1075 def _find_removed( 

1076 original: Iterable[DatasetKey | PrerequisiteDatasetKey], 

1077 adjusted: NamedKeyMapping[DatasetType, Sequence[DatasetRef]], 

1078 ) -> set[DatasetKey | PrerequisiteDatasetKey]: 

1079 """Identify skeleton-graph dataset nodes that have been removed by 

1080 `~PipelineTaskConnections.adjustQuantum`. 

1081 

1082 Parameters 

1083 ---------- 

1084 original : `~collections.abc.Iterable` [ `DatasetKey` or \ 

1085 `PrerequisiteDatasetKey` ] 

1086 Identifiers for the dataset nodes that were the original neighbors 

1087 (inputs or outputs) of a quantum. 

1088 adjusted : `~lsst.daf.butler.NamedKeyMapping` [ \ 

1089 `~lsst.daf.butler.DatasetType`, \ 

1090 `~collections.abc.Sequence` [ `lsst.daf.butler.DatasetType` ] ] 

1091 Adjusted neighbors, in the form used by `lsst.daf.butler.Quantum`. 

1092 

1093 Returns 

1094 ------- 

1095 removed : `set` [ `DatasetKey` ] 

1096 Datasets in ``original`` that have no counterpart in ``adjusted``. 

1097 """ 

1098 result = set(original) 

1099 for dataset_type, kept_refs in adjusted.items(): 

1100 parent_dataset_type_name, _ = DatasetType.splitDatasetTypeName(dataset_type.name) 

1101 for kept_ref in kept_refs: 

1102 result.remove(DatasetKey(parent_dataset_type_name, kept_ref.dataId.values_tuple())) 

1103 return result 

1104 

1105 

1106@dataclasses.dataclass(eq=False, order=False) 

1107class ExistingDatasets: 

1108 """Struct that holds the results of dataset queries for 

1109 `QuantumGraphBuilder`. 

1110 """ 

1111 

1112 inputs: dict[DatasetKey | PrerequisiteDatasetKey, DatasetRef] = dataclasses.field(default_factory=dict) 

1113 """Overall-input datasets found in `QuantumGraphBuilder.input_collections`. 

1114 

1115 This may include prerequisite inputs. It does include init-inputs. 

1116 It does not include intermediates. 

1117 """ 

1118 

1119 outputs_for_skip: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict) 

1120 """Output datasets found in `QuantumGraphBuilder.skip_existing_in`. 

1121 

1122 It is unspecified whether this contains include init-outputs; there is 

1123 no concept of skipping at the init stage, so this is not expected to 

1124 matter. 

1125 """ 

1126 

1127 outputs_in_the_way: dict[DatasetKey, DatasetRef] = dataclasses.field(default_factory=dict) 

1128 """Output datasets found in `QuantumGraphBuilder.output_run`. 

1129 

1130 This includes regular outputs and init-outputs. 

1131 """ 

1132 

1133 

1134def _quantum_or_quanta(n: int) -> str: 

1135 """Correctly pluralize 'quantum' if needed.""" 

1136 return f"{n} quanta" if n != 1 else "1 quantum"