Coverage for python/lsst/daf/butler/_quantum_backed.py: 34%
192 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from . import ddl
32__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
34import itertools
35import logging
36import uuid
37from collections import defaultdict
38from collections.abc import Iterable, Mapping
39from typing import TYPE_CHECKING, Any
41from deprecated.sphinx import deprecated
42from lsst.resources import ResourcePathExpression
44from ._butler_config import ButlerConfig
45from ._compat import _BaseModelCompat
46from ._config import Config
47from ._dataset_ref import DatasetId, DatasetRef
48from ._dataset_type import DatasetType
49from ._deferredDatasetHandle import DeferredDatasetHandle
50from ._limited_butler import LimitedButler
51from ._quantum import Quantum
52from ._storage_class import StorageClass, StorageClassFactory
53from .datastore import Datastore
54from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData
55from .dimensions import DimensionUniverse
56from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
57from .registry.databases.sqlite import SqliteDatabase
58from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
59from .registry.opaque import ByNameOpaqueTableStorageManager
61if TYPE_CHECKING:
62 from ._butler import Butler
64_LOG = logging.getLogger(__name__)
67class _DatasetRecordStorageManagerDatastoreConstructionMimic:
68 """A partial implementation of `DatasetRecordStorageManager` that exists
69 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
70 to be constructed without a full `Registry`.
72 Notes
73 -----
74 The interface implemented by this class should probably be its own ABC,
75 and that ABC should probably be used in the definition of
76 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
77 changes minimal.
78 """
80 @classmethod
81 def getIdColumnType(cls) -> type:
82 # Docstring inherited.
83 return ddl.GUID
85 @classmethod
86 def addDatasetForeignKey(
87 cls,
88 tableSpec: ddl.TableSpec,
89 *,
90 name: str = "dataset",
91 constraint: bool = True,
92 onDelete: str | None = None,
93 **kwargs: Any,
94 ) -> ddl.FieldSpec:
95 # Docstring inherited.
96 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
97 tableSpec.fields.add(idFieldSpec)
98 return idFieldSpec
101class QuantumBackedButler(LimitedButler):
102 """An implementation of `LimitedButler` intended to back execution of a
103 single `Quantum`.
105 Parameters
106 ----------
107 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
108 Dataset IDs for datasets that can can be read from this butler.
109 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
110 Dataset IDs for datasets that can be stored in this butler.
111 dimensions : `DimensionUniverse`
112 Object managing all dimension definitions.
113 datastore : `Datastore`
114 Datastore to use for all dataset I/O and existence checks.
115 storageClasses : `StorageClassFactory`
116 Object managing all storage class definitions.
118 Notes
119 -----
120 Most callers should use the `initialize` `classmethod` to construct new
121 instances instead of calling the constructor directly.
123 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
124 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
125 implementations that rely SQLAlchemy. If implementations are added in the
126 future that don't rely on SQLAlchemy, it should be possible to swap them
127 in by overriding the type arguments to `initialize` (though at present,
128 `QuantumBackedButler` would still create at least an in-memory SQLite
129 database that would then go unused).`
131 We imagine `QuantumBackedButler` being used during (at least) batch
132 execution to capture `Datastore` records and save them to per-quantum
133 files, which are also a convenient place to store provenance for eventual
134 upload to a SQL-backed `Registry` (once `Registry` has tables to store
135 provenance, that is).
136 These per-quantum files can be written in two ways:
138 - The SQLite file used internally by `QuantumBackedButler` can be used
139 directly but customizing the ``filename`` argument to ``initialize``, and
140 then transferring that file to the object store after execution completes
141 (or fails; a ``try/finally`` pattern probably makes sense here).
143 - A JSON or YAML file can be written by calling `extract_provenance_data`,
144 and using ``pydantic`` methods to write the returned
145 `QuantumProvenanceData` to a file.
147 Note that at present, the SQLite file only contains datastore records, not
148 provenance, but that should be easy to address (if desired) after we
149 actually design a `Registry` schema for provenance. I also suspect that
150 we'll want to explicitly close the SQLite file somehow before trying to
151 transfer it. But I'm guessing we'd prefer to write the per-quantum files
152 as JSON anyway.
153 """
155 def __init__(
156 self,
157 predicted_inputs: Iterable[DatasetId],
158 predicted_outputs: Iterable[DatasetId],
159 dimensions: DimensionUniverse,
160 datastore: Datastore,
161 storageClasses: StorageClassFactory,
162 dataset_types: Mapping[str, DatasetType] | None = None,
163 ):
164 self._dimensions = dimensions
165 self._predicted_inputs = set(predicted_inputs)
166 self._predicted_outputs = set(predicted_outputs)
167 self._available_inputs: set[DatasetId] = set()
168 self._unavailable_inputs: set[DatasetId] = set()
169 self._actual_inputs: set[DatasetId] = set()
170 self._actual_output_refs: set[DatasetRef] = set()
171 self._datastore = datastore
172 self.storageClasses = storageClasses
173 self._dataset_types: Mapping[str, DatasetType] = {}
174 if dataset_types is not None:
175 self._dataset_types = dataset_types
176 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
178 @classmethod
179 def initialize(
180 cls,
181 config: Config | ResourcePathExpression,
182 quantum: Quantum,
183 dimensions: DimensionUniverse,
184 filename: str = ":memory:",
185 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
186 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
187 search_paths: list[str] | None = None,
188 dataset_types: Mapping[str, DatasetType] | None = None,
189 ) -> QuantumBackedButler:
190 """Construct a new `QuantumBackedButler` from repository configuration
191 and helper types.
193 Parameters
194 ----------
195 config : `Config` or `~lsst.resources.ResourcePathExpression`
196 A butler repository root, configuration filename, or configuration
197 instance.
198 quantum : `Quantum`
199 Object describing the predicted input and output dataset relevant
200 to this butler. This must have resolved `DatasetRef` instances for
201 all inputs and outputs.
202 dimensions : `DimensionUniverse`
203 Object managing all dimension definitions.
204 filename : `str`, optional
205 Name for the SQLite database that will back this butler; defaults
206 to an in-memory database.
207 OpaqueManagerClass : `type`, optional
208 A subclass of `OpaqueTableStorageManager` to use for datastore
209 opaque records. Default is a SQL-backed implementation.
210 BridgeManagerClass : `type`, optional
211 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
212 location records. Default is a SQL-backed implementation.
213 search_paths : `list` of `str`, optional
214 Additional search paths for butler configuration.
215 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
216 optional
217 Mapping of the dataset type name to its registry definition.
218 """
219 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
220 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
221 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
222 return cls._initialize(
223 config=config,
224 predicted_inputs=predicted_inputs,
225 predicted_outputs=predicted_outputs,
226 dimensions=dimensions,
227 filename=filename,
228 datastore_records=quantum.datastore_records,
229 OpaqueManagerClass=OpaqueManagerClass,
230 BridgeManagerClass=BridgeManagerClass,
231 search_paths=search_paths,
232 dataset_types=dataset_types,
233 )
235 @classmethod
236 def from_predicted(
237 cls,
238 config: Config | ResourcePathExpression,
239 predicted_inputs: Iterable[DatasetId],
240 predicted_outputs: Iterable[DatasetId],
241 dimensions: DimensionUniverse,
242 datastore_records: Mapping[str, DatastoreRecordData],
243 filename: str = ":memory:",
244 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
245 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
246 search_paths: list[str] | None = None,
247 dataset_types: Mapping[str, DatasetType] | None = None,
248 ) -> QuantumBackedButler:
249 """Construct a new `QuantumBackedButler` from sets of input and output
250 dataset IDs.
252 Parameters
253 ----------
254 config : `Config` or `~lsst.resources.ResourcePathExpression`
255 A butler repository root, configuration filename, or configuration
256 instance.
257 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
258 Dataset IDs for datasets that can can be read from this butler.
259 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
260 Dataset IDs for datasets that can be stored in this butler, must be
261 fully resolved.
262 dimensions : `DimensionUniverse`
263 Object managing all dimension definitions.
264 filename : `str`, optional
265 Name for the SQLite database that will back this butler; defaults
266 to an in-memory database.
267 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
268 Datastore records to import into a datastore.
269 OpaqueManagerClass : `type`, optional
270 A subclass of `OpaqueTableStorageManager` to use for datastore
271 opaque records. Default is a SQL-backed implementation.
272 BridgeManagerClass : `type`, optional
273 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
274 location records. Default is a SQL-backed implementation.
275 search_paths : `list` of `str`, optional
276 Additional search paths for butler configuration.
277 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
278 optional
279 Mapping of the dataset type name to its registry definition.
280 """
281 return cls._initialize(
282 config=config,
283 predicted_inputs=predicted_inputs,
284 predicted_outputs=predicted_outputs,
285 dimensions=dimensions,
286 filename=filename,
287 datastore_records=datastore_records,
288 OpaqueManagerClass=OpaqueManagerClass,
289 BridgeManagerClass=BridgeManagerClass,
290 search_paths=search_paths,
291 dataset_types=dataset_types,
292 )
294 @classmethod
295 def _initialize(
296 cls,
297 *,
298 config: Config | ResourcePathExpression,
299 predicted_inputs: Iterable[DatasetId],
300 predicted_outputs: Iterable[DatasetId],
301 dimensions: DimensionUniverse,
302 filename: str = ":memory:",
303 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
304 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
305 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
306 search_paths: list[str] | None = None,
307 dataset_types: Mapping[str, DatasetType] | None = None,
308 ) -> QuantumBackedButler:
309 """Initialize quantum-backed butler.
311 Internal method with common implementation used by `initialize` and
312 `for_output`.
314 Parameters
315 ----------
316 config : `Config` or `~lsst.resources.ResourcePathExpression`
317 A butler repository root, configuration filename, or configuration
318 instance.
319 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
320 Dataset IDs for datasets that can can be read from this butler.
321 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
322 Dataset IDs for datasets that can be stored in this butler.
323 dimensions : `DimensionUniverse`
324 Object managing all dimension definitions.
325 filename : `str`, optional
326 Name for the SQLite database that will back this butler; defaults
327 to an in-memory database.
328 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
329 Datastore records to import into a datastore.
330 OpaqueManagerClass : `type`, optional
331 A subclass of `OpaqueTableStorageManager` to use for datastore
332 opaque records. Default is a SQL-backed implementation.
333 BridgeManagerClass : `type`, optional
334 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
335 location records. Default is a SQL-backed implementation.
336 search_paths : `list` of `str`, optional
337 Additional search paths for butler configuration.
338 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`]
339 Mapping of the dataset type name to its registry definition.
340 """
341 butler_config = ButlerConfig(config, searchPaths=search_paths)
342 butler_root = butler_config.get("root", butler_config.configDir)
343 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
344 with db.declareStaticTables(create=True) as context:
345 opaque_manager = OpaqueManagerClass.initialize(db, context)
346 bridge_manager = BridgeManagerClass.initialize(
347 db,
348 context,
349 opaque=opaque_manager,
350 # MyPy can tell it's a fake, but we know it shouldn't care.
351 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
352 universe=dimensions,
353 )
354 # TODO: We need to inform `Datastore` here that it needs to support
355 # predictive reads; right now that's a configuration option, but after
356 # execution butler is retired it could just be a kwarg we pass here.
357 # For now just force this option as we cannot work without it.
358 butler_config["datastore", "trust_get_request"] = True
359 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
360 if datastore_records is not None:
361 datastore.import_records(datastore_records)
362 storageClasses = StorageClassFactory()
363 storageClasses.addFromConfig(butler_config)
364 return cls(
365 predicted_inputs,
366 predicted_outputs,
367 dimensions,
368 datastore,
369 storageClasses=storageClasses,
370 dataset_types=dataset_types,
371 )
373 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
374 """Return DatasetType defined in registry given dataset type name."""
375 return self._dataset_types.get(name)
377 def isWriteable(self) -> bool:
378 # Docstring inherited.
379 return True
381 # TODO: remove on DM-40067.
382 @deprecated(
383 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
384 " Please use Butler.get(). Will be removed after v26.0.",
385 version="v26.0",
386 category=FutureWarning,
387 )
388 def getDirect(
389 self,
390 ref: DatasetRef,
391 *,
392 parameters: dict[str, Any] | None = None,
393 storageClass: str | StorageClass | None = None,
394 ) -> Any:
395 # Docstring inherited.
396 return self.get(ref, parameters=parameters, storageClass=storageClass)
398 def get(
399 self,
400 ref: DatasetRef,
401 /,
402 *,
403 parameters: dict[str, Any] | None = None,
404 storageClass: StorageClass | str | None = None,
405 ) -> Any:
406 try:
407 obj = super().get(
408 ref,
409 parameters=parameters,
410 storageClass=storageClass,
411 )
412 except (LookupError, FileNotFoundError, OSError):
413 self._unavailable_inputs.add(ref.id)
414 raise
415 if ref.id in self._predicted_inputs:
416 # do this after delegating to super in case that raises.
417 self._actual_inputs.add(ref.id)
418 self._available_inputs.add(ref.id)
419 return obj
421 # TODO: remove on DM-40067.
422 @deprecated(
423 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
424 "Please use Butler.getDeferred(). Will be removed after v26.0.",
425 version="v26.0",
426 category=FutureWarning,
427 )
428 def getDirectDeferred(
429 self,
430 ref: DatasetRef,
431 *,
432 parameters: dict[str, Any] | None = None,
433 storageClass: str | StorageClass | None = None,
434 ) -> DeferredDatasetHandle:
435 # Docstring inherited.
436 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass)
438 def getDeferred(
439 self,
440 ref: DatasetRef,
441 /,
442 *,
443 parameters: dict[str, Any] | None = None,
444 storageClass: str | StorageClass | None = None,
445 ) -> DeferredDatasetHandle:
446 if ref.id in self._predicted_inputs:
447 # Unfortunately, we can't do this after the handle succeeds in
448 # loading, so it's conceivable here that we're marking an input
449 # as "actual" even when it's not even available.
450 self._actual_inputs.add(ref.id)
451 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
453 def stored(self, ref: DatasetRef) -> bool:
454 # Docstring inherited.
455 stored = super().stored(ref)
456 if ref.id in self._predicted_inputs:
457 if stored:
458 self._available_inputs.add(ref.id)
459 else:
460 self._unavailable_inputs.add(ref.id)
461 return stored
463 def stored_many(
464 self,
465 refs: Iterable[DatasetRef],
466 ) -> dict[DatasetRef, bool]:
467 # Docstring inherited.
468 existence = super().stored_many(refs)
470 for ref, stored in existence.items():
471 if ref.id in self._predicted_inputs:
472 if stored:
473 self._available_inputs.add(ref.id)
474 else:
475 self._unavailable_inputs.add(ref.id)
476 return existence
478 def markInputUnused(self, ref: DatasetRef) -> None:
479 # Docstring inherited.
480 self._actual_inputs.discard(ref.id)
482 @property
483 def dimensions(self) -> DimensionUniverse:
484 # Docstring inherited.
485 return self._dimensions
487 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
488 # Docstring inherited.
489 if ref.id not in self._predicted_outputs:
490 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
491 self._datastore.put(obj, ref)
492 self._actual_output_refs.add(ref)
493 return ref
495 def pruneDatasets(
496 self,
497 refs: Iterable[DatasetRef],
498 *,
499 disassociate: bool = True,
500 unstore: bool = False,
501 tags: Iterable[str] = (),
502 purge: bool = False,
503 ) -> None:
504 # docstring inherited from LimitedButler
506 if purge:
507 if not disassociate:
508 raise TypeError("Cannot pass purge=True without disassociate=True.")
509 if not unstore:
510 raise TypeError("Cannot pass purge=True without unstore=True.")
511 elif disassociate:
512 # No tagged collections for this butler.
513 raise TypeError("Cannot pass disassociate=True without purge=True.")
515 refs = list(refs)
517 # Pruning a component of a DatasetRef makes no sense.
518 for ref in refs:
519 if ref.datasetType.component():
520 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
522 if unstore:
523 self._datastore.trash(refs)
524 if purge:
525 for ref in refs:
526 # We only care about removing them from actual output refs,
527 self._actual_output_refs.discard(ref)
529 if unstore:
530 # Point of no return for removing artifacts
531 self._datastore.emptyTrash()
533 def extract_provenance_data(self) -> QuantumProvenanceData:
534 """Extract provenance information and datastore records from this
535 butler.
537 Returns
538 -------
539 provenance : `QuantumProvenanceData`
540 A serializable struct containing input/output dataset IDs and
541 datastore records. This assumes all dataset IDs are UUIDs (just to
542 make it easier for `pydantic` to reason about the struct's types);
543 the rest of this class makes no such assumption, but the approach
544 to processing in which it's useful effectively requires UUIDs
545 anyway.
547 Notes
548 -----
549 `QuantumBackedButler` records this provenance information when its
550 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
551 authors from having to worry about while still recording very
552 detailed information. But it has two small weaknesses:
554 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
555 dataset as an "actual input", which may mark some datasets that
556 aren't actually used. We rely on task authors to use
557 `markInputUnused` to address this.
559 - We assume that the execution system will call ``datasetExistsDirect``
560 on all predicted inputs prior to execution, in order to populate the
561 "available inputs" set. This is what I envision
562 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
563 to use this class, but it feels fragile for this class to make such
564 a strong assumption about how it will be used, even if I can't think
565 of any other executor behavior that would make sense.
566 """
567 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
568 _LOG.warning(
569 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
570 "was obtained, but did not actually exist. This task should be be using markInputUnused "
571 "directly to clarify its provenance.",
572 self._actual_inputs & self._unavailable_inputs,
573 )
574 self._actual_inputs -= self._unavailable_inputs
575 checked_inputs = self._available_inputs | self._unavailable_inputs
576 if self._predicted_inputs != checked_inputs:
577 _LOG.warning(
578 "Execution harness did not check predicted inputs %s for existence; available inputs "
579 "recorded in provenance may be incomplete.",
580 self._predicted_inputs - checked_inputs,
581 )
582 datastore_records = self._datastore.export_records(self._actual_output_refs)
583 provenance_records = {
584 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
585 }
587 return QuantumProvenanceData(
588 predicted_inputs=self._predicted_inputs,
589 available_inputs=self._available_inputs,
590 actual_inputs=self._actual_inputs,
591 predicted_outputs=self._predicted_outputs,
592 actual_outputs={ref.id for ref in self._actual_output_refs},
593 datastore_records=provenance_records,
594 )
597class QuantumProvenanceData(_BaseModelCompat):
598 """A serializable struct for per-quantum provenance information and
599 datastore records.
601 Notes
602 -----
603 This class slightly duplicates information from the `Quantum` class itself
604 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the
605 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
606 assumes the original `Quantum` is also available to reconstruct the
607 complete provenance (e.g. by associating dataset IDs with data IDs,
608 dataset types, and `~CollectionType.RUN` names.
610 Note that ``pydantic`` method ``parse_raw()`` is not going to work
611 correctly for this class, use `direct` method instead.
612 """
614 # This class probably should have information about its execution
615 # environment (anything not controlled and recorded at the
616 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
617 # now is out of scope for this prototype.
619 predicted_inputs: set[uuid.UUID]
620 """Unique IDs of datasets that were predicted as inputs to this quantum
621 when the QuantumGraph was built.
622 """
624 available_inputs: set[uuid.UUID]
625 """Unique IDs of input datasets that were actually present in the datastore
626 when this quantum was executed.
628 This is a subset of ``predicted_inputs``, with the difference generally
629 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of
630 some upstream task.
631 """
633 actual_inputs: set[uuid.UUID]
634 """Unique IDs of datasets that were actually used as inputs by this task.
636 This is a subset of ``available_inputs``.
638 Notes
639 -----
640 The criteria for marking an input as used is that rerunning the quantum
641 with only these ``actual_inputs`` available must yield identical outputs.
642 This means that (for example) even just using an input to help determine
643 an output rejection criteria and then rejecting it as an outlier qualifies
644 that input as actually used.
645 """
647 predicted_outputs: set[uuid.UUID]
648 """Unique IDs of datasets that were predicted as outputs of this quantum
649 when the QuantumGraph was built.
650 """
652 actual_outputs: set[uuid.UUID]
653 """Unique IDs of datasets that were actually written when this quantum
654 was executed.
655 """
657 datastore_records: dict[str, SerializedDatastoreRecordData]
658 """Datastore records indexed by datastore name."""
660 @staticmethod
661 def collect_and_transfer(
662 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
663 ) -> None:
664 """Transfer output datasets from multiple quanta to a more permanent
665 `Butler` repository.
667 Parameters
668 ----------
669 butler : `Butler`
670 Full butler representing the data repository to transfer datasets
671 to.
672 quanta : `~collections.abc.Iterable` [ `Quantum` ]
673 Iterable of `Quantum` objects that carry information about
674 predicted outputs. May be a single-pass iterator.
675 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ]
676 Provenance and datastore data for each of the given quanta, in the
677 same order. May be a single-pass iterator.
679 Notes
680 -----
681 Input-output provenance data is not actually transferred yet, because
682 `Registry` has no place to store it.
684 This method probably works most efficiently if run on all quanta for a
685 single task label at once, because this will gather all datasets of
686 a particular type together into a single vectorized `Registry` import.
687 It should still behave correctly if run on smaller groups of quanta
688 or even quanta from multiple tasks.
690 Currently this method transfers datastore record data unchanged, with
691 no possibility of actually moving (e.g.) files. Datastores that are
692 present only in execution or only in the more permanent butler are
693 ignored.
694 """
695 grouped_refs = defaultdict(list)
696 summary_records: dict[str, DatastoreRecordData] = {}
697 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True):
698 quantum_refs_by_id = {
699 ref.id: ref
700 for ref in itertools.chain.from_iterable(quantum.outputs.values())
701 if ref.id in provenance_for_quantum.actual_outputs
702 }
703 for ref in quantum_refs_by_id.values():
704 grouped_refs[ref.datasetType, ref.run].append(ref)
706 # merge datastore records into a summary structure
707 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
708 quantum_records = DatastoreRecordData.from_simple(serialized_records)
709 if (records := summary_records.get(datastore_name)) is not None:
710 records.update(quantum_records)
711 else:
712 summary_records[datastore_name] = quantum_records
714 for refs in grouped_refs.values():
715 butler.registry._importDatasets(refs)
716 butler._datastore.import_records(summary_records)
718 @classmethod
719 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
720 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
722 @classmethod
723 def direct(
724 cls,
725 *,
726 predicted_inputs: Iterable[str | uuid.UUID],
727 available_inputs: Iterable[str | uuid.UUID],
728 actual_inputs: Iterable[str | uuid.UUID],
729 predicted_outputs: Iterable[str | uuid.UUID],
730 actual_outputs: Iterable[str | uuid.UUID],
731 datastore_records: Mapping[str, Mapping],
732 ) -> QuantumProvenanceData:
733 """Construct an instance directly without validators.
735 This differs from the pydantic "construct" method in that the
736 arguments are explicitly what the model requires, and it will recurse
737 through members, constructing them from their corresponding `direct`
738 methods.
740 This method should only be called when the inputs are trusted.
741 """
743 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]:
744 """Convert input UUIDs, which could be in string representation to
745 a set of `UUID` instances.
746 """
747 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids}
749 data = cls.model_construct(
750 predicted_inputs=_to_uuid_set(predicted_inputs),
751 available_inputs=_to_uuid_set(available_inputs),
752 actual_inputs=_to_uuid_set(actual_inputs),
753 predicted_outputs=_to_uuid_set(predicted_outputs),
754 actual_outputs=_to_uuid_set(actual_outputs),
755 datastore_records={
756 key: SerializedDatastoreRecordData.direct(**records)
757 for key, records in datastore_records.items()
758 },
759 )
761 return data