Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%
184 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from collections.abc import Iterable, Mapping
31from typing import TYPE_CHECKING, Any
33from deprecated.sphinx import deprecated
34from lsst.daf.butler._compat import _BaseModelCompat
35from lsst.resources import ResourcePathExpression
37from ._butlerConfig import ButlerConfig
38from ._deferredDatasetHandle import DeferredDatasetHandle
39from ._limited_butler import LimitedButler
40from .core import (
41 Config,
42 DatasetId,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 DatastoreRecordData,
47 DimensionUniverse,
48 Quantum,
49 SerializedDatastoreRecordData,
50 StorageClass,
51 StorageClassFactory,
52 ddl,
53)
54from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
55from .registry.databases.sqlite import SqliteDatabase
56from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
57from .registry.opaque import ByNameOpaqueTableStorageManager
59if TYPE_CHECKING:
60 from ._butler import Butler
62_LOG = logging.getLogger(__name__)
65class _DatasetRecordStorageManagerDatastoreConstructionMimic:
66 """A partial implementation of `DatasetRecordStorageManager` that exists
67 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
68 to be constructed without a full `Registry`.
70 Notes
71 -----
72 The interface implemented by this class should probably be its own ABC,
73 and that ABC should probably be used in the definition of
74 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
75 changes minimal.
76 """
78 @classmethod
79 def getIdColumnType(cls) -> type:
80 # Docstring inherited.
81 return ddl.GUID
83 @classmethod
84 def addDatasetForeignKey(
85 cls,
86 tableSpec: ddl.TableSpec,
87 *,
88 name: str = "dataset",
89 constraint: bool = True,
90 onDelete: str | None = None,
91 **kwargs: Any,
92 ) -> ddl.FieldSpec:
93 # Docstring inherited.
94 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
95 tableSpec.fields.add(idFieldSpec)
96 return idFieldSpec
99class QuantumBackedButler(LimitedButler):
100 """An implementation of `LimitedButler` intended to back execution of a
101 single `Quantum`.
103 Parameters
104 ----------
105 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
106 Dataset IDs for datasets that can can be read from this butler.
107 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
108 Dataset IDs for datasets that can be stored in this butler.
109 dimensions : `DimensionUniverse`
110 Object managing all dimension definitions.
111 datastore : `Datastore`
112 Datastore to use for all dataset I/O and existence checks.
113 storageClasses : `StorageClassFactory`
114 Object managing all storage class definitions.
116 Notes
117 -----
118 Most callers should use the `initialize` `classmethod` to construct new
119 instances instead of calling the constructor directly.
121 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
122 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
123 implementations that rely SQLAlchemy. If implementations are added in the
124 future that don't rely on SQLAlchemy, it should be possible to swap them
125 in by overriding the type arguments to `initialize` (though at present,
126 `QuantumBackedButler` would still create at least an in-memory SQLite
127 database that would then go unused).`
129 We imagine `QuantumBackedButler` being used during (at least) batch
130 execution to capture `Datastore` records and save them to per-quantum
131 files, which are also a convenient place to store provenance for eventual
132 upload to a SQL-backed `Registry` (once `Registry` has tables to store
133 provenance, that is).
134 These per-quantum files can be written in two ways:
136 - The SQLite file used internally by `QuantumBackedButler` can be used
137 directly but customizing the ``filename`` argument to ``initialize``, and
138 then transferring that file to the object store after execution completes
139 (or fails; a ``try/finally`` pattern probably makes sense here).
141 - A JSON or YAML file can be written by calling `extract_provenance_data`,
142 and using ``pydantic`` methods to write the returned
143 `QuantumProvenanceData` to a file.
145 Note that at present, the SQLite file only contains datastore records, not
146 provenance, but that should be easy to address (if desired) after we
147 actually design a `Registry` schema for provenance. I also suspect that
148 we'll want to explicitly close the SQLite file somehow before trying to
149 transfer it. But I'm guessing we'd prefer to write the per-quantum files
150 as JSON anyway.
151 """
153 def __init__(
154 self,
155 predicted_inputs: Iterable[DatasetId],
156 predicted_outputs: Iterable[DatasetId],
157 dimensions: DimensionUniverse,
158 datastore: Datastore,
159 storageClasses: StorageClassFactory,
160 dataset_types: Mapping[str, DatasetType] | None = None,
161 ):
162 self._dimensions = dimensions
163 self._predicted_inputs = set(predicted_inputs)
164 self._predicted_outputs = set(predicted_outputs)
165 self._available_inputs: set[DatasetId] = set()
166 self._unavailable_inputs: set[DatasetId] = set()
167 self._actual_inputs: set[DatasetId] = set()
168 self._actual_output_refs: set[DatasetRef] = set()
169 self._datastore = datastore
170 self.storageClasses = storageClasses
171 self._dataset_types: Mapping[str, DatasetType] = {}
172 if dataset_types is not None:
173 self._dataset_types = dataset_types
174 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
176 @classmethod
177 def initialize(
178 cls,
179 config: Config | ResourcePathExpression,
180 quantum: Quantum,
181 dimensions: DimensionUniverse,
182 filename: str = ":memory:",
183 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
184 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
185 search_paths: list[str] | None = None,
186 dataset_types: Mapping[str, DatasetType] | None = None,
187 ) -> QuantumBackedButler:
188 """Construct a new `QuantumBackedButler` from repository configuration
189 and helper types.
191 Parameters
192 ----------
193 config : `Config` or `~lsst.resources.ResourcePathExpression`
194 A butler repository root, configuration filename, or configuration
195 instance.
196 quantum : `Quantum`
197 Object describing the predicted input and output dataset relevant
198 to this butler. This must have resolved `DatasetRef` instances for
199 all inputs and outputs.
200 dimensions : `DimensionUniverse`
201 Object managing all dimension definitions.
202 filename : `str`, optional
203 Name for the SQLite database that will back this butler; defaults
204 to an in-memory database.
205 OpaqueManagerClass : `type`, optional
206 A subclass of `OpaqueTableStorageManager` to use for datastore
207 opaque records. Default is a SQL-backed implementation.
208 BridgeManagerClass : `type`, optional
209 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
210 location records. Default is a SQL-backed implementation.
211 search_paths : `list` of `str`, optional
212 Additional search paths for butler configuration.
213 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
214 optional
215 Mapping of the dataset type name to its registry definition.
216 """
217 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
218 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
219 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
220 return cls._initialize(
221 config=config,
222 predicted_inputs=predicted_inputs,
223 predicted_outputs=predicted_outputs,
224 dimensions=dimensions,
225 filename=filename,
226 datastore_records=quantum.datastore_records,
227 OpaqueManagerClass=OpaqueManagerClass,
228 BridgeManagerClass=BridgeManagerClass,
229 search_paths=search_paths,
230 dataset_types=dataset_types,
231 )
233 @classmethod
234 def from_predicted(
235 cls,
236 config: Config | ResourcePathExpression,
237 predicted_inputs: Iterable[DatasetId],
238 predicted_outputs: Iterable[DatasetId],
239 dimensions: DimensionUniverse,
240 datastore_records: Mapping[str, DatastoreRecordData],
241 filename: str = ":memory:",
242 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
243 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
244 search_paths: list[str] | None = None,
245 dataset_types: Mapping[str, DatasetType] | None = None,
246 ) -> QuantumBackedButler:
247 """Construct a new `QuantumBackedButler` from sets of input and output
248 dataset IDs.
250 Parameters
251 ----------
252 config : `Config` or `~lsst.resources.ResourcePathExpression`
253 A butler repository root, configuration filename, or configuration
254 instance.
255 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
256 Dataset IDs for datasets that can can be read from this butler.
257 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
258 Dataset IDs for datasets that can be stored in this butler, must be
259 fully resolved.
260 dimensions : `DimensionUniverse`
261 Object managing all dimension definitions.
262 filename : `str`, optional
263 Name for the SQLite database that will back this butler; defaults
264 to an in-memory database.
265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
266 Datastore records to import into a datastore.
267 OpaqueManagerClass : `type`, optional
268 A subclass of `OpaqueTableStorageManager` to use for datastore
269 opaque records. Default is a SQL-backed implementation.
270 BridgeManagerClass : `type`, optional
271 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
272 location records. Default is a SQL-backed implementation.
273 search_paths : `list` of `str`, optional
274 Additional search paths for butler configuration.
275 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
276 optional
277 Mapping of the dataset type name to its registry definition.
278 """
279 return cls._initialize(
280 config=config,
281 predicted_inputs=predicted_inputs,
282 predicted_outputs=predicted_outputs,
283 dimensions=dimensions,
284 filename=filename,
285 datastore_records=datastore_records,
286 OpaqueManagerClass=OpaqueManagerClass,
287 BridgeManagerClass=BridgeManagerClass,
288 search_paths=search_paths,
289 dataset_types=dataset_types,
290 )
292 @classmethod
293 def _initialize(
294 cls,
295 *,
296 config: Config | ResourcePathExpression,
297 predicted_inputs: Iterable[DatasetId],
298 predicted_outputs: Iterable[DatasetId],
299 dimensions: DimensionUniverse,
300 filename: str = ":memory:",
301 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
302 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
303 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
304 search_paths: list[str] | None = None,
305 dataset_types: Mapping[str, DatasetType] | None = None,
306 ) -> QuantumBackedButler:
307 """Initialize quantum-backed butler.
309 Internal method with common implementation used by `initialize` and
310 `for_output`.
312 Parameters
313 ----------
314 config : `Config` or `~lsst.resources.ResourcePathExpression`
315 A butler repository root, configuration filename, or configuration
316 instance.
317 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
318 Dataset IDs for datasets that can can be read from this butler.
319 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
320 Dataset IDs for datasets that can be stored in this butler.
321 dimensions : `DimensionUniverse`
322 Object managing all dimension definitions.
323 filename : `str`, optional
324 Name for the SQLite database that will back this butler; defaults
325 to an in-memory database.
326 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
327 Datastore records to import into a datastore.
328 OpaqueManagerClass : `type`, optional
329 A subclass of `OpaqueTableStorageManager` to use for datastore
330 opaque records. Default is a SQL-backed implementation.
331 BridgeManagerClass : `type`, optional
332 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
333 location records. Default is a SQL-backed implementation.
334 search_paths : `list` of `str`, optional
335 Additional search paths for butler configuration.
336 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`]
337 Mapping of the dataset type name to its registry definition.
338 """
339 butler_config = ButlerConfig(config, searchPaths=search_paths)
340 butler_root = butler_config.get("root", butler_config.configDir)
341 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
342 with db.declareStaticTables(create=True) as context:
343 opaque_manager = OpaqueManagerClass.initialize(db, context)
344 bridge_manager = BridgeManagerClass.initialize(
345 db,
346 context,
347 opaque=opaque_manager,
348 # MyPy can tell it's a fake, but we know it shouldn't care.
349 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
350 universe=dimensions,
351 )
352 # TODO: We need to inform `Datastore` here that it needs to support
353 # predictive reads; right now that's a configuration option, but after
354 # execution butler is retired it could just be a kwarg we pass here.
355 # For now just force this option as we cannot work without it.
356 butler_config["datastore", "trust_get_request"] = True
357 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
358 if datastore_records is not None:
359 datastore.import_records(datastore_records)
360 storageClasses = StorageClassFactory()
361 storageClasses.addFromConfig(butler_config)
362 return cls(
363 predicted_inputs,
364 predicted_outputs,
365 dimensions,
366 datastore,
367 storageClasses=storageClasses,
368 dataset_types=dataset_types,
369 )
371 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
372 """Return DatasetType defined in registry given dataset type name."""
373 return self._dataset_types.get(name)
375 def isWriteable(self) -> bool:
376 # Docstring inherited.
377 return True
379 # TODO: remove on DM-40067.
380 @deprecated(
381 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
382 " Please use Butler.get(). Will be removed after v26.0.",
383 version="v26.0",
384 category=FutureWarning,
385 )
386 def getDirect(
387 self,
388 ref: DatasetRef,
389 *,
390 parameters: dict[str, Any] | None = None,
391 storageClass: str | StorageClass | None = None,
392 ) -> Any:
393 # Docstring inherited.
394 return self.get(ref, parameters=parameters, storageClass=storageClass)
396 def get(
397 self,
398 ref: DatasetRef,
399 /,
400 *,
401 parameters: dict[str, Any] | None = None,
402 storageClass: StorageClass | str | None = None,
403 ) -> Any:
404 try:
405 obj = super().get(
406 ref,
407 parameters=parameters,
408 storageClass=storageClass,
409 )
410 except (LookupError, FileNotFoundError, OSError):
411 self._unavailable_inputs.add(ref.id)
412 raise
413 if ref.id in self._predicted_inputs:
414 # do this after delegating to super in case that raises.
415 self._actual_inputs.add(ref.id)
416 self._available_inputs.add(ref.id)
417 return obj
419 # TODO: remove on DM-40067.
420 @deprecated(
421 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
422 "Please use Butler.getDeferred(). Will be removed after v26.0.",
423 version="v26.0",
424 category=FutureWarning,
425 )
426 def getDirectDeferred(
427 self,
428 ref: DatasetRef,
429 *,
430 parameters: dict[str, Any] | None = None,
431 storageClass: str | StorageClass | None = None,
432 ) -> DeferredDatasetHandle:
433 # Docstring inherited.
434 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass)
436 def getDeferred(
437 self,
438 ref: DatasetRef,
439 /,
440 *,
441 parameters: dict[str, Any] | None = None,
442 storageClass: str | StorageClass | None = None,
443 ) -> DeferredDatasetHandle:
444 if ref.id in self._predicted_inputs:
445 # Unfortunately, we can't do this after the handle succeeds in
446 # loading, so it's conceivable here that we're marking an input
447 # as "actual" even when it's not even available.
448 self._actual_inputs.add(ref.id)
449 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
451 def stored(self, ref: DatasetRef) -> bool:
452 # Docstring inherited.
453 stored = super().stored(ref)
454 if ref.id in self._predicted_inputs:
455 if stored:
456 self._available_inputs.add(ref.id)
457 else:
458 self._unavailable_inputs.add(ref.id)
459 return stored
461 def stored_many(
462 self,
463 refs: Iterable[DatasetRef],
464 ) -> dict[DatasetRef, bool]:
465 # Docstring inherited.
466 existence = super().stored_many(refs)
468 for ref, stored in existence.items():
469 if ref.id in self._predicted_inputs:
470 if stored:
471 self._available_inputs.add(ref.id)
472 else:
473 self._unavailable_inputs.add(ref.id)
474 return existence
476 def markInputUnused(self, ref: DatasetRef) -> None:
477 # Docstring inherited.
478 self._actual_inputs.discard(ref.id)
480 @property
481 def dimensions(self) -> DimensionUniverse:
482 # Docstring inherited.
483 return self._dimensions
485 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
486 # Docstring inherited.
487 if ref.id not in self._predicted_outputs:
488 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
489 self._datastore.put(obj, ref)
490 self._actual_output_refs.add(ref)
491 return ref
493 def pruneDatasets(
494 self,
495 refs: Iterable[DatasetRef],
496 *,
497 disassociate: bool = True,
498 unstore: bool = False,
499 tags: Iterable[str] = (),
500 purge: bool = False,
501 ) -> None:
502 # docstring inherited from LimitedButler
504 if purge:
505 if not disassociate:
506 raise TypeError("Cannot pass purge=True without disassociate=True.")
507 if not unstore:
508 raise TypeError("Cannot pass purge=True without unstore=True.")
509 elif disassociate:
510 # No tagged collections for this butler.
511 raise TypeError("Cannot pass disassociate=True without purge=True.")
513 refs = list(refs)
515 # Pruning a component of a DatasetRef makes no sense.
516 for ref in refs:
517 if ref.datasetType.component():
518 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
520 if unstore:
521 self._datastore.trash(refs)
522 if purge:
523 for ref in refs:
524 # We only care about removing them from actual output refs,
525 self._actual_output_refs.discard(ref)
527 if unstore:
528 # Point of no return for removing artifacts
529 self._datastore.emptyTrash()
531 def extract_provenance_data(self) -> QuantumProvenanceData:
532 """Extract provenance information and datastore records from this
533 butler.
535 Returns
536 -------
537 provenance : `QuantumProvenanceData`
538 A serializable struct containing input/output dataset IDs and
539 datastore records. This assumes all dataset IDs are UUIDs (just to
540 make it easier for `pydantic` to reason about the struct's types);
541 the rest of this class makes no such assumption, but the approach
542 to processing in which it's useful effectively requires UUIDs
543 anyway.
545 Notes
546 -----
547 `QuantumBackedButler` records this provenance information when its
548 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
549 authors from having to worry about while still recording very
550 detailed information. But it has two small weaknesses:
552 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
553 dataset as an "actual input", which may mark some datasets that
554 aren't actually used. We rely on task authors to use
555 `markInputUnused` to address this.
557 - We assume that the execution system will call ``datasetExistsDirect``
558 on all predicted inputs prior to execution, in order to populate the
559 "available inputs" set. This is what I envision
560 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
561 to use this class, but it feels fragile for this class to make such
562 a strong assumption about how it will be used, even if I can't think
563 of any other executor behavior that would make sense.
564 """
565 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
566 _LOG.warning(
567 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
568 "was obtained, but did not actually exist. This task should be be using markInputUnused "
569 "directly to clarify its provenance.",
570 self._actual_inputs & self._unavailable_inputs,
571 )
572 self._actual_inputs -= self._unavailable_inputs
573 checked_inputs = self._available_inputs | self._unavailable_inputs
574 if self._predicted_inputs != checked_inputs:
575 _LOG.warning(
576 "Execution harness did not check predicted inputs %s for existence; available inputs "
577 "recorded in provenance may be incomplete.",
578 self._predicted_inputs - checked_inputs,
579 )
580 datastore_records = self._datastore.export_records(self._actual_output_refs)
581 provenance_records = {
582 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
583 }
585 return QuantumProvenanceData(
586 predicted_inputs=self._predicted_inputs,
587 available_inputs=self._available_inputs,
588 actual_inputs=self._actual_inputs,
589 predicted_outputs=self._predicted_outputs,
590 actual_outputs={ref.id for ref in self._actual_output_refs},
591 datastore_records=provenance_records,
592 )
595class QuantumProvenanceData(_BaseModelCompat):
596 """A serializable struct for per-quantum provenance information and
597 datastore records.
599 Notes
600 -----
601 This class slightly duplicates information from the `Quantum` class itself
602 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the
603 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
604 assumes the original `Quantum` is also available to reconstruct the
605 complete provenance (e.g. by associating dataset IDs with data IDs,
606 dataset types, and `~CollectionType.RUN` names.
608 Note that ``pydantic`` method ``parse_raw()`` is not going to work
609 correctly for this class, use `direct` method instead.
610 """
612 # This class probably should have information about its execution
613 # environment (anything not controlled and recorded at the
614 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
615 # now is out of scope for this prototype.
617 predicted_inputs: set[uuid.UUID]
618 """Unique IDs of datasets that were predicted as inputs to this quantum
619 when the QuantumGraph was built.
620 """
622 available_inputs: set[uuid.UUID]
623 """Unique IDs of input datasets that were actually present in the datastore
624 when this quantum was executed.
626 This is a subset of ``predicted_inputs``, with the difference generally
627 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of
628 some upstream task.
629 """
631 actual_inputs: set[uuid.UUID]
632 """Unique IDs of datasets that were actually used as inputs by this task.
634 This is a subset of ``available_inputs``.
636 Notes
637 -----
638 The criteria for marking an input as used is that rerunning the quantum
639 with only these ``actual_inputs`` available must yield identical outputs.
640 This means that (for example) even just using an input to help determine
641 an output rejection criteria and then rejecting it as an outlier qualifies
642 that input as actually used.
643 """
645 predicted_outputs: set[uuid.UUID]
646 """Unique IDs of datasets that were predicted as outputs of this quantum
647 when the QuantumGraph was built.
648 """
650 actual_outputs: set[uuid.UUID]
651 """Unique IDs of datasets that were actually written when this quantum
652 was executed.
653 """
655 datastore_records: dict[str, SerializedDatastoreRecordData]
656 """Datastore records indexed by datastore name."""
658 @staticmethod
659 def collect_and_transfer(
660 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
661 ) -> None:
662 """Transfer output datasets from multiple quanta to a more permantent
663 `Butler` repository.
665 Parameters
666 ----------
667 butler : `Butler`
668 Full butler representing the data repository to transfer datasets
669 to.
670 quanta : `~collections.abc.Iterable` [ `Quantum` ]
671 Iterable of `Quantum` objects that carry information about
672 predicted outputs. May be a single-pass iterator.
673 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ]
674 Provenance and datastore data for each of the given quanta, in the
675 same order. May be a single-pass iterator.
677 Notes
678 -----
679 Input-output provenance data is not actually transferred yet, because
680 `Registry` has no place to store it.
682 This method probably works most efficiently if run on all quanta for a
683 single task label at once, because this will gather all datasets of
684 a particular type together into a single vectorized `Registry` import.
685 It should still behave correctly if run on smaller groups of quanta
686 or even quanta from multiple tasks.
688 Currently this method transfers datastore record data unchanged, with
689 no possibility of actually moving (e.g.) files. Datastores that are
690 present only in execution or only in the more permanent butler are
691 ignored.
692 """
693 grouped_refs = defaultdict(list)
694 summary_records: dict[str, DatastoreRecordData] = {}
695 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True):
696 quantum_refs_by_id = {
697 ref.id: ref
698 for ref in itertools.chain.from_iterable(quantum.outputs.values())
699 if ref.id in provenance_for_quantum.actual_outputs
700 }
701 for ref in quantum_refs_by_id.values():
702 grouped_refs[ref.datasetType, ref.run].append(ref)
704 # merge datastore records into a summary structure
705 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
706 quantum_records = DatastoreRecordData.from_simple(serialized_records)
707 if (records := summary_records.get(datastore_name)) is not None:
708 records.update(quantum_records)
709 else:
710 summary_records[datastore_name] = quantum_records
712 for refs in grouped_refs.values():
713 butler.registry._importDatasets(refs)
714 butler._datastore.import_records(summary_records)
716 @classmethod
717 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
718 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
720 @classmethod
721 def direct(
722 cls,
723 *,
724 predicted_inputs: Iterable[str | uuid.UUID],
725 available_inputs: Iterable[str | uuid.UUID],
726 actual_inputs: Iterable[str | uuid.UUID],
727 predicted_outputs: Iterable[str | uuid.UUID],
728 actual_outputs: Iterable[str | uuid.UUID],
729 datastore_records: Mapping[str, Mapping],
730 ) -> QuantumProvenanceData:
731 """Construct an instance directly without validators.
733 This differs from the pydantic "construct" method in that the
734 arguments are explicitly what the model requires, and it will recurse
735 through members, constructing them from their corresponding `direct`
736 methods.
738 This method should only be called when the inputs are trusted.
739 """
741 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]:
742 """Convert input UUIDs, which could be in string representation to
743 a set of `UUID` instances.
744 """
745 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids}
747 data = cls.model_construct(
748 predicted_inputs=_to_uuid_set(predicted_inputs),
749 available_inputs=_to_uuid_set(available_inputs),
750 actual_inputs=_to_uuid_set(actual_inputs),
751 predicted_outputs=_to_uuid_set(predicted_outputs),
752 actual_outputs=_to_uuid_set(actual_outputs),
753 datastore_records={
754 key: SerializedDatastoreRecordData.direct(**records)
755 for key, records in datastore_records.items()
756 },
757 )
759 return data