Coverage for python/lsst/daf/butler/_quantum_backed.py: 26%
187 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
32from deprecated.sphinx import deprecated
33from lsst.resources import ResourcePathExpression
34from pydantic import BaseModel
36from ._butlerConfig import ButlerConfig
37from ._butlerRepoIndex import ButlerRepoIndex
38from ._deferredDatasetHandle import DeferredDatasetHandle
39from ._limited_butler import LimitedButler
40from .core import (
41 Config,
42 DatasetId,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 DatastoreRecordData,
47 DimensionUniverse,
48 Quantum,
49 SerializedDatastoreRecordData,
50 StorageClass,
51 StorageClassFactory,
52 ddl,
53)
54from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
55from .registry.databases.sqlite import SqliteDatabase
56from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
57from .registry.opaque import ByNameOpaqueTableStorageManager
59if TYPE_CHECKING:
60 from ._butler import Butler
62_LOG = logging.getLogger(__name__)
65class _DatasetRecordStorageManagerDatastoreConstructionMimic:
66 """A partial implementation of `DatasetRecordStorageManager` that exists
67 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
68 to be constructed without a full `Registry`.
70 Notes
71 -----
72 The interface implemented by this class should probably be its own ABC,
73 and that ABC should probably be used in the definition of
74 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
75 changes minimal.
76 """
78 @classmethod
79 def getIdColumnType(cls) -> type:
80 # Docstring inherited.
81 return ddl.GUID
83 @classmethod
84 def addDatasetForeignKey(
85 cls,
86 tableSpec: ddl.TableSpec,
87 *,
88 name: str = "dataset",
89 constraint: bool = True,
90 onDelete: Optional[str] = None,
91 **kwargs: Any,
92 ) -> ddl.FieldSpec:
93 # Docstring inherited.
94 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
95 tableSpec.fields.add(idFieldSpec)
96 return idFieldSpec
99class QuantumBackedButler(LimitedButler):
100 """An implementation of `LimitedButler` intended to back execution of a
101 single `Quantum`.
103 Parameters
104 ----------
105 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
106 Dataset IDs for datasets that can can be read from this butler.
107 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
108 Dataset IDs for datasets that can be stored in this butler.
109 dimensions : `DimensionUniverse`
110 Object managing all dimension definitions.
111 datastore : `Datastore`
112 Datastore to use for all dataset I/O and existence checks.
113 storageClasses : `StorageClassFactory`
114 Object managing all storage class definitions.
116 Notes
117 -----
118 Most callers should use the `initialize` `classmethod` to construct new
119 instances instead of calling the constructor directly.
121 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
122 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
123 implementations that rely SQLAlchemy. If implementations are added in the
124 future that don't rely on SQLAlchemy, it should be possible to swap them
125 in by overriding the type arguments to `initialize` (though at present,
126 `QuantumBackedButler` would still create at least an in-memory SQLite
127 database that would then go unused).`
129 We imagine `QuantumBackedButler` being used during (at least) batch
130 execution to capture `Datastore` records and save them to per-quantum
131 files, which are also a convenient place to store provenance for eventual
132 upload to a SQL-backed `Registry` (once `Registry` has tables to store
133 provenance, that is).
134 These per-quantum files can be written in two ways:
136 - The SQLite file used internally by `QuantumBackedButler` can be used
137 directly but customizing the ``filename`` argument to ``initialize``, and
138 then transferring that file to the object store after execution completes
139 (or fails; a ``try/finally`` pattern probably makes sense here).
141 - A JSON or YAML file can be written by calling `extract_provenance_data`,
142 and using ``pydantic`` methods to write the returned
143 `QuantumProvenanceData` to a file.
145 Note that at present, the SQLite file only contains datastore records, not
146 provenance, but that should be easy to address (if desired) after we
147 actually design a `Registry` schema for provenance. I also suspect that
148 we'll want to explicitly close the SQLite file somehow before trying to
149 transfer it. But I'm guessing we'd prefer to write the per-quantum files
150 as JSON anyway.
151 """
153 def __init__(
154 self,
155 predicted_inputs: Iterable[DatasetId],
156 predicted_outputs: Iterable[DatasetId],
157 dimensions: DimensionUniverse,
158 datastore: Datastore,
159 storageClasses: StorageClassFactory,
160 dataset_types: Mapping[str, DatasetType] | None = None,
161 ):
162 self._dimensions = dimensions
163 self._predicted_inputs = set(predicted_inputs)
164 self._predicted_outputs = set(predicted_outputs)
165 self._available_inputs: Set[DatasetId] = set()
166 self._unavailable_inputs: Set[DatasetId] = set()
167 self._actual_inputs: Set[DatasetId] = set()
168 self._actual_output_refs: Set[DatasetRef] = set()
169 self.datastore = datastore
170 self.storageClasses = storageClasses
171 self._dataset_types: Mapping[str, DatasetType] = {}
172 if dataset_types is not None:
173 self._dataset_types = dataset_types
174 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
176 @classmethod
177 def initialize(
178 cls,
179 config: Union[Config, ResourcePathExpression],
180 quantum: Quantum,
181 dimensions: DimensionUniverse,
182 filename: str = ":memory:",
183 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
184 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
185 search_paths: Optional[List[str]] = None,
186 dataset_types: Mapping[str, DatasetType] | None = None,
187 ) -> QuantumBackedButler:
188 """Construct a new `QuantumBackedButler` from repository configuration
189 and helper types.
191 Parameters
192 ----------
193 config : `Config` or `~lsst.resources.ResourcePathExpression`
194 A butler repository root, configuration filename, or configuration
195 instance.
196 quantum : `Quantum`
197 Object describing the predicted input and output dataset relevant
198 to this butler. This must have resolved `DatasetRef` instances for
199 all inputs and outputs.
200 dimensions : `DimensionUniverse`
201 Object managing all dimension definitions.
202 filename : `str`, optional
203 Name for the SQLite database that will back this butler; defaults
204 to an in-memory database.
205 OpaqueManagerClass : `type`, optional
206 A subclass of `OpaqueTableStorageManager` to use for datastore
207 opaque records. Default is a SQL-backed implementation.
208 BridgeManagerClass : `type`, optional
209 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
210 location records. Default is a SQL-backed implementation.
211 search_paths : `list` of `str`, optional
212 Additional search paths for butler configuration.
213 dataset_types: `Mapping` [`str`, `DatasetType`], optional
214 Mapping of the dataset type name to its registry definition.
215 """
216 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
217 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
218 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
219 return cls._initialize(
220 config=config,
221 predicted_inputs=predicted_inputs,
222 predicted_outputs=predicted_outputs,
223 dimensions=dimensions,
224 filename=filename,
225 datastore_records=quantum.datastore_records,
226 OpaqueManagerClass=OpaqueManagerClass,
227 BridgeManagerClass=BridgeManagerClass,
228 search_paths=search_paths,
229 dataset_types=dataset_types,
230 )
232 @classmethod
233 def from_predicted(
234 cls,
235 config: Union[Config, ResourcePathExpression],
236 predicted_inputs: Iterable[DatasetId],
237 predicted_outputs: Iterable[DatasetId],
238 dimensions: DimensionUniverse,
239 datastore_records: Mapping[str, DatastoreRecordData],
240 filename: str = ":memory:",
241 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
242 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
243 search_paths: Optional[List[str]] = None,
244 dataset_types: Mapping[str, DatasetType] | None = None,
245 ) -> QuantumBackedButler:
246 """Construct a new `QuantumBackedButler` from sets of input and output
247 dataset IDs.
249 Parameters
250 ----------
251 config : `Config` or `~lsst.resources.ResourcePathExpression`
252 A butler repository root, configuration filename, or configuration
253 instance.
254 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
255 Dataset IDs for datasets that can can be read from this butler.
256 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
257 Dataset IDs for datasets that can be stored in this butler, must be
258 fully resolved.
259 dimensions : `DimensionUniverse`
260 Object managing all dimension definitions.
261 filename : `str`, optional
262 Name for the SQLite database that will back this butler; defaults
263 to an in-memory database.
264 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
265 Datastore records to import into a datastore.
266 OpaqueManagerClass : `type`, optional
267 A subclass of `OpaqueTableStorageManager` to use for datastore
268 opaque records. Default is a SQL-backed implementation.
269 BridgeManagerClass : `type`, optional
270 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
271 location records. Default is a SQL-backed implementation.
272 search_paths : `list` of `str`, optional
273 Additional search paths for butler configuration.
274 dataset_types: `Mapping` [`str`, `DatasetType`], optional
275 Mapping of the dataset type name to its registry definition.
276 """
277 return cls._initialize(
278 config=config,
279 predicted_inputs=predicted_inputs,
280 predicted_outputs=predicted_outputs,
281 dimensions=dimensions,
282 filename=filename,
283 datastore_records=datastore_records,
284 OpaqueManagerClass=OpaqueManagerClass,
285 BridgeManagerClass=BridgeManagerClass,
286 search_paths=search_paths,
287 dataset_types=dataset_types,
288 )
290 @classmethod
291 def _initialize(
292 cls,
293 *,
294 config: Union[Config, ResourcePathExpression],
295 predicted_inputs: Iterable[DatasetId],
296 predicted_outputs: Iterable[DatasetId],
297 dimensions: DimensionUniverse,
298 filename: str = ":memory:",
299 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
300 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
301 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
302 search_paths: Optional[List[str]] = None,
303 dataset_types: Mapping[str, DatasetType] | None = None,
304 ) -> QuantumBackedButler:
305 """Internal method with common implementation used by `initialize` and
306 `for_output`.
308 Parameters
309 ----------
310 config : `Config` or `~lsst.resources.ResourcePathExpression`
311 A butler repository root, configuration filename, or configuration
312 instance.
313 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
314 Dataset IDs for datasets that can can be read from this butler.
315 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
316 Dataset IDs for datasets that can be stored in this butler.
317 dimensions : `DimensionUniverse`
318 Object managing all dimension definitions.
319 filename : `str`, optional
320 Name for the SQLite database that will back this butler; defaults
321 to an in-memory database.
322 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
323 Datastore records to import into a datastore.
324 OpaqueManagerClass : `type`, optional
325 A subclass of `OpaqueTableStorageManager` to use for datastore
326 opaque records. Default is a SQL-backed implementation.
327 BridgeManagerClass : `type`, optional
328 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
329 location records. Default is a SQL-backed implementation.
330 search_paths : `list` of `str`, optional
331 Additional search paths for butler configuration.
332 dataset_types: `Mapping` [`str`, `DatasetType`]
333 Mapping of the dataset type name to its registry definition.
334 """
335 if isinstance(config, str):
336 config = ButlerRepoIndex.get_repo_uri(config, True)
337 butler_config = ButlerConfig(config, searchPaths=search_paths)
338 if "root" in butler_config:
339 butler_root = butler_config["root"]
340 else:
341 butler_root = butler_config.configDir
342 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
343 with db.declareStaticTables(create=True) as context:
344 opaque_manager = OpaqueManagerClass.initialize(db, context)
345 bridge_manager = BridgeManagerClass.initialize(
346 db,
347 context,
348 opaque=opaque_manager,
349 # MyPy can tell it's a fake, but we know it shouldn't care.
350 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
351 universe=dimensions,
352 )
353 # TODO: We need to inform `Datastore` here that it needs to support
354 # predictive reads; right now that's a configuration option, but after
355 # execution butler is retired it could just be a kwarg we pass here.
356 # For now just force this option as we cannot work without it.
357 butler_config["datastore", "trust_get_request"] = True
358 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
359 if datastore_records is not None:
360 datastore.import_records(datastore_records)
361 storageClasses = StorageClassFactory()
362 storageClasses.addFromConfig(butler_config)
363 return cls(
364 predicted_inputs,
365 predicted_outputs,
366 dimensions,
367 datastore,
368 storageClasses=storageClasses,
369 dataset_types=dataset_types,
370 )
372 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
373 """Return DatasetType defined in registry given dataset type name."""
374 return self._dataset_types.get(name)
376 def isWriteable(self) -> bool:
377 # Docstring inherited.
378 return True
380 @deprecated(
381 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
382 " Please use Butler.get(). Will be removed after v27.0.",
383 version="v26.0",
384 category=FutureWarning,
385 )
386 def getDirect(
387 self,
388 ref: DatasetRef,
389 *,
390 parameters: Optional[Dict[str, Any]] = None,
391 storageClass: str | StorageClass | None = None,
392 ) -> Any:
393 # Docstring inherited.
394 return self.get(ref, parameters=parameters, storageClass=storageClass)
396 def get(
397 self,
398 ref: DatasetRef,
399 /,
400 *,
401 parameters: dict[str, Any] | None = None,
402 storageClass: StorageClass | str | None = None,
403 ) -> Any:
404 try:
405 obj = super().get(
406 ref,
407 parameters=parameters,
408 storageClass=storageClass,
409 )
410 except (LookupError, FileNotFoundError, IOError):
411 self._unavailable_inputs.add(ref.id)
412 raise
413 if ref.id in self._predicted_inputs:
414 # do this after delegating to super in case that raises.
415 self._actual_inputs.add(ref.id)
416 self._available_inputs.add(ref.id)
417 return obj
419 @deprecated(
420 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
421 "Please use Butler.getDeferred(). Will be removed after v27.0.",
422 version="v26.0",
423 category=FutureWarning,
424 )
425 def getDirectDeferred(
426 self,
427 ref: DatasetRef,
428 *,
429 parameters: Union[dict, None] = None,
430 storageClass: str | StorageClass | None = None,
431 ) -> DeferredDatasetHandle:
432 # Docstring inherited.
433 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass)
435 def getDeferred(
436 self,
437 ref: DatasetRef,
438 /,
439 *,
440 parameters: dict[str, Any] | None = None,
441 storageClass: str | StorageClass | None = None,
442 ) -> DeferredDatasetHandle:
443 if ref.id in self._predicted_inputs:
444 # Unfortunately, we can't do this after the handle succeeds in
445 # loading, so it's conceivable here that we're marking an input
446 # as "actual" even when it's not even available.
447 self._actual_inputs.add(ref.id)
448 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
450 def datasetExistsDirect(self, ref: DatasetRef) -> bool:
451 # Docstring inherited.
452 exists = super().datasetExistsDirect(ref)
453 if ref.id in self._predicted_inputs:
454 if exists:
455 self._available_inputs.add(ref.id)
456 else:
457 self._unavailable_inputs.add(ref.id)
458 return exists
460 def markInputUnused(self, ref: DatasetRef) -> None:
461 # Docstring inherited.
462 self._actual_inputs.discard(ref.id)
464 @property
465 def dimensions(self) -> DimensionUniverse:
466 # Docstring inherited.
467 return self._dimensions
469 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
470 # Docstring inherited.
471 if ref.id not in self._predicted_outputs:
472 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
473 self.datastore.put(obj, ref)
474 self._actual_output_refs.add(ref)
475 return ref
477 def pruneDatasets(
478 self,
479 refs: Iterable[DatasetRef],
480 *,
481 disassociate: bool = True,
482 unstore: bool = False,
483 tags: Iterable[str] = (),
484 purge: bool = False,
485 ) -> None:
486 # docstring inherited from LimitedButler
488 if purge:
489 if not disassociate:
490 raise TypeError("Cannot pass purge=True without disassociate=True.")
491 if not unstore:
492 raise TypeError("Cannot pass purge=True without unstore=True.")
493 elif disassociate:
494 # No tagged collections for this butler.
495 raise TypeError("Cannot pass disassociate=True without purge=True.")
497 refs = list(refs)
499 # Pruning a component of a DatasetRef makes no sense.
500 for ref in refs:
501 if ref.datasetType.component():
502 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
504 if unstore:
505 self.datastore.trash(refs)
506 if purge:
507 for ref in refs:
508 # We only care about removing them from actual output refs,
509 self._actual_output_refs.discard(ref)
511 if unstore:
512 # Point of no return for removing artifacts
513 self.datastore.emptyTrash()
515 def extract_provenance_data(self) -> QuantumProvenanceData:
516 """Extract provenance information and datastore records from this
517 butler.
519 Returns
520 -------
521 provenance : `QuantumProvenanceData`
522 A serializable struct containing input/output dataset IDs and
523 datastore records. This assumes all dataset IDs are UUIDs (just to
524 make it easier for `pydantic` to reason about the struct's types);
525 the rest of this class makes no such assumption, but the approach
526 to processing in which it's useful effectively requires UUIDs
527 anyway.
529 Notes
530 -----
531 `QuantumBackedButler` records this provenance information when its
532 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
533 authors from having to worry about while still recording very
534 detailed information. But it has two small weaknesses:
536 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
537 dataset as an "actual input", which may mark some datasets that
538 aren't actually used. We rely on task authors to use
539 `markInputUnused` to address this.
541 - We assume that the execution system will call ``datasetExistsDirect``
542 on all predicted inputs prior to execution, in order to populate the
543 "available inputs" set. This is what I envision
544 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
545 to use this class, but it feels fragile for this class to make such
546 a strong assumption about how it will be used, even if I can't think
547 of any other executor behavior that would make sense.
548 """
549 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
550 _LOG.warning(
551 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
552 "was obtained, but did not actually exist. This task should be be using markInputUnused "
553 "directly to clarify its provenance.",
554 self._actual_inputs & self._unavailable_inputs,
555 )
556 self._actual_inputs -= self._unavailable_inputs
557 checked_inputs = self._available_inputs | self._unavailable_inputs
558 if not self._predicted_inputs == checked_inputs:
559 _LOG.warning(
560 "Execution harness did not check predicted inputs %s for existence; available inputs "
561 "recorded in provenance may be incomplete.",
562 self._predicted_inputs - checked_inputs,
563 )
564 datastore_records = self.datastore.export_records(self._actual_output_refs)
565 provenance_records = {
566 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
567 }
569 return QuantumProvenanceData(
570 predicted_inputs=self._predicted_inputs,
571 available_inputs=self._available_inputs,
572 actual_inputs=self._actual_inputs,
573 predicted_outputs=self._predicted_outputs,
574 actual_outputs={ref.id for ref in self._actual_output_refs},
575 datastore_records=provenance_records,
576 )
579class QuantumProvenanceData(BaseModel):
580 """A serializable struct for per-quantum provenance information and
581 datastore records.
583 Notes
584 -----
585 This class slightly duplicates information from the `Quantum` class itself
586 (the `predicted_inputs` and `predicted_outputs` sets should have the same
587 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
588 assumes the original `Quantum` is also available to reconstruct the
589 complete provenance (e.g. by associating dataset IDs with data IDs,
590 dataset types, and `~CollectionType.RUN` names.
592 Note that ``pydantic`` method ``parse_raw()`` is not going to work
593 correctly for this class, use `direct` method instead.
594 """
596 # This class probably should have information about its execution
597 # environment (anything not controlled and recorded at the
598 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
599 # now is out of scope for this prototype.
601 predicted_inputs: Set[uuid.UUID]
602 """Unique IDs of datasets that were predicted as inputs to this quantum
603 when the QuantumGraph was built.
604 """
606 available_inputs: Set[uuid.UUID]
607 """Unique IDs of input datasets that were actually present in the datastore
608 when this quantum was executed.
610 This is a subset of `predicted_inputs`, with the difference generally being
611 datasets were `predicted_outputs` but not `actual_outputs` of some upstream
612 task.
613 """
615 actual_inputs: Set[uuid.UUID]
616 """Unique IDs of datasets that were actually used as inputs by this task.
618 This is a subset of `available_inputs`.
620 Notes
621 -----
622 The criteria for marking an input as used is that rerunning the quantum
623 with only these `actual_inputs` available must yield identical outputs.
624 This means that (for example) even just using an input to help determine
625 an output rejection criteria and then rejecting it as an outlier qualifies
626 that input as actually used.
627 """
629 predicted_outputs: Set[uuid.UUID]
630 """Unique IDs of datasets that were predicted as outputs of this quantum
631 when the QuantumGraph was built.
632 """
634 actual_outputs: Set[uuid.UUID]
635 """Unique IDs of datasets that were actually written when this quantum
636 was executed.
637 """
639 datastore_records: Dict[str, SerializedDatastoreRecordData]
640 """Datastore records indexed by datastore name."""
642 @staticmethod
643 def collect_and_transfer(
644 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
645 ) -> None:
646 """Transfer output datasets from multiple quanta to a more permantent
647 `Butler` repository.
649 Parameters
650 ----------
651 butler : `Butler`
652 Full butler representing the data repository to transfer datasets
653 to.
654 quanta : `Iterable` [ `Quantum` ]
655 Iterable of `Quantum` objects that carry information about
656 predicted outputs. May be a single-pass iterator.
657 provenance : `Iterable` [ `QuantumProvenanceData` ]
658 Provenance and datastore data for each of the given quanta, in the
659 same order. May be a single-pass iterator.
661 Notes
662 -----
663 Input-output provenance data is not actually transferred yet, because
664 `Registry` has no place to store it.
666 This method probably works most efficiently if run on all quanta for a
667 single task label at once, because this will gather all datasets of
668 a particular type together into a single vectorized `Registry` import.
669 It should still behave correctly if run on smaller groups of quanta
670 or even quanta from multiple tasks.
672 Currently this method transfers datastore record data unchanged, with
673 no possibility of actually moving (e.g.) files. Datastores that are
674 present only in execution or only in the more permanent butler are
675 ignored.
676 """
677 grouped_refs = defaultdict(list)
678 summary_records: Dict[str, DatastoreRecordData] = {}
679 for quantum, provenance_for_quantum in zip(quanta, provenance):
680 quantum_refs_by_id = {
681 ref.id: ref
682 for ref in itertools.chain.from_iterable(quantum.outputs.values())
683 if ref.id in provenance_for_quantum.actual_outputs
684 }
685 for ref in quantum_refs_by_id.values():
686 grouped_refs[ref.datasetType, ref.run].append(ref)
688 # merge datastore records into a summary structure
689 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
690 quantum_records = DatastoreRecordData.from_simple(serialized_records)
691 if (records := summary_records.get(datastore_name)) is not None:
692 records.update(quantum_records)
693 else:
694 summary_records[datastore_name] = quantum_records
696 for refs in grouped_refs.values():
697 butler.registry._importDatasets(refs)
698 butler.datastore.import_records(summary_records)
700 @classmethod
701 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
702 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
704 @classmethod
705 def direct(
706 cls,
707 *,
708 predicted_inputs: Iterable[Union[str, uuid.UUID]],
709 available_inputs: Iterable[Union[str, uuid.UUID]],
710 actual_inputs: Iterable[Union[str, uuid.UUID]],
711 predicted_outputs: Iterable[Union[str, uuid.UUID]],
712 actual_outputs: Iterable[Union[str, uuid.UUID]],
713 datastore_records: Mapping[str, Mapping],
714 ) -> QuantumProvenanceData:
715 """Construct an instance directly without validators.
717 This differs from the pydantic "construct" method in that the
718 arguments are explicitly what the model requires, and it will recurse
719 through members, constructing them from their corresponding `direct`
720 methods.
722 This method should only be called when the inputs are trusted.
723 """
725 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]:
726 """Convert input UUIDs, which could be in string representation to
727 a set of `UUID` instances.
728 """
729 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids)
731 data = QuantumProvenanceData.__new__(cls)
732 setter = object.__setattr__
733 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs))
734 setter(data, "available_inputs", _to_uuid_set(available_inputs))
735 setter(data, "actual_inputs", _to_uuid_set(actual_inputs))
736 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs))
737 setter(data, "actual_outputs", _to_uuid_set(actual_outputs))
738 setter(
739 data,
740 "datastore_records",
741 {
742 key: SerializedDatastoreRecordData.direct(**records)
743 for key, records in datastore_records.items()
744 },
745 )
746 return data