Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%
196 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from collections.abc import Iterable, Mapping
31from typing import TYPE_CHECKING, Any
33from deprecated.sphinx import deprecated
34from lsst.resources import ResourcePathExpression
36try:
37 from pydantic.v1 import BaseModel
38except ModuleNotFoundError:
39 from pydantic import BaseModel # type: ignore
41from ._butlerConfig import ButlerConfig
42from ._deferredDatasetHandle import DeferredDatasetHandle
43from ._limited_butler import LimitedButler
44from .core import (
45 Config,
46 DatasetId,
47 DatasetRef,
48 DatasetType,
49 Datastore,
50 DatastoreRecordData,
51 DimensionUniverse,
52 Quantum,
53 SerializedDatastoreRecordData,
54 StorageClass,
55 StorageClassFactory,
56 ddl,
57)
58from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
59from .registry.databases.sqlite import SqliteDatabase
60from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
61from .registry.opaque import ByNameOpaqueTableStorageManager
63if TYPE_CHECKING:
64 from ._butler import Butler
66_LOG = logging.getLogger(__name__)
69class _DatasetRecordStorageManagerDatastoreConstructionMimic:
70 """A partial implementation of `DatasetRecordStorageManager` that exists
71 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
72 to be constructed without a full `Registry`.
74 Notes
75 -----
76 The interface implemented by this class should probably be its own ABC,
77 and that ABC should probably be used in the definition of
78 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
79 changes minimal.
80 """
82 @classmethod
83 def getIdColumnType(cls) -> type:
84 # Docstring inherited.
85 return ddl.GUID
87 @classmethod
88 def addDatasetForeignKey(
89 cls,
90 tableSpec: ddl.TableSpec,
91 *,
92 name: str = "dataset",
93 constraint: bool = True,
94 onDelete: str | None = None,
95 **kwargs: Any,
96 ) -> ddl.FieldSpec:
97 # Docstring inherited.
98 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
99 tableSpec.fields.add(idFieldSpec)
100 return idFieldSpec
103class QuantumBackedButler(LimitedButler):
104 """An implementation of `LimitedButler` intended to back execution of a
105 single `Quantum`.
107 Parameters
108 ----------
109 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
110 Dataset IDs for datasets that can can be read from this butler.
111 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
112 Dataset IDs for datasets that can be stored in this butler.
113 dimensions : `DimensionUniverse`
114 Object managing all dimension definitions.
115 datastore : `Datastore`
116 Datastore to use for all dataset I/O and existence checks.
117 storageClasses : `StorageClassFactory`
118 Object managing all storage class definitions.
120 Notes
121 -----
122 Most callers should use the `initialize` `classmethod` to construct new
123 instances instead of calling the constructor directly.
125 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
126 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
127 implementations that rely SQLAlchemy. If implementations are added in the
128 future that don't rely on SQLAlchemy, it should be possible to swap them
129 in by overriding the type arguments to `initialize` (though at present,
130 `QuantumBackedButler` would still create at least an in-memory SQLite
131 database that would then go unused).`
133 We imagine `QuantumBackedButler` being used during (at least) batch
134 execution to capture `Datastore` records and save them to per-quantum
135 files, which are also a convenient place to store provenance for eventual
136 upload to a SQL-backed `Registry` (once `Registry` has tables to store
137 provenance, that is).
138 These per-quantum files can be written in two ways:
140 - The SQLite file used internally by `QuantumBackedButler` can be used
141 directly but customizing the ``filename`` argument to ``initialize``, and
142 then transferring that file to the object store after execution completes
143 (or fails; a ``try/finally`` pattern probably makes sense here).
145 - A JSON or YAML file can be written by calling `extract_provenance_data`,
146 and using ``pydantic`` methods to write the returned
147 `QuantumProvenanceData` to a file.
149 Note that at present, the SQLite file only contains datastore records, not
150 provenance, but that should be easy to address (if desired) after we
151 actually design a `Registry` schema for provenance. I also suspect that
152 we'll want to explicitly close the SQLite file somehow before trying to
153 transfer it. But I'm guessing we'd prefer to write the per-quantum files
154 as JSON anyway.
155 """
157 def __init__(
158 self,
159 predicted_inputs: Iterable[DatasetId],
160 predicted_outputs: Iterable[DatasetId],
161 dimensions: DimensionUniverse,
162 datastore: Datastore,
163 storageClasses: StorageClassFactory,
164 dataset_types: Mapping[str, DatasetType] | None = None,
165 ):
166 self._dimensions = dimensions
167 self._predicted_inputs = set(predicted_inputs)
168 self._predicted_outputs = set(predicted_outputs)
169 self._available_inputs: set[DatasetId] = set()
170 self._unavailable_inputs: set[DatasetId] = set()
171 self._actual_inputs: set[DatasetId] = set()
172 self._actual_output_refs: set[DatasetRef] = set()
173 self._datastore = datastore
174 self.storageClasses = storageClasses
175 self._dataset_types: Mapping[str, DatasetType] = {}
176 if dataset_types is not None:
177 self._dataset_types = dataset_types
178 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
180 @classmethod
181 def initialize(
182 cls,
183 config: Config | ResourcePathExpression,
184 quantum: Quantum,
185 dimensions: DimensionUniverse,
186 filename: str = ":memory:",
187 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
188 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
189 search_paths: list[str] | None = None,
190 dataset_types: Mapping[str, DatasetType] | None = None,
191 ) -> QuantumBackedButler:
192 """Construct a new `QuantumBackedButler` from repository configuration
193 and helper types.
195 Parameters
196 ----------
197 config : `Config` or `~lsst.resources.ResourcePathExpression`
198 A butler repository root, configuration filename, or configuration
199 instance.
200 quantum : `Quantum`
201 Object describing the predicted input and output dataset relevant
202 to this butler. This must have resolved `DatasetRef` instances for
203 all inputs and outputs.
204 dimensions : `DimensionUniverse`
205 Object managing all dimension definitions.
206 filename : `str`, optional
207 Name for the SQLite database that will back this butler; defaults
208 to an in-memory database.
209 OpaqueManagerClass : `type`, optional
210 A subclass of `OpaqueTableStorageManager` to use for datastore
211 opaque records. Default is a SQL-backed implementation.
212 BridgeManagerClass : `type`, optional
213 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
214 location records. Default is a SQL-backed implementation.
215 search_paths : `list` of `str`, optional
216 Additional search paths for butler configuration.
217 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
218 optional
219 Mapping of the dataset type name to its registry definition.
220 """
221 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
222 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
223 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
224 return cls._initialize(
225 config=config,
226 predicted_inputs=predicted_inputs,
227 predicted_outputs=predicted_outputs,
228 dimensions=dimensions,
229 filename=filename,
230 datastore_records=quantum.datastore_records,
231 OpaqueManagerClass=OpaqueManagerClass,
232 BridgeManagerClass=BridgeManagerClass,
233 search_paths=search_paths,
234 dataset_types=dataset_types,
235 )
237 @classmethod
238 def from_predicted(
239 cls,
240 config: Config | ResourcePathExpression,
241 predicted_inputs: Iterable[DatasetId],
242 predicted_outputs: Iterable[DatasetId],
243 dimensions: DimensionUniverse,
244 datastore_records: Mapping[str, DatastoreRecordData],
245 filename: str = ":memory:",
246 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
247 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
248 search_paths: list[str] | None = None,
249 dataset_types: Mapping[str, DatasetType] | None = None,
250 ) -> QuantumBackedButler:
251 """Construct a new `QuantumBackedButler` from sets of input and output
252 dataset IDs.
254 Parameters
255 ----------
256 config : `Config` or `~lsst.resources.ResourcePathExpression`
257 A butler repository root, configuration filename, or configuration
258 instance.
259 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
260 Dataset IDs for datasets that can can be read from this butler.
261 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
262 Dataset IDs for datasets that can be stored in this butler, must be
263 fully resolved.
264 dimensions : `DimensionUniverse`
265 Object managing all dimension definitions.
266 filename : `str`, optional
267 Name for the SQLite database that will back this butler; defaults
268 to an in-memory database.
269 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
270 Datastore records to import into a datastore.
271 OpaqueManagerClass : `type`, optional
272 A subclass of `OpaqueTableStorageManager` to use for datastore
273 opaque records. Default is a SQL-backed implementation.
274 BridgeManagerClass : `type`, optional
275 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
276 location records. Default is a SQL-backed implementation.
277 search_paths : `list` of `str`, optional
278 Additional search paths for butler configuration.
279 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
280 optional
281 Mapping of the dataset type name to its registry definition.
282 """
283 return cls._initialize(
284 config=config,
285 predicted_inputs=predicted_inputs,
286 predicted_outputs=predicted_outputs,
287 dimensions=dimensions,
288 filename=filename,
289 datastore_records=datastore_records,
290 OpaqueManagerClass=OpaqueManagerClass,
291 BridgeManagerClass=BridgeManagerClass,
292 search_paths=search_paths,
293 dataset_types=dataset_types,
294 )
296 @classmethod
297 def _initialize(
298 cls,
299 *,
300 config: Config | ResourcePathExpression,
301 predicted_inputs: Iterable[DatasetId],
302 predicted_outputs: Iterable[DatasetId],
303 dimensions: DimensionUniverse,
304 filename: str = ":memory:",
305 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
306 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
307 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
308 search_paths: list[str] | None = None,
309 dataset_types: Mapping[str, DatasetType] | None = None,
310 ) -> QuantumBackedButler:
311 """Initialize quantum-backed butler.
313 Internal method with common implementation used by `initialize` and
314 `for_output`.
316 Parameters
317 ----------
318 config : `Config` or `~lsst.resources.ResourcePathExpression`
319 A butler repository root, configuration filename, or configuration
320 instance.
321 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
322 Dataset IDs for datasets that can can be read from this butler.
323 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
324 Dataset IDs for datasets that can be stored in this butler.
325 dimensions : `DimensionUniverse`
326 Object managing all dimension definitions.
327 filename : `str`, optional
328 Name for the SQLite database that will back this butler; defaults
329 to an in-memory database.
330 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
331 Datastore records to import into a datastore.
332 OpaqueManagerClass : `type`, optional
333 A subclass of `OpaqueTableStorageManager` to use for datastore
334 opaque records. Default is a SQL-backed implementation.
335 BridgeManagerClass : `type`, optional
336 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
337 location records. Default is a SQL-backed implementation.
338 search_paths : `list` of `str`, optional
339 Additional search paths for butler configuration.
340 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`]
341 Mapping of the dataset type name to its registry definition.
342 """
343 butler_config = ButlerConfig(config, searchPaths=search_paths)
344 if "root" in butler_config:
345 butler_root = butler_config["root"]
346 else:
347 butler_root = butler_config.configDir
348 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
349 with db.declareStaticTables(create=True) as context:
350 opaque_manager = OpaqueManagerClass.initialize(db, context)
351 bridge_manager = BridgeManagerClass.initialize(
352 db,
353 context,
354 opaque=opaque_manager,
355 # MyPy can tell it's a fake, but we know it shouldn't care.
356 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
357 universe=dimensions,
358 )
359 # TODO: We need to inform `Datastore` here that it needs to support
360 # predictive reads; right now that's a configuration option, but after
361 # execution butler is retired it could just be a kwarg we pass here.
362 # For now just force this option as we cannot work without it.
363 butler_config["datastore", "trust_get_request"] = True
364 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
365 if datastore_records is not None:
366 datastore.import_records(datastore_records)
367 storageClasses = StorageClassFactory()
368 storageClasses.addFromConfig(butler_config)
369 return cls(
370 predicted_inputs,
371 predicted_outputs,
372 dimensions,
373 datastore,
374 storageClasses=storageClasses,
375 dataset_types=dataset_types,
376 )
378 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
379 """Return DatasetType defined in registry given dataset type name."""
380 return self._dataset_types.get(name)
382 def isWriteable(self) -> bool:
383 # Docstring inherited.
384 return True
386 @deprecated(
387 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
388 " Please use Butler.get(). Will be removed after v27.0.",
389 version="v26.0",
390 category=FutureWarning,
391 )
392 def getDirect(
393 self,
394 ref: DatasetRef,
395 *,
396 parameters: dict[str, Any] | None = None,
397 storageClass: str | StorageClass | None = None,
398 ) -> Any:
399 # Docstring inherited.
400 return self.get(ref, parameters=parameters, storageClass=storageClass)
402 def get(
403 self,
404 ref: DatasetRef,
405 /,
406 *,
407 parameters: dict[str, Any] | None = None,
408 storageClass: StorageClass | str | None = None,
409 ) -> Any:
410 try:
411 obj = super().get(
412 ref,
413 parameters=parameters,
414 storageClass=storageClass,
415 )
416 except (LookupError, FileNotFoundError, OSError):
417 self._unavailable_inputs.add(ref.id)
418 raise
419 if ref.id in self._predicted_inputs:
420 # do this after delegating to super in case that raises.
421 self._actual_inputs.add(ref.id)
422 self._available_inputs.add(ref.id)
423 return obj
425 @deprecated(
426 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
427 "Please use Butler.getDeferred(). Will be removed after v27.0.",
428 version="v26.0",
429 category=FutureWarning,
430 )
431 def getDirectDeferred(
432 self,
433 ref: DatasetRef,
434 *,
435 parameters: dict[str, Any] | None = None,
436 storageClass: str | StorageClass | None = None,
437 ) -> DeferredDatasetHandle:
438 # Docstring inherited.
439 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass)
441 def getDeferred(
442 self,
443 ref: DatasetRef,
444 /,
445 *,
446 parameters: dict[str, Any] | None = None,
447 storageClass: str | StorageClass | None = None,
448 ) -> DeferredDatasetHandle:
449 if ref.id in self._predicted_inputs:
450 # Unfortunately, we can't do this after the handle succeeds in
451 # loading, so it's conceivable here that we're marking an input
452 # as "actual" even when it's not even available.
453 self._actual_inputs.add(ref.id)
454 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
456 def stored(self, ref: DatasetRef) -> bool:
457 # Docstring inherited.
458 stored = super().stored(ref)
459 if ref.id in self._predicted_inputs:
460 if stored:
461 self._available_inputs.add(ref.id)
462 else:
463 self._unavailable_inputs.add(ref.id)
464 return stored
466 def stored_many(
467 self,
468 refs: Iterable[DatasetRef],
469 ) -> dict[DatasetRef, bool]:
470 # Docstring inherited.
471 existence = super().stored_many(refs)
473 for ref, stored in existence.items():
474 if ref.id in self._predicted_inputs:
475 if stored:
476 self._available_inputs.add(ref.id)
477 else:
478 self._unavailable_inputs.add(ref.id)
479 return existence
481 def markInputUnused(self, ref: DatasetRef) -> None:
482 # Docstring inherited.
483 self._actual_inputs.discard(ref.id)
485 @property
486 def dimensions(self) -> DimensionUniverse:
487 # Docstring inherited.
488 return self._dimensions
490 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
491 # Docstring inherited.
492 if ref.id not in self._predicted_outputs:
493 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
494 self._datastore.put(obj, ref)
495 self._actual_output_refs.add(ref)
496 return ref
498 def pruneDatasets(
499 self,
500 refs: Iterable[DatasetRef],
501 *,
502 disassociate: bool = True,
503 unstore: bool = False,
504 tags: Iterable[str] = (),
505 purge: bool = False,
506 ) -> None:
507 # docstring inherited from LimitedButler
509 if purge:
510 if not disassociate:
511 raise TypeError("Cannot pass purge=True without disassociate=True.")
512 if not unstore:
513 raise TypeError("Cannot pass purge=True without unstore=True.")
514 elif disassociate:
515 # No tagged collections for this butler.
516 raise TypeError("Cannot pass disassociate=True without purge=True.")
518 refs = list(refs)
520 # Pruning a component of a DatasetRef makes no sense.
521 for ref in refs:
522 if ref.datasetType.component():
523 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
525 if unstore:
526 self._datastore.trash(refs)
527 if purge:
528 for ref in refs:
529 # We only care about removing them from actual output refs,
530 self._actual_output_refs.discard(ref)
532 if unstore:
533 # Point of no return for removing artifacts
534 self._datastore.emptyTrash()
536 def extract_provenance_data(self) -> QuantumProvenanceData:
537 """Extract provenance information and datastore records from this
538 butler.
540 Returns
541 -------
542 provenance : `QuantumProvenanceData`
543 A serializable struct containing input/output dataset IDs and
544 datastore records. This assumes all dataset IDs are UUIDs (just to
545 make it easier for `pydantic` to reason about the struct's types);
546 the rest of this class makes no such assumption, but the approach
547 to processing in which it's useful effectively requires UUIDs
548 anyway.
550 Notes
551 -----
552 `QuantumBackedButler` records this provenance information when its
553 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
554 authors from having to worry about while still recording very
555 detailed information. But it has two small weaknesses:
557 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
558 dataset as an "actual input", which may mark some datasets that
559 aren't actually used. We rely on task authors to use
560 `markInputUnused` to address this.
562 - We assume that the execution system will call ``datasetExistsDirect``
563 on all predicted inputs prior to execution, in order to populate the
564 "available inputs" set. This is what I envision
565 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
566 to use this class, but it feels fragile for this class to make such
567 a strong assumption about how it will be used, even if I can't think
568 of any other executor behavior that would make sense.
569 """
570 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
571 _LOG.warning(
572 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
573 "was obtained, but did not actually exist. This task should be be using markInputUnused "
574 "directly to clarify its provenance.",
575 self._actual_inputs & self._unavailable_inputs,
576 )
577 self._actual_inputs -= self._unavailable_inputs
578 checked_inputs = self._available_inputs | self._unavailable_inputs
579 if not self._predicted_inputs == checked_inputs:
580 _LOG.warning(
581 "Execution harness did not check predicted inputs %s for existence; available inputs "
582 "recorded in provenance may be incomplete.",
583 self._predicted_inputs - checked_inputs,
584 )
585 datastore_records = self._datastore.export_records(self._actual_output_refs)
586 provenance_records = {
587 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
588 }
590 return QuantumProvenanceData(
591 predicted_inputs=self._predicted_inputs,
592 available_inputs=self._available_inputs,
593 actual_inputs=self._actual_inputs,
594 predicted_outputs=self._predicted_outputs,
595 actual_outputs={ref.id for ref in self._actual_output_refs},
596 datastore_records=provenance_records,
597 )
600class QuantumProvenanceData(BaseModel):
601 """A serializable struct for per-quantum provenance information and
602 datastore records.
604 Notes
605 -----
606 This class slightly duplicates information from the `Quantum` class itself
607 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the
608 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
609 assumes the original `Quantum` is also available to reconstruct the
610 complete provenance (e.g. by associating dataset IDs with data IDs,
611 dataset types, and `~CollectionType.RUN` names.
613 Note that ``pydantic`` method ``parse_raw()`` is not going to work
614 correctly for this class, use `direct` method instead.
615 """
617 # This class probably should have information about its execution
618 # environment (anything not controlled and recorded at the
619 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
620 # now is out of scope for this prototype.
622 predicted_inputs: set[uuid.UUID]
623 """Unique IDs of datasets that were predicted as inputs to this quantum
624 when the QuantumGraph was built.
625 """
627 available_inputs: set[uuid.UUID]
628 """Unique IDs of input datasets that were actually present in the datastore
629 when this quantum was executed.
631 This is a subset of ``predicted_inputs``, with the difference generally
632 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of
633 some upstream task.
634 """
636 actual_inputs: set[uuid.UUID]
637 """Unique IDs of datasets that were actually used as inputs by this task.
639 This is a subset of ``available_inputs``.
641 Notes
642 -----
643 The criteria for marking an input as used is that rerunning the quantum
644 with only these ``actual_inputs`` available must yield identical outputs.
645 This means that (for example) even just using an input to help determine
646 an output rejection criteria and then rejecting it as an outlier qualifies
647 that input as actually used.
648 """
650 predicted_outputs: set[uuid.UUID]
651 """Unique IDs of datasets that were predicted as outputs of this quantum
652 when the QuantumGraph was built.
653 """
655 actual_outputs: set[uuid.UUID]
656 """Unique IDs of datasets that were actually written when this quantum
657 was executed.
658 """
660 datastore_records: dict[str, SerializedDatastoreRecordData]
661 """Datastore records indexed by datastore name."""
663 @staticmethod
664 def collect_and_transfer(
665 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
666 ) -> None:
667 """Transfer output datasets from multiple quanta to a more permantent
668 `Butler` repository.
670 Parameters
671 ----------
672 butler : `Butler`
673 Full butler representing the data repository to transfer datasets
674 to.
675 quanta : `~collections.abc.Iterable` [ `Quantum` ]
676 Iterable of `Quantum` objects that carry information about
677 predicted outputs. May be a single-pass iterator.
678 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ]
679 Provenance and datastore data for each of the given quanta, in the
680 same order. May be a single-pass iterator.
682 Notes
683 -----
684 Input-output provenance data is not actually transferred yet, because
685 `Registry` has no place to store it.
687 This method probably works most efficiently if run on all quanta for a
688 single task label at once, because this will gather all datasets of
689 a particular type together into a single vectorized `Registry` import.
690 It should still behave correctly if run on smaller groups of quanta
691 or even quanta from multiple tasks.
693 Currently this method transfers datastore record data unchanged, with
694 no possibility of actually moving (e.g.) files. Datastores that are
695 present only in execution or only in the more permanent butler are
696 ignored.
697 """
698 grouped_refs = defaultdict(list)
699 summary_records: dict[str, DatastoreRecordData] = {}
700 for quantum, provenance_for_quantum in zip(quanta, provenance):
701 quantum_refs_by_id = {
702 ref.id: ref
703 for ref in itertools.chain.from_iterable(quantum.outputs.values())
704 if ref.id in provenance_for_quantum.actual_outputs
705 }
706 for ref in quantum_refs_by_id.values():
707 grouped_refs[ref.datasetType, ref.run].append(ref)
709 # merge datastore records into a summary structure
710 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
711 quantum_records = DatastoreRecordData.from_simple(serialized_records)
712 if (records := summary_records.get(datastore_name)) is not None:
713 records.update(quantum_records)
714 else:
715 summary_records[datastore_name] = quantum_records
717 for refs in grouped_refs.values():
718 butler.registry._importDatasets(refs)
719 butler._datastore.import_records(summary_records)
721 @classmethod
722 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
723 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
725 @classmethod
726 def direct(
727 cls,
728 *,
729 predicted_inputs: Iterable[str | uuid.UUID],
730 available_inputs: Iterable[str | uuid.UUID],
731 actual_inputs: Iterable[str | uuid.UUID],
732 predicted_outputs: Iterable[str | uuid.UUID],
733 actual_outputs: Iterable[str | uuid.UUID],
734 datastore_records: Mapping[str, Mapping],
735 ) -> QuantumProvenanceData:
736 """Construct an instance directly without validators.
738 This differs from the pydantic "construct" method in that the
739 arguments are explicitly what the model requires, and it will recurse
740 through members, constructing them from their corresponding `direct`
741 methods.
743 This method should only be called when the inputs are trusted.
744 """
746 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]:
747 """Convert input UUIDs, which could be in string representation to
748 a set of `UUID` instances.
749 """
750 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids}
752 data = QuantumProvenanceData.__new__(cls)
753 setter = object.__setattr__
754 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs))
755 setter(data, "available_inputs", _to_uuid_set(available_inputs))
756 setter(data, "actual_inputs", _to_uuid_set(actual_inputs))
757 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs))
758 setter(data, "actual_outputs", _to_uuid_set(actual_outputs))
759 setter(
760 data,
761 "datastore_records",
762 {
763 key: SerializedDatastoreRecordData.direct(**records)
764 for key, records in datastore_records.items()
765 },
766 )
767 return data