Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%
184 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
32import itertools
33import logging
34import uuid
35from collections import defaultdict
36from collections.abc import Iterable, Mapping
37from typing import TYPE_CHECKING, Any
39from deprecated.sphinx import deprecated
40from lsst.daf.butler._compat import _BaseModelCompat
41from lsst.resources import ResourcePathExpression
43from ._butlerConfig import ButlerConfig
44from ._deferredDatasetHandle import DeferredDatasetHandle
45from ._limited_butler import LimitedButler
46from .core import (
47 Config,
48 DatasetId,
49 DatasetRef,
50 DatasetType,
51 Datastore,
52 DatastoreRecordData,
53 DimensionUniverse,
54 Quantum,
55 SerializedDatastoreRecordData,
56 StorageClass,
57 StorageClassFactory,
58 ddl,
59)
60from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
61from .registry.databases.sqlite import SqliteDatabase
62from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
63from .registry.opaque import ByNameOpaqueTableStorageManager
65if TYPE_CHECKING:
66 from ._butler import Butler
68_LOG = logging.getLogger(__name__)
71class _DatasetRecordStorageManagerDatastoreConstructionMimic:
72 """A partial implementation of `DatasetRecordStorageManager` that exists
73 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
74 to be constructed without a full `Registry`.
76 Notes
77 -----
78 The interface implemented by this class should probably be its own ABC,
79 and that ABC should probably be used in the definition of
80 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
81 changes minimal.
82 """
84 @classmethod
85 def getIdColumnType(cls) -> type:
86 # Docstring inherited.
87 return ddl.GUID
89 @classmethod
90 def addDatasetForeignKey(
91 cls,
92 tableSpec: ddl.TableSpec,
93 *,
94 name: str = "dataset",
95 constraint: bool = True,
96 onDelete: str | None = None,
97 **kwargs: Any,
98 ) -> ddl.FieldSpec:
99 # Docstring inherited.
100 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
101 tableSpec.fields.add(idFieldSpec)
102 return idFieldSpec
105class QuantumBackedButler(LimitedButler):
106 """An implementation of `LimitedButler` intended to back execution of a
107 single `Quantum`.
109 Parameters
110 ----------
111 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
112 Dataset IDs for datasets that can can be read from this butler.
113 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
114 Dataset IDs for datasets that can be stored in this butler.
115 dimensions : `DimensionUniverse`
116 Object managing all dimension definitions.
117 datastore : `Datastore`
118 Datastore to use for all dataset I/O and existence checks.
119 storageClasses : `StorageClassFactory`
120 Object managing all storage class definitions.
122 Notes
123 -----
124 Most callers should use the `initialize` `classmethod` to construct new
125 instances instead of calling the constructor directly.
127 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
128 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
129 implementations that rely SQLAlchemy. If implementations are added in the
130 future that don't rely on SQLAlchemy, it should be possible to swap them
131 in by overriding the type arguments to `initialize` (though at present,
132 `QuantumBackedButler` would still create at least an in-memory SQLite
133 database that would then go unused).`
135 We imagine `QuantumBackedButler` being used during (at least) batch
136 execution to capture `Datastore` records and save them to per-quantum
137 files, which are also a convenient place to store provenance for eventual
138 upload to a SQL-backed `Registry` (once `Registry` has tables to store
139 provenance, that is).
140 These per-quantum files can be written in two ways:
142 - The SQLite file used internally by `QuantumBackedButler` can be used
143 directly but customizing the ``filename`` argument to ``initialize``, and
144 then transferring that file to the object store after execution completes
145 (or fails; a ``try/finally`` pattern probably makes sense here).
147 - A JSON or YAML file can be written by calling `extract_provenance_data`,
148 and using ``pydantic`` methods to write the returned
149 `QuantumProvenanceData` to a file.
151 Note that at present, the SQLite file only contains datastore records, not
152 provenance, but that should be easy to address (if desired) after we
153 actually design a `Registry` schema for provenance. I also suspect that
154 we'll want to explicitly close the SQLite file somehow before trying to
155 transfer it. But I'm guessing we'd prefer to write the per-quantum files
156 as JSON anyway.
157 """
159 def __init__(
160 self,
161 predicted_inputs: Iterable[DatasetId],
162 predicted_outputs: Iterable[DatasetId],
163 dimensions: DimensionUniverse,
164 datastore: Datastore,
165 storageClasses: StorageClassFactory,
166 dataset_types: Mapping[str, DatasetType] | None = None,
167 ):
168 self._dimensions = dimensions
169 self._predicted_inputs = set(predicted_inputs)
170 self._predicted_outputs = set(predicted_outputs)
171 self._available_inputs: set[DatasetId] = set()
172 self._unavailable_inputs: set[DatasetId] = set()
173 self._actual_inputs: set[DatasetId] = set()
174 self._actual_output_refs: set[DatasetRef] = set()
175 self._datastore = datastore
176 self.storageClasses = storageClasses
177 self._dataset_types: Mapping[str, DatasetType] = {}
178 if dataset_types is not None:
179 self._dataset_types = dataset_types
180 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
182 @classmethod
183 def initialize(
184 cls,
185 config: Config | ResourcePathExpression,
186 quantum: Quantum,
187 dimensions: DimensionUniverse,
188 filename: str = ":memory:",
189 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
190 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
191 search_paths: list[str] | None = None,
192 dataset_types: Mapping[str, DatasetType] | None = None,
193 ) -> QuantumBackedButler:
194 """Construct a new `QuantumBackedButler` from repository configuration
195 and helper types.
197 Parameters
198 ----------
199 config : `Config` or `~lsst.resources.ResourcePathExpression`
200 A butler repository root, configuration filename, or configuration
201 instance.
202 quantum : `Quantum`
203 Object describing the predicted input and output dataset relevant
204 to this butler. This must have resolved `DatasetRef` instances for
205 all inputs and outputs.
206 dimensions : `DimensionUniverse`
207 Object managing all dimension definitions.
208 filename : `str`, optional
209 Name for the SQLite database that will back this butler; defaults
210 to an in-memory database.
211 OpaqueManagerClass : `type`, optional
212 A subclass of `OpaqueTableStorageManager` to use for datastore
213 opaque records. Default is a SQL-backed implementation.
214 BridgeManagerClass : `type`, optional
215 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
216 location records. Default is a SQL-backed implementation.
217 search_paths : `list` of `str`, optional
218 Additional search paths for butler configuration.
219 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
220 optional
221 Mapping of the dataset type name to its registry definition.
222 """
223 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
224 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
225 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
226 return cls._initialize(
227 config=config,
228 predicted_inputs=predicted_inputs,
229 predicted_outputs=predicted_outputs,
230 dimensions=dimensions,
231 filename=filename,
232 datastore_records=quantum.datastore_records,
233 OpaqueManagerClass=OpaqueManagerClass,
234 BridgeManagerClass=BridgeManagerClass,
235 search_paths=search_paths,
236 dataset_types=dataset_types,
237 )
239 @classmethod
240 def from_predicted(
241 cls,
242 config: Config | ResourcePathExpression,
243 predicted_inputs: Iterable[DatasetId],
244 predicted_outputs: Iterable[DatasetId],
245 dimensions: DimensionUniverse,
246 datastore_records: Mapping[str, DatastoreRecordData],
247 filename: str = ":memory:",
248 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
249 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
250 search_paths: list[str] | None = None,
251 dataset_types: Mapping[str, DatasetType] | None = None,
252 ) -> QuantumBackedButler:
253 """Construct a new `QuantumBackedButler` from sets of input and output
254 dataset IDs.
256 Parameters
257 ----------
258 config : `Config` or `~lsst.resources.ResourcePathExpression`
259 A butler repository root, configuration filename, or configuration
260 instance.
261 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
262 Dataset IDs for datasets that can can be read from this butler.
263 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
264 Dataset IDs for datasets that can be stored in this butler, must be
265 fully resolved.
266 dimensions : `DimensionUniverse`
267 Object managing all dimension definitions.
268 filename : `str`, optional
269 Name for the SQLite database that will back this butler; defaults
270 to an in-memory database.
271 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
272 Datastore records to import into a datastore.
273 OpaqueManagerClass : `type`, optional
274 A subclass of `OpaqueTableStorageManager` to use for datastore
275 opaque records. Default is a SQL-backed implementation.
276 BridgeManagerClass : `type`, optional
277 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
278 location records. Default is a SQL-backed implementation.
279 search_paths : `list` of `str`, optional
280 Additional search paths for butler configuration.
281 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \
282 optional
283 Mapping of the dataset type name to its registry definition.
284 """
285 return cls._initialize(
286 config=config,
287 predicted_inputs=predicted_inputs,
288 predicted_outputs=predicted_outputs,
289 dimensions=dimensions,
290 filename=filename,
291 datastore_records=datastore_records,
292 OpaqueManagerClass=OpaqueManagerClass,
293 BridgeManagerClass=BridgeManagerClass,
294 search_paths=search_paths,
295 dataset_types=dataset_types,
296 )
298 @classmethod
299 def _initialize(
300 cls,
301 *,
302 config: Config | ResourcePathExpression,
303 predicted_inputs: Iterable[DatasetId],
304 predicted_outputs: Iterable[DatasetId],
305 dimensions: DimensionUniverse,
306 filename: str = ":memory:",
307 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
308 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
309 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
310 search_paths: list[str] | None = None,
311 dataset_types: Mapping[str, DatasetType] | None = None,
312 ) -> QuantumBackedButler:
313 """Initialize quantum-backed butler.
315 Internal method with common implementation used by `initialize` and
316 `for_output`.
318 Parameters
319 ----------
320 config : `Config` or `~lsst.resources.ResourcePathExpression`
321 A butler repository root, configuration filename, or configuration
322 instance.
323 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
324 Dataset IDs for datasets that can can be read from this butler.
325 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
326 Dataset IDs for datasets that can be stored in this butler.
327 dimensions : `DimensionUniverse`
328 Object managing all dimension definitions.
329 filename : `str`, optional
330 Name for the SQLite database that will back this butler; defaults
331 to an in-memory database.
332 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
333 Datastore records to import into a datastore.
334 OpaqueManagerClass : `type`, optional
335 A subclass of `OpaqueTableStorageManager` to use for datastore
336 opaque records. Default is a SQL-backed implementation.
337 BridgeManagerClass : `type`, optional
338 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
339 location records. Default is a SQL-backed implementation.
340 search_paths : `list` of `str`, optional
341 Additional search paths for butler configuration.
342 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`]
343 Mapping of the dataset type name to its registry definition.
344 """
345 butler_config = ButlerConfig(config, searchPaths=search_paths)
346 butler_root = butler_config.get("root", butler_config.configDir)
347 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
348 with db.declareStaticTables(create=True) as context:
349 opaque_manager = OpaqueManagerClass.initialize(db, context)
350 bridge_manager = BridgeManagerClass.initialize(
351 db,
352 context,
353 opaque=opaque_manager,
354 # MyPy can tell it's a fake, but we know it shouldn't care.
355 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
356 universe=dimensions,
357 )
358 # TODO: We need to inform `Datastore` here that it needs to support
359 # predictive reads; right now that's a configuration option, but after
360 # execution butler is retired it could just be a kwarg we pass here.
361 # For now just force this option as we cannot work without it.
362 butler_config["datastore", "trust_get_request"] = True
363 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
364 if datastore_records is not None:
365 datastore.import_records(datastore_records)
366 storageClasses = StorageClassFactory()
367 storageClasses.addFromConfig(butler_config)
368 return cls(
369 predicted_inputs,
370 predicted_outputs,
371 dimensions,
372 datastore,
373 storageClasses=storageClasses,
374 dataset_types=dataset_types,
375 )
377 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
378 """Return DatasetType defined in registry given dataset type name."""
379 return self._dataset_types.get(name)
381 def isWriteable(self) -> bool:
382 # Docstring inherited.
383 return True
385 # TODO: remove on DM-40067.
386 @deprecated(
387 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
388 " Please use Butler.get(). Will be removed after v26.0.",
389 version="v26.0",
390 category=FutureWarning,
391 )
392 def getDirect(
393 self,
394 ref: DatasetRef,
395 *,
396 parameters: dict[str, Any] | None = None,
397 storageClass: str | StorageClass | None = None,
398 ) -> Any:
399 # Docstring inherited.
400 return self.get(ref, parameters=parameters, storageClass=storageClass)
402 def get(
403 self,
404 ref: DatasetRef,
405 /,
406 *,
407 parameters: dict[str, Any] | None = None,
408 storageClass: StorageClass | str | None = None,
409 ) -> Any:
410 try:
411 obj = super().get(
412 ref,
413 parameters=parameters,
414 storageClass=storageClass,
415 )
416 except (LookupError, FileNotFoundError, OSError):
417 self._unavailable_inputs.add(ref.id)
418 raise
419 if ref.id in self._predicted_inputs:
420 # do this after delegating to super in case that raises.
421 self._actual_inputs.add(ref.id)
422 self._available_inputs.add(ref.id)
423 return obj
425 # TODO: remove on DM-40067.
426 @deprecated(
427 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
428 "Please use Butler.getDeferred(). Will be removed after v26.0.",
429 version="v26.0",
430 category=FutureWarning,
431 )
432 def getDirectDeferred(
433 self,
434 ref: DatasetRef,
435 *,
436 parameters: dict[str, Any] | None = None,
437 storageClass: str | StorageClass | None = None,
438 ) -> DeferredDatasetHandle:
439 # Docstring inherited.
440 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass)
442 def getDeferred(
443 self,
444 ref: DatasetRef,
445 /,
446 *,
447 parameters: dict[str, Any] | None = None,
448 storageClass: str | StorageClass | None = None,
449 ) -> DeferredDatasetHandle:
450 if ref.id in self._predicted_inputs:
451 # Unfortunately, we can't do this after the handle succeeds in
452 # loading, so it's conceivable here that we're marking an input
453 # as "actual" even when it's not even available.
454 self._actual_inputs.add(ref.id)
455 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
457 def stored(self, ref: DatasetRef) -> bool:
458 # Docstring inherited.
459 stored = super().stored(ref)
460 if ref.id in self._predicted_inputs:
461 if stored:
462 self._available_inputs.add(ref.id)
463 else:
464 self._unavailable_inputs.add(ref.id)
465 return stored
467 def stored_many(
468 self,
469 refs: Iterable[DatasetRef],
470 ) -> dict[DatasetRef, bool]:
471 # Docstring inherited.
472 existence = super().stored_many(refs)
474 for ref, stored in existence.items():
475 if ref.id in self._predicted_inputs:
476 if stored:
477 self._available_inputs.add(ref.id)
478 else:
479 self._unavailable_inputs.add(ref.id)
480 return existence
482 def markInputUnused(self, ref: DatasetRef) -> None:
483 # Docstring inherited.
484 self._actual_inputs.discard(ref.id)
486 @property
487 def dimensions(self) -> DimensionUniverse:
488 # Docstring inherited.
489 return self._dimensions
491 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
492 # Docstring inherited.
493 if ref.id not in self._predicted_outputs:
494 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
495 self._datastore.put(obj, ref)
496 self._actual_output_refs.add(ref)
497 return ref
499 def pruneDatasets(
500 self,
501 refs: Iterable[DatasetRef],
502 *,
503 disassociate: bool = True,
504 unstore: bool = False,
505 tags: Iterable[str] = (),
506 purge: bool = False,
507 ) -> None:
508 # docstring inherited from LimitedButler
510 if purge:
511 if not disassociate:
512 raise TypeError("Cannot pass purge=True without disassociate=True.")
513 if not unstore:
514 raise TypeError("Cannot pass purge=True without unstore=True.")
515 elif disassociate:
516 # No tagged collections for this butler.
517 raise TypeError("Cannot pass disassociate=True without purge=True.")
519 refs = list(refs)
521 # Pruning a component of a DatasetRef makes no sense.
522 for ref in refs:
523 if ref.datasetType.component():
524 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
526 if unstore:
527 self._datastore.trash(refs)
528 if purge:
529 for ref in refs:
530 # We only care about removing them from actual output refs,
531 self._actual_output_refs.discard(ref)
533 if unstore:
534 # Point of no return for removing artifacts
535 self._datastore.emptyTrash()
537 def extract_provenance_data(self) -> QuantumProvenanceData:
538 """Extract provenance information and datastore records from this
539 butler.
541 Returns
542 -------
543 provenance : `QuantumProvenanceData`
544 A serializable struct containing input/output dataset IDs and
545 datastore records. This assumes all dataset IDs are UUIDs (just to
546 make it easier for `pydantic` to reason about the struct's types);
547 the rest of this class makes no such assumption, but the approach
548 to processing in which it's useful effectively requires UUIDs
549 anyway.
551 Notes
552 -----
553 `QuantumBackedButler` records this provenance information when its
554 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
555 authors from having to worry about while still recording very
556 detailed information. But it has two small weaknesses:
558 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
559 dataset as an "actual input", which may mark some datasets that
560 aren't actually used. We rely on task authors to use
561 `markInputUnused` to address this.
563 - We assume that the execution system will call ``datasetExistsDirect``
564 on all predicted inputs prior to execution, in order to populate the
565 "available inputs" set. This is what I envision
566 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
567 to use this class, but it feels fragile for this class to make such
568 a strong assumption about how it will be used, even if I can't think
569 of any other executor behavior that would make sense.
570 """
571 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
572 _LOG.warning(
573 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
574 "was obtained, but did not actually exist. This task should be be using markInputUnused "
575 "directly to clarify its provenance.",
576 self._actual_inputs & self._unavailable_inputs,
577 )
578 self._actual_inputs -= self._unavailable_inputs
579 checked_inputs = self._available_inputs | self._unavailable_inputs
580 if self._predicted_inputs != checked_inputs:
581 _LOG.warning(
582 "Execution harness did not check predicted inputs %s for existence; available inputs "
583 "recorded in provenance may be incomplete.",
584 self._predicted_inputs - checked_inputs,
585 )
586 datastore_records = self._datastore.export_records(self._actual_output_refs)
587 provenance_records = {
588 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
589 }
591 return QuantumProvenanceData(
592 predicted_inputs=self._predicted_inputs,
593 available_inputs=self._available_inputs,
594 actual_inputs=self._actual_inputs,
595 predicted_outputs=self._predicted_outputs,
596 actual_outputs={ref.id for ref in self._actual_output_refs},
597 datastore_records=provenance_records,
598 )
601class QuantumProvenanceData(_BaseModelCompat):
602 """A serializable struct for per-quantum provenance information and
603 datastore records.
605 Notes
606 -----
607 This class slightly duplicates information from the `Quantum` class itself
608 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the
609 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
610 assumes the original `Quantum` is also available to reconstruct the
611 complete provenance (e.g. by associating dataset IDs with data IDs,
612 dataset types, and `~CollectionType.RUN` names.
614 Note that ``pydantic`` method ``parse_raw()`` is not going to work
615 correctly for this class, use `direct` method instead.
616 """
618 # This class probably should have information about its execution
619 # environment (anything not controlled and recorded at the
620 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
621 # now is out of scope for this prototype.
623 predicted_inputs: set[uuid.UUID]
624 """Unique IDs of datasets that were predicted as inputs to this quantum
625 when the QuantumGraph was built.
626 """
628 available_inputs: set[uuid.UUID]
629 """Unique IDs of input datasets that were actually present in the datastore
630 when this quantum was executed.
632 This is a subset of ``predicted_inputs``, with the difference generally
633 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of
634 some upstream task.
635 """
637 actual_inputs: set[uuid.UUID]
638 """Unique IDs of datasets that were actually used as inputs by this task.
640 This is a subset of ``available_inputs``.
642 Notes
643 -----
644 The criteria for marking an input as used is that rerunning the quantum
645 with only these ``actual_inputs`` available must yield identical outputs.
646 This means that (for example) even just using an input to help determine
647 an output rejection criteria and then rejecting it as an outlier qualifies
648 that input as actually used.
649 """
651 predicted_outputs: set[uuid.UUID]
652 """Unique IDs of datasets that were predicted as outputs of this quantum
653 when the QuantumGraph was built.
654 """
656 actual_outputs: set[uuid.UUID]
657 """Unique IDs of datasets that were actually written when this quantum
658 was executed.
659 """
661 datastore_records: dict[str, SerializedDatastoreRecordData]
662 """Datastore records indexed by datastore name."""
664 @staticmethod
665 def collect_and_transfer(
666 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
667 ) -> None:
668 """Transfer output datasets from multiple quanta to a more permantent
669 `Butler` repository.
671 Parameters
672 ----------
673 butler : `Butler`
674 Full butler representing the data repository to transfer datasets
675 to.
676 quanta : `~collections.abc.Iterable` [ `Quantum` ]
677 Iterable of `Quantum` objects that carry information about
678 predicted outputs. May be a single-pass iterator.
679 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ]
680 Provenance and datastore data for each of the given quanta, in the
681 same order. May be a single-pass iterator.
683 Notes
684 -----
685 Input-output provenance data is not actually transferred yet, because
686 `Registry` has no place to store it.
688 This method probably works most efficiently if run on all quanta for a
689 single task label at once, because this will gather all datasets of
690 a particular type together into a single vectorized `Registry` import.
691 It should still behave correctly if run on smaller groups of quanta
692 or even quanta from multiple tasks.
694 Currently this method transfers datastore record data unchanged, with
695 no possibility of actually moving (e.g.) files. Datastores that are
696 present only in execution or only in the more permanent butler are
697 ignored.
698 """
699 grouped_refs = defaultdict(list)
700 summary_records: dict[str, DatastoreRecordData] = {}
701 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True):
702 quantum_refs_by_id = {
703 ref.id: ref
704 for ref in itertools.chain.from_iterable(quantum.outputs.values())
705 if ref.id in provenance_for_quantum.actual_outputs
706 }
707 for ref in quantum_refs_by_id.values():
708 grouped_refs[ref.datasetType, ref.run].append(ref)
710 # merge datastore records into a summary structure
711 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
712 quantum_records = DatastoreRecordData.from_simple(serialized_records)
713 if (records := summary_records.get(datastore_name)) is not None:
714 records.update(quantum_records)
715 else:
716 summary_records[datastore_name] = quantum_records
718 for refs in grouped_refs.values():
719 butler.registry._importDatasets(refs)
720 butler._datastore.import_records(summary_records)
722 @classmethod
723 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
724 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
726 @classmethod
727 def direct(
728 cls,
729 *,
730 predicted_inputs: Iterable[str | uuid.UUID],
731 available_inputs: Iterable[str | uuid.UUID],
732 actual_inputs: Iterable[str | uuid.UUID],
733 predicted_outputs: Iterable[str | uuid.UUID],
734 actual_outputs: Iterable[str | uuid.UUID],
735 datastore_records: Mapping[str, Mapping],
736 ) -> QuantumProvenanceData:
737 """Construct an instance directly without validators.
739 This differs from the pydantic "construct" method in that the
740 arguments are explicitly what the model requires, and it will recurse
741 through members, constructing them from their corresponding `direct`
742 methods.
744 This method should only be called when the inputs are trusted.
745 """
747 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]:
748 """Convert input UUIDs, which could be in string representation to
749 a set of `UUID` instances.
750 """
751 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids}
753 data = cls.model_construct(
754 predicted_inputs=_to_uuid_set(predicted_inputs),
755 available_inputs=_to_uuid_set(available_inputs),
756 actual_inputs=_to_uuid_set(actual_inputs),
757 predicted_outputs=_to_uuid_set(predicted_outputs),
758 actual_outputs=_to_uuid_set(actual_outputs),
759 datastore_records={
760 key: SerializedDatastoreRecordData.direct(**records)
761 for key, records in datastore_records.items()
762 },
763 )
765 return data