Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%
185 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-25 10:50 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-25 10:50 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from . import ddl
32__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
34import itertools
35import logging
36import uuid
37from collections import defaultdict
38from collections.abc import Iterable, Mapping
39from typing import TYPE_CHECKING, Any
41import pydantic
42from lsst.resources import ResourcePathExpression
44from ._butler_config import ButlerConfig
45from ._config import Config
46from ._dataset_ref import DatasetId, DatasetRef
47from ._dataset_type import DatasetType
48from ._deferredDatasetHandle import DeferredDatasetHandle
49from ._limited_butler import LimitedButler
50from ._quantum import Quantum
51from ._storage_class import StorageClass, StorageClassFactory
52from .datastore import Datastore
53from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData
54from .dimensions import DimensionUniverse
55from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
56from .registry.databases.sqlite import SqliteDatabase
57from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
58from .registry.opaque import ByNameOpaqueTableStorageManager
60if TYPE_CHECKING:
61 from ._butler import Butler
63_LOG = logging.getLogger(__name__)
66class _DatasetRecordStorageManagerDatastoreConstructionMimic:
67 """A partial implementation of `DatasetRecordStorageManager` that exists
68 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
69 to be constructed without a full `Registry`.
71 Notes
72 -----
73 The interface implemented by this class should probably be its own ABC,
74 and that ABC should probably be used in the definition of
75 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
76 changes minimal.
77 """
79 @classmethod
80 def getIdColumnType(cls) -> type:
81 # Docstring inherited.
82 return ddl.GUID
84 @classmethod
85 def addDatasetForeignKey(
86 cls,
87 tableSpec: ddl.TableSpec,
88 *,
89 name: str = "dataset",
90 constraint: bool = True,
91 onDelete: str | None = None,
92 **kwargs: Any,
93 ) -> ddl.FieldSpec:
94 # Docstring inherited.
95 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
96 tableSpec.fields.add(idFieldSpec)
97 return idFieldSpec
100class QuantumBackedButler(LimitedButler):
101 """An implementation of `LimitedButler` intended to back execution of a
102 single `Quantum`.
104 Parameters
105 ----------
106 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
107 Dataset IDs for datasets that can can be read from this butler.
108 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
109 Dataset IDs for datasets that can be stored in this butler.
110 dimensions : `DimensionUniverse`
111 Object managing all dimension definitions.
112 datastore : `Datastore`
113 Datastore to use for all dataset I/O and existence checks.
114 storageClasses : `StorageClassFactory`
115 Object managing all storage class definitions.
116 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`]
117 The registry dataset type definitions, indexed by name.
119 Notes
120 -----
121 Most callers should use the `initialize` `classmethod` to construct new
122 instances instead of calling the constructor directly.
124 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
125 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
126 implementations that rely SQLAlchemy. If implementations are added in the
127 future that don't rely on SQLAlchemy, it should be possible to swap them
128 in by overriding the type arguments to `initialize` (though at present,
129 `QuantumBackedButler` would still create at least an in-memory SQLite
130 database that would then go unused).`
132 We imagine `QuantumBackedButler` being used during (at least) batch
133 execution to capture `Datastore` records and save them to per-quantum
134 files, which are also a convenient place to store provenance for eventual
135 upload to a SQL-backed `Registry` (once `Registry` has tables to store
136 provenance, that is).
137 These per-quantum files can be written in two ways:
139 - The SQLite file used internally by `QuantumBackedButler` can be used
140 directly but customizing the ``filename`` argument to ``initialize``, and
141 then transferring that file to the object store after execution completes
142 (or fails; a ``try/finally`` pattern probably makes sense here).
144 - A JSON or YAML file can be written by calling `extract_provenance_data`,
145 and using ``pydantic`` methods to write the returned
146 `QuantumProvenanceData` to a file.
148 Note that at present, the SQLite file only contains datastore records, not
149 provenance, but that should be easy to address (if desired) after we
150 actually design a `Registry` schema for provenance. I also suspect that
151 we'll want to explicitly close the SQLite file somehow before trying to
152 transfer it. But I'm guessing we'd prefer to write the per-quantum files
153 as JSON anyway.
154 """
156 def __init__(
157 self,
158 predicted_inputs: Iterable[DatasetId],
159 predicted_outputs: Iterable[DatasetId],
160 dimensions: DimensionUniverse,
161 datastore: Datastore,
162 storageClasses: StorageClassFactory,
163 dataset_types: Mapping[str, DatasetType] | None = None,
164 ):
165 self._dimensions = dimensions
166 self._predicted_inputs = set(predicted_inputs)
167 self._predicted_outputs = set(predicted_outputs)
168 self._available_inputs: set[DatasetId] = set()
169 self._unavailable_inputs: set[DatasetId] = set()
170 self._actual_inputs: set[DatasetId] = set()
171 self._actual_output_refs: set[DatasetRef] = set()
172 self._datastore = datastore
173 self.storageClasses = storageClasses
174 self._dataset_types: Mapping[str, DatasetType] = {}
175 if dataset_types is not None:
176 self._dataset_types = dataset_types
177 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
179 @classmethod
180 def initialize(
181 cls,
182 config: Config | ResourcePathExpression,
183 quantum: Quantum,
184 dimensions: DimensionUniverse,
185 filename: str = ":memory:",
186 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
187 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
188 search_paths: list[str] | None = None,
189 dataset_types: Mapping[str, DatasetType] | None = None,
190 ) -> QuantumBackedButler:
191 """Construct a new `QuantumBackedButler` from repository configuration
192 and helper types.
194 Parameters
195 ----------
196 config : `Config` or `~lsst.resources.ResourcePathExpression`
197 A butler repository root, configuration filename, or configuration
198 instance.
199 quantum : `Quantum`
200 Object describing the predicted input and output dataset relevant
201 to this butler. This must have resolved `DatasetRef` instances for
202 all inputs and outputs.
203 dimensions : `DimensionUniverse`
204 Object managing all dimension definitions.
205 filename : `str`, optional
206 Name for the SQLite database that will back this butler; defaults
207 to an in-memory database.
208 OpaqueManagerClass : `type`, optional
209 A subclass of `OpaqueTableStorageManager` to use for datastore
210 opaque records. Default is a SQL-backed implementation.
211 BridgeManagerClass : `type`, optional
212 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
213 location records. Default is a SQL-backed implementation.
214 search_paths : `list` of `str`, optional
215 Additional search paths for butler configuration.
216 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \
217 optional
218 Mapping of the dataset type name to its registry definition.
219 """
220 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
221 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
222 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
223 return cls._initialize(
224 config=config,
225 predicted_inputs=predicted_inputs,
226 predicted_outputs=predicted_outputs,
227 dimensions=dimensions,
228 filename=filename,
229 datastore_records=quantum.datastore_records,
230 OpaqueManagerClass=OpaqueManagerClass,
231 BridgeManagerClass=BridgeManagerClass,
232 search_paths=search_paths,
233 dataset_types=dataset_types,
234 )
236 @classmethod
237 def from_predicted(
238 cls,
239 config: Config | ResourcePathExpression,
240 predicted_inputs: Iterable[DatasetId],
241 predicted_outputs: Iterable[DatasetId],
242 dimensions: DimensionUniverse,
243 datastore_records: Mapping[str, DatastoreRecordData],
244 filename: str = ":memory:",
245 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
246 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
247 search_paths: list[str] | None = None,
248 dataset_types: Mapping[str, DatasetType] | None = None,
249 ) -> QuantumBackedButler:
250 """Construct a new `QuantumBackedButler` from sets of input and output
251 dataset IDs.
253 Parameters
254 ----------
255 config : `Config` or `~lsst.resources.ResourcePathExpression`
256 A butler repository root, configuration filename, or configuration
257 instance.
258 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
259 Dataset IDs for datasets that can can be read from this butler.
260 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
261 Dataset IDs for datasets that can be stored in this butler, must be
262 fully resolved.
263 dimensions : `DimensionUniverse`
264 Object managing all dimension definitions.
265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
266 Datastore records to import into a datastore.
267 filename : `str`, optional
268 Name for the SQLite database that will back this butler; defaults
269 to an in-memory database.
270 OpaqueManagerClass : `type`, optional
271 A subclass of `OpaqueTableStorageManager` to use for datastore
272 opaque records. Default is a SQL-backed implementation.
273 BridgeManagerClass : `type`, optional
274 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
275 location records. Default is a SQL-backed implementation.
276 search_paths : `list` of `str`, optional
277 Additional search paths for butler configuration.
278 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \
279 optional
280 Mapping of the dataset type name to its registry definition.
281 """
282 return cls._initialize(
283 config=config,
284 predicted_inputs=predicted_inputs,
285 predicted_outputs=predicted_outputs,
286 dimensions=dimensions,
287 filename=filename,
288 datastore_records=datastore_records,
289 OpaqueManagerClass=OpaqueManagerClass,
290 BridgeManagerClass=BridgeManagerClass,
291 search_paths=search_paths,
292 dataset_types=dataset_types,
293 )
295 @classmethod
296 def _initialize(
297 cls,
298 *,
299 config: Config | ResourcePathExpression,
300 predicted_inputs: Iterable[DatasetId],
301 predicted_outputs: Iterable[DatasetId],
302 dimensions: DimensionUniverse,
303 filename: str = ":memory:",
304 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
305 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
306 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
307 search_paths: list[str] | None = None,
308 dataset_types: Mapping[str, DatasetType] | None = None,
309 ) -> QuantumBackedButler:
310 """Initialize quantum-backed butler.
312 Internal method with common implementation used by `initialize` and
313 `for_output`.
315 Parameters
316 ----------
317 config : `Config` or `~lsst.resources.ResourcePathExpression`
318 A butler repository root, configuration filename, or configuration
319 instance.
320 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
321 Dataset IDs for datasets that can can be read from this butler.
322 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
323 Dataset IDs for datasets that can be stored in this butler.
324 dimensions : `DimensionUniverse`
325 Object managing all dimension definitions.
326 filename : `str`, optional
327 Name for the SQLite database that will back this butler; defaults
328 to an in-memory database.
329 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
330 Datastore records to import into a datastore.
331 OpaqueManagerClass : `type`, optional
332 A subclass of `OpaqueTableStorageManager` to use for datastore
333 opaque records. Default is a SQL-backed implementation.
334 BridgeManagerClass : `type`, optional
335 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
336 location records. Default is a SQL-backed implementation.
337 search_paths : `list` of `str`, optional
338 Additional search paths for butler configuration.
339 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`]
340 Mapping of the dataset type name to its registry definition.
341 """
342 butler_config = ButlerConfig(config, searchPaths=search_paths)
343 butler_root = butler_config.get("root", butler_config.configDir)
344 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
345 with db.declareStaticTables(create=True) as context:
346 opaque_manager = OpaqueManagerClass.initialize(db, context)
347 bridge_manager = BridgeManagerClass.initialize(
348 db,
349 context,
350 opaque=opaque_manager,
351 # MyPy can tell it's a fake, but we know it shouldn't care.
352 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
353 universe=dimensions,
354 )
355 # TODO: We need to inform `Datastore` here that it needs to support
356 # predictive reads; right now that's a configuration option, but after
357 # execution butler is retired it could just be a kwarg we pass here.
358 # For now just force this option as we cannot work without it.
359 butler_config["datastore", "trust_get_request"] = True
360 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
361 if datastore_records is not None:
362 datastore.import_records(datastore_records)
363 storageClasses = StorageClassFactory()
364 storageClasses.addFromConfig(butler_config)
365 return cls(
366 predicted_inputs,
367 predicted_outputs,
368 dimensions,
369 datastore,
370 storageClasses=storageClasses,
371 dataset_types=dataset_types,
372 )
374 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
375 """Return DatasetType defined in registry given dataset type name."""
376 return self._dataset_types.get(name)
378 def isWriteable(self) -> bool:
379 # Docstring inherited.
380 return True
382 def get(
383 self,
384 ref: DatasetRef,
385 /,
386 *,
387 parameters: dict[str, Any] | None = None,
388 storageClass: StorageClass | str | None = None,
389 ) -> Any:
390 try:
391 obj = super().get(
392 ref,
393 parameters=parameters,
394 storageClass=storageClass,
395 )
396 except (LookupError, FileNotFoundError, OSError):
397 self._unavailable_inputs.add(ref.id)
398 raise
399 if ref.id in self._predicted_inputs:
400 # do this after delegating to super in case that raises.
401 self._actual_inputs.add(ref.id)
402 self._available_inputs.add(ref.id)
403 return obj
405 def getDeferred(
406 self,
407 ref: DatasetRef,
408 /,
409 *,
410 parameters: dict[str, Any] | None = None,
411 storageClass: str | StorageClass | None = None,
412 ) -> DeferredDatasetHandle:
413 if ref.id in self._predicted_inputs:
414 # Unfortunately, we can't do this after the handle succeeds in
415 # loading, so it's conceivable here that we're marking an input
416 # as "actual" even when it's not even available.
417 self._actual_inputs.add(ref.id)
418 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
420 def stored(self, ref: DatasetRef) -> bool:
421 # Docstring inherited.
422 stored = super().stored(ref)
423 if ref.id in self._predicted_inputs:
424 if stored:
425 self._available_inputs.add(ref.id)
426 else:
427 self._unavailable_inputs.add(ref.id)
428 return stored
430 def stored_many(
431 self,
432 refs: Iterable[DatasetRef],
433 ) -> dict[DatasetRef, bool]:
434 # Docstring inherited.
435 existence = super().stored_many(refs)
437 for ref, stored in existence.items():
438 if ref.id in self._predicted_inputs:
439 if stored:
440 self._available_inputs.add(ref.id)
441 else:
442 self._unavailable_inputs.add(ref.id)
443 return existence
445 def markInputUnused(self, ref: DatasetRef) -> None:
446 # Docstring inherited.
447 self._actual_inputs.discard(ref.id)
449 @property
450 def dimensions(self) -> DimensionUniverse:
451 # Docstring inherited.
452 return self._dimensions
454 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
455 # Docstring inherited.
456 if ref.id not in self._predicted_outputs:
457 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
458 self._datastore.put(obj, ref)
459 self._actual_output_refs.add(ref)
460 return ref
462 def pruneDatasets(
463 self,
464 refs: Iterable[DatasetRef],
465 *,
466 disassociate: bool = True,
467 unstore: bool = False,
468 tags: Iterable[str] = (),
469 purge: bool = False,
470 ) -> None:
471 # docstring inherited from LimitedButler
473 if purge:
474 if not disassociate:
475 raise TypeError("Cannot pass purge=True without disassociate=True.")
476 if not unstore:
477 raise TypeError("Cannot pass purge=True without unstore=True.")
478 elif disassociate:
479 # No tagged collections for this butler.
480 raise TypeError("Cannot pass disassociate=True without purge=True.")
482 refs = list(refs)
484 # Pruning a component of a DatasetRef makes no sense.
485 for ref in refs:
486 if ref.datasetType.component():
487 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
489 if unstore:
490 self._datastore.trash(refs)
491 if purge:
492 for ref in refs:
493 # We only care about removing them from actual output refs,
494 self._actual_output_refs.discard(ref)
496 if unstore:
497 # Point of no return for removing artifacts
498 self._datastore.emptyTrash()
500 def extract_provenance_data(self) -> QuantumProvenanceData:
501 """Extract provenance information and datastore records from this
502 butler.
504 Returns
505 -------
506 provenance : `QuantumProvenanceData`
507 A serializable struct containing input/output dataset IDs and
508 datastore records. This assumes all dataset IDs are UUIDs (just to
509 make it easier for `pydantic` to reason about the struct's types);
510 the rest of this class makes no such assumption, but the approach
511 to processing in which it's useful effectively requires UUIDs
512 anyway.
514 Notes
515 -----
516 `QuantumBackedButler` records this provenance information when its
517 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
518 authors from having to worry about while still recording very
519 detailed information. But it has two small weaknesses:
521 - Calling `getDeferred` or `get` is enough to mark a
522 dataset as an "actual input", which may mark some datasets that
523 aren't actually used. We rely on task authors to use
524 `markInputUnused` to address this.
526 - We assume that the execution system will call ``stored``
527 on all predicted inputs prior to execution, in order to populate the
528 "available inputs" set. This is what I envision
529 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
530 to use this class, but it feels fragile for this class to make such
531 a strong assumption about how it will be used, even if I can't think
532 of any other executor behavior that would make sense.
533 """
534 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
535 _LOG.warning(
536 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
537 "was obtained, but did not actually exist. This task should be be using markInputUnused "
538 "directly to clarify its provenance.",
539 self._actual_inputs & self._unavailable_inputs,
540 )
541 self._actual_inputs -= self._unavailable_inputs
542 checked_inputs = self._available_inputs | self._unavailable_inputs
543 if self._predicted_inputs != checked_inputs:
544 _LOG.warning(
545 "Execution harness did not check predicted inputs %s for existence; available inputs "
546 "recorded in provenance may be incomplete.",
547 self._predicted_inputs - checked_inputs,
548 )
549 datastore_records = self._datastore.export_records(self._actual_output_refs)
550 provenance_records = {
551 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
552 }
554 return QuantumProvenanceData(
555 predicted_inputs=self._predicted_inputs,
556 available_inputs=self._available_inputs,
557 actual_inputs=self._actual_inputs,
558 predicted_outputs=self._predicted_outputs,
559 actual_outputs={ref.id for ref in self._actual_output_refs},
560 datastore_records=provenance_records,
561 )
564class QuantumProvenanceData(pydantic.BaseModel):
565 """A serializable struct for per-quantum provenance information and
566 datastore records.
568 Notes
569 -----
570 This class slightly duplicates information from the `Quantum` class itself
571 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the
572 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
573 assumes the original `Quantum` is also available to reconstruct the
574 complete provenance (e.g. by associating dataset IDs with data IDs,
575 dataset types, and `~CollectionType.RUN` names.
577 Note that ``pydantic`` method ``parse_raw()`` is not going to work
578 correctly for this class, use `direct` method instead.
579 """
581 # This class probably should have information about its execution
582 # environment (anything not controlled and recorded at the
583 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
584 # now is out of scope for this prototype.
586 predicted_inputs: set[uuid.UUID]
587 """Unique IDs of datasets that were predicted as inputs to this quantum
588 when the QuantumGraph was built.
589 """
591 available_inputs: set[uuid.UUID]
592 """Unique IDs of input datasets that were actually present in the datastore
593 when this quantum was executed.
595 This is a subset of ``predicted_inputs``, with the difference generally
596 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of
597 some upstream task.
598 """
600 actual_inputs: set[uuid.UUID]
601 """Unique IDs of datasets that were actually used as inputs by this task.
603 This is a subset of ``available_inputs``.
605 Notes
606 -----
607 The criteria for marking an input as used is that rerunning the quantum
608 with only these ``actual_inputs`` available must yield identical outputs.
609 This means that (for example) even just using an input to help determine
610 an output rejection criteria and then rejecting it as an outlier qualifies
611 that input as actually used.
612 """
614 predicted_outputs: set[uuid.UUID]
615 """Unique IDs of datasets that were predicted as outputs of this quantum
616 when the QuantumGraph was built.
617 """
619 actual_outputs: set[uuid.UUID]
620 """Unique IDs of datasets that were actually written when this quantum
621 was executed.
622 """
624 datastore_records: dict[str, SerializedDatastoreRecordData]
625 """Datastore records indexed by datastore name."""
627 @staticmethod
628 def collect_and_transfer(
629 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
630 ) -> None:
631 """Transfer output datasets from multiple quanta to a more permanent
632 `Butler` repository.
634 Parameters
635 ----------
636 butler : `Butler`
637 Full butler representing the data repository to transfer datasets
638 to.
639 quanta : `~collections.abc.Iterable` [ `Quantum` ]
640 Iterable of `Quantum` objects that carry information about
641 predicted outputs. May be a single-pass iterator.
642 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ]
643 Provenance and datastore data for each of the given quanta, in the
644 same order. May be a single-pass iterator.
646 Notes
647 -----
648 Input-output provenance data is not actually transferred yet, because
649 `Registry` has no place to store it.
651 This method probably works most efficiently if run on all quanta for a
652 single task label at once, because this will gather all datasets of
653 a particular type together into a single vectorized `Registry` import.
654 It should still behave correctly if run on smaller groups of quanta
655 or even quanta from multiple tasks.
657 Currently this method transfers datastore record data unchanged, with
658 no possibility of actually moving (e.g.) files. Datastores that are
659 present only in execution or only in the more permanent butler are
660 ignored.
661 """
662 grouped_refs = defaultdict(list)
663 summary_records: dict[str, DatastoreRecordData] = {}
664 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True):
665 quantum_refs_by_id = {
666 ref.id: ref
667 for ref in itertools.chain.from_iterable(quantum.outputs.values())
668 if ref.id in provenance_for_quantum.actual_outputs
669 }
670 for ref in quantum_refs_by_id.values():
671 grouped_refs[ref.datasetType, ref.run].append(ref)
673 # merge datastore records into a summary structure
674 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
675 quantum_records = DatastoreRecordData.from_simple(serialized_records)
676 if (records := summary_records.get(datastore_name)) is not None:
677 records.update(quantum_records)
678 else:
679 summary_records[datastore_name] = quantum_records
681 for refs in grouped_refs.values():
682 butler.registry._importDatasets(refs)
683 butler._datastore.import_records(summary_records)
685 @classmethod
686 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
687 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
689 @classmethod
690 def direct(
691 cls,
692 *,
693 predicted_inputs: Iterable[str | uuid.UUID],
694 available_inputs: Iterable[str | uuid.UUID],
695 actual_inputs: Iterable[str | uuid.UUID],
696 predicted_outputs: Iterable[str | uuid.UUID],
697 actual_outputs: Iterable[str | uuid.UUID],
698 datastore_records: Mapping[str, Mapping],
699 ) -> QuantumProvenanceData:
700 """Construct an instance directly without validators.
702 Parameters
703 ----------
704 predicted_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
705 The predicted inputs.
706 available_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
707 The available inputs.
708 actual_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
709 The actual inputs.
710 predicted_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
711 The predicted outputs.
712 actual_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
713 The actual outputs.
714 datastore_records : `~collections.abc.Mapping` [ `str`, \
715 `~collections.abc.Mapping` ]
716 The datastore records.
718 Returns
719 -------
720 provenance : `QuantumProvenanceData`
721 Serializable model of the quantum provenance.
723 Notes
724 -----
725 This differs from the Pydantic "construct" method in that the
726 arguments are explicitly what the model requires, and it will recurse
727 through members, constructing them from their corresponding `direct`
728 methods.
730 This method should only be called when the inputs are trusted.
731 """
733 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]:
734 """Convert input UUIDs, which could be in string representation to
735 a set of `UUID` instances.
736 """
737 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids}
739 data = cls.model_construct(
740 predicted_inputs=_to_uuid_set(predicted_inputs),
741 available_inputs=_to_uuid_set(available_inputs),
742 actual_inputs=_to_uuid_set(actual_inputs),
743 predicted_outputs=_to_uuid_set(predicted_outputs),
744 actual_outputs=_to_uuid_set(actual_outputs),
745 datastore_records={
746 key: SerializedDatastoreRecordData.direct(**records)
747 for key, records in datastore_records.items()
748 },
749 )
751 return data