Coverage for python/lsst/daf/butler/_quantum_backed.py: 26%
183 statements
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-26 02:11 -0700
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-26 02:11 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
32from deprecated.sphinx import deprecated
33from pydantic import BaseModel
35from ._butlerConfig import ButlerConfig
36from ._deferredDatasetHandle import DeferredDatasetHandle
37from ._limited_butler import LimitedButler
38from .core import (
39 Config,
40 DatasetId,
41 DatasetRef,
42 DatasetType,
43 Datastore,
44 DatastoreRecordData,
45 DimensionUniverse,
46 Quantum,
47 SerializedDatastoreRecordData,
48 StorageClass,
49 StorageClassFactory,
50 ddl,
51)
52from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
53from .registry.databases.sqlite import SqliteDatabase
54from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
55from .registry.opaque import ByNameOpaqueTableStorageManager
57if TYPE_CHECKING:
58 from ._butler import Butler
60_LOG = logging.getLogger(__name__)
63class _DatasetRecordStorageManagerDatastoreConstructionMimic:
64 """A partial implementation of `DatasetRecordStorageManager` that exists
65 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
66 to be constructed without a full `Registry`.
68 Notes
69 -----
70 The interface implemented by this class should probably be its own ABC,
71 and that ABC should probably be used in the definition of
72 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
73 changes minimal.
74 """
76 @classmethod
77 def getIdColumnType(cls) -> type:
78 # Docstring inherited.
79 return ddl.GUID
81 @classmethod
82 def addDatasetForeignKey(
83 cls,
84 tableSpec: ddl.TableSpec,
85 *,
86 name: str = "dataset",
87 constraint: bool = True,
88 onDelete: Optional[str] = None,
89 **kwargs: Any,
90 ) -> ddl.FieldSpec:
91 # Docstring inherited.
92 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
93 tableSpec.fields.add(idFieldSpec)
94 return idFieldSpec
97class QuantumBackedButler(LimitedButler):
98 """An implementation of `LimitedButler` intended to back execution of a
99 single `Quantum`.
101 Parameters
102 ----------
103 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
104 Dataset IDs for datasets that can can be read from this butler.
105 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
106 Dataset IDs for datasets that can be stored in this butler.
107 dimensions : `DimensionUniverse`
108 Object managing all dimension definitions.
109 datastore : `Datastore`
110 Datastore to use for all dataset I/O and existence checks.
111 storageClasses : `StorageClassFactory`
112 Object managing all storage class definitions.
114 Notes
115 -----
116 Most callers should use the `initialize` `classmethod` to construct new
117 instances instead of calling the constructor directly.
119 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
120 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
121 implementations that rely SQLAlchemy. If implementations are added in the
122 future that don't rely on SQLAlchemy, it should be possible to swap them
123 in by overriding the type arguments to `initialize` (though at present,
124 `QuantumBackedButler` would still create at least an in-memory SQLite
125 database that would then go unused).`
127 We imagine `QuantumBackedButler` being used during (at least) batch
128 execution to capture `Datastore` records and save them to per-quantum
129 files, which are also a convenient place to store provenance for eventual
130 upload to a SQL-backed `Registry` (once `Registry` has tables to store
131 provenance, that is).
132 These per-quantum files can be written in two ways:
134 - The SQLite file used internally by `QuantumBackedButler` can be used
135 directly but customizing the ``filename`` argument to ``initialize``, and
136 then transferring that file to the object store after execution completes
137 (or fails; a ``try/finally`` pattern probably makes sense here).
139 - A JSON or YAML file can be written by calling `extract_provenance_data`,
140 and using ``pydantic`` methods to write the returned
141 `QuantumProvenanceData` to a file.
143 Note that at present, the SQLite file only contains datastore records, not
144 provenance, but that should be easy to address (if desired) after we
145 actually design a `Registry` schema for provenance. I also suspect that
146 we'll want to explicitly close the SQLite file somehow before trying to
147 transfer it. But I'm guessing we'd prefer to write the per-quantum files
148 as JSON anyway.
149 """
151 def __init__(
152 self,
153 predicted_inputs: Iterable[DatasetId],
154 predicted_outputs: Iterable[DatasetId],
155 dimensions: DimensionUniverse,
156 datastore: Datastore,
157 storageClasses: StorageClassFactory,
158 dataset_types: Mapping[str, DatasetType] | None = None,
159 ):
160 self._dimensions = dimensions
161 self._predicted_inputs = set(predicted_inputs)
162 self._predicted_outputs = set(predicted_outputs)
163 self._available_inputs: Set[DatasetId] = set()
164 self._unavailable_inputs: Set[DatasetId] = set()
165 self._actual_inputs: Set[DatasetId] = set()
166 self._actual_output_refs: Set[DatasetRef] = set()
167 self.datastore = datastore
168 self.storageClasses = storageClasses
169 self._dataset_types: Mapping[str, DatasetType] = {}
170 if dataset_types is not None:
171 self._dataset_types = dataset_types
172 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
174 @classmethod
175 def initialize(
176 cls,
177 config: Union[Config, str],
178 quantum: Quantum,
179 dimensions: DimensionUniverse,
180 filename: str = ":memory:",
181 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
182 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
183 search_paths: Optional[List[str]] = None,
184 dataset_types: Mapping[str, DatasetType] | None = None,
185 ) -> QuantumBackedButler:
186 """Construct a new `QuantumBackedButler` from repository configuration
187 and helper types.
189 Parameters
190 ----------
191 config : `Config` or `str`
192 A butler repository root, configuration filename, or configuration
193 instance.
194 quantum : `Quantum`
195 Object describing the predicted input and output dataset relevant
196 to this butler. This must have resolved `DatasetRef` instances for
197 all inputs and outputs.
198 dimensions : `DimensionUniverse`
199 Object managing all dimension definitions.
200 filename : `str`, optional
201 Name for the SQLite database that will back this butler; defaults
202 to an in-memory database.
203 OpaqueManagerClass : `type`, optional
204 A subclass of `OpaqueTableStorageManager` to use for datastore
205 opaque records. Default is a SQL-backed implementation.
206 BridgeManagerClass : `type`, optional
207 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
208 location records. Default is a SQL-backed implementation.
209 search_paths : `list` of `str`, optional
210 Additional search paths for butler configuration.
211 dataset_types: `Mapping` [`str`, `DatasetType`], optional
212 Mapping of the dataset type name to its registry definition.
213 """
214 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
215 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
216 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
217 return cls._initialize(
218 config=config,
219 predicted_inputs=predicted_inputs,
220 predicted_outputs=predicted_outputs,
221 dimensions=dimensions,
222 filename=filename,
223 datastore_records=quantum.datastore_records,
224 OpaqueManagerClass=OpaqueManagerClass,
225 BridgeManagerClass=BridgeManagerClass,
226 search_paths=search_paths,
227 dataset_types=dataset_types,
228 )
230 @classmethod
231 def from_predicted(
232 cls,
233 config: Union[Config, str],
234 predicted_inputs: Iterable[DatasetId],
235 predicted_outputs: Iterable[DatasetId],
236 dimensions: DimensionUniverse,
237 datastore_records: Mapping[str, DatastoreRecordData],
238 filename: str = ":memory:",
239 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
240 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
241 search_paths: Optional[List[str]] = None,
242 dataset_types: Mapping[str, DatasetType] | None = None,
243 ) -> QuantumBackedButler:
244 """Construct a new `QuantumBackedButler` from sets of input and output
245 dataset IDs.
247 Parameters
248 ----------
249 config : `Config` or `str`
250 A butler repository root, configuration filename, or configuration
251 instance.
252 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
253 Dataset IDs for datasets that can can be read from this butler.
254 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
255 Dataset IDs for datasets that can be stored in this butler, must be
256 fully resolved.
257 dimensions : `DimensionUniverse`
258 Object managing all dimension definitions.
259 filename : `str`, optional
260 Name for the SQLite database that will back this butler; defaults
261 to an in-memory database.
262 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
263 Datastore records to import into a datastore.
264 OpaqueManagerClass : `type`, optional
265 A subclass of `OpaqueTableStorageManager` to use for datastore
266 opaque records. Default is a SQL-backed implementation.
267 BridgeManagerClass : `type`, optional
268 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
269 location records. Default is a SQL-backed implementation.
270 search_paths : `list` of `str`, optional
271 Additional search paths for butler configuration.
272 dataset_types: `Mapping` [`str`, `DatasetType`], optional
273 Mapping of the dataset type name to its registry definition.
274 """
275 return cls._initialize(
276 config=config,
277 predicted_inputs=predicted_inputs,
278 predicted_outputs=predicted_outputs,
279 dimensions=dimensions,
280 filename=filename,
281 datastore_records=datastore_records,
282 OpaqueManagerClass=OpaqueManagerClass,
283 BridgeManagerClass=BridgeManagerClass,
284 search_paths=search_paths,
285 dataset_types=dataset_types,
286 )
288 @classmethod
289 def _initialize(
290 cls,
291 *,
292 config: Union[Config, str],
293 predicted_inputs: Iterable[DatasetId],
294 predicted_outputs: Iterable[DatasetId],
295 dimensions: DimensionUniverse,
296 filename: str = ":memory:",
297 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
298 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
299 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
300 search_paths: Optional[List[str]] = None,
301 dataset_types: Mapping[str, DatasetType] | None = None,
302 ) -> QuantumBackedButler:
303 """Internal method with common implementation used by `initialize` and
304 `for_output`.
306 Parameters
307 ----------
308 config : `Config` or `str`
309 A butler repository root, configuration filename, or configuration
310 instance.
311 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
312 Dataset IDs for datasets that can can be read from this butler.
313 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
314 Dataset IDs for datasets that can be stored in this butler.
315 dimensions : `DimensionUniverse`
316 Object managing all dimension definitions.
317 filename : `str`, optional
318 Name for the SQLite database that will back this butler; defaults
319 to an in-memory database.
320 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
321 Datastore records to import into a datastore.
322 OpaqueManagerClass : `type`, optional
323 A subclass of `OpaqueTableStorageManager` to use for datastore
324 opaque records. Default is a SQL-backed implementation.
325 BridgeManagerClass : `type`, optional
326 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
327 location records. Default is a SQL-backed implementation.
328 search_paths : `list` of `str`, optional
329 Additional search paths for butler configuration.
330 dataset_types: `Mapping` [`str`, `DatasetType`]
331 Mapping of the dataset type name to its registry definition.
332 """
333 butler_config = ButlerConfig(config, searchPaths=search_paths)
334 if "root" in butler_config:
335 butler_root = butler_config["root"]
336 else:
337 butler_root = butler_config.configDir
338 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
339 with db.declareStaticTables(create=True) as context:
340 opaque_manager = OpaqueManagerClass.initialize(db, context)
341 bridge_manager = BridgeManagerClass.initialize(
342 db,
343 context,
344 opaque=opaque_manager,
345 # MyPy can tell it's a fake, but we know it shouldn't care.
346 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
347 universe=dimensions,
348 )
349 # TODO: We need to inform `Datastore` here that it needs to support
350 # predictive reads; right now that's a configuration option, but after
351 # execution butler is retired it could just be a kwarg we pass here.
352 # For now just force this option as we cannot work without it.
353 butler_config["datastore", "trust_get_request"] = True
354 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
355 if datastore_records is not None:
356 datastore.import_records(datastore_records)
357 storageClasses = StorageClassFactory()
358 storageClasses.addFromConfig(butler_config)
359 return cls(
360 predicted_inputs,
361 predicted_outputs,
362 dimensions,
363 datastore,
364 storageClasses=storageClasses,
365 dataset_types=dataset_types,
366 )
368 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
369 """Return DatasetType defined in registry given dataset type name."""
370 return self._dataset_types.get(name)
372 def isWriteable(self) -> bool:
373 # Docstring inherited.
374 return True
376 @deprecated(
377 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
378 " Please use Butler.get(). Will be removed after v27.0.",
379 version="v26.0",
380 category=FutureWarning,
381 )
382 def getDirect(
383 self,
384 ref: DatasetRef,
385 *,
386 parameters: Optional[Dict[str, Any]] = None,
387 storageClass: str | StorageClass | None = None,
388 ) -> Any:
389 # Docstring inherited.
390 return self.get(ref, parameters=parameters, storageClass=storageClass)
392 def get(
393 self,
394 ref: DatasetRef,
395 /,
396 *,
397 parameters: dict[str, Any] | None = None,
398 storageClass: StorageClass | str | None = None,
399 ) -> Any:
400 try:
401 obj = super().get(
402 ref,
403 parameters=parameters,
404 storageClass=storageClass,
405 )
406 except (LookupError, FileNotFoundError, IOError):
407 self._unavailable_inputs.add(ref.id)
408 raise
409 if ref.id in self._predicted_inputs:
410 # do this after delegating to super in case that raises.
411 self._actual_inputs.add(ref.id)
412 self._available_inputs.add(ref.id)
413 return obj
415 @deprecated(
416 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
417 "Please use Butler.getDeferred(). Will be removed after v27.0.",
418 version="v26.0",
419 category=FutureWarning,
420 )
421 def getDirectDeferred(
422 self,
423 ref: DatasetRef,
424 *,
425 parameters: Union[dict, None] = None,
426 storageClass: str | StorageClass | None = None,
427 ) -> DeferredDatasetHandle:
428 # Docstring inherited.
429 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass)
431 def getDeferred(
432 self,
433 ref: DatasetRef,
434 /,
435 *,
436 parameters: dict[str, Any] | None = None,
437 storageClass: str | StorageClass | None = None,
438 ) -> DeferredDatasetHandle:
439 if ref.id in self._predicted_inputs:
440 # Unfortunately, we can't do this after the handle succeeds in
441 # loading, so it's conceivable here that we're marking an input
442 # as "actual" even when it's not even available.
443 self._actual_inputs.add(ref.id)
444 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
446 def datasetExistsDirect(self, ref: DatasetRef) -> bool:
447 # Docstring inherited.
448 exists = super().datasetExistsDirect(ref)
449 if ref.id in self._predicted_inputs:
450 if exists:
451 self._available_inputs.add(ref.id)
452 else:
453 self._unavailable_inputs.add(ref.id)
454 return exists
456 def markInputUnused(self, ref: DatasetRef) -> None:
457 # Docstring inherited.
458 self._actual_inputs.discard(ref.id)
460 @property
461 def dimensions(self) -> DimensionUniverse:
462 # Docstring inherited.
463 return self._dimensions
465 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
466 # Docstring inherited.
467 if ref.id not in self._predicted_outputs:
468 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
469 self.datastore.put(obj, ref)
470 self._actual_output_refs.add(ref)
471 return ref
473 def pruneDatasets(
474 self,
475 refs: Iterable[DatasetRef],
476 *,
477 disassociate: bool = True,
478 unstore: bool = False,
479 tags: Iterable[str] = (),
480 purge: bool = False,
481 ) -> None:
482 # docstring inherited from LimitedButler
484 if purge:
485 if not disassociate:
486 raise TypeError("Cannot pass purge=True without disassociate=True.")
487 if not unstore:
488 raise TypeError("Cannot pass purge=True without unstore=True.")
489 elif disassociate:
490 # No tagged collections for this butler.
491 raise TypeError("Cannot pass disassociate=True without purge=True.")
493 refs = list(refs)
495 # Pruning a component of a DatasetRef makes no sense.
496 for ref in refs:
497 if ref.datasetType.component():
498 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
500 if unstore:
501 self.datastore.trash(refs)
502 if purge:
503 for ref in refs:
504 # We only care about removing them from actual output refs,
505 self._actual_output_refs.discard(ref)
507 if unstore:
508 # Point of no return for removing artifacts
509 self.datastore.emptyTrash()
511 def extract_provenance_data(self) -> QuantumProvenanceData:
512 """Extract provenance information and datastore records from this
513 butler.
515 Returns
516 -------
517 provenance : `QuantumProvenanceData`
518 A serializable struct containing input/output dataset IDs and
519 datastore records. This assumes all dataset IDs are UUIDs (just to
520 make it easier for `pydantic` to reason about the struct's types);
521 the rest of this class makes no such assumption, but the approach
522 to processing in which it's useful effectively requires UUIDs
523 anyway.
525 Notes
526 -----
527 `QuantumBackedButler` records this provenance information when its
528 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
529 authors from having to worry about while still recording very
530 detailed information. But it has two small weaknesses:
532 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
533 dataset as an "actual input", which may mark some datasets that
534 aren't actually used. We rely on task authors to use
535 `markInputUnused` to address this.
537 - We assume that the execution system will call ``datasetExistsDirect``
538 on all predicted inputs prior to execution, in order to populate the
539 "available inputs" set. This is what I envision
540 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
541 to use this class, but it feels fragile for this class to make such
542 a strong assumption about how it will be used, even if I can't think
543 of any other executor behavior that would make sense.
544 """
545 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
546 _LOG.warning(
547 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
548 "was obtained, but did not actually exist. This task should be be using markInputUnused "
549 "directly to clarify its provenance.",
550 self._actual_inputs & self._unavailable_inputs,
551 )
552 self._actual_inputs -= self._unavailable_inputs
553 checked_inputs = self._available_inputs | self._unavailable_inputs
554 if not self._predicted_inputs == checked_inputs:
555 _LOG.warning(
556 "Execution harness did not check predicted inputs %s for existence; available inputs "
557 "recorded in provenance may be incomplete.",
558 self._predicted_inputs - checked_inputs,
559 )
560 datastore_records = self.datastore.export_records(self._actual_output_refs)
561 provenance_records = {
562 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
563 }
565 return QuantumProvenanceData(
566 predicted_inputs=self._predicted_inputs,
567 available_inputs=self._available_inputs,
568 actual_inputs=self._actual_inputs,
569 predicted_outputs=self._predicted_outputs,
570 actual_outputs={ref.id for ref in self._actual_output_refs},
571 datastore_records=provenance_records,
572 )
575class QuantumProvenanceData(BaseModel):
576 """A serializable struct for per-quantum provenance information and
577 datastore records.
579 Notes
580 -----
581 This class slightly duplicates information from the `Quantum` class itself
582 (the `predicted_inputs` and `predicted_outputs` sets should have the same
583 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
584 assumes the original `Quantum` is also available to reconstruct the
585 complete provenance (e.g. by associating dataset IDs with data IDs,
586 dataset types, and `~CollectionType.RUN` names.
588 Note that ``pydantic`` method ``parse_raw()`` is not going to work
589 correctly for this class, use `direct` method instead.
590 """
592 # This class probably should have information about its execution
593 # environment (anything not controlled and recorded at the
594 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
595 # now is out of scope for this prototype.
597 predicted_inputs: Set[uuid.UUID]
598 """Unique IDs of datasets that were predicted as inputs to this quantum
599 when the QuantumGraph was built.
600 """
602 available_inputs: Set[uuid.UUID]
603 """Unique IDs of input datasets that were actually present in the datastore
604 when this quantum was executed.
606 This is a subset of `predicted_inputs`, with the difference generally being
607 datasets were `predicted_outputs` but not `actual_outputs` of some upstream
608 task.
609 """
611 actual_inputs: Set[uuid.UUID]
612 """Unique IDs of datasets that were actually used as inputs by this task.
614 This is a subset of `available_inputs`.
616 Notes
617 -----
618 The criteria for marking an input as used is that rerunning the quantum
619 with only these `actual_inputs` available must yield identical outputs.
620 This means that (for example) even just using an input to help determine
621 an output rejection criteria and then rejecting it as an outlier qualifies
622 that input as actually used.
623 """
625 predicted_outputs: Set[uuid.UUID]
626 """Unique IDs of datasets that were predicted as outputs of this quantum
627 when the QuantumGraph was built.
628 """
630 actual_outputs: Set[uuid.UUID]
631 """Unique IDs of datasets that were actually written when this quantum
632 was executed.
633 """
635 datastore_records: Dict[str, SerializedDatastoreRecordData]
636 """Datastore records indexed by datastore name."""
638 @staticmethod
639 def collect_and_transfer(
640 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
641 ) -> None:
642 """Transfer output datasets from multiple quanta to a more permantent
643 `Butler` repository.
645 Parameters
646 ----------
647 butler : `Butler`
648 Full butler representing the data repository to transfer datasets
649 to.
650 quanta : `Iterable` [ `Quantum` ]
651 Iterable of `Quantum` objects that carry information about
652 predicted outputs. May be a single-pass iterator.
653 provenance : `Iterable` [ `QuantumProvenanceData` ]
654 Provenance and datastore data for each of the given quanta, in the
655 same order. May be a single-pass iterator.
657 Notes
658 -----
659 Input-output provenance data is not actually transferred yet, because
660 `Registry` has no place to store it.
662 This method probably works most efficiently if run on all quanta for a
663 single task label at once, because this will gather all datasets of
664 a particular type together into a single vectorized `Registry` import.
665 It should still behave correctly if run on smaller groups of quanta
666 or even quanta from multiple tasks.
668 Currently this method transfers datastore record data unchanged, with
669 no possibility of actually moving (e.g.) files. Datastores that are
670 present only in execution or only in the more permanent butler are
671 ignored.
672 """
673 grouped_refs = defaultdict(list)
674 summary_records: Dict[str, DatastoreRecordData] = {}
675 for quantum, provenance_for_quantum in zip(quanta, provenance):
676 quantum_refs_by_id = {
677 ref.id: ref
678 for ref in itertools.chain.from_iterable(quantum.outputs.values())
679 if ref.id in provenance_for_quantum.actual_outputs
680 }
681 for ref in quantum_refs_by_id.values():
682 grouped_refs[ref.datasetType, ref.run].append(ref)
684 # merge datastore records into a summary structure
685 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
686 quantum_records = DatastoreRecordData.from_simple(serialized_records)
687 if (records := summary_records.get(datastore_name)) is not None:
688 records.update(quantum_records)
689 else:
690 summary_records[datastore_name] = quantum_records
692 for refs in grouped_refs.values():
693 butler.registry._importDatasets(refs)
694 butler.datastore.import_records(summary_records)
696 @classmethod
697 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
698 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
700 @classmethod
701 def direct(
702 cls,
703 *,
704 predicted_inputs: Iterable[Union[str, uuid.UUID]],
705 available_inputs: Iterable[Union[str, uuid.UUID]],
706 actual_inputs: Iterable[Union[str, uuid.UUID]],
707 predicted_outputs: Iterable[Union[str, uuid.UUID]],
708 actual_outputs: Iterable[Union[str, uuid.UUID]],
709 datastore_records: Mapping[str, Mapping],
710 ) -> QuantumProvenanceData:
711 """Construct an instance directly without validators.
713 This differs from the pydantic "construct" method in that the
714 arguments are explicitly what the model requires, and it will recurse
715 through members, constructing them from their corresponding `direct`
716 methods.
718 This method should only be called when the inputs are trusted.
719 """
721 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]:
722 """Convert input UUIDs, which could be in string representation to
723 a set of `UUID` instances.
724 """
725 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids)
727 data = QuantumProvenanceData.__new__(cls)
728 setter = object.__setattr__
729 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs))
730 setter(data, "available_inputs", _to_uuid_set(available_inputs))
731 setter(data, "actual_inputs", _to_uuid_set(actual_inputs))
732 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs))
733 setter(data, "actual_outputs", _to_uuid_set(actual_outputs))
734 setter(
735 data,
736 "datastore_records",
737 {
738 key: SerializedDatastoreRecordData.direct(**records)
739 for key, records in datastore_records.items()
740 },
741 )
742 return data