Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%
179 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-04 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-04 02:04 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
32from pydantic import BaseModel
34from ._butlerConfig import ButlerConfig
35from ._deferredDatasetHandle import DeferredDatasetHandle
36from ._limited_butler import LimitedButler
37from .core import (
38 Config,
39 DatasetId,
40 DatasetRef,
41 DatasetType,
42 Datastore,
43 DatastoreRecordData,
44 DimensionUniverse,
45 Quantum,
46 SerializedDatastoreRecordData,
47 StorageClass,
48 StorageClassFactory,
49 ddl,
50)
51from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
52from .registry.databases.sqlite import SqliteDatabase
53from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
54from .registry.opaque import ByNameOpaqueTableStorageManager
56if TYPE_CHECKING: 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true
57 from ._butler import Butler
59_LOG = logging.getLogger(__name__)
62class _DatasetRecordStorageManagerDatastoreConstructionMimic:
63 """A partial implementation of `DatasetRecordStorageManager` that exists
64 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
65 to be constructed without a full `Registry`.
67 Notes
68 -----
69 The interface implemented by this class should probably be its own ABC,
70 and that ABC should probably be used in the definition of
71 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
72 changes minimal.
73 """
75 @classmethod
76 def getIdColumnType(cls) -> type:
77 # Docstring inherited.
78 return ddl.GUID
80 @classmethod
81 def addDatasetForeignKey(
82 cls,
83 tableSpec: ddl.TableSpec,
84 *,
85 name: str = "dataset",
86 constraint: bool = True,
87 onDelete: Optional[str] = None,
88 **kwargs: Any,
89 ) -> ddl.FieldSpec:
90 # Docstring inherited.
91 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
92 tableSpec.fields.add(idFieldSpec)
93 return idFieldSpec
96class QuantumBackedButler(LimitedButler):
97 """An implementation of `LimitedButler` intended to back execution of a
98 single `Quantum`.
100 Parameters
101 ----------
102 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
103 Dataset IDs for datasets that can can be read from this butler.
104 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
105 Dataset IDs for datasets that can be stored in this butler.
106 dimensions : `DimensionUniverse`
107 Object managing all dimension definitions.
108 datastore : `Datastore`
109 Datastore to use for all dataset I/O and existence checks.
110 storageClasses : `StorageClassFactory`
111 Object managing all storage class definitions.
113 Notes
114 -----
115 Most callers should use the `initialize` `classmethod` to construct new
116 instances instead of calling the constructor directly.
118 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
119 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
120 implementations that rely SQLAlchemy. If implementations are added in the
121 future that don't rely on SQLAlchemy, it should be possible to swap them
122 in by overriding the type arguments to `initialize` (though at present,
123 `QuantumBackedButler` would still create at least an in-memory SQLite
124 database that would then go unused).`
126 We imagine `QuantumBackedButler` being used during (at least) batch
127 execution to capture `Datastore` records and save them to per-quantum
128 files, which are also a convenient place to store provenance for eventual
129 upload to a SQL-backed `Registry` (once `Registry` has tables to store
130 provenance, that is).
131 These per-quantum files can be written in two ways:
133 - The SQLite file used internally by `QuantumBackedButler` can be used
134 directly but customizing the ``filename`` argument to ``initialize``, and
135 then transferring that file to the object store after execution completes
136 (or fails; a ``try/finally`` pattern probably makes sense here).
138 - A JSON or YAML file can be written by calling `extract_provenance_data`,
139 and using ``pydantic`` methods to write the returned
140 `QuantumProvenanceData` to a file.
142 Note that at present, the SQLite file only contains datastore records, not
143 provenance, but that should be easy to address (if desired) after we
144 actually design a `Registry` schema for provenance. I also suspect that
145 we'll want to explicitly close the SQLite file somehow before trying to
146 transfer it. But I'm guessing we'd prefer to write the per-quantum files
147 as JSON anyway.
148 """
150 def __init__(
151 self,
152 predicted_inputs: Iterable[DatasetId],
153 predicted_outputs: Iterable[DatasetId],
154 dimensions: DimensionUniverse,
155 datastore: Datastore,
156 storageClasses: StorageClassFactory,
157 dataset_types: Mapping[str, DatasetType] | None = None,
158 ):
159 self._dimensions = dimensions
160 self._predicted_inputs = set(predicted_inputs)
161 self._predicted_outputs = set(predicted_outputs)
162 self._available_inputs: Set[DatasetId] = set()
163 self._unavailable_inputs: Set[DatasetId] = set()
164 self._actual_inputs: Set[DatasetId] = set()
165 self._actual_output_refs: Set[DatasetRef] = set()
166 self.datastore = datastore
167 self.storageClasses = storageClasses
168 self._dataset_types: Mapping[str, DatasetType] = {}
169 if dataset_types is not None:
170 self._dataset_types = dataset_types
171 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
173 @classmethod
174 def initialize(
175 cls,
176 config: Union[Config, str],
177 quantum: Quantum,
178 dimensions: DimensionUniverse,
179 filename: str = ":memory:",
180 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
181 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
182 search_paths: Optional[List[str]] = None,
183 dataset_types: Mapping[str, DatasetType] | None = None,
184 ) -> QuantumBackedButler:
185 """Construct a new `QuantumBackedButler` from repository configuration
186 and helper types.
188 Parameters
189 ----------
190 config : `Config` or `str`
191 A butler repository root, configuration filename, or configuration
192 instance.
193 quantum : `Quantum`
194 Object describing the predicted input and output dataset relevant
195 to this butler. This must have resolved `DatasetRef` instances for
196 all inputs and outputs.
197 dimensions : `DimensionUniverse`
198 Object managing all dimension definitions.
199 filename : `str`, optional
200 Name for the SQLite database that will back this butler; defaults
201 to an in-memory database.
202 OpaqueManagerClass : `type`, optional
203 A subclass of `OpaqueTableStorageManager` to use for datastore
204 opaque records. Default is a SQL-backed implementation.
205 BridgeManagerClass : `type`, optional
206 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
207 location records. Default is a SQL-backed implementation.
208 search_paths : `list` of `str`, optional
209 Additional search paths for butler configuration.
210 dataset_types: `Mapping` [`str`, `DatasetType`], optional
211 Mapping of the dataset type name to its registry definition.
212 """
213 predicted_inputs = [
214 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values())
215 ]
216 predicted_inputs += [ref.getCheckedId() for ref in quantum.initInputs.values()]
217 predicted_outputs = [
218 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values())
219 ]
220 return cls._initialize(
221 config=config,
222 predicted_inputs=predicted_inputs,
223 predicted_outputs=predicted_outputs,
224 dimensions=dimensions,
225 filename=filename,
226 datastore_records=quantum.datastore_records,
227 OpaqueManagerClass=OpaqueManagerClass,
228 BridgeManagerClass=BridgeManagerClass,
229 search_paths=search_paths,
230 dataset_types=dataset_types,
231 )
233 @classmethod
234 def from_predicted(
235 cls,
236 config: Union[Config, str],
237 predicted_inputs: Iterable[DatasetId],
238 predicted_outputs: Iterable[DatasetId],
239 dimensions: DimensionUniverse,
240 datastore_records: Mapping[str, DatastoreRecordData],
241 filename: str = ":memory:",
242 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
243 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
244 search_paths: Optional[List[str]] = None,
245 dataset_types: Mapping[str, DatasetType] | None = None,
246 ) -> QuantumBackedButler:
247 """Construct a new `QuantumBackedButler` from sets of input and output
248 dataset IDs.
250 Parameters
251 ----------
252 config : `Config` or `str`
253 A butler repository root, configuration filename, or configuration
254 instance.
255 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
256 Dataset IDs for datasets that can can be read from this butler.
257 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
258 Dataset IDs for datasets that can be stored in this butler, must be
259 fully resolved.
260 dimensions : `DimensionUniverse`
261 Object managing all dimension definitions.
262 filename : `str`, optional
263 Name for the SQLite database that will back this butler; defaults
264 to an in-memory database.
265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
266 Datastore records to import into a datastore.
267 OpaqueManagerClass : `type`, optional
268 A subclass of `OpaqueTableStorageManager` to use for datastore
269 opaque records. Default is a SQL-backed implementation.
270 BridgeManagerClass : `type`, optional
271 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
272 location records. Default is a SQL-backed implementation.
273 search_paths : `list` of `str`, optional
274 Additional search paths for butler configuration.
275 dataset_types: `Mapping` [`str`, `DatasetType`], optional
276 Mapping of the dataset type name to its registry definition.
277 """
278 return cls._initialize(
279 config=config,
280 predicted_inputs=predicted_inputs,
281 predicted_outputs=predicted_outputs,
282 dimensions=dimensions,
283 filename=filename,
284 datastore_records=datastore_records,
285 OpaqueManagerClass=OpaqueManagerClass,
286 BridgeManagerClass=BridgeManagerClass,
287 search_paths=search_paths,
288 dataset_types=dataset_types,
289 )
291 @classmethod
292 def _initialize(
293 cls,
294 *,
295 config: Union[Config, str],
296 predicted_inputs: Iterable[DatasetId],
297 predicted_outputs: Iterable[DatasetId],
298 dimensions: DimensionUniverse,
299 filename: str = ":memory:",
300 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
301 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
302 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
303 search_paths: Optional[List[str]] = None,
304 dataset_types: Mapping[str, DatasetType] | None = None,
305 ) -> QuantumBackedButler:
306 """Internal method with common implementation used by `initialize` and
307 `for_output`.
309 Parameters
310 ----------
311 config : `Config` or `str`
312 A butler repository root, configuration filename, or configuration
313 instance.
314 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
315 Dataset IDs for datasets that can can be read from this butler.
316 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
317 Dataset IDs for datasets that can be stored in this butler.
318 dimensions : `DimensionUniverse`
319 Object managing all dimension definitions.
320 filename : `str`, optional
321 Name for the SQLite database that will back this butler; defaults
322 to an in-memory database.
323 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
324 Datastore records to import into a datastore.
325 OpaqueManagerClass : `type`, optional
326 A subclass of `OpaqueTableStorageManager` to use for datastore
327 opaque records. Default is a SQL-backed implementation.
328 BridgeManagerClass : `type`, optional
329 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
330 location records. Default is a SQL-backed implementation.
331 search_paths : `list` of `str`, optional
332 Additional search paths for butler configuration.
333 dataset_types: `Mapping` [`str`, `DatasetType`]
334 Mapping of the dataset type name to its registry definition.
335 """
336 butler_config = ButlerConfig(config, searchPaths=search_paths)
337 if "root" in butler_config:
338 butler_root = butler_config["root"]
339 else:
340 butler_root = butler_config.configDir
341 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
342 with db.declareStaticTables(create=True) as context:
343 opaque_manager = OpaqueManagerClass.initialize(db, context)
344 bridge_manager = BridgeManagerClass.initialize(
345 db,
346 context,
347 opaque=opaque_manager,
348 # MyPy can tell it's a fake, but we know it shouldn't care.
349 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
350 universe=dimensions,
351 )
352 # TODO: We need to inform `Datastore` here that it needs to support
353 # predictive reads; right now that's a configuration option, but after
354 # execution butler is retired it could just be a kwarg we pass here.
355 # For now just force this option as we cannot work without it.
356 butler_config["datastore", "trust_get_request"] = True
357 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
358 if datastore_records is not None:
359 datastore.import_records(datastore_records)
360 storageClasses = StorageClassFactory()
361 storageClasses.addFromConfig(butler_config)
362 return cls(
363 predicted_inputs,
364 predicted_outputs,
365 dimensions,
366 datastore,
367 storageClasses=storageClasses,
368 dataset_types=dataset_types,
369 )
371 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
372 """Return DatasetType defined in registry given dataset type name."""
373 return self._dataset_types.get(name)
375 def isWriteable(self) -> bool:
376 # Docstring inherited.
377 return True
379 def getDirect(
380 self,
381 ref: DatasetRef,
382 *,
383 parameters: Optional[Dict[str, Any]] = None,
384 storageClass: str | StorageClass | None = None,
385 ) -> Any:
386 # Docstring inherited.
387 try:
388 obj = super().getDirect(ref, parameters=parameters, storageClass=storageClass)
389 except (LookupError, FileNotFoundError, IOError):
390 self._unavailable_inputs.add(ref.getCheckedId())
391 raise
392 if ref.id in self._predicted_inputs:
393 # do this after delegating to super in case that raises.
394 self._actual_inputs.add(ref.id)
395 self._available_inputs.add(ref.id)
396 return obj
398 def getDirectDeferred(
399 self,
400 ref: DatasetRef,
401 *,
402 parameters: Union[dict, None] = None,
403 storageClass: str | StorageClass | None = None,
404 ) -> DeferredDatasetHandle:
405 # Docstring inherited.
406 if ref.id in self._predicted_inputs:
407 # Unfortunately, we can't do this after the handle succeeds in
408 # loading, so it's conceivable here that we're marking an input
409 # as "actual" even when it's not even available.
410 self._actual_inputs.add(ref.id)
411 return super().getDirectDeferred(ref, parameters=parameters, storageClass=storageClass)
413 def datasetExistsDirect(self, ref: DatasetRef) -> bool:
414 # Docstring inherited.
415 exists = super().datasetExistsDirect(ref)
416 if ref.id in self._predicted_inputs:
417 if exists:
418 self._available_inputs.add(ref.id)
419 else:
420 self._unavailable_inputs.add(ref.id)
421 return exists
423 def markInputUnused(self, ref: DatasetRef) -> None:
424 # Docstring inherited.
425 self._actual_inputs.discard(ref.getCheckedId())
427 @property
428 def dimensions(self) -> DimensionUniverse:
429 # Docstring inherited.
430 return self._dimensions
432 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
433 # Docstring inherited.
434 if ref.id not in self._predicted_outputs:
435 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
436 self.datastore.put(obj, ref)
437 self._actual_output_refs.add(ref)
438 return ref
440 def pruneDatasets(
441 self,
442 refs: Iterable[DatasetRef],
443 *,
444 disassociate: bool = True,
445 unstore: bool = False,
446 tags: Iterable[str] = (),
447 purge: bool = False,
448 ) -> None:
449 # docstring inherited from LimitedButler
451 if purge:
452 if not disassociate:
453 raise TypeError("Cannot pass purge=True without disassociate=True.")
454 if not unstore:
455 raise TypeError("Cannot pass purge=True without unstore=True.")
456 elif disassociate:
457 # No tagged collections for this butler.
458 raise TypeError("Cannot pass disassociate=True without purge=True.")
460 refs = list(refs)
462 # Pruning a component of a DatasetRef makes no sense.
463 for ref in refs:
464 if ref.datasetType.component():
465 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
467 if unstore:
468 self.datastore.trash(refs)
469 if purge:
470 for ref in refs:
471 # We only care about removing them from actual output refs,
472 self._actual_output_refs.discard(ref)
474 if unstore:
475 # Point of no return for removing artifacts
476 self.datastore.emptyTrash()
478 def extract_provenance_data(self) -> QuantumProvenanceData:
479 """Extract provenance information and datastore records from this
480 butler.
482 Returns
483 -------
484 provenance : `QuantumProvenanceData`
485 A serializable struct containing input/output dataset IDs and
486 datastore records. This assumes all dataset IDs are UUIDs (just to
487 make it easier for `pydantic` to reason about the struct's types);
488 the rest of this class makes no such assumption, but the approach
489 to processing in which it's useful effectively requires UUIDs
490 anyway.
492 Notes
493 -----
494 `QuantumBackedButler` records this provenance information when its
495 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
496 authors from having to worry about while still recording very
497 detailed information. But it has two small weaknesses:
499 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
500 dataset as an "actual input", which may mark some datasets that
501 aren't actually used. We rely on task authors to use
502 `markInputUnused` to address this.
504 - We assume that the execution system will call ``datasetExistsDirect``
505 on all predicted inputs prior to execution, in order to populate the
506 "available inputs" set. This is what I envision
507 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
508 to use this class, but it feels fragile for this class to make such
509 a strong assumption about how it will be used, even if I can't think
510 of any other executor behavior that would make sense.
511 """
512 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
513 _LOG.warning(
514 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
515 "was obtained, but did not actually exist. This task should be be using markInputUnused "
516 "directly to clarify its provenance.",
517 self._actual_inputs & self._unavailable_inputs,
518 )
519 self._actual_inputs -= self._unavailable_inputs
520 checked_inputs = self._available_inputs | self._unavailable_inputs
521 if not self._predicted_inputs == checked_inputs:
522 _LOG.warning(
523 "Execution harness did not check predicted inputs %s for existence; available inputs "
524 "recorded in provenance may be incomplete.",
525 self._predicted_inputs - checked_inputs,
526 )
527 datastore_records = self.datastore.export_records(self._actual_output_refs)
528 provenance_records = {
529 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
530 }
532 return QuantumProvenanceData(
533 predicted_inputs=self._predicted_inputs,
534 available_inputs=self._available_inputs,
535 actual_inputs=self._actual_inputs,
536 predicted_outputs=self._predicted_outputs,
537 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs},
538 datastore_records=provenance_records,
539 )
542class QuantumProvenanceData(BaseModel):
543 """A serializable struct for per-quantum provenance information and
544 datastore records.
546 Notes
547 -----
548 This class slightly duplicates information from the `Quantum` class itself
549 (the `predicted_inputs` and `predicted_outputs` sets should have the same
550 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
551 assumes the original `Quantum` is also available to reconstruct the
552 complete provenance (e.g. by associating dataset IDs with data IDs,
553 dataset types, and `~CollectionType.RUN` names.
555 Note that ``pydantic`` method ``parse_raw()`` is not going to work
556 correctly for this class, use `direct` method instead.
557 """
559 # This class probably should have information about its execution
560 # environment (anything not controlled and recorded at the
561 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
562 # now is out of scope for this prototype.
564 predicted_inputs: Set[uuid.UUID]
565 """Unique IDs of datasets that were predicted as inputs to this quantum
566 when the QuantumGraph was built.
567 """
569 available_inputs: Set[uuid.UUID]
570 """Unique IDs of input datasets that were actually present in the datastore
571 when this quantum was executed.
573 This is a subset of `predicted_inputs`, with the difference generally being
574 datasets were `predicted_outputs` but not `actual_outputs` of some upstream
575 task.
576 """
578 actual_inputs: Set[uuid.UUID]
579 """Unique IDs of datasets that were actually used as inputs by this task.
581 This is a subset of `available_inputs`.
583 Notes
584 -----
585 The criteria for marking an input as used is that rerunning the quantum
586 with only these `actual_inputs` available must yield identical outputs.
587 This means that (for example) even just using an input to help determine
588 an output rejection criteria and then rejecting it as an outlier qualifies
589 that input as actually used.
590 """
592 predicted_outputs: Set[uuid.UUID]
593 """Unique IDs of datasets that were predicted as outputs of this quantum
594 when the QuantumGraph was built.
595 """
597 actual_outputs: Set[uuid.UUID]
598 """Unique IDs of datasets that were actually written when this quantum
599 was executed.
600 """
602 datastore_records: Dict[str, SerializedDatastoreRecordData]
603 """Datastore records indexed by datastore name."""
605 @staticmethod
606 def collect_and_transfer(
607 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
608 ) -> None:
609 """Transfer output datasets from multiple quanta to a more permantent
610 `Butler` repository.
612 Parameters
613 ----------
614 butler : `Butler`
615 Full butler representing the data repository to transfer datasets
616 to.
617 quanta : `Iterable` [ `Quantum` ]
618 Iterable of `Quantum` objects that carry information about
619 predicted outputs. May be a single-pass iterator.
620 provenance : `Iterable` [ `QuantumProvenanceData` ]
621 Provenance and datastore data for each of the given quanta, in the
622 same order. May be a single-pass iterator.
624 Notes
625 -----
626 Input-output provenance data is not actually transferred yet, because
627 `Registry` has no place to store it.
629 This method probably works most efficiently if run on all quanta for a
630 single task label at once, because this will gather all datasets of
631 a particular type together into a single vectorized `Registry` import.
632 It should still behave correctly if run on smaller groups of quanta
633 or even quanta from multiple tasks.
635 Currently this method transfers datastore record data unchanged, with
636 no possibility of actually moving (e.g.) files. Datastores that are
637 present only in execution or only in the more permanent butler are
638 ignored.
639 """
640 grouped_refs = defaultdict(list)
641 summary_records: Dict[str, DatastoreRecordData] = {}
642 for quantum, provenance_for_quantum in zip(quanta, provenance):
643 quantum_refs_by_id = {
644 ref.getCheckedId(): ref
645 for ref in itertools.chain.from_iterable(quantum.outputs.values())
646 if ref.getCheckedId() in provenance_for_quantum.actual_outputs
647 }
648 for ref in quantum_refs_by_id.values():
649 grouped_refs[ref.datasetType, ref.run].append(ref)
651 # merge datastore records into a summary structure
652 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
653 quantum_records = DatastoreRecordData.from_simple(serialized_records)
654 if (records := summary_records.get(datastore_name)) is not None:
655 records.update(quantum_records)
656 else:
657 summary_records[datastore_name] = quantum_records
659 for refs in grouped_refs.values():
660 butler.registry._importDatasets(refs)
661 butler.datastore.import_records(summary_records)
663 @classmethod
664 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
665 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
667 @classmethod
668 def direct(
669 cls,
670 *,
671 predicted_inputs: Iterable[Union[str, uuid.UUID]],
672 available_inputs: Iterable[Union[str, uuid.UUID]],
673 actual_inputs: Iterable[Union[str, uuid.UUID]],
674 predicted_outputs: Iterable[Union[str, uuid.UUID]],
675 actual_outputs: Iterable[Union[str, uuid.UUID]],
676 datastore_records: Mapping[str, Mapping],
677 ) -> QuantumProvenanceData:
678 """Construct an instance directly without validators.
680 This differs from the pydantic "construct" method in that the
681 arguments are explicitly what the model requires, and it will recurse
682 through members, constructing them from their corresponding `direct`
683 methods.
685 This method should only be called when the inputs are trusted.
686 """
688 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]:
689 """Convert input UUIDs, which could be in string representation to
690 a set of `UUID` instances.
691 """
692 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids)
694 data = QuantumProvenanceData.__new__(cls)
695 setter = object.__setattr__
696 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs))
697 setter(data, "available_inputs", _to_uuid_set(available_inputs))
698 setter(data, "actual_inputs", _to_uuid_set(actual_inputs))
699 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs))
700 setter(data, "actual_outputs", _to_uuid_set(actual_outputs))
701 setter(
702 data,
703 "datastore_records",
704 {
705 key: SerializedDatastoreRecordData.direct(**records)
706 for key, records in datastore_records.items()
707 },
708 )
709 return data