Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%
185 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 02:46 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 02:46 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from . import ddl
32__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
34import itertools
35import logging
36import uuid
37from collections import defaultdict
38from collections.abc import Iterable, Mapping
39from typing import TYPE_CHECKING, Any
41import pydantic
42from lsst.resources import ResourcePathExpression
44from ._butler_config import ButlerConfig
45from ._config import Config
46from ._dataset_ref import DatasetId, DatasetRef
47from ._dataset_type import DatasetType
48from ._deferredDatasetHandle import DeferredDatasetHandle
49from ._limited_butler import LimitedButler
50from ._quantum import Quantum
51from ._storage_class import StorageClass, StorageClassFactory
52from .datastore import Datastore
53from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData
54from .dimensions import DimensionUniverse
55from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
56from .registry.databases.sqlite import SqliteDatabase
57from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
58from .registry.opaque import ByNameOpaqueTableStorageManager
60if TYPE_CHECKING:
61 from ._butler import Butler
63_LOG = logging.getLogger(__name__)
66class _DatasetRecordStorageManagerDatastoreConstructionMimic:
67 """A partial implementation of `DatasetRecordStorageManager` that exists
68 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
69 to be constructed without a full `Registry`.
71 Notes
72 -----
73 The interface implemented by this class should probably be its own ABC,
74 and that ABC should probably be used in the definition of
75 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
76 changes minimal.
77 """
79 @classmethod
80 def getIdColumnType(cls) -> type:
81 # Docstring inherited.
82 return ddl.GUID
84 @classmethod
85 def addDatasetForeignKey(
86 cls,
87 tableSpec: ddl.TableSpec,
88 *,
89 name: str = "dataset",
90 constraint: bool = True,
91 onDelete: str | None = None,
92 **kwargs: Any,
93 ) -> ddl.FieldSpec:
94 # Docstring inherited.
95 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
96 tableSpec.fields.add(idFieldSpec)
97 return idFieldSpec
100class QuantumBackedButler(LimitedButler):
101 """An implementation of `LimitedButler` intended to back execution of a
102 single `Quantum`.
104 Parameters
105 ----------
106 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
107 Dataset IDs for datasets that can can be read from this butler.
108 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
109 Dataset IDs for datasets that can be stored in this butler.
110 dimensions : `DimensionUniverse`
111 Object managing all dimension definitions.
112 datastore : `Datastore`
113 Datastore to use for all dataset I/O and existence checks.
114 storageClasses : `StorageClassFactory`
115 Object managing all storage class definitions.
116 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`]
117 The registry dataset type definitions, indexed by name.
119 Notes
120 -----
121 Most callers should use the `initialize` `classmethod` to construct new
122 instances instead of calling the constructor directly.
124 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
125 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
126 implementations that rely SQLAlchemy. If implementations are added in the
127 future that don't rely on SQLAlchemy, it should be possible to swap them
128 in by overriding the type arguments to `initialize` (though at present,
129 `QuantumBackedButler` would still create at least an in-memory SQLite
130 database that would then go unused).`
132 We imagine `QuantumBackedButler` being used during (at least) batch
133 execution to capture `Datastore` records and save them to per-quantum
134 files, which are also a convenient place to store provenance for eventual
135 upload to a SQL-backed `Registry` (once `Registry` has tables to store
136 provenance, that is).
137 These per-quantum files can be written in two ways:
139 - The SQLite file used internally by `QuantumBackedButler` can be used
140 directly but customizing the ``filename`` argument to ``initialize``, and
141 then transferring that file to the object store after execution completes
142 (or fails; a ``try/finally`` pattern probably makes sense here).
144 - A JSON or YAML file can be written by calling `extract_provenance_data`,
145 and using ``pydantic`` methods to write the returned
146 `QuantumProvenanceData` to a file.
148 Note that at present, the SQLite file only contains datastore records, not
149 provenance, but that should be easy to address (if desired) after we
150 actually design a `Registry` schema for provenance. I also suspect that
151 we'll want to explicitly close the SQLite file somehow before trying to
152 transfer it. But I'm guessing we'd prefer to write the per-quantum files
153 as JSON anyway.
154 """
156 def __init__(
157 self,
158 predicted_inputs: Iterable[DatasetId],
159 predicted_outputs: Iterable[DatasetId],
160 dimensions: DimensionUniverse,
161 datastore: Datastore,
162 storageClasses: StorageClassFactory,
163 dataset_types: Mapping[str, DatasetType] | None = None,
164 ):
165 self._dimensions = dimensions
166 self._predicted_inputs = set(predicted_inputs)
167 self._predicted_outputs = set(predicted_outputs)
168 self._available_inputs: set[DatasetId] = set()
169 self._unavailable_inputs: set[DatasetId] = set()
170 self._actual_inputs: set[DatasetId] = set()
171 self._actual_output_refs: set[DatasetRef] = set()
172 self._datastore = datastore
173 self.storageClasses = storageClasses
174 self._dataset_types: Mapping[str, DatasetType] = {}
175 if dataset_types is not None:
176 self._dataset_types = dataset_types
177 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
179 @classmethod
180 def initialize(
181 cls,
182 config: Config | ResourcePathExpression,
183 quantum: Quantum,
184 dimensions: DimensionUniverse,
185 filename: str = ":memory:",
186 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
187 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
188 search_paths: list[str] | None = None,
189 dataset_types: Mapping[str, DatasetType] | None = None,
190 ) -> QuantumBackedButler:
191 """Construct a new `QuantumBackedButler` from repository configuration
192 and helper types.
194 Parameters
195 ----------
196 config : `Config` or `~lsst.resources.ResourcePathExpression`
197 A butler repository root, configuration filename, or configuration
198 instance.
199 quantum : `Quantum`
200 Object describing the predicted input and output dataset relevant
201 to this butler. This must have resolved `DatasetRef` instances for
202 all inputs and outputs.
203 dimensions : `DimensionUniverse`
204 Object managing all dimension definitions.
205 filename : `str`, optional
206 Name for the SQLite database that will back this butler; defaults
207 to an in-memory database.
208 OpaqueManagerClass : `type`, optional
209 A subclass of `OpaqueTableStorageManager` to use for datastore
210 opaque records. Default is a SQL-backed implementation.
211 BridgeManagerClass : `type`, optional
212 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
213 location records. Default is a SQL-backed implementation.
214 search_paths : `list` of `str`, optional
215 Additional search paths for butler configuration.
216 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \
217 optional
218 Mapping of the dataset type name to its registry definition.
219 """
220 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())]
221 predicted_inputs += [ref.id for ref in quantum.initInputs.values()]
222 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())]
223 return cls._initialize(
224 config=config,
225 predicted_inputs=predicted_inputs,
226 predicted_outputs=predicted_outputs,
227 dimensions=dimensions,
228 filename=filename,
229 datastore_records=quantum.datastore_records,
230 OpaqueManagerClass=OpaqueManagerClass,
231 BridgeManagerClass=BridgeManagerClass,
232 search_paths=search_paths,
233 dataset_types=dataset_types,
234 )
236 @classmethod
237 def from_predicted(
238 cls,
239 config: Config | ResourcePathExpression,
240 predicted_inputs: Iterable[DatasetId],
241 predicted_outputs: Iterable[DatasetId],
242 dimensions: DimensionUniverse,
243 datastore_records: Mapping[str, DatastoreRecordData],
244 filename: str = ":memory:",
245 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
246 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
247 search_paths: list[str] | None = None,
248 dataset_types: Mapping[str, DatasetType] | None = None,
249 ) -> QuantumBackedButler:
250 """Construct a new `QuantumBackedButler` from sets of input and output
251 dataset IDs.
253 Parameters
254 ----------
255 config : `Config` or `~lsst.resources.ResourcePathExpression`
256 A butler repository root, configuration filename, or configuration
257 instance.
258 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
259 Dataset IDs for datasets that can can be read from this butler.
260 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
261 Dataset IDs for datasets that can be stored in this butler, must be
262 fully resolved.
263 dimensions : `DimensionUniverse`
264 Object managing all dimension definitions.
265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
266 Datastore records to import into a datastore.
267 filename : `str`, optional
268 Name for the SQLite database that will back this butler; defaults
269 to an in-memory database.
270 OpaqueManagerClass : `type`, optional
271 A subclass of `OpaqueTableStorageManager` to use for datastore
272 opaque records. Default is a SQL-backed implementation.
273 BridgeManagerClass : `type`, optional
274 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
275 location records. Default is a SQL-backed implementation.
276 search_paths : `list` of `str`, optional
277 Additional search paths for butler configuration.
278 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \
279 optional
280 Mapping of the dataset type name to its registry definition.
281 """
282 return cls._initialize(
283 config=config,
284 predicted_inputs=predicted_inputs,
285 predicted_outputs=predicted_outputs,
286 dimensions=dimensions,
287 filename=filename,
288 datastore_records=datastore_records,
289 OpaqueManagerClass=OpaqueManagerClass,
290 BridgeManagerClass=BridgeManagerClass,
291 search_paths=search_paths,
292 dataset_types=dataset_types,
293 )
295 @classmethod
296 def _initialize(
297 cls,
298 *,
299 config: Config | ResourcePathExpression,
300 predicted_inputs: Iterable[DatasetId],
301 predicted_outputs: Iterable[DatasetId],
302 dimensions: DimensionUniverse,
303 filename: str = ":memory:",
304 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
305 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
306 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
307 search_paths: list[str] | None = None,
308 dataset_types: Mapping[str, DatasetType] | None = None,
309 ) -> QuantumBackedButler:
310 """Initialize quantum-backed butler.
312 Internal method with common implementation used by `initialize` and
313 `for_output`.
315 Parameters
316 ----------
317 config : `Config` or `~lsst.resources.ResourcePathExpression`
318 A butler repository root, configuration filename, or configuration
319 instance.
320 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`]
321 Dataset IDs for datasets that can can be read from this butler.
322 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`]
323 Dataset IDs for datasets that can be stored in this butler.
324 dimensions : `DimensionUniverse`
325 Object managing all dimension definitions.
326 filename : `str`, optional
327 Name for the SQLite database that will back this butler; defaults
328 to an in-memory database.
329 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None`
330 Datastore records to import into a datastore.
331 OpaqueManagerClass : `type`, optional
332 A subclass of `OpaqueTableStorageManager` to use for datastore
333 opaque records. Default is a SQL-backed implementation.
334 BridgeManagerClass : `type`, optional
335 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
336 location records. Default is a SQL-backed implementation.
337 search_paths : `list` of `str`, optional
338 Additional search paths for butler configuration.
339 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`]
340 Mapping of the dataset type name to its registry definition.
341 """
342 butler_config = ButlerConfig(config, searchPaths=search_paths)
343 butler_root = butler_config.get("root", butler_config.configDir)
344 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
345 with db.declareStaticTables(create=True) as context:
346 opaque_manager = OpaqueManagerClass.initialize(db, context)
347 bridge_manager = BridgeManagerClass.initialize(
348 db,
349 context,
350 opaque=opaque_manager,
351 # MyPy can tell it's a fake, but we know it shouldn't care.
352 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
353 universe=dimensions,
354 )
355 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
357 # TODO: We need to inform `Datastore` here that it needs to support
358 # predictive reads; This only really works for file datastore but
359 # we need to try everything in case there is a chained datastore.
360 datastore._set_trust_mode(True)
362 if datastore_records is not None:
363 datastore.import_records(datastore_records)
364 storageClasses = StorageClassFactory()
365 storageClasses.addFromConfig(butler_config)
366 return cls(
367 predicted_inputs,
368 predicted_outputs,
369 dimensions,
370 datastore,
371 storageClasses=storageClasses,
372 dataset_types=dataset_types,
373 )
375 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
376 """Return DatasetType defined in registry given dataset type name."""
377 return self._dataset_types.get(name)
379 def isWriteable(self) -> bool:
380 # Docstring inherited.
381 return True
383 def get(
384 self,
385 ref: DatasetRef,
386 /,
387 *,
388 parameters: dict[str, Any] | None = None,
389 storageClass: StorageClass | str | None = None,
390 ) -> Any:
391 try:
392 obj = super().get(
393 ref,
394 parameters=parameters,
395 storageClass=storageClass,
396 )
397 except (LookupError, FileNotFoundError, OSError):
398 self._unavailable_inputs.add(ref.id)
399 raise
400 if ref.id in self._predicted_inputs:
401 # do this after delegating to super in case that raises.
402 self._actual_inputs.add(ref.id)
403 self._available_inputs.add(ref.id)
404 return obj
406 def getDeferred(
407 self,
408 ref: DatasetRef,
409 /,
410 *,
411 parameters: dict[str, Any] | None = None,
412 storageClass: str | StorageClass | None = None,
413 ) -> DeferredDatasetHandle:
414 if ref.id in self._predicted_inputs:
415 # Unfortunately, we can't do this after the handle succeeds in
416 # loading, so it's conceivable here that we're marking an input
417 # as "actual" even when it's not even available.
418 self._actual_inputs.add(ref.id)
419 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass)
421 def stored(self, ref: DatasetRef) -> bool:
422 # Docstring inherited.
423 stored = super().stored(ref)
424 if ref.id in self._predicted_inputs:
425 if stored:
426 self._available_inputs.add(ref.id)
427 else:
428 self._unavailable_inputs.add(ref.id)
429 return stored
431 def stored_many(
432 self,
433 refs: Iterable[DatasetRef],
434 ) -> dict[DatasetRef, bool]:
435 # Docstring inherited.
436 existence = super().stored_many(refs)
438 for ref, stored in existence.items():
439 if ref.id in self._predicted_inputs:
440 if stored:
441 self._available_inputs.add(ref.id)
442 else:
443 self._unavailable_inputs.add(ref.id)
444 return existence
446 def markInputUnused(self, ref: DatasetRef) -> None:
447 # Docstring inherited.
448 self._actual_inputs.discard(ref.id)
450 @property
451 def dimensions(self) -> DimensionUniverse:
452 # Docstring inherited.
453 return self._dimensions
455 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
456 # Docstring inherited.
457 if ref.id not in self._predicted_outputs:
458 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
459 self._datastore.put(obj, ref)
460 self._actual_output_refs.add(ref)
461 return ref
463 def pruneDatasets(
464 self,
465 refs: Iterable[DatasetRef],
466 *,
467 disassociate: bool = True,
468 unstore: bool = False,
469 tags: Iterable[str] = (),
470 purge: bool = False,
471 ) -> None:
472 # docstring inherited from LimitedButler
474 if purge:
475 if not disassociate:
476 raise TypeError("Cannot pass purge=True without disassociate=True.")
477 if not unstore:
478 raise TypeError("Cannot pass purge=True without unstore=True.")
479 elif disassociate:
480 # No tagged collections for this butler.
481 raise TypeError("Cannot pass disassociate=True without purge=True.")
483 refs = list(refs)
485 # Pruning a component of a DatasetRef makes no sense.
486 for ref in refs:
487 if ref.datasetType.component():
488 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
490 if unstore:
491 self._datastore.trash(refs)
492 if purge:
493 for ref in refs:
494 # We only care about removing them from actual output refs,
495 self._actual_output_refs.discard(ref)
497 if unstore:
498 # Point of no return for removing artifacts
499 self._datastore.emptyTrash()
501 def extract_provenance_data(self) -> QuantumProvenanceData:
502 """Extract provenance information and datastore records from this
503 butler.
505 Returns
506 -------
507 provenance : `QuantumProvenanceData`
508 A serializable struct containing input/output dataset IDs and
509 datastore records. This assumes all dataset IDs are UUIDs (just to
510 make it easier for `pydantic` to reason about the struct's types);
511 the rest of this class makes no such assumption, but the approach
512 to processing in which it's useful effectively requires UUIDs
513 anyway.
515 Notes
516 -----
517 `QuantumBackedButler` records this provenance information when its
518 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
519 authors from having to worry about while still recording very
520 detailed information. But it has two small weaknesses:
522 - Calling `getDeferred` or `get` is enough to mark a
523 dataset as an "actual input", which may mark some datasets that
524 aren't actually used. We rely on task authors to use
525 `markInputUnused` to address this.
527 - We assume that the execution system will call ``stored``
528 on all predicted inputs prior to execution, in order to populate the
529 "available inputs" set. This is what I envision
530 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
531 to use this class, but it feels fragile for this class to make such
532 a strong assumption about how it will be used, even if I can't think
533 of any other executor behavior that would make sense.
534 """
535 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
536 _LOG.warning(
537 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
538 "was obtained, but did not actually exist. This task should be be using markInputUnused "
539 "directly to clarify its provenance.",
540 self._actual_inputs & self._unavailable_inputs,
541 )
542 self._actual_inputs -= self._unavailable_inputs
543 checked_inputs = self._available_inputs | self._unavailable_inputs
544 if self._predicted_inputs != checked_inputs:
545 _LOG.warning(
546 "Execution harness did not check predicted inputs %s for existence; available inputs "
547 "recorded in provenance may be incomplete.",
548 self._predicted_inputs - checked_inputs,
549 )
550 datastore_records = self._datastore.export_records(self._actual_output_refs)
551 provenance_records = {
552 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
553 }
555 return QuantumProvenanceData(
556 predicted_inputs=self._predicted_inputs,
557 available_inputs=self._available_inputs,
558 actual_inputs=self._actual_inputs,
559 predicted_outputs=self._predicted_outputs,
560 actual_outputs={ref.id for ref in self._actual_output_refs},
561 datastore_records=provenance_records,
562 )
565class QuantumProvenanceData(pydantic.BaseModel):
566 """A serializable struct for per-quantum provenance information and
567 datastore records.
569 Notes
570 -----
571 This class slightly duplicates information from the `Quantum` class itself
572 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the
573 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
574 assumes the original `Quantum` is also available to reconstruct the
575 complete provenance (e.g. by associating dataset IDs with data IDs,
576 dataset types, and `~CollectionType.RUN` names.
578 Note that ``pydantic`` method ``parse_raw()`` is not going to work
579 correctly for this class, use `direct` method instead.
580 """
582 # This class probably should have information about its execution
583 # environment (anything not controlled and recorded at the
584 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
585 # now is out of scope for this prototype.
587 predicted_inputs: set[uuid.UUID]
588 """Unique IDs of datasets that were predicted as inputs to this quantum
589 when the QuantumGraph was built.
590 """
592 available_inputs: set[uuid.UUID]
593 """Unique IDs of input datasets that were actually present in the datastore
594 when this quantum was executed.
596 This is a subset of ``predicted_inputs``, with the difference generally
597 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of
598 some upstream task.
599 """
601 actual_inputs: set[uuid.UUID]
602 """Unique IDs of datasets that were actually used as inputs by this task.
604 This is a subset of ``available_inputs``.
606 Notes
607 -----
608 The criteria for marking an input as used is that rerunning the quantum
609 with only these ``actual_inputs`` available must yield identical outputs.
610 This means that (for example) even just using an input to help determine
611 an output rejection criteria and then rejecting it as an outlier qualifies
612 that input as actually used.
613 """
615 predicted_outputs: set[uuid.UUID]
616 """Unique IDs of datasets that were predicted as outputs of this quantum
617 when the QuantumGraph was built.
618 """
620 actual_outputs: set[uuid.UUID]
621 """Unique IDs of datasets that were actually written when this quantum
622 was executed.
623 """
625 datastore_records: dict[str, SerializedDatastoreRecordData]
626 """Datastore records indexed by datastore name."""
628 @staticmethod
629 def collect_and_transfer(
630 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
631 ) -> None:
632 """Transfer output datasets from multiple quanta to a more permanent
633 `Butler` repository.
635 Parameters
636 ----------
637 butler : `Butler`
638 Full butler representing the data repository to transfer datasets
639 to.
640 quanta : `~collections.abc.Iterable` [ `Quantum` ]
641 Iterable of `Quantum` objects that carry information about
642 predicted outputs. May be a single-pass iterator.
643 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ]
644 Provenance and datastore data for each of the given quanta, in the
645 same order. May be a single-pass iterator.
647 Notes
648 -----
649 Input-output provenance data is not actually transferred yet, because
650 `Registry` has no place to store it.
652 This method probably works most efficiently if run on all quanta for a
653 single task label at once, because this will gather all datasets of
654 a particular type together into a single vectorized `Registry` import.
655 It should still behave correctly if run on smaller groups of quanta
656 or even quanta from multiple tasks.
658 Currently this method transfers datastore record data unchanged, with
659 no possibility of actually moving (e.g.) files. Datastores that are
660 present only in execution or only in the more permanent butler are
661 ignored.
662 """
663 grouped_refs = defaultdict(list)
664 summary_records: dict[str, DatastoreRecordData] = {}
665 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True):
666 quantum_refs_by_id = {
667 ref.id: ref
668 for ref in itertools.chain.from_iterable(quantum.outputs.values())
669 if ref.id in provenance_for_quantum.actual_outputs
670 }
671 for ref in quantum_refs_by_id.values():
672 grouped_refs[ref.datasetType, ref.run].append(ref)
674 # merge datastore records into a summary structure
675 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
676 quantum_records = DatastoreRecordData.from_simple(serialized_records)
677 if (records := summary_records.get(datastore_name)) is not None:
678 records.update(quantum_records)
679 else:
680 summary_records[datastore_name] = quantum_records
682 for refs in grouped_refs.values():
683 butler.registry._importDatasets(refs)
684 butler._datastore.import_records(summary_records)
686 @classmethod
687 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
688 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
690 @classmethod
691 def direct(
692 cls,
693 *,
694 predicted_inputs: Iterable[str | uuid.UUID],
695 available_inputs: Iterable[str | uuid.UUID],
696 actual_inputs: Iterable[str | uuid.UUID],
697 predicted_outputs: Iterable[str | uuid.UUID],
698 actual_outputs: Iterable[str | uuid.UUID],
699 datastore_records: Mapping[str, Mapping],
700 ) -> QuantumProvenanceData:
701 """Construct an instance directly without validators.
703 Parameters
704 ----------
705 predicted_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
706 The predicted inputs.
707 available_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
708 The available inputs.
709 actual_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
710 The actual inputs.
711 predicted_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
712 The predicted outputs.
713 actual_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID`
714 The actual outputs.
715 datastore_records : `~collections.abc.Mapping` [ `str`, \
716 `~collections.abc.Mapping` ]
717 The datastore records.
719 Returns
720 -------
721 provenance : `QuantumProvenanceData`
722 Serializable model of the quantum provenance.
724 Notes
725 -----
726 This differs from the Pydantic "construct" method in that the
727 arguments are explicitly what the model requires, and it will recurse
728 through members, constructing them from their corresponding `direct`
729 methods.
731 This method should only be called when the inputs are trusted.
732 """
734 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]:
735 """Convert input UUIDs, which could be in string representation to
736 a set of `UUID` instances.
737 """
738 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids}
740 data = cls.model_construct(
741 predicted_inputs=_to_uuid_set(predicted_inputs),
742 available_inputs=_to_uuid_set(available_inputs),
743 actual_inputs=_to_uuid_set(actual_inputs),
744 predicted_outputs=_to_uuid_set(predicted_outputs),
745 actual_outputs=_to_uuid_set(actual_outputs),
746 datastore_records={
747 key: SerializedDatastoreRecordData.direct(**records)
748 for key, records in datastore_records.items()
749 },
750 )
752 return data