Coverage for python/lsst/daf/butler/core/datastoreRecordData.py: 28%
85 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData")
28import dataclasses
29import uuid
30from collections.abc import Mapping
31from typing import TYPE_CHECKING, Any, TypeAlias
33from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat
34from lsst.utils import doImportType
35from lsst.utils.introspection import get_full_type_name
37from .datasets import DatasetId
38from .dimensions import DimensionUniverse
39from .persistenceContext import PersistenceContextVars
40from .storedFileInfo import StoredDatastoreItemInfo
42if TYPE_CHECKING:
43 from ..registry import Registry
45# Pydantic 2 requires we be explicit about the types that are used in
46# datastore records. Without this UUID can not be handled. Pydantic v1
47# wants the opposite and does not work unless we use Any.
48if PYDANTIC_V2: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 _Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
50else:
51 _Record: TypeAlias = dict[str, Any] # type: ignore
54class SerializedDatastoreRecordData(_BaseModelCompat):
55 """Representation of a `DatastoreRecordData` suitable for serialization."""
57 dataset_ids: list[uuid.UUID]
58 """List of dataset IDs"""
60 records: Mapping[str, Mapping[str, list[_Record]]]
61 """List of records indexed by record class name and table name."""
63 @classmethod
64 def direct(
65 cls,
66 *,
67 dataset_ids: list[str | uuid.UUID],
68 records: dict[str, dict[str, list[_Record]]],
69 ) -> SerializedDatastoreRecordData:
70 """Construct a `SerializedDatastoreRecordData` directly without
71 validators.
73 This differs from the pydantic "construct" method in that the
74 arguments are explicitly what the model requires, and it will recurse
75 through members, constructing them from their corresponding `direct`
76 methods.
78 This method should only be called when the inputs are trusted.
79 """
80 # See also comments in record_ids_to_uuid()
81 for table_data in records.values():
82 for table_records in table_data.values():
83 for record in table_records:
84 # This only checks dataset_id value, if there are any other
85 # columns that are UUIDs we'd need more generic approach.
86 if (id := record.get("dataset_id")) is not None:
87 record["dataset_id"] = uuid.UUID(id) if isinstance(id, str) else id
89 data = cls.model_construct(
90 _fields_set={"dataset_ids", "records"},
91 # JSON makes strings out of UUIDs, need to convert them back
92 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids],
93 records=records,
94 )
96 return data
99@dataclasses.dataclass
100class DatastoreRecordData:
101 """A struct that represents a tabular data export from a single
102 datastore.
103 """
105 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field(
106 default_factory=dict
107 )
108 """Opaque table data, indexed by dataset ID and grouped by opaque table
109 name."""
111 def update(self, other: DatastoreRecordData) -> None:
112 """Update contents of this instance with data from another instance.
114 Parameters
115 ----------
116 other : `DatastoreRecordData`
117 Records to merge into this instance.
119 Notes
120 -----
121 If a ``(dataset_id, table_name)`` combination has any records in
122 ``self``, it is assumed that all records for that combination are
123 already present. This allows duplicates of the same dataset to be
124 handled gracefully.
125 """
126 for dataset_id, table_records in other.records.items():
127 this_table_records = self.records.setdefault(dataset_id, {})
128 for table_name, records in table_records.items():
129 # If this (dataset_id, table_name) combination already has
130 # records in `self`, we assume that means all of the records
131 # for that combination; we require other code to ensure entire
132 # (parent) datasets are exported to these data structures
133 # (never components).
134 if not (this_records := this_table_records.setdefault(table_name, [])):
135 this_records.extend(records)
137 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None:
138 """Extract a subset of the records that match given dataset IDs.
140 Parameters
141 ----------
142 dataset_ids : `set` [ `DatasetId` ]
143 Dataset IDs to match.
145 Returns
146 -------
147 record_data : `DatastoreRecordData` or `None`
148 `None` is returned if there are no matching refs.
150 Notes
151 -----
152 Records in the returned instance are shared with this instance, clients
153 should not update or extend records in the returned instance.
154 """
155 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
156 for dataset_id in dataset_ids:
157 if (id_records := self.records.get(dataset_id)) is not None:
158 matching_records[dataset_id] = id_records
159 if matching_records:
160 return DatastoreRecordData(records=matching_records)
161 else:
162 return None
164 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData:
165 """Make representation of the object for serialization.
167 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
169 Parameters
170 ----------
171 minimal : `bool`, optional
172 If True produce minimal representation, not used by this method.
174 Returns
175 -------
176 simple : `dict`
177 Representation of this instance as a simple dictionary.
178 """
180 def _class_name(records: list[StoredDatastoreItemInfo]) -> str:
181 """Get fully qualified class name for the records. Empty string
182 returned if list is empty. Exception is raised if records are of
183 different classes.
184 """
185 if not records:
186 return ""
187 classes = {record.__class__ for record in records}
188 assert len(classes) == 1, f"Records have to be of the same class: {classes}"
189 return get_full_type_name(classes.pop())
191 records: dict[str, dict[str, list[_Record]]] = {}
192 for table_data in self.records.values():
193 for table_name, table_records in table_data.items():
194 class_name = _class_name(table_records)
195 class_records = records.setdefault(class_name, {})
196 class_records.setdefault(table_name, []).extend(
197 [record.to_record() for record in table_records]
198 )
199 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records)
201 @classmethod
202 def from_simple(
203 cls,
204 simple: SerializedDatastoreRecordData,
205 universe: DimensionUniverse | None = None,
206 registry: Registry | None = None,
207 ) -> DatastoreRecordData:
208 """Make an instance of this class from serialized data.
210 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
212 Parameters
213 ----------
214 data : `dict`
215 Serialized representation returned from `to_simple` method.
216 universe : `DimensionUniverse`, optional
217 Dimension universe, not used by this method.
218 registry : `Registry`, optional
219 Registry instance, not used by this method.
221 Returns
222 -------
223 item_info : `StoredDatastoreItemInfo`
224 De-serialized instance of `StoredDatastoreItemInfo`.
225 """
226 cache = PersistenceContextVars.dataStoreRecords.get()
227 key = frozenset(simple.dataset_ids)
228 if cache is not None and (cachedRecord := cache.get(key)) is not None:
229 return cachedRecord
230 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
231 # make sure that all dataset IDs appear in the dict even if they don't
232 # have records.
233 for dataset_id in simple.dataset_ids:
234 records[dataset_id] = {}
235 for class_name, table_data in simple.records.items():
236 klass = doImportType(class_name)
237 if not issubclass(klass, StoredDatastoreItemInfo):
238 raise RuntimeError(
239 "The class specified in the SerializedDatastoreRecordData "
240 f"({get_full_type_name(klass)}) is not a StoredDatastoreItemInfo."
241 )
242 for table_name, table_records in table_data.items():
243 for record in table_records:
244 info = klass.from_record(record)
245 dataset_type_records = records.setdefault(info.dataset_id, {})
246 dataset_type_records.setdefault(table_name, []).append(info)
247 newRecord = cls(records=records)
248 if cache is not None:
249 cache[key] = newRecord
250 return newRecord