Coverage for python/lsst/daf/butler/core/datastoreRecordData.py: 29%
84 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-12 09:20 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-12 09:20 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Support for generic data stores."""
24from __future__ import annotations
26__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData")
28import dataclasses
29import uuid
30from collections.abc import Mapping
31from typing import TYPE_CHECKING, Any, TypeAlias
33from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat
34from lsst.utils import doImportType
35from lsst.utils.introspection import get_full_type_name
37from .datasets import DatasetId
38from .dimensions import DimensionUniverse
39from .persistenceContext import PersistenceContextVars
40from .storedFileInfo import StoredDatastoreItemInfo
42if TYPE_CHECKING:
43 from ..registry import Registry
45# Pydantic 2 requires we be explicit about the types that are used in
46# datastore records. Without this UUID can not be handled. Pydantic v1
47# wants the opposite and does not work unless we use Any.
48if PYDANTIC_V2: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 _Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
50else:
51 _Record: TypeAlias = dict[str, Any] # type: ignore
54class SerializedDatastoreRecordData(_BaseModelCompat):
55 """Representation of a `DatastoreRecordData` suitable for serialization."""
57 dataset_ids: list[uuid.UUID]
58 """List of dataset IDs"""
60 records: Mapping[str, Mapping[str, list[_Record]]]
61 """List of records indexed by record class name and table name."""
63 @classmethod
64 def direct(
65 cls,
66 *,
67 dataset_ids: list[str | uuid.UUID],
68 records: dict[str, dict[str, list[_Record]]],
69 ) -> SerializedDatastoreRecordData:
70 """Construct a `SerializedDatastoreRecordData` directly without
71 validators.
73 This differs from the pydantic "construct" method in that the
74 arguments are explicitly what the model requires, and it will recurse
75 through members, constructing them from their corresponding `direct`
76 methods.
78 This method should only be called when the inputs are trusted.
79 """
80 # See also comments in record_ids_to_uuid()
81 for table_data in records.values():
82 for table_records in table_data.values():
83 for record in table_records:
84 # This only checks dataset_id value, if there are any other
85 # columns that are UUIDs we'd need more generic approach.
86 if (id := record.get("dataset_id")) is not None:
87 record["dataset_id"] = uuid.UUID(id) if isinstance(id, str) else id
89 data = cls.model_construct(
90 _fields_set={"dataset_ids", "records"},
91 # JSON makes strings out of UUIDs, need to convert them back
92 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids],
93 records=records,
94 )
96 return data
99@dataclasses.dataclass
100class DatastoreRecordData:
101 """A struct that represents a tabular data export from a single
102 datastore.
103 """
105 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field(
106 default_factory=dict
107 )
108 """Opaque table data, indexed by dataset ID and grouped by opaque table
109 name."""
111 def update(self, other: DatastoreRecordData) -> None:
112 """Update contents of this instance with data from another instance.
114 Parameters
115 ----------
116 other : `DatastoreRecordData`
117 Records tho merge into this instance.
119 Notes
120 -----
121 Merged instances can not have identical records.
122 """
123 for dataset_id, table_records in other.records.items():
124 this_table_records = self.records.setdefault(dataset_id, {})
125 for table_name, records in table_records.items():
126 this_table_records.setdefault(table_name, []).extend(records)
128 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None:
129 """Extract a subset of the records that match given dataset IDs.
131 Parameters
132 ----------
133 dataset_ids : `set` [ `DatasetId` ]
134 Dataset IDs to match.
136 Returns
137 -------
138 record_data : `DatastoreRecordData` or `None`
139 `None` is returned if there are no matching refs.
141 Notes
142 -----
143 Records in the returned instance are shared with this instance, clients
144 should not update or extend records in the returned instance.
145 """
146 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
147 for dataset_id in dataset_ids:
148 if (id_records := self.records.get(dataset_id)) is not None:
149 matching_records[dataset_id] = id_records
150 if matching_records:
151 return DatastoreRecordData(records=matching_records)
152 else:
153 return None
155 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData:
156 """Make representation of the object for serialization.
158 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
160 Parameters
161 ----------
162 minimal : `bool`, optional
163 If True produce minimal representation, not used by this method.
165 Returns
166 -------
167 simple : `dict`
168 Representation of this instance as a simple dictionary.
169 """
171 def _class_name(records: list[StoredDatastoreItemInfo]) -> str:
172 """Get fully qualified class name for the records. Empty string
173 returned if list is empty. Exception is raised if records are of
174 different classes.
175 """
176 if not records:
177 return ""
178 classes = {record.__class__ for record in records}
179 assert len(classes) == 1, f"Records have to be of the same class: {classes}"
180 return get_full_type_name(classes.pop())
182 records: dict[str, dict[str, list[_Record]]] = {}
183 for table_data in self.records.values():
184 for table_name, table_records in table_data.items():
185 class_name = _class_name(table_records)
186 class_records = records.setdefault(class_name, {})
187 class_records.setdefault(table_name, []).extend(
188 [record.to_record() for record in table_records]
189 )
190 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records)
192 @classmethod
193 def from_simple(
194 cls,
195 simple: SerializedDatastoreRecordData,
196 universe: DimensionUniverse | None = None,
197 registry: Registry | None = None,
198 ) -> DatastoreRecordData:
199 """Make an instance of this class from serialized data.
201 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
203 Parameters
204 ----------
205 data : `dict`
206 Serialized representation returned from `to_simple` method.
207 universe : `DimensionUniverse`, optional
208 Dimension universe, not used by this method.
209 registry : `Registry`, optional
210 Registry instance, not used by this method.
212 Returns
213 -------
214 item_info : `StoredDatastoreItemInfo`
215 De-serialized instance of `StoredDatastoreItemInfo`.
216 """
217 cache = PersistenceContextVars.dataStoreRecords.get()
218 key = frozenset(simple.dataset_ids)
219 if cache is not None and (cachedRecord := cache.get(key)) is not None:
220 return cachedRecord
221 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
222 # make sure that all dataset IDs appear in the dict even if they don't
223 # have records.
224 for dataset_id in simple.dataset_ids:
225 records[dataset_id] = {}
226 for class_name, table_data in simple.records.items():
227 klass = doImportType(class_name)
228 if not issubclass(klass, StoredDatastoreItemInfo):
229 raise RuntimeError(
230 "The class specified in the SerializedDatastoreRecordData "
231 f"({get_full_type_name(klass)}) is not a StoredDatastoreItemInfo."
232 )
233 for table_name, table_records in table_data.items():
234 for record in table_records:
235 info = klass.from_record(record)
236 dataset_type_records = records.setdefault(info.dataset_id, {})
237 dataset_type_records.setdefault(table_name, []).append(info)
238 newRecord = cls(records=records)
239 if cache is not None:
240 cache[key] = newRecord
241 return newRecord