Coverage for python/lsst/daf/butler/datastore/record_data.py: 32%
71 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 10:53 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 10:53 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for generic data stores."""
30from __future__ import annotations
32__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData")
34import dataclasses
35import uuid
36from collections.abc import Mapping
37from typing import TYPE_CHECKING, TypeAlias
39import pydantic
41from .._dataset_ref import DatasetId
42from ..dimensions import DimensionUniverse
43from ..persistence_context import PersistenceContextVars
44from .stored_file_info import StoredDatastoreItemInfo
46if TYPE_CHECKING:
47 from ..registry import Registry
49# Pydantic requires the possible value types to be explicitly enumerated in
50# order for `uuid.UUID` in particular to work. `typing.Any` does not work
51# here.
52_Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
55class SerializedDatastoreRecordData(pydantic.BaseModel):
56 """Representation of a `DatastoreRecordData` suitable for serialization."""
58 dataset_ids: list[uuid.UUID]
59 """List of dataset IDs"""
61 records: Mapping[str, Mapping[str, Mapping[str, list[_Record]]]]
62 """List of records indexed by record class name, dataset ID (encoded as
63 str, because JSON), and opaque table name.
64 """
66 @classmethod
67 def direct(
68 cls,
69 *,
70 dataset_ids: list[str | uuid.UUID],
71 records: dict[str, dict[str, dict[str, list[_Record]]]],
72 ) -> SerializedDatastoreRecordData:
73 """Construct a `SerializedDatastoreRecordData` directly without
74 validators.
76 Parameters
77 ----------
78 dataset_ids : `list` [`str` or `uuid.UUID`]
79 The dataset UUIDs.
80 records : `dict`
81 The datastore records.
83 Notes
84 -----
85 This differs from the pydantic "construct" method in that the
86 arguments are explicitly what the model requires, and it will recurse
87 through members, constructing them from their corresponding `direct`
88 methods.
90 This method should only be called when the inputs are trusted.
91 """
92 data = cls.model_construct(
93 _fields_set={"dataset_ids", "records"},
94 # JSON makes strings out of UUIDs, need to convert them back
95 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids],
96 records=records,
97 )
99 return data
102@dataclasses.dataclass
103class DatastoreRecordData:
104 """A struct that represents a tabular data export from a single
105 datastore.
106 """
108 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field(
109 default_factory=dict
110 )
111 """Opaque table data, indexed by dataset ID and grouped by opaque table
112 name."""
114 def update(self, other: DatastoreRecordData) -> None:
115 """Update contents of this instance with data from another instance.
117 Parameters
118 ----------
119 other : `DatastoreRecordData`
120 Records to merge into this instance.
122 Notes
123 -----
124 If a ``(dataset_id, table_name)`` combination has any records in
125 ``self``, it is assumed that all records for that combination are
126 already present. This allows duplicates of the same dataset to be
127 handled gracefully.
128 """
129 for dataset_id, table_records in other.records.items():
130 this_table_records = self.records.setdefault(dataset_id, {})
131 for table_name, records in table_records.items():
132 # If this (dataset_id, table_name) combination already has
133 # records in `self`, we assume that means all of the records
134 # for that combination; we require other code to ensure entire
135 # (parent) datasets are exported to these data structures
136 # (never components).
137 if not (this_records := this_table_records.setdefault(table_name, [])):
138 this_records.extend(records)
140 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None:
141 """Extract a subset of the records that match given dataset IDs.
143 Parameters
144 ----------
145 dataset_ids : `set` [ `DatasetId` ]
146 Dataset IDs to match.
148 Returns
149 -------
150 record_data : `DatastoreRecordData` or `None`
151 `None` is returned if there are no matching refs.
153 Notes
154 -----
155 Records in the returned instance are shared with this instance, clients
156 should not update or extend records in the returned instance.
157 """
158 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
159 for dataset_id in dataset_ids:
160 if (id_records := self.records.get(dataset_id)) is not None:
161 matching_records[dataset_id] = id_records
162 if matching_records:
163 return DatastoreRecordData(records=matching_records)
164 else:
165 return None
167 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData:
168 """Make representation of the object for serialization.
170 Implements `~lsst.daf.butler.json.SupportsSimple` protocol.
172 Parameters
173 ----------
174 minimal : `bool`, optional
175 If True produce minimal representation, not used by this method.
177 Returns
178 -------
179 simple : `dict`
180 Representation of this instance as a simple dictionary.
181 """
182 records: dict[str, dict[str, dict[str, list[_Record]]]] = {}
183 for dataset_id, table_data in self.records.items():
184 for table_name, table_records in table_data.items():
185 class_name, infos = StoredDatastoreItemInfo.to_records(table_records)
186 class_records = records.setdefault(class_name, {})
187 dataset_records = class_records.setdefault(dataset_id.hex, {})
188 dataset_records.setdefault(table_name, []).extend(dict(info) for info in infos)
189 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records)
191 @classmethod
192 def from_simple(
193 cls,
194 simple: SerializedDatastoreRecordData,
195 universe: DimensionUniverse | None = None,
196 registry: Registry | None = None,
197 ) -> DatastoreRecordData:
198 """Make an instance of this class from serialized data.
200 Implements `~lsst.daf.butler.json.SupportsSimple` protocol.
202 Parameters
203 ----------
204 simple : `dict`
205 Serialized representation returned from `to_simple` method.
206 universe : `DimensionUniverse`, optional
207 Dimension universe, not used by this method.
208 registry : `Registry`, optional
209 Registry instance, not used by this method.
211 Returns
212 -------
213 item_info : `StoredDatastoreItemInfo`
214 De-serialized instance of `StoredDatastoreItemInfo`.
215 """
216 cache = PersistenceContextVars.dataStoreRecords.get()
217 key = frozenset(simple.dataset_ids)
218 if cache is not None and (cachedRecord := cache.get(key)) is not None:
219 return cachedRecord
220 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
221 # make sure that all dataset IDs appear in the dict even if they don't
222 # have records.
223 for dataset_id in simple.dataset_ids:
224 records[dataset_id] = {}
225 for class_name, class_data in simple.records.items():
226 for dataset_id_str, dataset_data in class_data.items():
227 for table_name, table_records in dataset_data.items():
228 try:
229 infos = StoredDatastoreItemInfo.from_records(class_name, table_records)
230 except TypeError as exc:
231 raise RuntimeError(
232 "The class specified in the SerializedDatastoreRecordData "
233 f"({class_name}) is not a StoredDatastoreItemInfo."
234 ) from exc
235 dataset_records = records.setdefault(uuid.UUID(dataset_id_str), {})
236 dataset_records.setdefault(table_name, []).extend(infos)
237 newRecord = cls(records=records)
238 if cache is not None:
239 cache[key] = newRecord
240 return newRecord