Coverage for python/lsst/daf/butler/datastore/record_data.py: 33%
73 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for generic data stores."""
30from __future__ import annotations
32__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData")
34import dataclasses
35import uuid
36from collections.abc import Mapping
37from typing import TYPE_CHECKING, Any, TypeAlias
39from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat
41from .._dataset_ref import DatasetId
42from ..dimensions import DimensionUniverse
43from ..persistence_context import PersistenceContextVars
44from .stored_file_info import StoredDatastoreItemInfo
46if TYPE_CHECKING:
47 from ..registry import Registry
49# Pydantic 2 requires we be explicit about the types that are used in
50# datastore records. Without this UUID can not be handled. Pydantic v1
51# wants the opposite and does not work unless we use Any.
52if PYDANTIC_V2: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 _Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
54else:
55 _Record: TypeAlias = dict[str, Any] # type: ignore
58class SerializedDatastoreRecordData(_BaseModelCompat):
59 """Representation of a `DatastoreRecordData` suitable for serialization."""
61 dataset_ids: list[uuid.UUID]
62 """List of dataset IDs"""
64 records: Mapping[str, Mapping[str, Mapping[str, list[_Record]]]]
65 """List of records indexed by record class name, dataset ID (encoded as
66 str, because JSON), and opaque table name.
67 """
69 @classmethod
70 def direct(
71 cls,
72 *,
73 dataset_ids: list[str | uuid.UUID],
74 records: dict[str, dict[str, dict[str, list[_Record]]]],
75 ) -> SerializedDatastoreRecordData:
76 """Construct a `SerializedDatastoreRecordData` directly without
77 validators.
79 This differs from the pydantic "construct" method in that the
80 arguments are explicitly what the model requires, and it will recurse
81 through members, constructing them from their corresponding `direct`
82 methods.
84 This method should only be called when the inputs are trusted.
85 """
86 data = cls.model_construct(
87 _fields_set={"dataset_ids", "records"},
88 # JSON makes strings out of UUIDs, need to convert them back
89 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids],
90 records=records,
91 )
93 return data
96@dataclasses.dataclass
97class DatastoreRecordData:
98 """A struct that represents a tabular data export from a single
99 datastore.
100 """
102 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field(
103 default_factory=dict
104 )
105 """Opaque table data, indexed by dataset ID and grouped by opaque table
106 name."""
108 def update(self, other: DatastoreRecordData) -> None:
109 """Update contents of this instance with data from another instance.
111 Parameters
112 ----------
113 other : `DatastoreRecordData`
114 Records to merge into this instance.
116 Notes
117 -----
118 If a ``(dataset_id, table_name)`` combination has any records in
119 ``self``, it is assumed that all records for that combination are
120 already present. This allows duplicates of the same dataset to be
121 handled gracefully.
122 """
123 for dataset_id, table_records in other.records.items():
124 this_table_records = self.records.setdefault(dataset_id, {})
125 for table_name, records in table_records.items():
126 # If this (dataset_id, table_name) combination already has
127 # records in `self`, we assume that means all of the records
128 # for that combination; we require other code to ensure entire
129 # (parent) datasets are exported to these data structures
130 # (never components).
131 if not (this_records := this_table_records.setdefault(table_name, [])):
132 this_records.extend(records)
134 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None:
135 """Extract a subset of the records that match given dataset IDs.
137 Parameters
138 ----------
139 dataset_ids : `set` [ `DatasetId` ]
140 Dataset IDs to match.
142 Returns
143 -------
144 record_data : `DatastoreRecordData` or `None`
145 `None` is returned if there are no matching refs.
147 Notes
148 -----
149 Records in the returned instance are shared with this instance, clients
150 should not update or extend records in the returned instance.
151 """
152 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
153 for dataset_id in dataset_ids:
154 if (id_records := self.records.get(dataset_id)) is not None:
155 matching_records[dataset_id] = id_records
156 if matching_records:
157 return DatastoreRecordData(records=matching_records)
158 else:
159 return None
161 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData:
162 """Make representation of the object for serialization.
164 Implements `~lsst.daf.butler.json.SupportsSimple` protocol.
166 Parameters
167 ----------
168 minimal : `bool`, optional
169 If True produce minimal representation, not used by this method.
171 Returns
172 -------
173 simple : `dict`
174 Representation of this instance as a simple dictionary.
175 """
176 records: dict[str, dict[str, dict[str, list[_Record]]]] = {}
177 for dataset_id, table_data in self.records.items():
178 for table_name, table_records in table_data.items():
179 class_name, infos = StoredDatastoreItemInfo.to_records(table_records)
180 class_records = records.setdefault(class_name, {})
181 dataset_records = class_records.setdefault(dataset_id.hex, {})
182 dataset_records.setdefault(table_name, []).extend(dict(info) for info in infos)
183 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records)
185 @classmethod
186 def from_simple(
187 cls,
188 simple: SerializedDatastoreRecordData,
189 universe: DimensionUniverse | None = None,
190 registry: Registry | None = None,
191 ) -> DatastoreRecordData:
192 """Make an instance of this class from serialized data.
194 Implements `~lsst.daf.butler.json.SupportsSimple` protocol.
196 Parameters
197 ----------
198 data : `dict`
199 Serialized representation returned from `to_simple` method.
200 universe : `DimensionUniverse`, optional
201 Dimension universe, not used by this method.
202 registry : `Registry`, optional
203 Registry instance, not used by this method.
205 Returns
206 -------
207 item_info : `StoredDatastoreItemInfo`
208 De-serialized instance of `StoredDatastoreItemInfo`.
209 """
210 cache = PersistenceContextVars.dataStoreRecords.get()
211 key = frozenset(simple.dataset_ids)
212 if cache is not None and (cachedRecord := cache.get(key)) is not None:
213 return cachedRecord
214 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
215 # make sure that all dataset IDs appear in the dict even if they don't
216 # have records.
217 for dataset_id in simple.dataset_ids:
218 records[dataset_id] = {}
219 for class_name, class_data in simple.records.items():
220 for dataset_id_str, dataset_data in class_data.items():
221 for table_name, table_records in dataset_data.items():
222 try:
223 infos = StoredDatastoreItemInfo.from_records(class_name, table_records)
224 except TypeError as exc:
225 raise RuntimeError(
226 "The class specified in the SerializedDatastoreRecordData "
227 f"({class_name}) is not a StoredDatastoreItemInfo."
228 ) from exc
229 dataset_records = records.setdefault(uuid.UUID(dataset_id_str), {})
230 dataset_records.setdefault(table_name, []).extend(infos)
231 newRecord = cls(records=records)
232 if cache is not None:
233 cache[key] = newRecord
234 return newRecord