Coverage for python/lsst/daf/butler/core/datastoreRecordData.py: 28%
85 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for generic data stores."""
30from __future__ import annotations
32__all__ = ("DatastoreRecordData", "SerializedDatastoreRecordData")
34import dataclasses
35import uuid
36from collections.abc import Mapping
37from typing import TYPE_CHECKING, Any, TypeAlias
39from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat
40from lsst.utils import doImportType
41from lsst.utils.introspection import get_full_type_name
43from .datasets import DatasetId
44from .dimensions import DimensionUniverse
45from .persistenceContext import PersistenceContextVars
46from .storedFileInfo import StoredDatastoreItemInfo
48if TYPE_CHECKING:
49 from ..registry import Registry
51# Pydantic 2 requires we be explicit about the types that are used in
52# datastore records. Without this UUID can not be handled. Pydantic v1
53# wants the opposite and does not work unless we use Any.
54if PYDANTIC_V2: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 _Record: TypeAlias = dict[str, int | str | uuid.UUID | None]
56else:
57 _Record: TypeAlias = dict[str, Any] # type: ignore
60class SerializedDatastoreRecordData(_BaseModelCompat):
61 """Representation of a `DatastoreRecordData` suitable for serialization."""
63 dataset_ids: list[uuid.UUID]
64 """List of dataset IDs"""
66 records: Mapping[str, Mapping[str, list[_Record]]]
67 """List of records indexed by record class name and table name."""
69 @classmethod
70 def direct(
71 cls,
72 *,
73 dataset_ids: list[str | uuid.UUID],
74 records: dict[str, dict[str, list[_Record]]],
75 ) -> SerializedDatastoreRecordData:
76 """Construct a `SerializedDatastoreRecordData` directly without
77 validators.
79 This differs from the pydantic "construct" method in that the
80 arguments are explicitly what the model requires, and it will recurse
81 through members, constructing them from their corresponding `direct`
82 methods.
84 This method should only be called when the inputs are trusted.
85 """
86 # See also comments in record_ids_to_uuid()
87 for table_data in records.values():
88 for table_records in table_data.values():
89 for record in table_records:
90 # This only checks dataset_id value, if there are any other
91 # columns that are UUIDs we'd need more generic approach.
92 if (id := record.get("dataset_id")) is not None:
93 record["dataset_id"] = uuid.UUID(id) if isinstance(id, str) else id
95 data = cls.model_construct(
96 _fields_set={"dataset_ids", "records"},
97 # JSON makes strings out of UUIDs, need to convert them back
98 dataset_ids=[uuid.UUID(id) if isinstance(id, str) else id for id in dataset_ids],
99 records=records,
100 )
102 return data
105@dataclasses.dataclass
106class DatastoreRecordData:
107 """A struct that represents a tabular data export from a single
108 datastore.
109 """
111 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = dataclasses.field(
112 default_factory=dict
113 )
114 """Opaque table data, indexed by dataset ID and grouped by opaque table
115 name."""
117 def update(self, other: DatastoreRecordData) -> None:
118 """Update contents of this instance with data from another instance.
120 Parameters
121 ----------
122 other : `DatastoreRecordData`
123 Records to merge into this instance.
125 Notes
126 -----
127 If a ``(dataset_id, table_name)`` combination has any records in
128 ``self``, it is assumed that all records for that combination are
129 already present. This allows duplicates of the same dataset to be
130 handled gracefully.
131 """
132 for dataset_id, table_records in other.records.items():
133 this_table_records = self.records.setdefault(dataset_id, {})
134 for table_name, records in table_records.items():
135 # If this (dataset_id, table_name) combination already has
136 # records in `self`, we assume that means all of the records
137 # for that combination; we require other code to ensure entire
138 # (parent) datasets are exported to these data structures
139 # (never components).
140 if not (this_records := this_table_records.setdefault(table_name, [])):
141 this_records.extend(records)
143 def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None:
144 """Extract a subset of the records that match given dataset IDs.
146 Parameters
147 ----------
148 dataset_ids : `set` [ `DatasetId` ]
149 Dataset IDs to match.
151 Returns
152 -------
153 record_data : `DatastoreRecordData` or `None`
154 `None` is returned if there are no matching refs.
156 Notes
157 -----
158 Records in the returned instance are shared with this instance, clients
159 should not update or extend records in the returned instance.
160 """
161 matching_records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
162 for dataset_id in dataset_ids:
163 if (id_records := self.records.get(dataset_id)) is not None:
164 matching_records[dataset_id] = id_records
165 if matching_records:
166 return DatastoreRecordData(records=matching_records)
167 else:
168 return None
170 def to_simple(self, minimal: bool = False) -> SerializedDatastoreRecordData:
171 """Make representation of the object for serialization.
173 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
175 Parameters
176 ----------
177 minimal : `bool`, optional
178 If True produce minimal representation, not used by this method.
180 Returns
181 -------
182 simple : `dict`
183 Representation of this instance as a simple dictionary.
184 """
186 def _class_name(records: list[StoredDatastoreItemInfo]) -> str:
187 """Get fully qualified class name for the records. Empty string
188 returned if list is empty. Exception is raised if records are of
189 different classes.
190 """
191 if not records:
192 return ""
193 classes = {record.__class__ for record in records}
194 assert len(classes) == 1, f"Records have to be of the same class: {classes}"
195 return get_full_type_name(classes.pop())
197 records: dict[str, dict[str, list[_Record]]] = {}
198 for table_data in self.records.values():
199 for table_name, table_records in table_data.items():
200 class_name = _class_name(table_records)
201 class_records = records.setdefault(class_name, {})
202 class_records.setdefault(table_name, []).extend(
203 [record.to_record() for record in table_records]
204 )
205 return SerializedDatastoreRecordData(dataset_ids=list(self.records.keys()), records=records)
207 @classmethod
208 def from_simple(
209 cls,
210 simple: SerializedDatastoreRecordData,
211 universe: DimensionUniverse | None = None,
212 registry: Registry | None = None,
213 ) -> DatastoreRecordData:
214 """Make an instance of this class from serialized data.
216 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
218 Parameters
219 ----------
220 data : `dict`
221 Serialized representation returned from `to_simple` method.
222 universe : `DimensionUniverse`, optional
223 Dimension universe, not used by this method.
224 registry : `Registry`, optional
225 Registry instance, not used by this method.
227 Returns
228 -------
229 item_info : `StoredDatastoreItemInfo`
230 De-serialized instance of `StoredDatastoreItemInfo`.
231 """
232 cache = PersistenceContextVars.dataStoreRecords.get()
233 key = frozenset(simple.dataset_ids)
234 if cache is not None and (cachedRecord := cache.get(key)) is not None:
235 return cachedRecord
236 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {}
237 # make sure that all dataset IDs appear in the dict even if they don't
238 # have records.
239 for dataset_id in simple.dataset_ids:
240 records[dataset_id] = {}
241 for class_name, table_data in simple.records.items():
242 klass = doImportType(class_name)
243 if not issubclass(klass, StoredDatastoreItemInfo):
244 raise RuntimeError(
245 "The class specified in the SerializedDatastoreRecordData "
246 f"({get_full_type_name(klass)}) is not a StoredDatastoreItemInfo."
247 )
248 for table_name, table_records in table_data.items():
249 for record in table_records:
250 info = klass.from_record(record)
251 dataset_type_records = records.setdefault(info.dataset_id, {})
252 dataset_type_records.setdefault(table_name, []).append(info)
253 newRecord = cls(records=records)
254 if cache is not None:
255 cache[key] = newRecord
256 return newRecord