Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 23%
98 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-06 02:34 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-06 02:34 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
25import copy
26from collections import namedtuple
27from contextlib import contextmanager
28from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, cast
30import sqlalchemy
32from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl
33from ..interfaces import (
34 DatasetIdRef,
35 DatastoreRegistryBridge,
36 DatastoreRegistryBridgeManager,
37 FakeDatasetRef,
38 OpaqueTableStorage,
39 VersionTuple,
40)
41from ..opaque import ByNameOpaqueTableStorage
42from .ephemeral import EphemeralDatastoreRegistryBridge
44if TYPE_CHECKING:
45 from ...core import DimensionUniverse
46 from ...core.datastore import DatastoreTransaction
47 from ..interfaces import (
48 Database,
49 DatasetRecordStorageManager,
50 OpaqueTableStorageManager,
51 StaticTablesContext,
52 )
54_TablesTuple = namedtuple(
55 "_TablesTuple",
56 [
57 "dataset_location",
58 "dataset_location_trash",
59 ],
60)
62# This has to be updated on every schema change
63_VERSION = VersionTuple(0, 2, 0)
66def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple:
67 """Construct specifications for tables used by the monolithic datastore
68 bridge classes.
70 Parameters
71 ----------
72 universe : `DimensionUniverse`
73 All dimensions known to the `Registry`.
74 datasets : subclass of `DatasetRecordStorageManager`
75 Manager class for datasets; used only to create foreign key fields.
77 Returns
78 -------
79 specs : `_TablesTuple`
80 A named tuple containing `ddl.TableSpec` instances.
81 """
82 # We want the dataset_location and dataset_location_trash tables
83 # to have the same definition, aside from the behavior of their link
84 # to the dataset table: the trash table has no foreign key constraint.
85 dataset_location_spec = ddl.TableSpec(
86 doc=(
87 "A table that provides information on whether a dataset is stored in "
88 "one or more Datastores. The presence or absence of a record in this "
89 "table itself indicates whether the dataset is present in that "
90 "Datastore. "
91 ),
92 fields=NamedValueSet(
93 [
94 ddl.FieldSpec(
95 name="datastore_name",
96 dtype=sqlalchemy.String,
97 length=256,
98 primaryKey=True,
99 nullable=False,
100 doc="Name of the Datastore this entry corresponds to.",
101 ),
102 ]
103 ),
104 )
105 dataset_location = copy.deepcopy(dataset_location_spec)
106 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
107 dataset_location_trash = copy.deepcopy(dataset_location_spec)
108 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
109 return _TablesTuple(
110 dataset_location=dataset_location,
111 dataset_location_trash=dataset_location_trash,
112 )
115class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
116 """An implementation of `DatastoreRegistryBridge` that uses the same two
117 tables for all non-ephemeral datastores.
119 Parameters
120 ----------
121 datastoreName : `str`
122 Name of the `Datastore` as it should appear in `Registry` tables
123 referencing it.
124 db : `Database`
125 Object providing a database connection and generic distractions.
126 tables : `_TablesTuple`
127 Named tuple containing `sqlalchemy.schema.Table` instances.
128 """
130 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
131 super().__init__(datastoreName)
132 self._db = db
133 self._tables = tables
135 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]:
136 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
137 a list of dictionaries that match the schema of the tables used by this
138 class.
140 Parameters
141 ----------
142 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
143 Datasets to transform.
145 Returns
146 -------
147 rows : `list` [ `dict` ]
148 List of dictionaries, with "datastoreName" and "dataset_id" keys.
149 """
150 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs]
152 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
153 # Docstring inherited from DatastoreRegistryBridge
154 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
156 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
157 # Docstring inherited from DatastoreRegistryBridge
158 rows = self._refsToRows(self.check(refs))
159 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
161 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: Optional[DatastoreTransaction]) -> None:
162 # Docstring inherited from DatastoreRegistryBridge
163 # TODO: avoid self.check() call via queries like
164 # INSERT INTO dataset_location_trash
165 # SELECT datastore_name, dataset_id FROM dataset_location
166 # WHERE datastore_name=? AND dataset_id IN (?);
167 # DELETE FROM dataset_location
168 # WHERE datastore_name=? AND dataset_id IN (?);
169 # ...but the Database interface doesn't support those kinds of queries
170 # right now.
171 rows = self._refsToRows(self.check(refs))
172 with self._db.transaction():
173 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
174 self._db.insert(self._tables.dataset_location_trash, *rows)
176 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
177 # Docstring inherited from DatastoreRegistryBridge
178 byId = {ref.id: ref for ref in refs}
179 sql = (
180 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
181 .select_from(self._tables.dataset_location)
182 .where(
183 sqlalchemy.sql.and_(
184 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
185 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
186 )
187 )
188 )
189 with self._db.query(sql) as sql_result:
190 sql_rows = sql_result.fetchall()
191 for row in sql_rows:
192 yield byId[row.dataset_id]
194 @contextmanager
195 def emptyTrash(
196 self,
197 records_table: Optional[OpaqueTableStorage] = None,
198 record_class: Optional[Type[StoredDatastoreItemInfo]] = None,
199 record_column: Optional[str] = None,
200 ) -> Iterator[
201 Tuple[Iterable[Tuple[DatasetIdRef, Optional[StoredDatastoreItemInfo]]], Optional[Set[str]]]
202 ]:
203 # Docstring inherited from DatastoreRegistryBridge
205 if records_table is None:
206 raise ValueError("This implementation requires a records table.")
208 assert isinstance(
209 records_table, ByNameOpaqueTableStorage
210 ), f"Records table must support hidden attributes. Got {type(records_table)}."
212 if record_class is None:
213 raise ValueError("Record class must be provided if records table is given.")
215 # Helper closure to generate the common join+where clause.
216 def join_records(
217 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
218 ) -> sqlalchemy.sql.Select:
219 # mypy needs to be sure
220 assert isinstance(records_table, ByNameOpaqueTableStorage)
221 return select.select_from(
222 records_table._table.join(
223 location_table,
224 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
225 )
226 ).where(location_table.columns.datastore_name == self.datastoreName)
228 # SELECT records.dataset_id, records.path FROM records
229 # JOIN records on dataset_location.dataset_id == records.dataset_id
230 # WHERE dataset_location.datastore_name = datastoreName
232 # It's possible that we may end up with a ref listed in the trash
233 # table that is not listed in the records table. Such an
234 # inconsistency would be missed by this query.
235 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
237 # Run query, transform results into a list of dicts that we can later
238 # use to delete.
239 with self._db.query(info_in_trash) as sql_result:
240 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()]
242 # It is possible for trashed refs to be linked to artifacts that
243 # are still associated with refs that are not to be trashed. We
244 # need to be careful to consider those and indicate to the caller
245 # that those artifacts should be retained. Can only do this check
246 # if the caller provides a column name that can map to multiple
247 # refs.
248 preserved: Optional[Set[str]] = None
249 if record_column is not None:
250 # Some helper subqueries
251 items_not_in_trash = join_records(
252 sqlalchemy.sql.select(records_table._table.columns[record_column]),
253 self._tables.dataset_location,
254 ).alias("items_not_in_trash")
255 items_in_trash = join_records(
256 sqlalchemy.sql.select(records_table._table.columns[record_column]),
257 self._tables.dataset_location_trash,
258 ).alias("items_in_trash")
260 # A query for paths that are referenced by datasets in the trash
261 # and datasets not in the trash.
262 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from(
263 items_not_in_trash.join(
264 items_in_trash,
265 onclause=items_in_trash.columns[record_column]
266 == items_not_in_trash.columns[record_column],
267 )
268 )
269 with self._db.query(items_to_preserve) as sql_result:
270 preserved = {row[record_column] for row in sql_result.mappings()}
272 # Convert results to a tuple of id+info and a record of the artifacts
273 # that should not be deleted from datastore. The id+info tuple is
274 # solely to allow logging to report the relevant ID.
275 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows)
277 # Start contextmanager, return results
278 yield ((id_info, preserved))
280 # No exception raised in context manager block.
281 if not rows:
282 return
284 # Delete the rows from the records table
285 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows])
287 # Delete those rows from the trash table.
288 self._db.delete(
289 self._tables.dataset_location_trash,
290 ["dataset_id", "datastore_name"],
291 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows],
292 )
295class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
296 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
297 two tables for all non-ephemeral datastores.
299 Parameters
300 ----------
301 db : `Database`
302 Object providing a database connection and generic distractions.
303 tables : `_TablesTuple`
304 Named tuple containing `sqlalchemy.schema.Table` instances.
305 opaque : `OpaqueTableStorageManager`
306 Manager object for opaque table storage in the `Registry`.
307 universe : `DimensionUniverse`
308 All dimensions know to the `Registry`.
309 datasetIdColumnType : `type`
310 Type for dataset ID column.
311 """
313 def __init__(
314 self,
315 *,
316 db: Database,
317 tables: _TablesTuple,
318 opaque: OpaqueTableStorageManager,
319 universe: DimensionUniverse,
320 datasetIdColumnType: type,
321 registry_schema_version: VersionTuple | None = None,
322 ):
323 super().__init__(
324 opaque=opaque,
325 universe=universe,
326 datasetIdColumnType=datasetIdColumnType,
327 registry_schema_version=registry_schema_version,
328 )
329 self._db = db
330 self._tables = tables
331 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {}
333 @classmethod
334 def initialize(
335 cls,
336 db: Database,
337 context: StaticTablesContext,
338 *,
339 opaque: OpaqueTableStorageManager,
340 datasets: Type[DatasetRecordStorageManager],
341 universe: DimensionUniverse,
342 registry_schema_version: VersionTuple | None = None,
343 ) -> DatastoreRegistryBridgeManager:
344 # Docstring inherited from DatastoreRegistryBridge
345 tables = context.addTableTuple(_makeTableSpecs(datasets))
346 return cls(
347 db=db,
348 tables=cast(_TablesTuple, tables),
349 opaque=opaque,
350 universe=universe,
351 datasetIdColumnType=datasets.getIdColumnType(),
352 registry_schema_version=registry_schema_version,
353 )
355 def refresh(self) -> None:
356 # Docstring inherited from DatastoreRegistryBridge
357 # This implementation has no in-Python state that depends on which
358 # datastores exist, so there's nothing to do.
359 pass
361 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
362 # Docstring inherited from DatastoreRegistryBridge
363 if ephemeral:
364 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
365 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
367 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]:
368 # Docstring inherited from DatastoreRegistryBridge
369 sql = (
370 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name)
371 .select_from(self._tables.dataset_location)
372 .where(self._tables.dataset_location.columns.dataset_id == ref.id)
373 )
374 with self._db.query(sql) as sql_result:
375 sql_rows = sql_result.mappings().fetchall()
376 for row in sql_rows:
377 yield row[self._tables.dataset_location.columns.datastore_name]
378 for name, bridge in self._ephemeral.items():
379 if ref in bridge:
380 yield name
382 @classmethod
383 def currentVersions(cls) -> list[VersionTuple]:
384 # Docstring inherited from VersionedExtension.
385 return [_VERSION]