Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 28%
105 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-18 09:55 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-18 09:55 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29from ... import ddl
31__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
33import copy
34from collections import namedtuple
35from collections.abc import Iterable, Iterator
36from contextlib import contextmanager
37from typing import TYPE_CHECKING, cast
39import sqlalchemy
41from ..._named import NamedValueSet
42from ...datastore.stored_file_info import StoredDatastoreItemInfo
43from ..interfaces import (
44 DatasetIdRef,
45 DatastoreRegistryBridge,
46 DatastoreRegistryBridgeManager,
47 FakeDatasetRef,
48 OpaqueTableStorage,
49 VersionTuple,
50)
51from ..opaque import ByNameOpaqueTableStorage
52from .ephemeral import EphemeralDatastoreRegistryBridge
54if TYPE_CHECKING:
55 from ...datastore import DatastoreTransaction
56 from ...dimensions import DimensionUniverse
57 from ..interfaces import (
58 Database,
59 DatasetRecordStorageManager,
60 OpaqueTableStorageManager,
61 StaticTablesContext,
62 )
64_TablesTuple = namedtuple(
65 "_TablesTuple",
66 [
67 "dataset_location",
68 "dataset_location_trash",
69 ],
70)
72# This has to be updated on every schema change
73_VERSION = VersionTuple(0, 2, 0)
76def _makeTableSpecs(datasets: type[DatasetRecordStorageManager]) -> _TablesTuple:
77 """Construct specifications for tables used by the monolithic datastore
78 bridge classes.
80 Parameters
81 ----------
82 datasets : subclass of `DatasetRecordStorageManager`
83 Manager class for datasets; used only to create foreign key fields.
85 Returns
86 -------
87 specs : `_TablesTuple`
88 A named tuple containing `ddl.TableSpec` instances.
89 """
90 # We want the dataset_location and dataset_location_trash tables
91 # to have the same definition, aside from the behavior of their link
92 # to the dataset table: the trash table has no foreign key constraint.
93 dataset_location_spec = ddl.TableSpec(
94 doc=(
95 "A table that provides information on whether a dataset is stored in "
96 "one or more Datastores. The presence or absence of a record in this "
97 "table itself indicates whether the dataset is present in that "
98 "Datastore. "
99 ),
100 fields=NamedValueSet(
101 [
102 ddl.FieldSpec(
103 name="datastore_name",
104 dtype=sqlalchemy.String,
105 length=256,
106 primaryKey=True,
107 nullable=False,
108 doc="Name of the Datastore this entry corresponds to.",
109 ),
110 ]
111 ),
112 )
113 dataset_location = copy.deepcopy(dataset_location_spec)
114 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
115 dataset_location_trash = copy.deepcopy(dataset_location_spec)
116 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
117 return _TablesTuple(
118 dataset_location=dataset_location,
119 dataset_location_trash=dataset_location_trash,
120 )
123class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
124 """An implementation of `DatastoreRegistryBridge` that uses the same two
125 tables for all non-ephemeral datastores.
127 Parameters
128 ----------
129 datastoreName : `str`
130 Name of the `Datastore` as it should appear in `Registry` tables
131 referencing it.
132 db : `Database`
133 Object providing a database connection and generic distractions.
134 tables : `_TablesTuple`
135 Named tuple containing `sqlalchemy.schema.Table` instances.
136 """
138 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
139 super().__init__(datastoreName)
140 self._db = db
141 self._tables = tables
143 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> list[dict]:
144 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
145 a list of dictionaries that match the schema of the tables used by this
146 class.
148 Parameters
149 ----------
150 refs : `~collections.abc.Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
151 Datasets to transform.
153 Returns
154 -------
155 rows : `list` [ `dict` ]
156 List of dictionaries, with "datastoreName" and "dataset_id" keys.
157 """
158 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs]
160 def ensure(self, refs: Iterable[DatasetIdRef]) -> None:
161 # Docstring inherited from DatastoreRegistryBridge
162 self._db.ensure(self._tables.dataset_location, *self._refsToRows(refs))
164 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
165 # Docstring inherited from DatastoreRegistryBridge
166 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
168 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
169 # Docstring inherited from DatastoreRegistryBridge
170 rows = self._refsToRows(self.check(refs))
171 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
173 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: DatastoreTransaction | None) -> None:
174 # Docstring inherited from DatastoreRegistryBridge
175 # TODO: avoid self.check() call via queries like
176 # INSERT INTO dataset_location_trash
177 # SELECT datastore_name, dataset_id FROM dataset_location
178 # WHERE datastore_name=? AND dataset_id IN (?);
179 # DELETE FROM dataset_location
180 # WHERE datastore_name=? AND dataset_id IN (?);
181 # ...but the Database interface doesn't support those kinds of queries
182 # right now.
183 rows = self._refsToRows(self.check(refs))
184 with self._db.transaction():
185 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
186 self._db.insert(self._tables.dataset_location_trash, *rows)
188 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
189 # Docstring inherited from DatastoreRegistryBridge
190 byId = {ref.id: ref for ref in refs}
191 sql = (
192 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
193 .select_from(self._tables.dataset_location)
194 .where(
195 sqlalchemy.sql.and_(
196 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
197 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
198 )
199 )
200 )
201 with self._db.query(sql) as sql_result:
202 sql_rows = sql_result.fetchall()
203 for row in sql_rows:
204 yield byId[row.dataset_id]
206 @contextmanager
207 def emptyTrash(
208 self,
209 records_table: OpaqueTableStorage | None = None,
210 record_class: type[StoredDatastoreItemInfo] | None = None,
211 record_column: str | None = None,
212 ) -> Iterator[tuple[Iterable[tuple[DatasetIdRef, StoredDatastoreItemInfo | None]], set[str] | None]]:
213 # Docstring inherited from DatastoreRegistryBridge
215 if records_table is None:
216 raise ValueError("This implementation requires a records table.")
218 assert isinstance(
219 records_table, ByNameOpaqueTableStorage
220 ), f"Records table must support hidden attributes. Got {type(records_table)}."
222 if record_class is None:
223 raise ValueError("Record class must be provided if records table is given.")
225 # Helper closure to generate the common join+where clause.
226 def join_records(
227 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
228 ) -> sqlalchemy.sql.Select:
229 # mypy needs to be sure
230 assert isinstance(records_table, ByNameOpaqueTableStorage)
231 return select.select_from(
232 records_table._table.join(
233 location_table,
234 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
235 )
236 ).where(location_table.columns.datastore_name == self.datastoreName)
238 # SELECT records.dataset_id, records.path FROM records
239 # JOIN records on dataset_location.dataset_id == records.dataset_id
240 # WHERE dataset_location.datastore_name = datastoreName
242 # It's possible that we may end up with a ref listed in the trash
243 # table that is not listed in the records table. Such an
244 # inconsistency would be missed by this query.
245 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
247 # Run query, transform results into a list of dicts that we can later
248 # use to delete.
249 with self._db.query(info_in_trash) as sql_result:
250 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()]
252 # It is possible for trashed refs to be linked to artifacts that
253 # are still associated with refs that are not to be trashed. We
254 # need to be careful to consider those and indicate to the caller
255 # that those artifacts should be retained. Can only do this check
256 # if the caller provides a column name that can map to multiple
257 # refs.
258 preserved: set[str] | None = None
259 if record_column is not None:
260 # Some helper subqueries
261 items_not_in_trash = join_records(
262 sqlalchemy.sql.select(records_table._table.columns[record_column]),
263 self._tables.dataset_location,
264 ).alias("items_not_in_trash")
265 items_in_trash = join_records(
266 sqlalchemy.sql.select(records_table._table.columns[record_column]),
267 self._tables.dataset_location_trash,
268 ).alias("items_in_trash")
270 # A query for paths that are referenced by datasets in the trash
271 # and datasets not in the trash.
272 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from(
273 items_not_in_trash.join(
274 items_in_trash,
275 onclause=items_in_trash.columns[record_column]
276 == items_not_in_trash.columns[record_column],
277 )
278 )
279 with self._db.query(items_to_preserve) as sql_result:
280 preserved = {row[record_column] for row in sql_result.mappings()}
282 # Convert results to a tuple of id+info and a record of the artifacts
283 # that should not be deleted from datastore. The id+info tuple is
284 # solely to allow logging to report the relevant ID.
285 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows)
287 # Start contextmanager, return results
288 yield ((id_info, preserved))
290 # No exception raised in context manager block.
291 if not rows:
292 return
294 # Delete the rows from the records table
295 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows])
297 # Delete those rows from the trash table.
298 self._db.delete(
299 self._tables.dataset_location_trash,
300 ["dataset_id", "datastore_name"],
301 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows],
302 )
305class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
306 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
307 two tables for all non-ephemeral datastores.
309 Parameters
310 ----------
311 db : `Database`
312 Object providing a database connection and generic distractions.
313 tables : `_TablesTuple`
314 Named tuple containing `sqlalchemy.schema.Table` instances.
315 opaque : `OpaqueTableStorageManager`
316 Manager object for opaque table storage in the `Registry`.
317 universe : `DimensionUniverse`
318 All dimensions know to the `Registry`.
319 datasetIdColumnType : `type`
320 Type for dataset ID column.
321 registry_schema_version : `VersionTuple` or `None`, optional
322 The version of the registry schema.
323 """
325 def __init__(
326 self,
327 *,
328 db: Database,
329 tables: _TablesTuple,
330 opaque: OpaqueTableStorageManager,
331 universe: DimensionUniverse,
332 datasetIdColumnType: type,
333 registry_schema_version: VersionTuple | None = None,
334 ):
335 super().__init__(
336 opaque=opaque,
337 universe=universe,
338 datasetIdColumnType=datasetIdColumnType,
339 registry_schema_version=registry_schema_version,
340 )
341 self._db = db
342 self._tables = tables
343 self._ephemeral: dict[str, EphemeralDatastoreRegistryBridge] = {}
345 def clone(self, *, db: Database, opaque: OpaqueTableStorageManager) -> DatastoreRegistryBridgeManager:
346 return MonolithicDatastoreRegistryBridgeManager(
347 db=db,
348 tables=self._tables,
349 opaque=opaque,
350 universe=self.universe,
351 datasetIdColumnType=self.datasetIdColumnType,
352 registry_schema_version=self._registry_schema_version,
353 )
355 @classmethod
356 def initialize(
357 cls,
358 db: Database,
359 context: StaticTablesContext,
360 *,
361 opaque: OpaqueTableStorageManager,
362 datasets: type[DatasetRecordStorageManager],
363 universe: DimensionUniverse,
364 registry_schema_version: VersionTuple | None = None,
365 ) -> DatastoreRegistryBridgeManager:
366 # Docstring inherited from DatastoreRegistryBridge
367 tables = context.addTableTuple(_makeTableSpecs(datasets))
368 return cls(
369 db=db,
370 tables=cast(_TablesTuple, tables),
371 opaque=opaque,
372 universe=universe,
373 datasetIdColumnType=datasets.getIdColumnType(),
374 registry_schema_version=registry_schema_version,
375 )
377 def refresh(self) -> None:
378 # Docstring inherited from DatastoreRegistryBridge
379 # This implementation has no in-Python state that depends on which
380 # datastores exist, so there's nothing to do.
381 pass
383 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
384 # Docstring inherited from DatastoreRegistryBridge
385 if ephemeral:
386 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
387 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
389 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]:
390 # Docstring inherited from DatastoreRegistryBridge
391 sql = (
392 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name)
393 .select_from(self._tables.dataset_location)
394 .where(self._tables.dataset_location.columns.dataset_id == ref.id)
395 )
396 with self._db.query(sql) as sql_result:
397 sql_rows = sql_result.mappings().fetchall()
398 for row in sql_rows:
399 yield row[self._tables.dataset_location.columns.datastore_name]
400 for name, bridge in self._ephemeral.items():
401 if ref in bridge:
402 yield name
404 @classmethod
405 def currentVersions(cls) -> list[VersionTuple]:
406 # Docstring inherited from VersionedExtension.
407 return [_VERSION]