Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 27%
101 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
25import copy
26from collections import namedtuple
27from collections.abc import Iterable, Iterator
28from contextlib import contextmanager
29from typing import TYPE_CHECKING, cast
31import sqlalchemy
33from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl
34from ..interfaces import (
35 DatasetIdRef,
36 DatastoreRegistryBridge,
37 DatastoreRegistryBridgeManager,
38 FakeDatasetRef,
39 OpaqueTableStorage,
40 VersionTuple,
41)
42from ..opaque import ByNameOpaqueTableStorage
43from .ephemeral import EphemeralDatastoreRegistryBridge
45if TYPE_CHECKING:
46 from ...core import DimensionUniverse
47 from ...core.datastore import DatastoreTransaction
48 from ..interfaces import (
49 Database,
50 DatasetRecordStorageManager,
51 OpaqueTableStorageManager,
52 StaticTablesContext,
53 )
55_TablesTuple = namedtuple(
56 "_TablesTuple",
57 [
58 "dataset_location",
59 "dataset_location_trash",
60 ],
61)
63# This has to be updated on every schema change
64_VERSION = VersionTuple(0, 2, 0)
67def _makeTableSpecs(datasets: type[DatasetRecordStorageManager]) -> _TablesTuple:
68 """Construct specifications for tables used by the monolithic datastore
69 bridge classes.
71 Parameters
72 ----------
73 universe : `DimensionUniverse`
74 All dimensions known to the `Registry`.
75 datasets : subclass of `DatasetRecordStorageManager`
76 Manager class for datasets; used only to create foreign key fields.
78 Returns
79 -------
80 specs : `_TablesTuple`
81 A named tuple containing `ddl.TableSpec` instances.
82 """
83 # We want the dataset_location and dataset_location_trash tables
84 # to have the same definition, aside from the behavior of their link
85 # to the dataset table: the trash table has no foreign key constraint.
86 dataset_location_spec = ddl.TableSpec(
87 doc=(
88 "A table that provides information on whether a dataset is stored in "
89 "one or more Datastores. The presence or absence of a record in this "
90 "table itself indicates whether the dataset is present in that "
91 "Datastore. "
92 ),
93 fields=NamedValueSet(
94 [
95 ddl.FieldSpec(
96 name="datastore_name",
97 dtype=sqlalchemy.String,
98 length=256,
99 primaryKey=True,
100 nullable=False,
101 doc="Name of the Datastore this entry corresponds to.",
102 ),
103 ]
104 ),
105 )
106 dataset_location = copy.deepcopy(dataset_location_spec)
107 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
108 dataset_location_trash = copy.deepcopy(dataset_location_spec)
109 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
110 return _TablesTuple(
111 dataset_location=dataset_location,
112 dataset_location_trash=dataset_location_trash,
113 )
116class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
117 """An implementation of `DatastoreRegistryBridge` that uses the same two
118 tables for all non-ephemeral datastores.
120 Parameters
121 ----------
122 datastoreName : `str`
123 Name of the `Datastore` as it should appear in `Registry` tables
124 referencing it.
125 db : `Database`
126 Object providing a database connection and generic distractions.
127 tables : `_TablesTuple`
128 Named tuple containing `sqlalchemy.schema.Table` instances.
129 """
131 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
132 super().__init__(datastoreName)
133 self._db = db
134 self._tables = tables
136 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> list[dict]:
137 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
138 a list of dictionaries that match the schema of the tables used by this
139 class.
141 Parameters
142 ----------
143 refs : `~collections.abc.Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
144 Datasets to transform.
146 Returns
147 -------
148 rows : `list` [ `dict` ]
149 List of dictionaries, with "datastoreName" and "dataset_id" keys.
150 """
151 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs]
153 def ensure(self, refs: Iterable[DatasetIdRef]) -> None:
154 # Docstring inherited from DatastoreRegistryBridge
155 self._db.ensure(self._tables.dataset_location, *self._refsToRows(refs))
157 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
158 # Docstring inherited from DatastoreRegistryBridge
159 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
161 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
162 # Docstring inherited from DatastoreRegistryBridge
163 rows = self._refsToRows(self.check(refs))
164 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
166 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: DatastoreTransaction | None) -> None:
167 # Docstring inherited from DatastoreRegistryBridge
168 # TODO: avoid self.check() call via queries like
169 # INSERT INTO dataset_location_trash
170 # SELECT datastore_name, dataset_id FROM dataset_location
171 # WHERE datastore_name=? AND dataset_id IN (?);
172 # DELETE FROM dataset_location
173 # WHERE datastore_name=? AND dataset_id IN (?);
174 # ...but the Database interface doesn't support those kinds of queries
175 # right now.
176 rows = self._refsToRows(self.check(refs))
177 with self._db.transaction():
178 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
179 self._db.insert(self._tables.dataset_location_trash, *rows)
181 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
182 # Docstring inherited from DatastoreRegistryBridge
183 byId = {ref.id: ref for ref in refs}
184 sql = (
185 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
186 .select_from(self._tables.dataset_location)
187 .where(
188 sqlalchemy.sql.and_(
189 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
190 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
191 )
192 )
193 )
194 with self._db.query(sql) as sql_result:
195 sql_rows = sql_result.fetchall()
196 for row in sql_rows:
197 yield byId[row.dataset_id]
199 @contextmanager
200 def emptyTrash(
201 self,
202 records_table: OpaqueTableStorage | None = None,
203 record_class: type[StoredDatastoreItemInfo] | None = None,
204 record_column: str | None = None,
205 ) -> Iterator[tuple[Iterable[tuple[DatasetIdRef, StoredDatastoreItemInfo | None]], set[str] | None]]:
206 # Docstring inherited from DatastoreRegistryBridge
208 if records_table is None:
209 raise ValueError("This implementation requires a records table.")
211 assert isinstance(
212 records_table, ByNameOpaqueTableStorage
213 ), f"Records table must support hidden attributes. Got {type(records_table)}."
215 if record_class is None:
216 raise ValueError("Record class must be provided if records table is given.")
218 # Helper closure to generate the common join+where clause.
219 def join_records(
220 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
221 ) -> sqlalchemy.sql.Select:
222 # mypy needs to be sure
223 assert isinstance(records_table, ByNameOpaqueTableStorage)
224 return select.select_from(
225 records_table._table.join(
226 location_table,
227 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
228 )
229 ).where(location_table.columns.datastore_name == self.datastoreName)
231 # SELECT records.dataset_id, records.path FROM records
232 # JOIN records on dataset_location.dataset_id == records.dataset_id
233 # WHERE dataset_location.datastore_name = datastoreName
235 # It's possible that we may end up with a ref listed in the trash
236 # table that is not listed in the records table. Such an
237 # inconsistency would be missed by this query.
238 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
240 # Run query, transform results into a list of dicts that we can later
241 # use to delete.
242 with self._db.query(info_in_trash) as sql_result:
243 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()]
245 # It is possible for trashed refs to be linked to artifacts that
246 # are still associated with refs that are not to be trashed. We
247 # need to be careful to consider those and indicate to the caller
248 # that those artifacts should be retained. Can only do this check
249 # if the caller provides a column name that can map to multiple
250 # refs.
251 preserved: set[str] | None = None
252 if record_column is not None:
253 # Some helper subqueries
254 items_not_in_trash = join_records(
255 sqlalchemy.sql.select(records_table._table.columns[record_column]),
256 self._tables.dataset_location,
257 ).alias("items_not_in_trash")
258 items_in_trash = join_records(
259 sqlalchemy.sql.select(records_table._table.columns[record_column]),
260 self._tables.dataset_location_trash,
261 ).alias("items_in_trash")
263 # A query for paths that are referenced by datasets in the trash
264 # and datasets not in the trash.
265 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from(
266 items_not_in_trash.join(
267 items_in_trash,
268 onclause=items_in_trash.columns[record_column]
269 == items_not_in_trash.columns[record_column],
270 )
271 )
272 with self._db.query(items_to_preserve) as sql_result:
273 preserved = {row[record_column] for row in sql_result.mappings()}
275 # Convert results to a tuple of id+info and a record of the artifacts
276 # that should not be deleted from datastore. The id+info tuple is
277 # solely to allow logging to report the relevant ID.
278 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows)
280 # Start contextmanager, return results
281 yield ((id_info, preserved))
283 # No exception raised in context manager block.
284 if not rows:
285 return
287 # Delete the rows from the records table
288 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows])
290 # Delete those rows from the trash table.
291 self._db.delete(
292 self._tables.dataset_location_trash,
293 ["dataset_id", "datastore_name"],
294 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows],
295 )
298class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
299 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
300 two tables for all non-ephemeral datastores.
302 Parameters
303 ----------
304 db : `Database`
305 Object providing a database connection and generic distractions.
306 tables : `_TablesTuple`
307 Named tuple containing `sqlalchemy.schema.Table` instances.
308 opaque : `OpaqueTableStorageManager`
309 Manager object for opaque table storage in the `Registry`.
310 universe : `DimensionUniverse`
311 All dimensions know to the `Registry`.
312 datasetIdColumnType : `type`
313 Type for dataset ID column.
314 """
316 def __init__(
317 self,
318 *,
319 db: Database,
320 tables: _TablesTuple,
321 opaque: OpaqueTableStorageManager,
322 universe: DimensionUniverse,
323 datasetIdColumnType: type,
324 registry_schema_version: VersionTuple | None = None,
325 ):
326 super().__init__(
327 opaque=opaque,
328 universe=universe,
329 datasetIdColumnType=datasetIdColumnType,
330 registry_schema_version=registry_schema_version,
331 )
332 self._db = db
333 self._tables = tables
334 self._ephemeral: dict[str, EphemeralDatastoreRegistryBridge] = {}
336 @classmethod
337 def initialize(
338 cls,
339 db: Database,
340 context: StaticTablesContext,
341 *,
342 opaque: OpaqueTableStorageManager,
343 datasets: type[DatasetRecordStorageManager],
344 universe: DimensionUniverse,
345 registry_schema_version: VersionTuple | None = None,
346 ) -> DatastoreRegistryBridgeManager:
347 # Docstring inherited from DatastoreRegistryBridge
348 tables = context.addTableTuple(_makeTableSpecs(datasets))
349 return cls(
350 db=db,
351 tables=cast(_TablesTuple, tables),
352 opaque=opaque,
353 universe=universe,
354 datasetIdColumnType=datasets.getIdColumnType(),
355 registry_schema_version=registry_schema_version,
356 )
358 def refresh(self) -> None:
359 # Docstring inherited from DatastoreRegistryBridge
360 # This implementation has no in-Python state that depends on which
361 # datastores exist, so there's nothing to do.
362 pass
364 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
365 # Docstring inherited from DatastoreRegistryBridge
366 if ephemeral:
367 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
368 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
370 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]:
371 # Docstring inherited from DatastoreRegistryBridge
372 sql = (
373 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name)
374 .select_from(self._tables.dataset_location)
375 .where(self._tables.dataset_location.columns.dataset_id == ref.id)
376 )
377 with self._db.query(sql) as sql_result:
378 sql_rows = sql_result.mappings().fetchall()
379 for row in sql_rows:
380 yield row[self._tables.dataset_location.columns.datastore_name]
381 for name, bridge in self._ephemeral.items():
382 if ref in bridge:
383 yield name
385 @classmethod
386 def currentVersions(cls) -> list[VersionTuple]:
387 # Docstring inherited from VersionedExtension.
388 return [_VERSION]