Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 27%
101 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
31import copy
32from collections import namedtuple
33from collections.abc import Iterable, Iterator
34from contextlib import contextmanager
35from typing import TYPE_CHECKING, cast
37import sqlalchemy
39from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl
40from ..interfaces import (
41 DatasetIdRef,
42 DatastoreRegistryBridge,
43 DatastoreRegistryBridgeManager,
44 FakeDatasetRef,
45 OpaqueTableStorage,
46 VersionTuple,
47)
48from ..opaque import ByNameOpaqueTableStorage
49from .ephemeral import EphemeralDatastoreRegistryBridge
51if TYPE_CHECKING:
52 from ...core import DimensionUniverse
53 from ...core.datastore import DatastoreTransaction
54 from ..interfaces import (
55 Database,
56 DatasetRecordStorageManager,
57 OpaqueTableStorageManager,
58 StaticTablesContext,
59 )
61_TablesTuple = namedtuple(
62 "_TablesTuple",
63 [
64 "dataset_location",
65 "dataset_location_trash",
66 ],
67)
69# This has to be updated on every schema change
70_VERSION = VersionTuple(0, 2, 0)
73def _makeTableSpecs(datasets: type[DatasetRecordStorageManager]) -> _TablesTuple:
74 """Construct specifications for tables used by the monolithic datastore
75 bridge classes.
77 Parameters
78 ----------
79 universe : `DimensionUniverse`
80 All dimensions known to the `Registry`.
81 datasets : subclass of `DatasetRecordStorageManager`
82 Manager class for datasets; used only to create foreign key fields.
84 Returns
85 -------
86 specs : `_TablesTuple`
87 A named tuple containing `ddl.TableSpec` instances.
88 """
89 # We want the dataset_location and dataset_location_trash tables
90 # to have the same definition, aside from the behavior of their link
91 # to the dataset table: the trash table has no foreign key constraint.
92 dataset_location_spec = ddl.TableSpec(
93 doc=(
94 "A table that provides information on whether a dataset is stored in "
95 "one or more Datastores. The presence or absence of a record in this "
96 "table itself indicates whether the dataset is present in that "
97 "Datastore. "
98 ),
99 fields=NamedValueSet(
100 [
101 ddl.FieldSpec(
102 name="datastore_name",
103 dtype=sqlalchemy.String,
104 length=256,
105 primaryKey=True,
106 nullable=False,
107 doc="Name of the Datastore this entry corresponds to.",
108 ),
109 ]
110 ),
111 )
112 dataset_location = copy.deepcopy(dataset_location_spec)
113 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
114 dataset_location_trash = copy.deepcopy(dataset_location_spec)
115 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
116 return _TablesTuple(
117 dataset_location=dataset_location,
118 dataset_location_trash=dataset_location_trash,
119 )
122class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
123 """An implementation of `DatastoreRegistryBridge` that uses the same two
124 tables for all non-ephemeral datastores.
126 Parameters
127 ----------
128 datastoreName : `str`
129 Name of the `Datastore` as it should appear in `Registry` tables
130 referencing it.
131 db : `Database`
132 Object providing a database connection and generic distractions.
133 tables : `_TablesTuple`
134 Named tuple containing `sqlalchemy.schema.Table` instances.
135 """
137 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
138 super().__init__(datastoreName)
139 self._db = db
140 self._tables = tables
142 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> list[dict]:
143 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
144 a list of dictionaries that match the schema of the tables used by this
145 class.
147 Parameters
148 ----------
149 refs : `~collections.abc.Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
150 Datasets to transform.
152 Returns
153 -------
154 rows : `list` [ `dict` ]
155 List of dictionaries, with "datastoreName" and "dataset_id" keys.
156 """
157 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs]
159 def ensure(self, refs: Iterable[DatasetIdRef]) -> None:
160 # Docstring inherited from DatastoreRegistryBridge
161 self._db.ensure(self._tables.dataset_location, *self._refsToRows(refs))
163 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
164 # Docstring inherited from DatastoreRegistryBridge
165 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
167 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
168 # Docstring inherited from DatastoreRegistryBridge
169 rows = self._refsToRows(self.check(refs))
170 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
172 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: DatastoreTransaction | None) -> None:
173 # Docstring inherited from DatastoreRegistryBridge
174 # TODO: avoid self.check() call via queries like
175 # INSERT INTO dataset_location_trash
176 # SELECT datastore_name, dataset_id FROM dataset_location
177 # WHERE datastore_name=? AND dataset_id IN (?);
178 # DELETE FROM dataset_location
179 # WHERE datastore_name=? AND dataset_id IN (?);
180 # ...but the Database interface doesn't support those kinds of queries
181 # right now.
182 rows = self._refsToRows(self.check(refs))
183 with self._db.transaction():
184 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
185 self._db.insert(self._tables.dataset_location_trash, *rows)
187 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
188 # Docstring inherited from DatastoreRegistryBridge
189 byId = {ref.id: ref for ref in refs}
190 sql = (
191 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
192 .select_from(self._tables.dataset_location)
193 .where(
194 sqlalchemy.sql.and_(
195 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
196 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
197 )
198 )
199 )
200 with self._db.query(sql) as sql_result:
201 sql_rows = sql_result.fetchall()
202 for row in sql_rows:
203 yield byId[row.dataset_id]
205 @contextmanager
206 def emptyTrash(
207 self,
208 records_table: OpaqueTableStorage | None = None,
209 record_class: type[StoredDatastoreItemInfo] | None = None,
210 record_column: str | None = None,
211 ) -> Iterator[tuple[Iterable[tuple[DatasetIdRef, StoredDatastoreItemInfo | None]], set[str] | None]]:
212 # Docstring inherited from DatastoreRegistryBridge
214 if records_table is None:
215 raise ValueError("This implementation requires a records table.")
217 assert isinstance(
218 records_table, ByNameOpaqueTableStorage
219 ), f"Records table must support hidden attributes. Got {type(records_table)}."
221 if record_class is None:
222 raise ValueError("Record class must be provided if records table is given.")
224 # Helper closure to generate the common join+where clause.
225 def join_records(
226 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
227 ) -> sqlalchemy.sql.Select:
228 # mypy needs to be sure
229 assert isinstance(records_table, ByNameOpaqueTableStorage)
230 return select.select_from(
231 records_table._table.join(
232 location_table,
233 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
234 )
235 ).where(location_table.columns.datastore_name == self.datastoreName)
237 # SELECT records.dataset_id, records.path FROM records
238 # JOIN records on dataset_location.dataset_id == records.dataset_id
239 # WHERE dataset_location.datastore_name = datastoreName
241 # It's possible that we may end up with a ref listed in the trash
242 # table that is not listed in the records table. Such an
243 # inconsistency would be missed by this query.
244 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
246 # Run query, transform results into a list of dicts that we can later
247 # use to delete.
248 with self._db.query(info_in_trash) as sql_result:
249 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()]
251 # It is possible for trashed refs to be linked to artifacts that
252 # are still associated with refs that are not to be trashed. We
253 # need to be careful to consider those and indicate to the caller
254 # that those artifacts should be retained. Can only do this check
255 # if the caller provides a column name that can map to multiple
256 # refs.
257 preserved: set[str] | None = None
258 if record_column is not None:
259 # Some helper subqueries
260 items_not_in_trash = join_records(
261 sqlalchemy.sql.select(records_table._table.columns[record_column]),
262 self._tables.dataset_location,
263 ).alias("items_not_in_trash")
264 items_in_trash = join_records(
265 sqlalchemy.sql.select(records_table._table.columns[record_column]),
266 self._tables.dataset_location_trash,
267 ).alias("items_in_trash")
269 # A query for paths that are referenced by datasets in the trash
270 # and datasets not in the trash.
271 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from(
272 items_not_in_trash.join(
273 items_in_trash,
274 onclause=items_in_trash.columns[record_column]
275 == items_not_in_trash.columns[record_column],
276 )
277 )
278 with self._db.query(items_to_preserve) as sql_result:
279 preserved = {row[record_column] for row in sql_result.mappings()}
281 # Convert results to a tuple of id+info and a record of the artifacts
282 # that should not be deleted from datastore. The id+info tuple is
283 # solely to allow logging to report the relevant ID.
284 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows)
286 # Start contextmanager, return results
287 yield ((id_info, preserved))
289 # No exception raised in context manager block.
290 if not rows:
291 return
293 # Delete the rows from the records table
294 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows])
296 # Delete those rows from the trash table.
297 self._db.delete(
298 self._tables.dataset_location_trash,
299 ["dataset_id", "datastore_name"],
300 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows],
301 )
304class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
305 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
306 two tables for all non-ephemeral datastores.
308 Parameters
309 ----------
310 db : `Database`
311 Object providing a database connection and generic distractions.
312 tables : `_TablesTuple`
313 Named tuple containing `sqlalchemy.schema.Table` instances.
314 opaque : `OpaqueTableStorageManager`
315 Manager object for opaque table storage in the `Registry`.
316 universe : `DimensionUniverse`
317 All dimensions know to the `Registry`.
318 datasetIdColumnType : `type`
319 Type for dataset ID column.
320 """
322 def __init__(
323 self,
324 *,
325 db: Database,
326 tables: _TablesTuple,
327 opaque: OpaqueTableStorageManager,
328 universe: DimensionUniverse,
329 datasetIdColumnType: type,
330 registry_schema_version: VersionTuple | None = None,
331 ):
332 super().__init__(
333 opaque=opaque,
334 universe=universe,
335 datasetIdColumnType=datasetIdColumnType,
336 registry_schema_version=registry_schema_version,
337 )
338 self._db = db
339 self._tables = tables
340 self._ephemeral: dict[str, EphemeralDatastoreRegistryBridge] = {}
342 @classmethod
343 def initialize(
344 cls,
345 db: Database,
346 context: StaticTablesContext,
347 *,
348 opaque: OpaqueTableStorageManager,
349 datasets: type[DatasetRecordStorageManager],
350 universe: DimensionUniverse,
351 registry_schema_version: VersionTuple | None = None,
352 ) -> DatastoreRegistryBridgeManager:
353 # Docstring inherited from DatastoreRegistryBridge
354 tables = context.addTableTuple(_makeTableSpecs(datasets))
355 return cls(
356 db=db,
357 tables=cast(_TablesTuple, tables),
358 opaque=opaque,
359 universe=universe,
360 datasetIdColumnType=datasets.getIdColumnType(),
361 registry_schema_version=registry_schema_version,
362 )
364 def refresh(self) -> None:
365 # Docstring inherited from DatastoreRegistryBridge
366 # This implementation has no in-Python state that depends on which
367 # datastores exist, so there's nothing to do.
368 pass
370 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
371 # Docstring inherited from DatastoreRegistryBridge
372 if ephemeral:
373 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
374 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
376 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]:
377 # Docstring inherited from DatastoreRegistryBridge
378 sql = (
379 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name)
380 .select_from(self._tables.dataset_location)
381 .where(self._tables.dataset_location.columns.dataset_id == ref.id)
382 )
383 with self._db.query(sql) as sql_result:
384 sql_rows = sql_result.mappings().fetchall()
385 for row in sql_rows:
386 yield row[self._tables.dataset_location.columns.datastore_name]
387 for name, bridge in self._ephemeral.items():
388 if ref in bridge:
389 yield name
391 @classmethod
392 def currentVersions(cls) -> list[VersionTuple]:
393 # Docstring inherited from VersionedExtension.
394 return [_VERSION]