Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 29%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
25import copy
26from collections import namedtuple
27from contextlib import contextmanager
28from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, cast
30import sqlalchemy
32from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl
33from ..interfaces import (
34 DatasetIdRef,
35 DatastoreRegistryBridge,
36 DatastoreRegistryBridgeManager,
37 FakeDatasetRef,
38 OpaqueTableStorage,
39 VersionTuple,
40)
41from ..opaque import ByNameOpaqueTableStorage
42from .ephemeral import EphemeralDatastoreRegistryBridge
44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true
45 from ...core import DimensionUniverse
46 from ..interfaces import (
47 Database,
48 DatasetRecordStorageManager,
49 OpaqueTableStorageManager,
50 StaticTablesContext,
51 )
53_TablesTuple = namedtuple(
54 "_TablesTuple",
55 [
56 "dataset_location",
57 "dataset_location_trash",
58 ],
59)
61# This has to be updated on every schema change
62_VERSION = VersionTuple(0, 2, 0)
65def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple:
66 """Construct specifications for tables used by the monolithic datastore
67 bridge classes.
69 Parameters
70 ----------
71 universe : `DimensionUniverse`
72 All dimensions known to the `Registry`.
73 datasets : subclass of `DatasetRecordStorageManager`
74 Manager class for datasets; used only to create foreign key fields.
76 Returns
77 -------
78 specs : `_TablesTuple`
79 A named tuple containing `ddl.TableSpec` instances.
80 """
81 # We want the dataset_location and dataset_location_trash tables
82 # to have the same definition, aside from the behavior of their link
83 # to the dataset table: the trash table has no foreign key constraint.
84 dataset_location_spec = ddl.TableSpec(
85 doc=(
86 "A table that provides information on whether a dataset is stored in "
87 "one or more Datastores. The presence or absence of a record in this "
88 "table itself indicates whether the dataset is present in that "
89 "Datastore. "
90 ),
91 fields=NamedValueSet(
92 [
93 ddl.FieldSpec(
94 name="datastore_name",
95 dtype=sqlalchemy.String,
96 length=256,
97 primaryKey=True,
98 nullable=False,
99 doc="Name of the Datastore this entry corresponds to.",
100 ),
101 ]
102 ),
103 )
104 dataset_location = copy.deepcopy(dataset_location_spec)
105 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
106 dataset_location_trash = copy.deepcopy(dataset_location_spec)
107 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
108 return _TablesTuple(
109 dataset_location=dataset_location,
110 dataset_location_trash=dataset_location_trash,
111 )
114class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
115 """An implementation of `DatastoreRegistryBridge` that uses the same two
116 tables for all non-ephemeral datastores.
118 Parameters
119 ----------
120 datastoreName : `str`
121 Name of the `Datastore` as it should appear in `Registry` tables
122 referencing it.
123 db : `Database`
124 Object providing a database connection and generic distractions.
125 tables : `_TablesTuple`
126 Named tuple containing `sqlalchemy.schema.Table` instances.
127 """
129 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
130 super().__init__(datastoreName)
131 self._db = db
132 self._tables = tables
134 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]:
135 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
136 a list of dictionaries that match the schema of the tables used by this
137 class.
139 Parameters
140 ----------
141 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
142 Datasets to transform.
144 Returns
145 -------
146 rows : `list` [ `dict` ]
147 List of dictionaries, with "datastoreName" and "dataset_id" keys.
148 """
149 return [{"datastore_name": self.datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
151 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
152 # Docstring inherited from DatastoreRegistryBridge
153 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
155 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
156 # Docstring inherited from DatastoreRegistryBridge
157 rows = self._refsToRows(self.check(refs))
158 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
160 def moveToTrash(self, refs: Iterable[DatasetIdRef]) -> None:
161 # Docstring inherited from DatastoreRegistryBridge
162 # TODO: avoid self.check() call via queries like
163 # INSERT INTO dataset_location_trash
164 # SELECT datastore_name, dataset_id FROM dataset_location
165 # WHERE datastore_name=? AND dataset_id IN (?);
166 # DELETE FROM dataset_location
167 # WHERE datastore_name=? AND dataset_id IN (?);
168 # ...but the Database interface doesn't support those kinds of queries
169 # right now.
170 rows = self._refsToRows(self.check(refs))
171 with self._db.transaction():
172 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
173 self._db.insert(self._tables.dataset_location_trash, *rows)
175 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
176 # Docstring inherited from DatastoreRegistryBridge
177 byId = {ref.getCheckedId(): ref for ref in refs}
178 sql = (
179 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
180 .select_from(self._tables.dataset_location)
181 .where(
182 sqlalchemy.sql.and_(
183 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
184 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
185 )
186 )
187 )
188 for row in self._db.query(sql).fetchall():
189 yield byId[row.dataset_id]
191 @contextmanager
192 def emptyTrash(
193 self,
194 records_table: Optional[OpaqueTableStorage] = None,
195 record_class: Optional[Type[StoredDatastoreItemInfo]] = None,
196 record_column: Optional[str] = None,
197 ) -> Iterator[
198 Tuple[Iterable[Tuple[DatasetIdRef, Optional[StoredDatastoreItemInfo]]], Optional[Set[str]]]
199 ]:
200 # Docstring inherited from DatastoreRegistryBridge
202 if records_table is None:
203 raise ValueError("This implementation requires a records table.")
205 assert isinstance(
206 records_table, ByNameOpaqueTableStorage
207 ), f"Records table must support hidden attributes. Got {type(records_table)}."
209 if record_class is None:
210 raise ValueError("Record class must be provided if records table is given.")
212 # Helper closure to generate the common join+where clause.
213 def join_records(
214 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
215 ) -> sqlalchemy.sql.FromClause:
216 # mypy needs to be sure
217 assert isinstance(records_table, ByNameOpaqueTableStorage)
218 return select.select_from(
219 records_table._table.join(
220 location_table,
221 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
222 )
223 ).where(location_table.columns.datastore_name == self.datastoreName)
225 # SELECT records.dataset_id, records.path FROM records
226 # JOIN records on dataset_location.dataset_id == records.dataset_id
227 # WHERE dataset_location.datastore_name = datastoreName
229 # It's possible that we may end up with a ref listed in the trash
230 # table that is not listed in the records table. Such an
231 # inconsistency would be missed by this query.
232 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
234 # Run query, transform results into a list of dicts that we can later
235 # use to delete.
236 rows = [
237 dict(**row, datastore_name=self.datastoreName) for row in self._db.query(info_in_trash).mappings()
238 ]
240 # It is possible for trashed refs to be linked to artifacts that
241 # are still associated with refs that are not to be trashed. We
242 # need to be careful to consider those and indicate to the caller
243 # that those artifacts should be retained. Can only do this check
244 # if the caller provides a column name that can map to multiple
245 # refs.
246 preserved: Optional[Set[str]] = None
247 if record_column is not None:
248 # Some helper subqueries
249 items_not_in_trash = join_records(
250 sqlalchemy.sql.select(records_table._table.columns[record_column]),
251 self._tables.dataset_location,
252 ).alias("items_not_in_trash")
253 items_in_trash = join_records(
254 sqlalchemy.sql.select(records_table._table.columns[record_column]),
255 self._tables.dataset_location_trash,
256 ).alias("items_in_trash")
258 # A query for paths that are referenced by datasets in the trash
259 # and datasets not in the trash.
260 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from(
261 items_not_in_trash.join(
262 items_in_trash,
263 onclause=items_in_trash.columns[record_column]
264 == items_not_in_trash.columns[record_column],
265 )
266 )
267 preserved = {row[record_column] for row in self._db.query(items_to_preserve).mappings()}
269 # Convert results to a tuple of id+info and a record of the artifacts
270 # that should not be deleted from datastore. The id+info tuple is
271 # solely to allow logging to report the relevant ID.
272 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows)
274 # Start contextmanager, return results
275 yield ((id_info, preserved))
277 # No exception raised in context manager block.
278 if not rows:
279 return
281 # Delete the rows from the records table
282 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows])
284 # Delete those rows from the trash table.
285 self._db.delete(
286 self._tables.dataset_location_trash,
287 ["dataset_id", "datastore_name"],
288 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows],
289 )
292class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
293 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
294 two tables for all non-ephemeral datastores.
296 Parameters
297 ----------
298 db : `Database`
299 Object providing a database connection and generic distractions.
300 tables : `_TablesTuple`
301 Named tuple containing `sqlalchemy.schema.Table` instances.
302 opaque : `OpaqueTableStorageManager`
303 Manager object for opaque table storage in the `Registry`.
304 universe : `DimensionUniverse`
305 All dimensions know to the `Registry`.
306 datasetIdColumnType : `type`
307 Type for dataset ID column.
308 """
310 def __init__(
311 self,
312 *,
313 db: Database,
314 tables: _TablesTuple,
315 opaque: OpaqueTableStorageManager,
316 universe: DimensionUniverse,
317 datasetIdColumnType: type,
318 ):
319 super().__init__(opaque=opaque, universe=universe, datasetIdColumnType=datasetIdColumnType)
320 self._db = db
321 self._tables = tables
322 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {}
324 @classmethod
325 def initialize(
326 cls,
327 db: Database,
328 context: StaticTablesContext,
329 *,
330 opaque: OpaqueTableStorageManager,
331 datasets: Type[DatasetRecordStorageManager],
332 universe: DimensionUniverse,
333 ) -> DatastoreRegistryBridgeManager:
334 # Docstring inherited from DatastoreRegistryBridge
335 tables = context.addTableTuple(_makeTableSpecs(datasets))
336 return cls(
337 db=db,
338 tables=cast(_TablesTuple, tables),
339 opaque=opaque,
340 universe=universe,
341 datasetIdColumnType=datasets.getIdColumnType(),
342 )
344 def refresh(self) -> None:
345 # Docstring inherited from DatastoreRegistryBridge
346 # This implementation has no in-Python state that depends on which
347 # datastores exist, so there's nothing to do.
348 pass
350 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
351 # Docstring inherited from DatastoreRegistryBridge
352 if ephemeral:
353 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
354 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
356 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]:
357 # Docstring inherited from DatastoreRegistryBridge
358 sql = (
359 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name)
360 .select_from(self._tables.dataset_location)
361 .where(self._tables.dataset_location.columns.dataset_id == ref.getCheckedId())
362 )
363 for row in self._db.query(sql).mappings():
364 yield row[self._tables.dataset_location.columns.datastore_name]
365 for name, bridge in self._ephemeral.items():
366 if ref in bridge:
367 yield name
369 @classmethod
370 def currentVersion(cls) -> Optional[VersionTuple]:
371 # Docstring inherited from VersionedExtension.
372 return _VERSION
374 def schemaDigest(self) -> Optional[str]:
375 # Docstring inherited from VersionedExtension.
376 return self._defaultSchemaDigest(self._tables, self._db.dialect)