Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 84%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
25import copy
26from collections import namedtuple
27from contextlib import contextmanager
28from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, cast
30import sqlalchemy
31from lsst.daf.butler import DatasetRef, NamedValueSet, StoredDatastoreItemInfo, ddl
32from lsst.daf.butler.registry.bridge.ephemeral import EphemeralDatastoreRegistryBridge
33from lsst.daf.butler.registry.interfaces import (
34 DatasetIdRef,
35 DatastoreRegistryBridge,
36 DatastoreRegistryBridgeManager,
37 FakeDatasetRef,
38 OpaqueTableStorage,
39 VersionTuple,
40)
41from lsst.daf.butler.registry.opaque import ByNameOpaqueTableStorage
43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true
44 from lsst.daf.butler import DimensionUniverse
45 from lsst.daf.butler.registry.interfaces import (
46 Database,
47 DatasetRecordStorageManager,
48 OpaqueTableStorageManager,
49 StaticTablesContext,
50 )
52_TablesTuple = namedtuple(
53 "_TablesTuple",
54 [
55 "dataset_location",
56 "dataset_location_trash",
57 ],
58)
60# This has to be updated on every schema change
61_VERSION = VersionTuple(0, 2, 0)
64def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple:
65 """Construct specifications for tables used by the monolithic datastore
66 bridge classes.
68 Parameters
69 ----------
70 universe : `DimensionUniverse`
71 All dimensions known to the `Registry`.
72 datasets : subclass of `DatasetRecordStorageManager`
73 Manager class for datasets; used only to create foreign key fields.
75 Returns
76 -------
77 specs : `_TablesTuple`
78 A named tuple containing `ddl.TableSpec` instances.
79 """
80 # We want the dataset_location and dataset_location_trash tables
81 # to have the same definition, aside from the behavior of their link
82 # to the dataset table: the trash table has no foreign key constraint.
83 dataset_location_spec = ddl.TableSpec(
84 doc=(
85 "A table that provides information on whether a dataset is stored in "
86 "one or more Datastores. The presence or absence of a record in this "
87 "table itself indicates whether the dataset is present in that "
88 "Datastore. "
89 ),
90 fields=NamedValueSet(
91 [
92 ddl.FieldSpec(
93 name="datastore_name",
94 dtype=sqlalchemy.String,
95 length=256,
96 primaryKey=True,
97 nullable=False,
98 doc="Name of the Datastore this entry corresponds to.",
99 ),
100 ]
101 ),
102 )
103 dataset_location = copy.deepcopy(dataset_location_spec)
104 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
105 dataset_location_trash = copy.deepcopy(dataset_location_spec)
106 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
107 return _TablesTuple(
108 dataset_location=dataset_location,
109 dataset_location_trash=dataset_location_trash,
110 )
113class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
114 """An implementation of `DatastoreRegistryBridge` that uses the same two
115 tables for all non-ephemeral datastores.
117 Parameters
118 ----------
119 datastoreName : `str`
120 Name of the `Datastore` as it should appear in `Registry` tables
121 referencing it.
122 db : `Database`
123 Object providing a database connection and generic distractions.
124 tables : `_TablesTuple`
125 Named tuple containing `sqlalchemy.schema.Table` instances.
126 """
128 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
129 super().__init__(datastoreName)
130 self._db = db
131 self._tables = tables
133 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]:
134 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
135 a list of dictionaries that match the schema of the tables used by this
136 class.
138 Parameters
139 ----------
140 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
141 Datasets to transform.
143 Returns
144 -------
145 rows : `list` [ `dict` ]
146 List of dictionaries, with "datastoreName" and "dataset_id" keys.
147 """
148 return [{"datastore_name": self.datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
150 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
151 # Docstring inherited from DatastoreRegistryBridge
152 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
154 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
155 # Docstring inherited from DatastoreRegistryBridge
156 rows = self._refsToRows(self.check(refs))
157 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
159 def moveToTrash(self, refs: Iterable[DatasetIdRef]) -> None:
160 # Docstring inherited from DatastoreRegistryBridge
161 # TODO: avoid self.check() call via queries like
162 # INSERT INTO dataset_location_trash
163 # SELECT datastore_name, dataset_id FROM dataset_location
164 # WHERE datastore_name=? AND dataset_id IN (?);
165 # DELETE FROM dataset_location
166 # WHERE datastore_name=? AND dataset_id IN (?);
167 # ...but the Database interface doesn't support those kinds of queries
168 # right now.
169 rows = self._refsToRows(self.check(refs))
170 with self._db.transaction():
171 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
172 self._db.insert(self._tables.dataset_location_trash, *rows)
174 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
175 # Docstring inherited from DatastoreRegistryBridge
176 byId = {ref.getCheckedId(): ref for ref in refs}
177 sql = (
178 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id)
179 .select_from(self._tables.dataset_location)
180 .where(
181 sqlalchemy.sql.and_(
182 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
183 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()),
184 )
185 )
186 )
187 for row in self._db.query(sql).fetchall():
188 yield byId[row.dataset_id]
190 @contextmanager
191 def emptyTrash(
192 self,
193 records_table: Optional[OpaqueTableStorage] = None,
194 record_class: Optional[Type[StoredDatastoreItemInfo]] = None,
195 record_column: Optional[str] = None,
196 ) -> Iterator[
197 Tuple[Iterable[Tuple[DatasetIdRef, Optional[StoredDatastoreItemInfo]]], Optional[Set[str]]]
198 ]:
199 # Docstring inherited from DatastoreRegistryBridge
201 if records_table is None: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true
202 raise ValueError("This implementation requires a records table.")
204 assert isinstance(
205 records_table, ByNameOpaqueTableStorage
206 ), f"Records table must support hidden attributes. Got {type(records_table)}."
208 if record_class is None: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true
209 raise ValueError("Record class must be provided if records table is given.")
211 # Helper closure to generate the common join+where clause.
212 def join_records(
213 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
214 ) -> sqlalchemy.sql.FromClause:
215 # mypy needs to be sure
216 assert isinstance(records_table, ByNameOpaqueTableStorage)
217 return select.select_from(
218 records_table._table.join(
219 location_table,
220 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
221 )
222 ).where(location_table.columns.datastore_name == self.datastoreName)
224 # SELECT records.dataset_id, records.path FROM records
225 # JOIN records on dataset_location.dataset_id == records.dataset_id
226 # WHERE dataset_location.datastore_name = datastoreName
228 # It's possible that we may end up with a ref listed in the trash
229 # table that is not listed in the records table. Such an
230 # inconsistency would be missed by this query.
231 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
233 # Run query, transform results into a list of dicts that we can later
234 # use to delete.
235 rows = [
236 dict(**row, datastore_name=self.datastoreName) for row in self._db.query(info_in_trash).mappings()
237 ]
239 # It is possible for trashed refs to be linked to artifacts that
240 # are still associated with refs that are not to be trashed. We
241 # need to be careful to consider those and indicate to the caller
242 # that those artifacts should be retained. Can only do this check
243 # if the caller provides a column name that can map to multiple
244 # refs.
245 preserved: Optional[Set[str]] = None
246 if record_column is not None: 246 ↛ 271line 246 didn't jump to line 271, because the condition on line 246 was never false
247 # Some helper subqueries
248 items_not_in_trash = join_records(
249 sqlalchemy.sql.select(records_table._table.columns[record_column]),
250 self._tables.dataset_location,
251 ).alias("items_not_in_trash")
252 items_in_trash = join_records(
253 sqlalchemy.sql.select(records_table._table.columns[record_column]),
254 self._tables.dataset_location_trash,
255 ).alias("items_in_trash")
257 # A query for paths that are referenced by datasets in the trash
258 # and datasets not in the trash.
259 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from(
260 items_not_in_trash.join(
261 items_in_trash,
262 onclause=items_in_trash.columns[record_column]
263 == items_not_in_trash.columns[record_column],
264 )
265 )
266 preserved = {row[record_column] for row in self._db.query(items_to_preserve).mappings()}
268 # Convert results to a tuple of id+info and a record of the artifacts
269 # that should not be deleted from datastore. The id+info tuple is
270 # solely to allow logging to report the relevant ID.
271 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows)
273 # Start contextmanager, return results
274 yield ((id_info, preserved))
276 # No exception raised in context manager block.
277 if not rows:
278 return
280 # Delete the rows from the records table
281 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows])
283 # Delete those rows from the trash table.
284 self._db.delete(
285 self._tables.dataset_location_trash,
286 ["dataset_id", "datastore_name"],
287 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows],
288 )
291class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
292 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
293 two tables for all non-ephemeral datastores.
295 Parameters
296 ----------
297 db : `Database`
298 Object providing a database connection and generic distractions.
299 tables : `_TablesTuple`
300 Named tuple containing `sqlalchemy.schema.Table` instances.
301 opaque : `OpaqueTableStorageManager`
302 Manager object for opaque table storage in the `Registry`.
303 universe : `DimensionUniverse`
304 All dimensions know to the `Registry`.
305 datasetIdColumnType : `type`
306 Type for dataset ID column.
307 """
309 def __init__(
310 self,
311 *,
312 db: Database,
313 tables: _TablesTuple,
314 opaque: OpaqueTableStorageManager,
315 universe: DimensionUniverse,
316 datasetIdColumnType: type,
317 ):
318 super().__init__(opaque=opaque, universe=universe, datasetIdColumnType=datasetIdColumnType)
319 self._db = db
320 self._tables = tables
321 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {}
323 @classmethod
324 def initialize(
325 cls,
326 db: Database,
327 context: StaticTablesContext,
328 *,
329 opaque: OpaqueTableStorageManager,
330 datasets: Type[DatasetRecordStorageManager],
331 universe: DimensionUniverse,
332 ) -> DatastoreRegistryBridgeManager:
333 # Docstring inherited from DatastoreRegistryBridge
334 tables = context.addTableTuple(_makeTableSpecs(datasets))
335 return cls(
336 db=db,
337 tables=cast(_TablesTuple, tables),
338 opaque=opaque,
339 universe=universe,
340 datasetIdColumnType=datasets.getIdColumnType(),
341 )
343 def refresh(self) -> None:
344 # Docstring inherited from DatastoreRegistryBridge
345 # This implementation has no in-Python state that depends on which
346 # datastores exist, so there's nothing to do.
347 pass
349 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
350 # Docstring inherited from DatastoreRegistryBridge
351 if ephemeral:
352 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
353 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
355 def findDatastores(self, ref: DatasetRef) -> Iterable[str]:
356 # Docstring inherited from DatastoreRegistryBridge
357 sql = (
358 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name)
359 .select_from(self._tables.dataset_location)
360 .where(self._tables.dataset_location.columns.dataset_id == ref.getCheckedId())
361 )
362 for row in self._db.query(sql).mappings():
363 yield row[self._tables.dataset_location.columns.datastore_name]
364 for name, bridge in self._ephemeral.items():
365 if ref in bridge:
366 yield name
368 @classmethod
369 def currentVersion(cls) -> Optional[VersionTuple]:
370 # Docstring inherited from VersionedExtension.
371 return _VERSION
373 def schemaDigest(self) -> Optional[str]:
374 # Docstring inherited from VersionedExtension.
375 return self._defaultSchemaDigest(self._tables, self._db.dialect)