Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 84%
97 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-01 19:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge")
25from collections import namedtuple
26from contextlib import contextmanager
27import copy
28from typing import cast, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, TYPE_CHECKING
30import sqlalchemy
32from lsst.daf.butler import DatasetRef, ddl, NamedValueSet, StoredDatastoreItemInfo
33from lsst.daf.butler.registry.interfaces import (
34 DatasetIdRef,
35 DatastoreRegistryBridge,
36 DatastoreRegistryBridgeManager,
37 FakeDatasetRef,
38 OpaqueTableStorage,
39 VersionTuple,
40)
41from lsst.daf.butler.registry.opaque import ByNameOpaqueTableStorage
42from lsst.daf.butler.registry.bridge.ephemeral import EphemeralDatastoreRegistryBridge
44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true
45 from lsst.daf.butler import DimensionUniverse
46 from lsst.daf.butler.registry.interfaces import (
47 Database,
48 DatasetRecordStorageManager,
49 OpaqueTableStorageManager,
50 StaticTablesContext,
51 )
53_TablesTuple = namedtuple(
54 "_TablesTuple",
55 [
56 "dataset_location",
57 "dataset_location_trash",
58 ]
59)
61# This has to be updated on every schema change
62_VERSION = VersionTuple(0, 2, 0)
65def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple:
66 """Construct specifications for tables used by the monolithic datastore
67 bridge classes.
69 Parameters
70 ----------
71 universe : `DimensionUniverse`
72 All dimensions known to the `Registry`.
73 datasets : subclass of `DatasetRecordStorageManager`
74 Manager class for datasets; used only to create foreign key fields.
76 Returns
77 -------
78 specs : `_TablesTuple`
79 A named tuple containing `ddl.TableSpec` instances.
80 """
81 # We want the dataset_location and dataset_location_trash tables
82 # to have the same definition, aside from the behavior of their link
83 # to the dataset table: the trash table has no foreign key constraint.
84 dataset_location_spec = ddl.TableSpec(
85 doc=(
86 "A table that provides information on whether a dataset is stored in "
87 "one or more Datastores. The presence or absence of a record in this "
88 "table itself indicates whether the dataset is present in that "
89 "Datastore. "
90 ),
91 fields=NamedValueSet([
92 ddl.FieldSpec(
93 name="datastore_name",
94 dtype=sqlalchemy.String,
95 length=256,
96 primaryKey=True,
97 nullable=False,
98 doc="Name of the Datastore this entry corresponds to.",
99 ),
100 ]),
101 )
102 dataset_location = copy.deepcopy(dataset_location_spec)
103 datasets.addDatasetForeignKey(dataset_location, primaryKey=True)
104 dataset_location_trash = copy.deepcopy(dataset_location_spec)
105 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False)
106 return _TablesTuple(
107 dataset_location=dataset_location,
108 dataset_location_trash=dataset_location_trash,
109 )
112class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge):
113 """An implementation of `DatastoreRegistryBridge` that uses the same two
114 tables for all non-ephemeral datastores.
116 Parameters
117 ----------
118 datastoreName : `str`
119 Name of the `Datastore` as it should appear in `Registry` tables
120 referencing it.
121 db : `Database`
122 Object providing a database connection and generic distractions.
123 tables : `_TablesTuple`
124 Named tuple containing `sqlalchemy.schema.Table` instances.
125 """
126 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple):
127 super().__init__(datastoreName)
128 self._db = db
129 self._tables = tables
131 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]:
132 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to
133 a list of dictionaries that match the schema of the tables used by this
134 class.
136 Parameters
137 ----------
138 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ]
139 Datasets to transform.
141 Returns
142 -------
143 rows : `list` [ `dict` ]
144 List of dictionaries, with "datastoreName" and "dataset_id" keys.
145 """
146 return [{"datastore_name": self.datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs]
148 def insert(self, refs: Iterable[DatasetIdRef]) -> None:
149 # Docstring inherited from DatastoreRegistryBridge
150 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs))
152 def forget(self, refs: Iterable[DatasetIdRef]) -> None:
153 # Docstring inherited from DatastoreRegistryBridge
154 rows = self._refsToRows(self.check(refs))
155 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
157 def moveToTrash(self, refs: Iterable[DatasetIdRef]) -> None:
158 # Docstring inherited from DatastoreRegistryBridge
159 # TODO: avoid self.check() call via queries like
160 # INSERT INTO dataset_location_trash
161 # SELECT datastore_name, dataset_id FROM dataset_location
162 # WHERE datastore_name=? AND dataset_id IN (?);
163 # DELETE FROM dataset_location
164 # WHERE datastore_name=? AND dataset_id IN (?);
165 # ...but the Database interface doesn't support those kinds of queries
166 # right now.
167 rows = self._refsToRows(self.check(refs))
168 with self._db.transaction():
169 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows)
170 self._db.insert(self._tables.dataset_location_trash, *rows)
172 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]:
173 # Docstring inherited from DatastoreRegistryBridge
174 byId = {ref.getCheckedId(): ref for ref in refs}
175 sql = sqlalchemy.sql.select(
176 self._tables.dataset_location.columns.dataset_id
177 ).select_from(
178 self._tables.dataset_location
179 ).where(
180 sqlalchemy.sql.and_(
181 self._tables.dataset_location.columns.datastore_name == self.datastoreName,
182 self._tables.dataset_location.columns.dataset_id.in_(byId.keys())
183 )
184 )
185 for row in self._db.query(sql).fetchall():
186 yield byId[row.dataset_id]
188 @contextmanager
189 def emptyTrash(self, records_table: Optional[OpaqueTableStorage] = None,
190 record_class: Optional[Type[StoredDatastoreItemInfo]] = None,
191 record_column: Optional[str] = None,
192 ) -> Iterator[Tuple[Iterable[Tuple[DatasetIdRef,
193 Optional[StoredDatastoreItemInfo]]],
194 Optional[Set[str]]]]:
195 # Docstring inherited from DatastoreRegistryBridge
197 if records_table is None: 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true
198 raise ValueError("This implementation requires a records table.")
200 assert isinstance(records_table, ByNameOpaqueTableStorage),\
201 f"Records table must support hidden attributes. Got {type(records_table)}."
203 if record_class is None: 203 ↛ 204line 203 didn't jump to line 204, because the condition on line 203 was never true
204 raise ValueError("Record class must be provided if records table is given.")
206 # Helper closure to generate the common join+where clause.
207 def join_records(select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table
208 ) -> sqlalchemy.sql.FromClause:
209 # mypy needs to be sure
210 assert isinstance(records_table, ByNameOpaqueTableStorage)
211 return select.select_from(
212 records_table._table.join(
213 location_table,
214 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id,
215 )
216 ).where(
217 location_table.columns.datastore_name == self.datastoreName
218 )
220 # SELECT records.dataset_id, records.path FROM records
221 # JOIN records on dataset_location.dataset_id == records.dataset_id
222 # WHERE dataset_location.datastore_name = datastoreName
224 # It's possible that we may end up with a ref listed in the trash
225 # table that is not listed in the records table. Such an
226 # inconsistency would be missed by this query.
227 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash)
229 # Run query, transform results into a list of dicts that we can later
230 # use to delete.
231 rows = [dict(**row, datastore_name=self.datastoreName)
232 for row in self._db.query(info_in_trash).mappings()]
234 # It is possible for trashed refs to be linked to artifacts that
235 # are still associated with refs that are not to be trashed. We
236 # need to be careful to consider those and indicate to the caller
237 # that those artifacts should be retained. Can only do this check
238 # if the caller provides a column name that can map to multiple
239 # refs.
240 preserved: Optional[Set[str]] = None
241 if record_column is not None: 241 ↛ 269line 241 didn't jump to line 269, because the condition on line 241 was never false
242 # Some helper subqueries
243 items_not_in_trash = join_records(
244 sqlalchemy.sql.select(records_table._table.columns[record_column]),
245 self._tables.dataset_location,
246 ).alias("items_not_in_trash")
247 items_in_trash = join_records(
248 sqlalchemy.sql.select(records_table._table.columns[record_column]),
249 self._tables.dataset_location_trash,
250 ).alias("items_in_trash")
252 # A query for paths that are referenced by datasets in the trash
253 # and datasets not in the trash.
254 items_to_preserve = sqlalchemy.sql.select(
255 items_in_trash.columns[record_column]
256 ).select_from(
257 items_not_in_trash.join(
258 items_in_trash,
259 onclause=items_in_trash.columns[record_column]
260 == items_not_in_trash.columns[record_column]
261 )
262 )
263 preserved = {row[record_column]
264 for row in self._db.query(items_to_preserve).mappings()}
266 # Convert results to a tuple of id+info and a record of the artifacts
267 # that should not be deleted from datastore. The id+info tuple is
268 # solely to allow logging to report the relevant ID.
269 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row))
270 for row in rows)
272 # Start contextmanager, return results
273 yield ((id_info, preserved))
275 # No exception raised in context manager block.
276 if not rows:
277 return
279 # Delete the rows from the records table
280 records_table.delete(["dataset_id"],
281 *[{"dataset_id": row["dataset_id"]} for row in rows])
283 # Delete those rows from the trash table.
284 self._db.delete(self._tables.dataset_location_trash, ["dataset_id", "datastore_name"],
285 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]}
286 for row in rows])
289class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager):
290 """An implementation of `DatastoreRegistryBridgeManager` that uses the same
291 two tables for all non-ephemeral datastores.
293 Parameters
294 ----------
295 db : `Database`
296 Object providing a database connection and generic distractions.
297 tables : `_TablesTuple`
298 Named tuple containing `sqlalchemy.schema.Table` instances.
299 opaque : `OpaqueTableStorageManager`
300 Manager object for opaque table storage in the `Registry`.
301 universe : `DimensionUniverse`
302 All dimensions know to the `Registry`.
303 datasetIdColumnType : `type`
304 Type for dataset ID column.
305 """
306 def __init__(self, *, db: Database, tables: _TablesTuple,
307 opaque: OpaqueTableStorageManager, universe: DimensionUniverse,
308 datasetIdColumnType: type):
309 super().__init__(opaque=opaque, universe=universe, datasetIdColumnType=datasetIdColumnType)
310 self._db = db
311 self._tables = tables
312 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {}
314 @classmethod
315 def initialize(cls, db: Database, context: StaticTablesContext, *,
316 opaque: OpaqueTableStorageManager,
317 datasets: Type[DatasetRecordStorageManager],
318 universe: DimensionUniverse,
319 ) -> DatastoreRegistryBridgeManager:
320 # Docstring inherited from DatastoreRegistryBridge
321 tables = context.addTableTuple(_makeTableSpecs(datasets))
322 return cls(db=db, tables=cast(_TablesTuple, tables), opaque=opaque, universe=universe,
323 datasetIdColumnType=datasets.getIdColumnType())
325 def refresh(self) -> None:
326 # Docstring inherited from DatastoreRegistryBridge
327 # This implementation has no in-Python state that depends on which
328 # datastores exist, so there's nothing to do.
329 pass
331 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge:
332 # Docstring inherited from DatastoreRegistryBridge
333 if ephemeral:
334 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name))
335 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables)
337 def findDatastores(self, ref: DatasetRef) -> Iterable[str]:
338 # Docstring inherited from DatastoreRegistryBridge
339 sql = sqlalchemy.sql.select(
340 self._tables.dataset_location.columns.datastore_name
341 ).select_from(
342 self._tables.dataset_location
343 ).where(
344 self._tables.dataset_location.columns.dataset_id == ref.getCheckedId()
345 )
346 for row in self._db.query(sql).mappings():
347 yield row[self._tables.dataset_location.columns.datastore_name]
348 for name, bridge in self._ephemeral.items():
349 if ref in bridge:
350 yield name
352 @classmethod
353 def currentVersion(cls) -> Optional[VersionTuple]:
354 # Docstring inherited from VersionedExtension.
355 return _VERSION
357 def schemaDigest(self) -> Optional[str]:
358 # Docstring inherited from VersionedExtension.
359 return self._defaultSchemaDigest(self._tables, self._db.dialect)