Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 92%
149 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 01:59 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 01:59 -0700
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8import copy
9from collections.abc import Iterator
10from typing import TYPE_CHECKING, Any
12import sqlalchemy
14from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
15from ..._collection_summary import CollectionSummary
16from ..._exceptions import ConflictingDefinitionError, OrphanedRecordError
17from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
18from ._storage import (
19 ByDimensionsDatasetRecordStorage,
20 ByDimensionsDatasetRecordStorageInt,
21 ByDimensionsDatasetRecordStorageUUID,
22)
23from .summaries import CollectionSummaryManager
24from .tables import (
25 addDatasetForeignKey,
26 makeCalibTableName,
27 makeCalibTableSpec,
28 makeStaticTableSpecs,
29 makeTagTableName,
30 makeTagTableSpec,
31)
33if TYPE_CHECKING: 33 ↛ 34line 33 didn't jump to line 34, because the condition on line 33 was never true
34 from ...interfaces import (
35 CollectionManager,
36 CollectionRecord,
37 Database,
38 DimensionRecordStorageManager,
39 StaticTablesContext,
40 )
41 from .tables import StaticDatasetTablesTuple
44# This has to be updated on every schema change
45_VERSION_INT = VersionTuple(1, 0, 0)
46_VERSION_UUID = VersionTuple(1, 0, 0)
49class MissingDatabaseTableError(RuntimeError):
50 """Exception raised when a table is not found in a database."""
53class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
54 """A manager class for datasets that uses one dataset-collection table for
55 each group of dataset types that share the same dimensions.
57 In addition to the table organization, this class makes a number of
58 other design choices that would have been cumbersome (to say the least) to
59 try to pack into its name:
61 - It uses a private surrogate integer autoincrement field to identify
62 dataset types, instead of using the name as the primary and foreign key
63 directly.
65 - It aggressively loads all DatasetTypes into memory instead of fetching
66 them from the database only when needed or attempting more clever forms
67 of caching.
69 Alternative implementations that make different choices for these while
70 keeping the same general table organization might be reasonable as well.
72 This class provides complete implementation of manager logic but it is
73 parametrized by few class attributes that have to be defined by
74 sub-classes.
76 Parameters
77 ----------
78 db : `Database`
79 Interface to the underlying database engine and namespace.
80 collections : `CollectionManager`
81 Manager object for the collections in this `Registry`.
82 dimensions : `DimensionRecordStorageManager`
83 Manager object for the dimensions in this `Registry`.
84 static : `StaticDatasetTablesTuple`
85 Named tuple of `sqlalchemy.schema.Table` instances for all static
86 tables used by this class.
87 summaries : `CollectionSummaryManager`
88 Structure containing tables that summarize the contents of collections.
89 """
91 def __init__(
92 self,
93 *,
94 db: Database,
95 collections: CollectionManager,
96 dimensions: DimensionRecordStorageManager,
97 static: StaticDatasetTablesTuple,
98 summaries: CollectionSummaryManager,
99 ):
100 self._db = db
101 self._collections = collections
102 self._dimensions = dimensions
103 self._static = static
104 self._summaries = summaries
105 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
106 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
108 @classmethod
109 def initialize(
110 cls,
111 db: Database,
112 context: StaticTablesContext,
113 *,
114 collections: CollectionManager,
115 dimensions: DimensionRecordStorageManager,
116 ) -> DatasetRecordStorageManager:
117 # Docstring inherited from DatasetRecordStorageManager.
118 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
119 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
120 summaries = CollectionSummaryManager.initialize(
121 db,
122 context,
123 collections=collections,
124 dimensions=dimensions,
125 )
126 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
128 @classmethod
129 def currentVersion(cls) -> VersionTuple | None:
130 # Docstring inherited from VersionedExtension.
131 return cls._version
133 @classmethod
134 def makeStaticTableSpecs(
135 cls, collections: type[CollectionManager], universe: DimensionUniverse
136 ) -> StaticDatasetTablesTuple:
137 """Construct all static tables used by the classes in this package.
139 Static tables are those that are present in all Registries and do not
140 depend on what DatasetTypes have been registered.
142 Parameters
143 ----------
144 collections: `CollectionManager`
145 Manager object for the collections in this `Registry`.
146 universe : `DimensionUniverse`
147 Universe graph containing all dimensions known to this `Registry`.
149 Returns
150 -------
151 specs : `StaticDatasetTablesTuple`
152 A named tuple containing `ddl.TableSpec` instances.
153 """
154 return makeStaticTableSpecs(
155 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
156 )
158 @classmethod
159 def getIdColumnType(cls) -> type:
160 # Docstring inherited from base class.
161 return cls._idColumnType
163 @classmethod
164 def addDatasetForeignKey(
165 cls,
166 tableSpec: ddl.TableSpec,
167 *,
168 name: str = "dataset",
169 constraint: bool = True,
170 onDelete: str | None = None,
171 **kwargs: Any,
172 ) -> ddl.FieldSpec:
173 # Docstring inherited from DatasetRecordStorageManager.
174 return addDatasetForeignKey(
175 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
176 )
178 def refresh(self) -> None:
179 # Docstring inherited from DatasetRecordStorageManager.
180 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
181 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
182 c = self._static.dataset_type.columns
183 for row in self._db.query(self._static.dataset_type.select()).mappings():
184 name = row[c.name]
185 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
186 calibTableName = row[c.calibration_association_table]
187 datasetType = DatasetType(
188 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
189 )
190 tags = self._db.getExistingTable(
191 row[c.tag_association_table],
192 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
193 )
194 if tags is None: 194 ↛ 195line 194 didn't jump to line 195, because the condition on line 194 was never true
195 raise MissingDatabaseTableError(
196 f"Table {row[c.tag_association_table]} is missing from database schema."
197 )
198 if calibTableName is not None:
199 calibs = self._db.getExistingTable(
200 row[c.calibration_association_table],
201 makeCalibTableSpec(
202 datasetType,
203 type(self._collections),
204 self._db.getTimespanRepresentation(),
205 self.getIdColumnType(),
206 ),
207 )
208 if calibs is None: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true
209 raise MissingDatabaseTableError(
210 f"Table {row[c.calibration_association_table]} is missing from database schema."
211 )
212 else:
213 calibs = None
214 storage = self._recordStorageType(
215 db=self._db,
216 datasetType=datasetType,
217 static=self._static,
218 summaries=self._summaries,
219 tags=tags,
220 calibs=calibs,
221 dataset_type_id=row["id"],
222 collections=self._collections,
223 )
224 byName[datasetType.name] = storage
225 byId[storage._dataset_type_id] = storage
226 self._byName = byName
227 self._byId = byId
228 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
230 def remove(self, name: str) -> None:
231 # Docstring inherited from DatasetRecordStorageManager.
232 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
233 if componentName is not None:
234 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
236 # Delete the row
237 try:
238 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
239 except sqlalchemy.exc.IntegrityError as e:
240 raise OrphanedRecordError(
241 f"Dataset type {name} can not be removed."
242 " It is associated with datasets that must be removed first."
243 ) from e
245 # Now refresh everything -- removal is rare enough that this does
246 # not need to be fast.
247 self.refresh()
249 def find(self, name: str) -> DatasetRecordStorage | None:
250 # Docstring inherited from DatasetRecordStorageManager.
251 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
252 storage = self._byName.get(compositeName)
253 if storage is not None and componentName is not None:
254 componentStorage = copy.copy(storage)
255 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName)
256 return componentStorage
257 else:
258 return storage
260 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
261 # Docstring inherited from DatasetRecordStorageManager.
262 if datasetType.isComponent(): 262 ↛ 263line 262 didn't jump to line 263, because the condition on line 262 was never true
263 raise ValueError(
264 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
265 )
266 storage = self._byName.get(datasetType.name)
267 if storage is None:
268 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
269 tagTableName = makeTagTableName(datasetType, dimensionsKey)
270 calibTableName = (
271 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
272 )
273 # The order is important here, we want to create tables first and
274 # only register them if this operation is successful. We cannot
275 # wrap it into a transaction because database class assumes that
276 # DDL is not transaction safe in general.
277 tags = self._db.ensureTableExists(
278 tagTableName,
279 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
280 )
281 if calibTableName is not None:
282 calibs = self._db.ensureTableExists(
283 calibTableName,
284 makeCalibTableSpec(
285 datasetType,
286 type(self._collections),
287 self._db.getTimespanRepresentation(),
288 self.getIdColumnType(),
289 ),
290 )
291 else:
292 calibs = None
293 row, inserted = self._db.sync(
294 self._static.dataset_type,
295 keys={"name": datasetType.name},
296 compared={
297 "dimensions_key": dimensionsKey,
298 # Force the storage class to be loaded to ensure it
299 # exists and there is no typo in the name.
300 "storage_class": datasetType.storageClass.name,
301 },
302 extra={
303 "tag_association_table": tagTableName,
304 "calibration_association_table": calibTableName,
305 },
306 returning=["id", "tag_association_table"],
307 )
308 assert row is not None
309 storage = self._recordStorageType(
310 db=self._db,
311 datasetType=datasetType,
312 static=self._static,
313 summaries=self._summaries,
314 tags=tags,
315 calibs=calibs,
316 dataset_type_id=row["id"],
317 collections=self._collections,
318 )
319 self._byName[datasetType.name] = storage
320 self._byId[storage._dataset_type_id] = storage
321 else:
322 if datasetType != storage.datasetType:
323 raise ConflictingDefinitionError(
324 f"Given dataset type {datasetType} is inconsistent "
325 f"with database definition {storage.datasetType}."
326 )
327 inserted = False
328 return storage, bool(inserted)
330 def __iter__(self) -> Iterator[DatasetType]:
331 for storage in self._byName.values():
332 yield storage.datasetType
334 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
335 # Docstring inherited from DatasetRecordStorageManager.
336 sql = (
337 sqlalchemy.sql.select(
338 self._static.dataset.columns.dataset_type_id,
339 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
340 )
341 .select_from(self._static.dataset)
342 .where(self._static.dataset.columns.id == id)
343 )
344 row = self._db.query(sql).mappings().fetchone()
345 if row is None:
346 return None
347 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
348 if recordsForType is None: 348 ↛ 349line 348 didn't jump to line 349, because the condition on line 348 was never true
349 self.refresh()
350 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
351 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
352 return DatasetRef(
353 recordsForType.datasetType,
354 dataId=recordsForType.getDataId(id=id),
355 id=id,
356 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
357 )
359 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
360 # Docstring inherited from DatasetRecordStorageManager.
361 return self._summaries.get(collection)
363 def schemaDigest(self) -> str | None:
364 # Docstring inherited from VersionedExtension.
365 return self._defaultSchemaDigest(self._static, self._db.dialect)
367 _version: VersionTuple
368 """Schema version for this class."""
370 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
371 """Type of the storage class returned by this manager."""
373 _autoincrement: bool
374 """If True then PK column of the dataset table is auto-increment."""
376 _idColumnType: type
377 """Type of dataset column used to store dataset ID."""
380class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
381 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
382 auto-incremental integer for dataset primary key.
383 """
385 _version: VersionTuple = _VERSION_INT
386 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
387 _autoincrement: bool = True
388 _idColumnType: type = sqlalchemy.BigInteger
390 @classmethod
391 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
392 # Docstring inherited from DatasetRecordStorageManager.
393 # MyPy seems confused about enum value types here.
394 return mode is mode.UNIQUE # type: ignore
397class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
398 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
399 UUID for dataset primary key.
400 """
402 _version: VersionTuple = _VERSION_UUID
403 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
404 _autoincrement: bool = False
405 _idColumnType: type = ddl.GUID
407 @classmethod
408 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
409 # Docstring inherited from DatasetRecordStorageManager.
410 return True