Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 91%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8from typing import (
9 Any,
10 Dict,
11 Iterator,
12 Optional,
13 Tuple,
14 Type,
15 TYPE_CHECKING,
16)
18import copy
19import sqlalchemy
21from lsst.daf.butler import (
22 DatasetId,
23 DatasetRef,
24 DatasetType,
25 ddl,
26 DimensionUniverse,
27)
28from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError
29from lsst.daf.butler.registry.interfaces import (
30 DatasetIdGenEnum,
31 DatasetRecordStorage,
32 DatasetRecordStorageManager,
33 VersionTuple
34)
36from .tables import (
37 addDatasetForeignKey,
38 makeCalibTableName,
39 makeCalibTableSpec,
40 makeStaticTableSpecs,
41 makeTagTableName,
42 makeTagTableSpec,
43)
44from .summaries import CollectionSummaryManager
45from ._storage import (
46 ByDimensionsDatasetRecordStorage,
47 ByDimensionsDatasetRecordStorageInt,
48 ByDimensionsDatasetRecordStorageUUID
49)
50from ...summaries import CollectionSummary
53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true
54 from lsst.daf.butler.registry.interfaces import (
55 CollectionManager,
56 CollectionRecord,
57 Database,
58 DimensionRecordStorageManager,
59 StaticTablesContext,
60 )
61 from .tables import StaticDatasetTablesTuple
64# This has to be updated on every schema change
65_VERSION_INT = VersionTuple(1, 0, 0)
66_VERSION_UUID = VersionTuple(1, 0, 0)
69class MissingDatabaseTableError(RuntimeError):
70 """Exception raised when a table is not found in a database.
71 """
74class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
75 """A manager class for datasets that uses one dataset-collection table for
76 each group of dataset types that share the same dimensions.
78 In addition to the table organization, this class makes a number of
79 other design choices that would have been cumbersome (to say the least) to
80 try to pack into its name:
82 - It uses a private surrogate integer autoincrement field to identify
83 dataset types, instead of using the name as the primary and foreign key
84 directly.
86 - It aggressively loads all DatasetTypes into memory instead of fetching
87 them from the database only when needed or attempting more clever forms
88 of caching.
90 Alternative implementations that make different choices for these while
91 keeping the same general table organization might be reasonable as well.
93 This class provides complete implementation of manager logic but it is
94 parametrized by few class attributes that have to be defined by
95 sub-classes.
97 Parameters
98 ----------
99 db : `Database`
100 Interface to the underlying database engine and namespace.
101 collections : `CollectionManager`
102 Manager object for the collections in this `Registry`.
103 dimensions : `DimensionRecordStorageManager`
104 Manager object for the dimensions in this `Registry`.
105 static : `StaticDatasetTablesTuple`
106 Named tuple of `sqlalchemy.schema.Table` instances for all static
107 tables used by this class.
108 summaries : `CollectionSummaryManager`
109 Structure containing tables that summarize the contents of collections.
110 """
111 def __init__(
112 self, *,
113 db: Database,
114 collections: CollectionManager,
115 dimensions: DimensionRecordStorageManager,
116 static: StaticDatasetTablesTuple,
117 summaries: CollectionSummaryManager,
118 ):
119 self._db = db
120 self._collections = collections
121 self._dimensions = dimensions
122 self._static = static
123 self._summaries = summaries
124 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {}
125 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
127 @classmethod
128 def initialize(
129 cls,
130 db: Database,
131 context: StaticTablesContext, *,
132 collections: CollectionManager,
133 dimensions: DimensionRecordStorageManager,
134 ) -> DatasetRecordStorageManager:
135 # Docstring inherited from DatasetRecordStorageManager.
136 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
137 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
138 summaries = CollectionSummaryManager.initialize(
139 db,
140 context,
141 collections=collections,
142 dimensions=dimensions,
143 )
144 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
146 @classmethod
147 def currentVersion(cls) -> Optional[VersionTuple]:
148 # Docstring inherited from VersionedExtension.
149 return cls._version
151 @classmethod
152 def makeStaticTableSpecs(cls, collections: Type[CollectionManager],
153 universe: DimensionUniverse) -> StaticDatasetTablesTuple:
154 """Construct all static tables used by the classes in this package.
156 Static tables are those that are present in all Registries and do not
157 depend on what DatasetTypes have been registered.
159 Parameters
160 ----------
161 collections: `CollectionManager`
162 Manager object for the collections in this `Registry`.
163 universe : `DimensionUniverse`
164 Universe graph containing all dimensions known to this `Registry`.
166 Returns
167 -------
168 specs : `StaticDatasetTablesTuple`
169 A named tuple containing `ddl.TableSpec` instances.
170 """
171 return makeStaticTableSpecs(collections, universe=universe,
172 dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement)
174 @classmethod
175 def getIdColumnType(cls) -> type:
176 # Docstring inherited from base class.
177 return cls._idColumnType
179 @classmethod
180 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset",
181 constraint: bool = True, onDelete: Optional[str] = None,
182 **kwargs: Any) -> ddl.FieldSpec:
183 # Docstring inherited from DatasetRecordStorageManager.
184 return addDatasetForeignKey(tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete,
185 constraint=constraint, **kwargs)
187 def refresh(self) -> None:
188 # Docstring inherited from DatasetRecordStorageManager.
189 byName = {}
190 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
191 c = self._static.dataset_type.columns
192 for row in self._db.query(self._static.dataset_type.select()).mappings():
193 name = row[c.name]
194 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
195 calibTableName = row[c.calibration_association_table]
196 datasetType = DatasetType(name, dimensions, row[c.storage_class],
197 isCalibration=(calibTableName is not None))
198 tags = self._db.getExistingTable(
199 row[c.tag_association_table],
200 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()))
201 if tags is None: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true
202 raise MissingDatabaseTableError(
203 f"Table {row[c.tag_association_table]} is missing from database schema."
204 )
205 if calibTableName is not None:
206 calibs = self._db.getExistingTable(row[c.calibration_association_table],
207 makeCalibTableSpec(datasetType, type(self._collections),
208 self._db.getTimespanRepresentation(),
209 self.getIdColumnType()))
210 if calibs is None: 210 ↛ 211line 210 didn't jump to line 211, because the condition on line 210 was never true
211 raise MissingDatabaseTableError(
212 f"Table {row[c.calibration_association_table]} is missing from database schema."
213 )
214 else:
215 calibs = None
216 storage = self._recordStorageType(db=self._db, datasetType=datasetType,
217 static=self._static, summaries=self._summaries,
218 tags=tags, calibs=calibs,
219 dataset_type_id=row["id"],
220 collections=self._collections)
221 byName[datasetType.name] = storage
222 byId[storage._dataset_type_id] = storage
223 self._byName = byName
224 self._byId = byId
225 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
227 def remove(self, name: str) -> None:
228 # Docstring inherited from DatasetRecordStorageManager.
229 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
230 if componentName is not None:
231 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
233 # Delete the row
234 try:
235 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
236 except sqlalchemy.exc.IntegrityError as e:
237 raise OrphanedRecordError(f"Dataset type {name} can not be removed."
238 " It is associated with datasets that must be removed first.") from e
240 # Now refresh everything -- removal is rare enough that this does
241 # not need to be fast.
242 self.refresh()
244 def find(self, name: str) -> Optional[DatasetRecordStorage]:
245 # Docstring inherited from DatasetRecordStorageManager.
246 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
247 storage = self._byName.get(compositeName)
248 if storage is not None and componentName is not None:
249 componentStorage = copy.copy(storage)
250 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName)
251 return componentStorage
252 else:
253 return storage
255 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
256 # Docstring inherited from DatasetRecordStorageManager.
257 if datasetType.isComponent(): 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true
258 raise ValueError("Component dataset types can not be stored in registry."
259 f" Rejecting {datasetType.name}")
260 storage = self._byName.get(datasetType.name)
261 if storage is None:
262 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
263 tagTableName = makeTagTableName(datasetType, dimensionsKey)
264 calibTableName = (makeCalibTableName(datasetType, dimensionsKey)
265 if datasetType.isCalibration() else None)
266 # The order is important here, we want to create tables first and
267 # only register them if this operation is successful. We cannot
268 # wrap it into a transaction because database class assumes that
269 # DDL is not transaction safe in general.
270 tags = self._db.ensureTableExists(
271 tagTableName,
272 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
273 )
274 if calibTableName is not None:
275 calibs = self._db.ensureTableExists(
276 calibTableName,
277 makeCalibTableSpec(datasetType, type(self._collections),
278 self._db.getTimespanRepresentation(), self.getIdColumnType()),
279 )
280 else:
281 calibs = None
282 row, inserted = self._db.sync(
283 self._static.dataset_type,
284 keys={"name": datasetType.name},
285 compared={
286 "dimensions_key": dimensionsKey,
287 "storage_class": datasetType.storageClass.name,
288 },
289 extra={
290 "tag_association_table": tagTableName,
291 "calibration_association_table": calibTableName,
292 },
293 returning=["id", "tag_association_table"],
294 )
295 assert row is not None
296 storage = self._recordStorageType(db=self._db, datasetType=datasetType,
297 static=self._static, summaries=self._summaries,
298 tags=tags, calibs=calibs,
299 dataset_type_id=row["id"],
300 collections=self._collections)
301 self._byName[datasetType.name] = storage
302 self._byId[storage._dataset_type_id] = storage
303 else:
304 if datasetType != storage.datasetType:
305 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent "
306 f"with database definition {storage.datasetType}.")
307 inserted = False
308 return storage, bool(inserted)
310 def __iter__(self) -> Iterator[DatasetType]:
311 for storage in self._byName.values():
312 yield storage.datasetType
314 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
315 # Docstring inherited from DatasetRecordStorageManager.
316 sql = sqlalchemy.sql.select(
317 self._static.dataset.columns.dataset_type_id,
318 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
319 ).select_from(
320 self._static.dataset
321 ).where(
322 self._static.dataset.columns.id == id
323 )
324 row = self._db.query(sql).mappings().fetchone()
325 if row is None:
326 return None
327 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
328 if recordsForType is None: 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true
329 self.refresh()
330 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
331 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
332 return DatasetRef(
333 recordsForType.datasetType,
334 dataId=recordsForType.getDataId(id=id),
335 id=id,
336 run=self._collections[row[self._collections.getRunForeignKeyName()]].name
337 )
339 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
340 # Docstring inherited from DatasetRecordStorageManager.
341 return self._summaries.get(collection)
343 def schemaDigest(self) -> Optional[str]:
344 # Docstring inherited from VersionedExtension.
345 return self._defaultSchemaDigest(self._static, self._db.dialect)
347 _version: VersionTuple
348 """Schema version for this class."""
350 _recordStorageType: Type[ByDimensionsDatasetRecordStorage]
351 """Type of the storage class returned by this manager."""
353 _autoincrement: bool
354 """If True then PK column of the dataset table is auto-increment."""
356 _idColumnType: type
357 """Type of dataset column used to store dataset ID."""
360class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
361 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
362 auto-incremental integer for dataset primary key.
363 """
364 _version: VersionTuple = _VERSION_INT
365 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
366 _autoincrement: bool = True
367 _idColumnType: type = sqlalchemy.BigInteger
369 @classmethod
370 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
371 # Docstring inherited from DatasetRecordStorageManager.
372 # MyPy seems confused about enum value types here.
373 return mode is mode.UNIQUE # type: ignore
376class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
377 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
378 UUID for dataset primary key.
379 """
380 _version: VersionTuple = _VERSION_UUID
381 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
382 _autoincrement: bool = False
383 _idColumnType: type = ddl.GUID
385 @classmethod
386 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
387 # Docstring inherited from DatasetRecordStorageManager.
388 return True