Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py : 93%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8from typing import (
9 Any,
10 Dict,
11 Iterator,
12 Optional,
13 Tuple,
14 Type,
15 TYPE_CHECKING,
16)
18import copy
19import sqlalchemy
21from lsst.daf.butler import (
22 DatasetId,
23 DatasetRef,
24 DatasetType,
25 ddl,
26 DimensionUniverse,
27)
28from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError
29from lsst.daf.butler.registry.interfaces import (
30 DatasetIdGenEnum,
31 DatasetRecordStorage,
32 DatasetRecordStorageManager,
33 VersionTuple
34)
36from .tables import (
37 addDatasetForeignKey,
38 makeCalibTableName,
39 makeCalibTableSpec,
40 makeStaticTableSpecs,
41 makeTagTableName,
42 makeTagTableSpec,
43)
44from .summaries import CollectionSummaryManager
45from ._storage import (
46 ByDimensionsDatasetRecordStorage,
47 ByDimensionsDatasetRecordStorageInt,
48 ByDimensionsDatasetRecordStorageUUID
49)
50from ...summaries import CollectionSummary
53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true
54 from lsst.daf.butler.registry.interfaces import (
55 CollectionManager,
56 CollectionRecord,
57 Database,
58 DimensionRecordStorageManager,
59 StaticTablesContext,
60 )
61 from .tables import StaticDatasetTablesTuple
64# This has to be updated on every schema change
65_VERSION_INT = VersionTuple(1, 0, 0)
66_VERSION_UUID = VersionTuple(1, 0, 0)
69class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
70 """A manager class for datasets that uses one dataset-collection table for
71 each group of dataset types that share the same dimensions.
73 In addition to the table organization, this class makes a number of
74 other design choices that would have been cumbersome (to say the least) to
75 try to pack into its name:
77 - It uses a private surrogate integer autoincrement field to identify
78 dataset types, instead of using the name as the primary and foreign key
79 directly.
81 - It aggressively loads all DatasetTypes into memory instead of fetching
82 them from the database only when needed or attempting more clever forms
83 of caching.
85 Alternative implementations that make different choices for these while
86 keeping the same general table organization might be reasonable as well.
88 This class provides complete implementation of manager logic but it is
89 parametrized by few class attributes that have to be defined by
90 sub-classes.
92 Parameters
93 ----------
94 db : `Database`
95 Interface to the underlying database engine and namespace.
96 collections : `CollectionManager`
97 Manager object for the collections in this `Registry`.
98 dimensions : `DimensionRecordStorageManager`
99 Manager object for the dimensions in this `Registry`.
100 static : `StaticDatasetTablesTuple`
101 Named tuple of `sqlalchemy.schema.Table` instances for all static
102 tables used by this class.
103 summaries : `CollectionSummaryManager`
104 Structure containing tables that summarize the contents of collections.
105 """
106 def __init__(
107 self, *,
108 db: Database,
109 collections: CollectionManager,
110 dimensions: DimensionRecordStorageManager,
111 static: StaticDatasetTablesTuple,
112 summaries: CollectionSummaryManager,
113 ):
114 self._db = db
115 self._collections = collections
116 self._dimensions = dimensions
117 self._static = static
118 self._summaries = summaries
119 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {}
120 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
122 @classmethod
123 def initialize(
124 cls,
125 db: Database,
126 context: StaticTablesContext, *,
127 collections: CollectionManager,
128 dimensions: DimensionRecordStorageManager,
129 ) -> DatasetRecordStorageManager:
130 # Docstring inherited from DatasetRecordStorageManager.
131 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
132 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
133 summaries = CollectionSummaryManager.initialize(
134 db,
135 context,
136 collections=collections,
137 dimensions=dimensions,
138 )
139 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
141 @classmethod
142 def currentVersion(cls) -> Optional[VersionTuple]:
143 # Docstring inherited from VersionedExtension.
144 return cls._version
146 @classmethod
147 def makeStaticTableSpecs(cls, collections: Type[CollectionManager],
148 universe: DimensionUniverse) -> StaticDatasetTablesTuple:
149 """Construct all static tables used by the classes in this package.
151 Static tables are those that are present in all Registries and do not
152 depend on what DatasetTypes have been registered.
154 Parameters
155 ----------
156 collections: `CollectionManager`
157 Manager object for the collections in this `Registry`.
158 universe : `DimensionUniverse`
159 Universe graph containing all dimensions known to this `Registry`.
161 Returns
162 -------
163 specs : `StaticDatasetTablesTuple`
164 A named tuple containing `ddl.TableSpec` instances.
165 """
166 return makeStaticTableSpecs(collections, universe=universe,
167 dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement)
169 @classmethod
170 def getIdColumnType(cls) -> type:
171 # Docstring inherited from base class.
172 return cls._idColumnType
174 @classmethod
175 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, name: str = "dataset",
176 constraint: bool = True, onDelete: Optional[str] = None,
177 **kwargs: Any) -> ddl.FieldSpec:
178 # Docstring inherited from DatasetRecordStorageManager.
179 return addDatasetForeignKey(tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete,
180 constraint=constraint, **kwargs)
182 def refresh(self) -> None:
183 # Docstring inherited from DatasetRecordStorageManager.
184 byName = {}
185 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
186 c = self._static.dataset_type.columns
187 for row in self._db.query(self._static.dataset_type.select()).fetchall():
188 name = row[c.name]
189 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
190 calibTableName = row[c.calibration_association_table]
191 datasetType = DatasetType(name, dimensions, row[c.storage_class],
192 isCalibration=(calibTableName is not None))
193 tags = self._db.getExistingTable(
194 row[c.tag_association_table],
195 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()))
196 if calibTableName is not None:
197 calibs = self._db.getExistingTable(row[c.calibration_association_table],
198 makeCalibTableSpec(datasetType, type(self._collections),
199 self._db.getTimespanRepresentation(),
200 self.getIdColumnType()))
201 else:
202 calibs = None
203 storage = self._recordStorageType(db=self._db, datasetType=datasetType,
204 static=self._static, summaries=self._summaries,
205 tags=tags, calibs=calibs,
206 dataset_type_id=row["id"],
207 collections=self._collections)
208 byName[datasetType.name] = storage
209 byId[storage._dataset_type_id] = storage
210 self._byName = byName
211 self._byId = byId
212 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
214 def remove(self, name: str) -> None:
215 # Docstring inherited from DatasetRecordStorageManager.
216 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
217 if componentName is not None:
218 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
220 # Delete the row
221 try:
222 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
223 except sqlalchemy.exc.IntegrityError as e:
224 raise OrphanedRecordError(f"Dataset type {name} can not be removed."
225 " It is associated with datasets that must be removed first.") from e
227 # Now refresh everything -- removal is rare enough that this does
228 # not need to be fast.
229 self.refresh()
231 def find(self, name: str) -> Optional[DatasetRecordStorage]:
232 # Docstring inherited from DatasetRecordStorageManager.
233 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
234 storage = self._byName.get(compositeName)
235 if storage is not None and componentName is not None:
236 componentStorage = copy.copy(storage)
237 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName)
238 return componentStorage
239 else:
240 return storage
242 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
243 # Docstring inherited from DatasetRecordStorageManager.
244 if datasetType.isComponent(): 244 ↛ 245line 244 didn't jump to line 245, because the condition on line 244 was never true
245 raise ValueError("Component dataset types can not be stored in registry."
246 f" Rejecting {datasetType.name}")
247 storage = self._byName.get(datasetType.name)
248 if storage is None:
249 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
250 tagTableName = makeTagTableName(datasetType, dimensionsKey)
251 calibTableName = (makeCalibTableName(datasetType, dimensionsKey)
252 if datasetType.isCalibration() else None)
253 row, inserted = self._db.sync(
254 self._static.dataset_type,
255 keys={"name": datasetType.name},
256 compared={
257 "dimensions_key": dimensionsKey,
258 "storage_class": datasetType.storageClass.name,
259 },
260 extra={
261 "tag_association_table": tagTableName,
262 "calibration_association_table": calibTableName,
263 },
264 returning=["id", "tag_association_table"],
265 )
266 assert row is not None
267 tags = self._db.ensureTableExists(
268 tagTableName,
269 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
270 )
271 if calibTableName is not None:
272 calibs = self._db.ensureTableExists(
273 calibTableName,
274 makeCalibTableSpec(datasetType, type(self._collections),
275 self._db.getTimespanRepresentation(), self.getIdColumnType()),
276 )
277 else:
278 calibs = None
279 storage = self._recordStorageType(db=self._db, datasetType=datasetType,
280 static=self._static, summaries=self._summaries,
281 tags=tags, calibs=calibs,
282 dataset_type_id=row["id"],
283 collections=self._collections)
284 self._byName[datasetType.name] = storage
285 self._byId[storage._dataset_type_id] = storage
286 else:
287 if datasetType != storage.datasetType:
288 raise ConflictingDefinitionError(f"Given dataset type {datasetType} is inconsistent "
289 f"with database definition {storage.datasetType}.")
290 inserted = False
291 return storage, bool(inserted)
293 def __iter__(self) -> Iterator[DatasetType]:
294 for storage in self._byName.values():
295 yield storage.datasetType
297 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
298 # Docstring inherited from DatasetRecordStorageManager.
299 sql = sqlalchemy.sql.select(
300 [
301 self._static.dataset.columns.dataset_type_id,
302 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
303 ]
304 ).select_from(
305 self._static.dataset
306 ).where(
307 self._static.dataset.columns.id == id
308 )
309 row = self._db.query(sql).fetchone()
310 if row is None:
311 return None
312 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
313 if recordsForType is None: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true
314 self.refresh()
315 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
316 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
317 return DatasetRef(
318 recordsForType.datasetType,
319 dataId=recordsForType.getDataId(id=id),
320 id=id,
321 run=self._collections[row[self._collections.getRunForeignKeyName()]].name
322 )
324 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
325 # Docstring inherited from DatasetRecordStorageManager.
326 return self._summaries.get(collection)
328 def schemaDigest(self) -> Optional[str]:
329 # Docstring inherited from VersionedExtension.
330 return self._defaultSchemaDigest(self._static, self._db.dialect)
332 _version: VersionTuple
333 """Schema version for this class."""
335 _recordStorageType: Type[ByDimensionsDatasetRecordStorage]
336 """Type of the storage class returned by this manager."""
338 _autoincrement: bool
339 """If True then PK column of the dataset table is auto-increment."""
341 _idColumnType: type
342 """Type of dataset column used to store dataset ID."""
345class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
346 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
347 auto-incremental integer for dataset primary key.
348 """
349 _version: VersionTuple = _VERSION_INT
350 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
351 _autoincrement: bool = True
352 _idColumnType: type = sqlalchemy.BigInteger
354 @classmethod
355 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
356 # Docstring inherited from DatasetRecordStorageManager.
357 # MyPy seems confused about enum value types here.
358 return mode is mode.UNIQUE # type: ignore
361class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
362 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
363 UUID for dataset primary key.
364 """
365 _version: VersionTuple = _VERSION_UUID
366 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
367 _autoincrement: bool = False
368 _idColumnType: type = ddl.GUID
370 @classmethod
371 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
372 # Docstring inherited from DatasetRecordStorageManager.
373 return True