Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 92%
148 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 09:42 +0000
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 09:42 +0000
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8import copy
9from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type
11import sqlalchemy
12from lsst.daf.butler import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
13from lsst.daf.butler.registry import ConflictingDefinitionError, OrphanedRecordError
14from lsst.daf.butler.registry.interfaces import (
15 DatasetIdGenEnum,
16 DatasetRecordStorage,
17 DatasetRecordStorageManager,
18 VersionTuple,
19)
21from ...summaries import CollectionSummary
22from ._storage import (
23 ByDimensionsDatasetRecordStorage,
24 ByDimensionsDatasetRecordStorageInt,
25 ByDimensionsDatasetRecordStorageUUID,
26)
27from .summaries import CollectionSummaryManager
28from .tables import (
29 addDatasetForeignKey,
30 makeCalibTableName,
31 makeCalibTableSpec,
32 makeStaticTableSpecs,
33 makeTagTableName,
34 makeTagTableSpec,
35)
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from lsst.daf.butler.registry.interfaces import (
39 CollectionManager,
40 CollectionRecord,
41 Database,
42 DimensionRecordStorageManager,
43 StaticTablesContext,
44 )
46 from .tables import StaticDatasetTablesTuple
49# This has to be updated on every schema change
50_VERSION_INT = VersionTuple(1, 0, 0)
51_VERSION_UUID = VersionTuple(1, 0, 0)
54class MissingDatabaseTableError(RuntimeError):
55 """Exception raised when a table is not found in a database."""
58class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
59 """A manager class for datasets that uses one dataset-collection table for
60 each group of dataset types that share the same dimensions.
62 In addition to the table organization, this class makes a number of
63 other design choices that would have been cumbersome (to say the least) to
64 try to pack into its name:
66 - It uses a private surrogate integer autoincrement field to identify
67 dataset types, instead of using the name as the primary and foreign key
68 directly.
70 - It aggressively loads all DatasetTypes into memory instead of fetching
71 them from the database only when needed or attempting more clever forms
72 of caching.
74 Alternative implementations that make different choices for these while
75 keeping the same general table organization might be reasonable as well.
77 This class provides complete implementation of manager logic but it is
78 parametrized by few class attributes that have to be defined by
79 sub-classes.
81 Parameters
82 ----------
83 db : `Database`
84 Interface to the underlying database engine and namespace.
85 collections : `CollectionManager`
86 Manager object for the collections in this `Registry`.
87 dimensions : `DimensionRecordStorageManager`
88 Manager object for the dimensions in this `Registry`.
89 static : `StaticDatasetTablesTuple`
90 Named tuple of `sqlalchemy.schema.Table` instances for all static
91 tables used by this class.
92 summaries : `CollectionSummaryManager`
93 Structure containing tables that summarize the contents of collections.
94 """
96 def __init__(
97 self,
98 *,
99 db: Database,
100 collections: CollectionManager,
101 dimensions: DimensionRecordStorageManager,
102 static: StaticDatasetTablesTuple,
103 summaries: CollectionSummaryManager,
104 ):
105 self._db = db
106 self._collections = collections
107 self._dimensions = dimensions
108 self._static = static
109 self._summaries = summaries
110 self._byName: Dict[str, ByDimensionsDatasetRecordStorage] = {}
111 self._byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
113 @classmethod
114 def initialize(
115 cls,
116 db: Database,
117 context: StaticTablesContext,
118 *,
119 collections: CollectionManager,
120 dimensions: DimensionRecordStorageManager,
121 ) -> DatasetRecordStorageManager:
122 # Docstring inherited from DatasetRecordStorageManager.
123 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
125 summaries = CollectionSummaryManager.initialize(
126 db,
127 context,
128 collections=collections,
129 dimensions=dimensions,
130 )
131 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
133 @classmethod
134 def currentVersion(cls) -> Optional[VersionTuple]:
135 # Docstring inherited from VersionedExtension.
136 return cls._version
138 @classmethod
139 def makeStaticTableSpecs(
140 cls, collections: Type[CollectionManager], universe: DimensionUniverse
141 ) -> StaticDatasetTablesTuple:
142 """Construct all static tables used by the classes in this package.
144 Static tables are those that are present in all Registries and do not
145 depend on what DatasetTypes have been registered.
147 Parameters
148 ----------
149 collections: `CollectionManager`
150 Manager object for the collections in this `Registry`.
151 universe : `DimensionUniverse`
152 Universe graph containing all dimensions known to this `Registry`.
154 Returns
155 -------
156 specs : `StaticDatasetTablesTuple`
157 A named tuple containing `ddl.TableSpec` instances.
158 """
159 return makeStaticTableSpecs(
160 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
161 )
163 @classmethod
164 def getIdColumnType(cls) -> type:
165 # Docstring inherited from base class.
166 return cls._idColumnType
168 @classmethod
169 def addDatasetForeignKey(
170 cls,
171 tableSpec: ddl.TableSpec,
172 *,
173 name: str = "dataset",
174 constraint: bool = True,
175 onDelete: Optional[str] = None,
176 **kwargs: Any,
177 ) -> ddl.FieldSpec:
178 # Docstring inherited from DatasetRecordStorageManager.
179 return addDatasetForeignKey(
180 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
181 )
183 def refresh(self) -> None:
184 # Docstring inherited from DatasetRecordStorageManager.
185 byName = {}
186 byId: Dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
187 c = self._static.dataset_type.columns
188 for row in self._db.query(self._static.dataset_type.select()).mappings():
189 name = row[c.name]
190 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
191 calibTableName = row[c.calibration_association_table]
192 datasetType = DatasetType(
193 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
194 )
195 tags = self._db.getExistingTable(
196 row[c.tag_association_table],
197 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
198 )
199 if tags is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true
200 raise MissingDatabaseTableError(
201 f"Table {row[c.tag_association_table]} is missing from database schema."
202 )
203 if calibTableName is not None:
204 calibs = self._db.getExistingTable(
205 row[c.calibration_association_table],
206 makeCalibTableSpec(
207 datasetType,
208 type(self._collections),
209 self._db.getTimespanRepresentation(),
210 self.getIdColumnType(),
211 ),
212 )
213 if calibs is None: 213 ↛ 214line 213 didn't jump to line 214, because the condition on line 213 was never true
214 raise MissingDatabaseTableError(
215 f"Table {row[c.calibration_association_table]} is missing from database schema."
216 )
217 else:
218 calibs = None
219 storage = self._recordStorageType(
220 db=self._db,
221 datasetType=datasetType,
222 static=self._static,
223 summaries=self._summaries,
224 tags=tags,
225 calibs=calibs,
226 dataset_type_id=row["id"],
227 collections=self._collections,
228 )
229 byName[datasetType.name] = storage
230 byId[storage._dataset_type_id] = storage
231 self._byName = byName
232 self._byId = byId
233 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
235 def remove(self, name: str) -> None:
236 # Docstring inherited from DatasetRecordStorageManager.
237 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
238 if componentName is not None:
239 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
241 # Delete the row
242 try:
243 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
244 except sqlalchemy.exc.IntegrityError as e:
245 raise OrphanedRecordError(
246 f"Dataset type {name} can not be removed."
247 " It is associated with datasets that must be removed first."
248 ) from e
250 # Now refresh everything -- removal is rare enough that this does
251 # not need to be fast.
252 self.refresh()
254 def find(self, name: str) -> Optional[DatasetRecordStorage]:
255 # Docstring inherited from DatasetRecordStorageManager.
256 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
257 storage = self._byName.get(compositeName)
258 if storage is not None and componentName is not None:
259 componentStorage = copy.copy(storage)
260 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName)
261 return componentStorage
262 else:
263 return storage
265 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
266 # Docstring inherited from DatasetRecordStorageManager.
267 if datasetType.isComponent(): 267 ↛ 268line 267 didn't jump to line 268, because the condition on line 267 was never true
268 raise ValueError(
269 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
270 )
271 storage = self._byName.get(datasetType.name)
272 if storage is None:
273 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
274 tagTableName = makeTagTableName(datasetType, dimensionsKey)
275 calibTableName = (
276 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
277 )
278 # The order is important here, we want to create tables first and
279 # only register them if this operation is successful. We cannot
280 # wrap it into a transaction because database class assumes that
281 # DDL is not transaction safe in general.
282 tags = self._db.ensureTableExists(
283 tagTableName,
284 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
285 )
286 if calibTableName is not None:
287 calibs = self._db.ensureTableExists(
288 calibTableName,
289 makeCalibTableSpec(
290 datasetType,
291 type(self._collections),
292 self._db.getTimespanRepresentation(),
293 self.getIdColumnType(),
294 ),
295 )
296 else:
297 calibs = None
298 row, inserted = self._db.sync(
299 self._static.dataset_type,
300 keys={"name": datasetType.name},
301 compared={
302 "dimensions_key": dimensionsKey,
303 "storage_class": datasetType.storageClass.name,
304 },
305 extra={
306 "tag_association_table": tagTableName,
307 "calibration_association_table": calibTableName,
308 },
309 returning=["id", "tag_association_table"],
310 )
311 assert row is not None
312 storage = self._recordStorageType(
313 db=self._db,
314 datasetType=datasetType,
315 static=self._static,
316 summaries=self._summaries,
317 tags=tags,
318 calibs=calibs,
319 dataset_type_id=row["id"],
320 collections=self._collections,
321 )
322 self._byName[datasetType.name] = storage
323 self._byId[storage._dataset_type_id] = storage
324 else:
325 if datasetType != storage.datasetType:
326 raise ConflictingDefinitionError(
327 f"Given dataset type {datasetType} is inconsistent "
328 f"with database definition {storage.datasetType}."
329 )
330 inserted = False
331 return storage, bool(inserted)
333 def __iter__(self) -> Iterator[DatasetType]:
334 for storage in self._byName.values():
335 yield storage.datasetType
337 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
338 # Docstring inherited from DatasetRecordStorageManager.
339 sql = (
340 sqlalchemy.sql.select(
341 self._static.dataset.columns.dataset_type_id,
342 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
343 )
344 .select_from(self._static.dataset)
345 .where(self._static.dataset.columns.id == id)
346 )
347 row = self._db.query(sql).mappings().fetchone()
348 if row is None:
349 return None
350 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
351 if recordsForType is None: 351 ↛ 352line 351 didn't jump to line 352, because the condition on line 351 was never true
352 self.refresh()
353 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
354 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
355 return DatasetRef(
356 recordsForType.datasetType,
357 dataId=recordsForType.getDataId(id=id),
358 id=id,
359 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
360 )
362 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
363 # Docstring inherited from DatasetRecordStorageManager.
364 return self._summaries.get(collection)
366 def schemaDigest(self) -> Optional[str]:
367 # Docstring inherited from VersionedExtension.
368 return self._defaultSchemaDigest(self._static, self._db.dialect)
370 _version: VersionTuple
371 """Schema version for this class."""
373 _recordStorageType: Type[ByDimensionsDatasetRecordStorage]
374 """Type of the storage class returned by this manager."""
376 _autoincrement: bool
377 """If True then PK column of the dataset table is auto-increment."""
379 _idColumnType: type
380 """Type of dataset column used to store dataset ID."""
383class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
384 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
385 auto-incremental integer for dataset primary key.
386 """
388 _version: VersionTuple = _VERSION_INT
389 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
390 _autoincrement: bool = True
391 _idColumnType: type = sqlalchemy.BigInteger
393 @classmethod
394 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
395 # Docstring inherited from DatasetRecordStorageManager.
396 # MyPy seems confused about enum value types here.
397 return mode is mode.UNIQUE # type: ignore
400class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
401 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
402 UUID for dataset primary key.
403 """
405 _version: VersionTuple = _VERSION_UUID
406 _recordStorageType: Type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
407 _autoincrement: bool = False
408 _idColumnType: type = ddl.GUID
410 @classmethod
411 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
412 # Docstring inherited from DatasetRecordStorageManager.
413 return True