Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%
199 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-04 02:18 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-04 02:18 -0700
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8import copy
9import logging
10import warnings
11from collections import defaultdict
12from typing import TYPE_CHECKING, Any
14import sqlalchemy
15from lsst.utils.ellipsis import Ellipsis
17from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
18from ..._collection_summary import CollectionSummary
19from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
20from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
21from ...wildcards import DatasetTypeWildcard
22from ._storage import (
23 ByDimensionsDatasetRecordStorage,
24 ByDimensionsDatasetRecordStorageInt,
25 ByDimensionsDatasetRecordStorageUUID,
26)
27from .summaries import CollectionSummaryManager
28from .tables import (
29 addDatasetForeignKey,
30 makeCalibTableName,
31 makeCalibTableSpec,
32 makeStaticTableSpecs,
33 makeTagTableName,
34 makeTagTableSpec,
35)
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from ...interfaces import (
39 CollectionManager,
40 CollectionRecord,
41 Database,
42 DimensionRecordStorageManager,
43 StaticTablesContext,
44 )
45 from .tables import StaticDatasetTablesTuple
48# This has to be updated on every schema change
49_VERSION_INT = VersionTuple(1, 0, 0)
50_VERSION_UUID = VersionTuple(1, 0, 0)
52_LOG = logging.getLogger(__name__)
55class MissingDatabaseTableError(RuntimeError):
56 """Exception raised when a table is not found in a database."""
59class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
60 """A manager class for datasets that uses one dataset-collection table for
61 each group of dataset types that share the same dimensions.
63 In addition to the table organization, this class makes a number of
64 other design choices that would have been cumbersome (to say the least) to
65 try to pack into its name:
67 - It uses a private surrogate integer autoincrement field to identify
68 dataset types, instead of using the name as the primary and foreign key
69 directly.
71 - It aggressively loads all DatasetTypes into memory instead of fetching
72 them from the database only when needed or attempting more clever forms
73 of caching.
75 Alternative implementations that make different choices for these while
76 keeping the same general table organization might be reasonable as well.
78 This class provides complete implementation of manager logic but it is
79 parametrized by few class attributes that have to be defined by
80 sub-classes.
82 Parameters
83 ----------
84 db : `Database`
85 Interface to the underlying database engine and namespace.
86 collections : `CollectionManager`
87 Manager object for the collections in this `Registry`.
88 dimensions : `DimensionRecordStorageManager`
89 Manager object for the dimensions in this `Registry`.
90 static : `StaticDatasetTablesTuple`
91 Named tuple of `sqlalchemy.schema.Table` instances for all static
92 tables used by this class.
93 summaries : `CollectionSummaryManager`
94 Structure containing tables that summarize the contents of collections.
95 """
97 def __init__(
98 self,
99 *,
100 db: Database,
101 collections: CollectionManager,
102 dimensions: DimensionRecordStorageManager,
103 static: StaticDatasetTablesTuple,
104 summaries: CollectionSummaryManager,
105 ):
106 self._db = db
107 self._collections = collections
108 self._dimensions = dimensions
109 self._static = static
110 self._summaries = summaries
111 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
112 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
114 @classmethod
115 def initialize(
116 cls,
117 db: Database,
118 context: StaticTablesContext,
119 *,
120 collections: CollectionManager,
121 dimensions: DimensionRecordStorageManager,
122 ) -> DatasetRecordStorageManager:
123 # Docstring inherited from DatasetRecordStorageManager.
124 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
126 summaries = CollectionSummaryManager.initialize(
127 db,
128 context,
129 collections=collections,
130 dimensions=dimensions,
131 )
132 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
134 @classmethod
135 def currentVersion(cls) -> VersionTuple | None:
136 # Docstring inherited from VersionedExtension.
137 return cls._version
139 @classmethod
140 def makeStaticTableSpecs(
141 cls, collections: type[CollectionManager], universe: DimensionUniverse
142 ) -> StaticDatasetTablesTuple:
143 """Construct all static tables used by the classes in this package.
145 Static tables are those that are present in all Registries and do not
146 depend on what DatasetTypes have been registered.
148 Parameters
149 ----------
150 collections: `CollectionManager`
151 Manager object for the collections in this `Registry`.
152 universe : `DimensionUniverse`
153 Universe graph containing all dimensions known to this `Registry`.
155 Returns
156 -------
157 specs : `StaticDatasetTablesTuple`
158 A named tuple containing `ddl.TableSpec` instances.
159 """
160 return makeStaticTableSpecs(
161 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
162 )
164 @classmethod
165 def getIdColumnType(cls) -> type:
166 # Docstring inherited from base class.
167 return cls._idColumnType
169 @classmethod
170 def addDatasetForeignKey(
171 cls,
172 tableSpec: ddl.TableSpec,
173 *,
174 name: str = "dataset",
175 constraint: bool = True,
176 onDelete: str | None = None,
177 **kwargs: Any,
178 ) -> ddl.FieldSpec:
179 # Docstring inherited from DatasetRecordStorageManager.
180 return addDatasetForeignKey(
181 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
182 )
184 def refresh(self) -> None:
185 # Docstring inherited from DatasetRecordStorageManager.
186 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
187 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
188 c = self._static.dataset_type.columns
189 for row in self._db.query(self._static.dataset_type.select()).mappings():
190 name = row[c.name]
191 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
192 calibTableName = row[c.calibration_association_table]
193 datasetType = DatasetType(
194 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
195 )
196 tags = self._db.getExistingTable(
197 row[c.tag_association_table],
198 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
199 )
200 if tags is None: 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true
201 raise MissingDatabaseTableError(
202 f"Table {row[c.tag_association_table]} is missing from database schema."
203 )
204 if calibTableName is not None:
205 calibs = self._db.getExistingTable(
206 row[c.calibration_association_table],
207 makeCalibTableSpec(
208 datasetType,
209 type(self._collections),
210 self._db.getTimespanRepresentation(),
211 self.getIdColumnType(),
212 ),
213 )
214 if calibs is None: 214 ↛ 215line 214 didn't jump to line 215, because the condition on line 214 was never true
215 raise MissingDatabaseTableError(
216 f"Table {row[c.calibration_association_table]} is missing from database schema."
217 )
218 else:
219 calibs = None
220 storage = self._recordStorageType(
221 db=self._db,
222 datasetType=datasetType,
223 static=self._static,
224 summaries=self._summaries,
225 tags=tags,
226 calibs=calibs,
227 dataset_type_id=row["id"],
228 collections=self._collections,
229 )
230 byName[datasetType.name] = storage
231 byId[storage._dataset_type_id] = storage
232 self._byName = byName
233 self._byId = byId
234 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
236 def remove(self, name: str) -> None:
237 # Docstring inherited from DatasetRecordStorageManager.
238 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
239 if componentName is not None:
240 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
242 # Delete the row
243 try:
244 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
245 except sqlalchemy.exc.IntegrityError as e:
246 raise OrphanedRecordError(
247 f"Dataset type {name} can not be removed."
248 " It is associated with datasets that must be removed first."
249 ) from e
251 # Now refresh everything -- removal is rare enough that this does
252 # not need to be fast.
253 self.refresh()
255 def find(self, name: str) -> DatasetRecordStorage | None:
256 # Docstring inherited from DatasetRecordStorageManager.
257 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
258 storage = self._byName.get(compositeName)
259 if storage is not None and componentName is not None:
260 componentStorage = copy.copy(storage)
261 componentStorage.datasetType = storage.datasetType.makeComponentDatasetType(componentName)
262 return componentStorage
263 else:
264 return storage
266 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
267 # Docstring inherited from DatasetRecordStorageManager.
268 if datasetType.isComponent(): 268 ↛ 269line 268 didn't jump to line 269, because the condition on line 268 was never true
269 raise ValueError(
270 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
271 )
272 storage = self._byName.get(datasetType.name)
273 if storage is None:
274 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
275 tagTableName = makeTagTableName(datasetType, dimensionsKey)
276 calibTableName = (
277 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
278 )
279 # The order is important here, we want to create tables first and
280 # only register them if this operation is successful. We cannot
281 # wrap it into a transaction because database class assumes that
282 # DDL is not transaction safe in general.
283 tags = self._db.ensureTableExists(
284 tagTableName,
285 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
286 )
287 if calibTableName is not None:
288 calibs = self._db.ensureTableExists(
289 calibTableName,
290 makeCalibTableSpec(
291 datasetType,
292 type(self._collections),
293 self._db.getTimespanRepresentation(),
294 self.getIdColumnType(),
295 ),
296 )
297 else:
298 calibs = None
299 row, inserted = self._db.sync(
300 self._static.dataset_type,
301 keys={"name": datasetType.name},
302 compared={
303 "dimensions_key": dimensionsKey,
304 # Force the storage class to be loaded to ensure it
305 # exists and there is no typo in the name.
306 "storage_class": datasetType.storageClass.name,
307 },
308 extra={
309 "tag_association_table": tagTableName,
310 "calibration_association_table": calibTableName,
311 },
312 returning=["id", "tag_association_table"],
313 )
314 assert row is not None
315 storage = self._recordStorageType(
316 db=self._db,
317 datasetType=datasetType,
318 static=self._static,
319 summaries=self._summaries,
320 tags=tags,
321 calibs=calibs,
322 dataset_type_id=row["id"],
323 collections=self._collections,
324 )
325 self._byName[datasetType.name] = storage
326 self._byId[storage._dataset_type_id] = storage
327 else:
328 if datasetType != storage.datasetType:
329 raise ConflictingDefinitionError(
330 f"Given dataset type {datasetType} is inconsistent "
331 f"with database definition {storage.datasetType}."
332 )
333 inserted = False
334 return storage, bool(inserted)
336 def resolve_wildcard(
337 self,
338 expression: Any,
339 components: bool | None = None,
340 missing: list[str] | None = None,
341 explicit_only: bool = False,
342 ) -> dict[DatasetType, list[str | None]]:
343 wildcard = DatasetTypeWildcard.from_expression(expression)
344 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
345 for name, dataset_type in wildcard.values.items():
346 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
347 if (found_storage := self.find(parent_name)) is not None:
348 found_parent = found_storage.datasetType
349 if component_name is not None:
350 found = found_parent.makeComponentDatasetType(component_name)
351 else:
352 found = found_parent
353 if dataset_type is not None:
354 if dataset_type.is_compatible_with(found): 354 ↛ 362line 354 didn't jump to line 362, because the condition on line 354 was never false
355 # Prefer the given dataset type to enable storage class
356 # conversions.
357 if component_name is not None:
358 found_parent = dataset_type.makeCompositeDatasetType()
359 else:
360 found_parent = dataset_type
361 else:
362 raise DatasetTypeError(
363 f"Dataset type definition in query expression {dataset_type} is "
364 f"not compatible with the registered type {found}."
365 )
366 result[found_parent].add(component_name)
367 elif missing is not None: 367 ↛ 345line 367 didn't jump to line 345, because the condition on line 367 was never false
368 missing.append(name)
369 if wildcard.patterns is Ellipsis:
370 if explicit_only:
371 raise TypeError(
372 "Universal wildcard '...' is not permitted for dataset types in this context."
373 )
374 for storage in self._byName.values():
375 result[storage.datasetType].add(None)
376 if components:
377 try:
378 result[storage.datasetType].update(
379 storage.datasetType.storageClass.allComponents().keys()
380 )
381 except KeyError as err:
382 _LOG.warning(
383 f"Could not load storage class {err} for {storage.datasetType.name}; "
384 "if it has components they will not be included in query results.",
385 )
386 elif wildcard.patterns:
387 if explicit_only:
388 # After v26 this should raise DatasetTypeExpressionError, to
389 # be implemented on DM-36303.
390 warnings.warn(
391 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
392 FutureWarning,
393 )
394 for storage in self._byName.values():
395 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
396 result[storage.datasetType].add(None)
397 if components is not False:
398 for storage in self._byName.values():
399 if components is None and storage.datasetType in result:
400 continue
401 try:
402 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
403 except KeyError as err:
404 _LOG.warning(
405 f"Could not load storage class {err} for {storage.datasetType.name}; "
406 "if it has components they will not be included in query results."
407 )
408 continue
409 for component_name in components_for_parent:
410 if any(
411 p.fullmatch(
412 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
413 )
414 for p in wildcard.patterns
415 ):
416 result[storage.datasetType].add(component_name)
417 return {k: list(v) for k, v in result.items()}
419 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
420 # Docstring inherited from DatasetRecordStorageManager.
421 sql = (
422 sqlalchemy.sql.select(
423 self._static.dataset.columns.dataset_type_id,
424 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
425 )
426 .select_from(self._static.dataset)
427 .where(self._static.dataset.columns.id == id)
428 )
429 row = self._db.query(sql).mappings().fetchone()
430 if row is None:
431 return None
432 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
433 if recordsForType is None: 433 ↛ 434line 433 didn't jump to line 434, because the condition on line 433 was never true
434 self.refresh()
435 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
436 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
437 return DatasetRef(
438 recordsForType.datasetType,
439 dataId=recordsForType.getDataId(id=id),
440 id=id,
441 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
442 )
444 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
445 # Docstring inherited from DatasetRecordStorageManager.
446 return self._summaries.get(collection)
448 def schemaDigest(self) -> str | None:
449 # Docstring inherited from VersionedExtension.
450 return self._defaultSchemaDigest(self._static, self._db.dialect)
452 _version: VersionTuple
453 """Schema version for this class."""
455 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
456 """Type of the storage class returned by this manager."""
458 _autoincrement: bool
459 """If True then PK column of the dataset table is auto-increment."""
461 _idColumnType: type
462 """Type of dataset column used to store dataset ID."""
465class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
466 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
467 auto-incremental integer for dataset primary key.
468 """
470 _version: VersionTuple = _VERSION_INT
471 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
472 _autoincrement: bool = True
473 _idColumnType: type = sqlalchemy.BigInteger
475 @classmethod
476 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
477 # Docstring inherited from DatasetRecordStorageManager.
478 # MyPy seems confused about enum value types here.
479 return mode is mode.UNIQUE # type: ignore
482class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
483 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
484 UUID for dataset primary key.
485 """
487 _version: VersionTuple = _VERSION_UUID
488 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
489 _autoincrement: bool = False
490 _idColumnType: type = ddl.GUID
492 @classmethod
493 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
494 # Docstring inherited from DatasetRecordStorageManager.
495 return True