Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%
207 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-04 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-04 02:04 -0800
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8import logging
9import warnings
10from collections import defaultdict
11from typing import TYPE_CHECKING, Any
13import sqlalchemy
14from deprecated.sphinx import deprecated
15from lsst.utils.ellipsis import Ellipsis
17from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
18from ..._collection_summary import CollectionSummary
19from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
20from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
21from ...wildcards import DatasetTypeWildcard
22from ._storage import (
23 ByDimensionsDatasetRecordStorage,
24 ByDimensionsDatasetRecordStorageInt,
25 ByDimensionsDatasetRecordStorageUUID,
26)
27from .summaries import CollectionSummaryManager
28from .tables import (
29 addDatasetForeignKey,
30 makeCalibTableName,
31 makeCalibTableSpec,
32 makeStaticTableSpecs,
33 makeTagTableName,
34 makeTagTableSpec,
35)
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from ...interfaces import (
39 CollectionManager,
40 CollectionRecord,
41 Database,
42 DimensionRecordStorageManager,
43 StaticTablesContext,
44 )
45 from .tables import StaticDatasetTablesTuple
48# This has to be updated on every schema change
49_VERSION_INT = VersionTuple(1, 0, 0)
50_VERSION_UUID = VersionTuple(1, 0, 0)
52_LOG = logging.getLogger(__name__)
55class MissingDatabaseTableError(RuntimeError):
56 """Exception raised when a table is not found in a database."""
59class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
60 """A manager class for datasets that uses one dataset-collection table for
61 each group of dataset types that share the same dimensions.
63 In addition to the table organization, this class makes a number of
64 other design choices that would have been cumbersome (to say the least) to
65 try to pack into its name:
67 - It uses a private surrogate integer autoincrement field to identify
68 dataset types, instead of using the name as the primary and foreign key
69 directly.
71 - It aggressively loads all DatasetTypes into memory instead of fetching
72 them from the database only when needed or attempting more clever forms
73 of caching.
75 Alternative implementations that make different choices for these while
76 keeping the same general table organization might be reasonable as well.
78 This class provides complete implementation of manager logic but it is
79 parametrized by few class attributes that have to be defined by
80 sub-classes.
82 Parameters
83 ----------
84 db : `Database`
85 Interface to the underlying database engine and namespace.
86 collections : `CollectionManager`
87 Manager object for the collections in this `Registry`.
88 dimensions : `DimensionRecordStorageManager`
89 Manager object for the dimensions in this `Registry`.
90 static : `StaticDatasetTablesTuple`
91 Named tuple of `sqlalchemy.schema.Table` instances for all static
92 tables used by this class.
93 summaries : `CollectionSummaryManager`
94 Structure containing tables that summarize the contents of collections.
95 """
97 def __init__(
98 self,
99 *,
100 db: Database,
101 collections: CollectionManager,
102 dimensions: DimensionRecordStorageManager,
103 static: StaticDatasetTablesTuple,
104 summaries: CollectionSummaryManager,
105 ):
106 self._db = db
107 self._collections = collections
108 self._dimensions = dimensions
109 self._static = static
110 self._summaries = summaries
111 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
112 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
114 @classmethod
115 def initialize(
116 cls,
117 db: Database,
118 context: StaticTablesContext,
119 *,
120 collections: CollectionManager,
121 dimensions: DimensionRecordStorageManager,
122 ) -> DatasetRecordStorageManager:
123 # Docstring inherited from DatasetRecordStorageManager.
124 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
126 summaries = CollectionSummaryManager.initialize(
127 db,
128 context,
129 collections=collections,
130 dimensions=dimensions,
131 )
132 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
134 @classmethod
135 def currentVersion(cls) -> VersionTuple | None:
136 # Docstring inherited from VersionedExtension.
137 return cls._version
139 @classmethod
140 def makeStaticTableSpecs(
141 cls, collections: type[CollectionManager], universe: DimensionUniverse
142 ) -> StaticDatasetTablesTuple:
143 """Construct all static tables used by the classes in this package.
145 Static tables are those that are present in all Registries and do not
146 depend on what DatasetTypes have been registered.
148 Parameters
149 ----------
150 collections: `CollectionManager`
151 Manager object for the collections in this `Registry`.
152 universe : `DimensionUniverse`
153 Universe graph containing all dimensions known to this `Registry`.
155 Returns
156 -------
157 specs : `StaticDatasetTablesTuple`
158 A named tuple containing `ddl.TableSpec` instances.
159 """
160 return makeStaticTableSpecs(
161 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
162 )
164 @classmethod
165 def getIdColumnType(cls) -> type:
166 # Docstring inherited from base class.
167 return cls._idColumnType
169 @classmethod
170 def addDatasetForeignKey(
171 cls,
172 tableSpec: ddl.TableSpec,
173 *,
174 name: str = "dataset",
175 constraint: bool = True,
176 onDelete: str | None = None,
177 **kwargs: Any,
178 ) -> ddl.FieldSpec:
179 # Docstring inherited from DatasetRecordStorageManager.
180 return addDatasetForeignKey(
181 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
182 )
184 def refresh(self) -> None:
185 # Docstring inherited from DatasetRecordStorageManager.
186 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
187 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
188 c = self._static.dataset_type.columns
189 with self._db.query(self._static.dataset_type.select()) as sql_result:
190 sql_rows = sql_result.mappings().fetchall()
191 for row in sql_rows:
192 name = row[c.name]
193 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
194 calibTableName = row[c.calibration_association_table]
195 datasetType = DatasetType(
196 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
197 )
198 tags = self._db.getExistingTable(
199 row[c.tag_association_table],
200 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
201 )
202 if tags is None: 202 ↛ 203line 202 didn't jump to line 203, because the condition on line 202 was never true
203 raise MissingDatabaseTableError(
204 f"Table {row[c.tag_association_table]} is missing from database schema."
205 )
206 if calibTableName is not None:
207 calibs = self._db.getExistingTable(
208 row[c.calibration_association_table],
209 makeCalibTableSpec(
210 datasetType,
211 type(self._collections),
212 self._db.getTimespanRepresentation(),
213 self.getIdColumnType(),
214 ),
215 )
216 if calibs is None: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true
217 raise MissingDatabaseTableError(
218 f"Table {row[c.calibration_association_table]} is missing from database schema."
219 )
220 else:
221 calibs = None
222 storage = self._recordStorageType(
223 db=self._db,
224 datasetType=datasetType,
225 static=self._static,
226 summaries=self._summaries,
227 tags=tags,
228 calibs=calibs,
229 dataset_type_id=row["id"],
230 collections=self._collections,
231 )
232 byName[datasetType.name] = storage
233 byId[storage._dataset_type_id] = storage
234 self._byName = byName
235 self._byId = byId
236 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
238 def remove(self, name: str) -> None:
239 # Docstring inherited from DatasetRecordStorageManager.
240 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
241 if componentName is not None:
242 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
244 # Delete the row
245 try:
246 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
247 except sqlalchemy.exc.IntegrityError as e:
248 raise OrphanedRecordError(
249 f"Dataset type {name} can not be removed."
250 " It is associated with datasets that must be removed first."
251 ) from e
253 # Now refresh everything -- removal is rare enough that this does
254 # not need to be fast.
255 self.refresh()
257 def find(self, name: str) -> DatasetRecordStorage | None:
258 # Docstring inherited from DatasetRecordStorageManager.
259 return self._byName.get(name)
261 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
262 # Docstring inherited from DatasetRecordStorageManager.
263 if datasetType.isComponent(): 263 ↛ 264line 263 didn't jump to line 264, because the condition on line 263 was never true
264 raise ValueError(
265 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
266 )
267 storage = self._byName.get(datasetType.name)
268 if storage is None:
269 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
270 tagTableName = makeTagTableName(datasetType, dimensionsKey)
271 calibTableName = (
272 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
273 )
274 # The order is important here, we want to create tables first and
275 # only register them if this operation is successful. We cannot
276 # wrap it into a transaction because database class assumes that
277 # DDL is not transaction safe in general.
278 tags = self._db.ensureTableExists(
279 tagTableName,
280 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
281 )
282 if calibTableName is not None:
283 calibs = self._db.ensureTableExists(
284 calibTableName,
285 makeCalibTableSpec(
286 datasetType,
287 type(self._collections),
288 self._db.getTimespanRepresentation(),
289 self.getIdColumnType(),
290 ),
291 )
292 else:
293 calibs = None
294 row, inserted = self._db.sync(
295 self._static.dataset_type,
296 keys={"name": datasetType.name},
297 compared={
298 "dimensions_key": dimensionsKey,
299 # Force the storage class to be loaded to ensure it
300 # exists and there is no typo in the name.
301 "storage_class": datasetType.storageClass.name,
302 },
303 extra={
304 "tag_association_table": tagTableName,
305 "calibration_association_table": calibTableName,
306 },
307 returning=["id", "tag_association_table"],
308 )
309 assert row is not None
310 storage = self._recordStorageType(
311 db=self._db,
312 datasetType=datasetType,
313 static=self._static,
314 summaries=self._summaries,
315 tags=tags,
316 calibs=calibs,
317 dataset_type_id=row["id"],
318 collections=self._collections,
319 )
320 self._byName[datasetType.name] = storage
321 self._byId[storage._dataset_type_id] = storage
322 else:
323 if datasetType != storage.datasetType:
324 raise ConflictingDefinitionError(
325 f"Given dataset type {datasetType} is inconsistent "
326 f"with database definition {storage.datasetType}."
327 )
328 inserted = False
329 return storage, bool(inserted)
331 def resolve_wildcard(
332 self,
333 expression: Any,
334 components: bool | None = None,
335 missing: list[str] | None = None,
336 explicit_only: bool = False,
337 ) -> dict[DatasetType, list[str | None]]:
338 wildcard = DatasetTypeWildcard.from_expression(expression)
339 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
340 # This message can be transformed into an error on DM-36303 after v26,
341 # and the components argument here (and in all callers) can be removed
342 # entirely on DM-36457 after v27.
343 deprecation_message = (
344 "Querying for component datasets via Registry query methods is deprecated in favor of using "
345 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
346 "after v26, and the components argument will be removed after v27."
347 )
348 for name, dataset_type in wildcard.values.items():
349 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
350 if component_name is not None:
351 warnings.warn(deprecation_message, FutureWarning)
352 if (found_storage := self.find(parent_name)) is not None:
353 found_parent = found_storage.datasetType
354 if component_name is not None:
355 found = found_parent.makeComponentDatasetType(component_name)
356 else:
357 found = found_parent
358 if dataset_type is not None:
359 if dataset_type.is_compatible_with(found): 359 ↛ 367line 359 didn't jump to line 367, because the condition on line 359 was never false
360 # Prefer the given dataset type to enable storage class
361 # conversions.
362 if component_name is not None: 362 ↛ 363line 362 didn't jump to line 363, because the condition on line 362 was never true
363 found_parent = dataset_type.makeCompositeDatasetType()
364 else:
365 found_parent = dataset_type
366 else:
367 raise DatasetTypeError(
368 f"Dataset type definition in query expression {dataset_type} is "
369 f"not compatible with the registered type {found}."
370 )
371 result[found_parent].add(component_name)
372 elif missing is not None: 372 ↛ 348line 372 didn't jump to line 348, because the condition on line 372 was never false
373 missing.append(name)
374 already_warned = False
375 if wildcard.patterns is Ellipsis:
376 if explicit_only:
377 raise TypeError(
378 "Universal wildcard '...' is not permitted for dataset types in this context."
379 )
380 for storage in self._byName.values():
381 result[storage.datasetType].add(None)
382 if components:
383 try:
384 result[storage.datasetType].update(
385 storage.datasetType.storageClass.allComponents().keys()
386 )
387 if storage.datasetType.storageClass.allComponents() and not already_warned:
388 warnings.warn(deprecation_message, FutureWarning)
389 already_warned = True
390 except KeyError as err:
391 _LOG.warning(
392 f"Could not load storage class {err} for {storage.datasetType.name}; "
393 "if it has components they will not be included in query results.",
394 )
395 elif wildcard.patterns:
396 if explicit_only:
397 # After v26 this should raise DatasetTypeExpressionError, to
398 # be implemented on DM-36303.
399 warnings.warn(
400 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
401 FutureWarning,
402 )
403 for storage in self._byName.values():
404 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
405 result[storage.datasetType].add(None)
406 if components is not False:
407 for storage in self._byName.values():
408 if components is None and storage.datasetType in result:
409 continue
410 try:
411 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
412 except KeyError as err:
413 _LOG.warning(
414 f"Could not load storage class {err} for {storage.datasetType.name}; "
415 "if it has components they will not be included in query results."
416 )
417 continue
418 for component_name in components_for_parent:
419 if any(
420 p.fullmatch(
421 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
422 )
423 for p in wildcard.patterns
424 ):
425 result[storage.datasetType].add(component_name)
426 if not already_warned:
427 warnings.warn(deprecation_message, FutureWarning)
428 already_warned = True
429 return {k: list(v) for k, v in result.items()}
431 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
432 # Docstring inherited from DatasetRecordStorageManager.
433 sql = (
434 sqlalchemy.sql.select(
435 self._static.dataset.columns.dataset_type_id,
436 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
437 )
438 .select_from(self._static.dataset)
439 .where(self._static.dataset.columns.id == id)
440 )
441 with self._db.query(sql) as sql_result:
442 row = sql_result.mappings().fetchone()
443 if row is None:
444 return None
445 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
446 if recordsForType is None: 446 ↛ 447line 446 didn't jump to line 447, because the condition on line 446 was never true
447 self.refresh()
448 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
449 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
450 return DatasetRef(
451 recordsForType.datasetType,
452 dataId=recordsForType.getDataId(id=id),
453 id=id,
454 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
455 )
457 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
458 # Docstring inherited from DatasetRecordStorageManager.
459 return self._summaries.get(collection)
461 def schemaDigest(self) -> str | None:
462 # Docstring inherited from VersionedExtension.
463 return self._defaultSchemaDigest(self._static, self._db.dialect)
465 _version: VersionTuple
466 """Schema version for this class."""
468 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
469 """Type of the storage class returned by this manager."""
471 _autoincrement: bool
472 """If True then PK column of the dataset table is auto-increment."""
474 _idColumnType: type
475 """Type of dataset column used to store dataset ID."""
478@deprecated(
479 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v26. "
480 "Please migrate or re-create this data repository.",
481 version="v25.0",
482 category=FutureWarning,
483)
484class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
485 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
486 auto-incremental integer for dataset primary key.
487 """
489 _version: VersionTuple = _VERSION_INT
490 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
491 _autoincrement: bool = True
492 _idColumnType: type = sqlalchemy.BigInteger
494 @classmethod
495 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
496 # Docstring inherited from DatasetRecordStorageManager.
497 # MyPy seems confused about enum value types here.
498 return mode is mode.UNIQUE # type: ignore
501class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
502 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
503 UUID for dataset primary key.
504 """
506 _version: VersionTuple = _VERSION_UUID
507 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
508 _autoincrement: bool = False
509 _idColumnType: type = ddl.GUID
511 @classmethod
512 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
513 # Docstring inherited from DatasetRecordStorageManager.
514 return True