Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%
230 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-13 10:56 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-13 10:56 +0000
1from __future__ import annotations
3from .... import ddl
5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
7import dataclasses
8import logging
9from collections.abc import Iterable, Mapping
10from typing import TYPE_CHECKING, Any
12import sqlalchemy
14from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
15from ....dimensions import DimensionUniverse
16from ..._collection_summary import CollectionSummary
17from ..._exceptions import (
18 ConflictingDefinitionError,
19 DatasetTypeError,
20 DatasetTypeExpressionError,
21 OrphanedRecordError,
22)
23from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
24from ...wildcards import DatasetTypeWildcard
25from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
26from .summaries import CollectionSummaryManager
27from .tables import (
28 addDatasetForeignKey,
29 makeCalibTableName,
30 makeCalibTableSpec,
31 makeStaticTableSpecs,
32 makeTagTableName,
33 makeTagTableSpec,
34)
36if TYPE_CHECKING:
37 from ..._caching_context import CachingContext
38 from ...interfaces import (
39 CollectionManager,
40 CollectionRecord,
41 Database,
42 DimensionRecordStorageManager,
43 StaticTablesContext,
44 )
45 from .tables import StaticDatasetTablesTuple
48# This has to be updated on every schema change
49_VERSION_UUID = VersionTuple(1, 0, 0)
50# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
51# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
52# client migration period.
53_VERSION_UUID_NS = VersionTuple(2, 0, 0)
55_LOG = logging.getLogger(__name__)
58class MissingDatabaseTableError(RuntimeError):
59 """Exception raised when a table is not found in a database."""
62@dataclasses.dataclass
63class _DatasetTypeRecord:
64 """Contents of a single dataset type record."""
66 dataset_type: DatasetType
67 dataset_type_id: int
68 tag_table_name: str
69 calib_table_name: str | None
72class _SpecTableFactory:
73 """Factory for `sqlalchemy.schema.Table` instances that builds table
74 instances using provided `ddl.TableSpec` definition and verifies that
75 table exists in the database.
76 """
78 def __init__(self, db: Database, name: str, spec: ddl.TableSpec):
79 self._db = db
80 self._name = name
81 self._spec = spec
83 def __call__(self) -> sqlalchemy.schema.Table:
84 table = self._db.getExistingTable(self._name, self._spec)
85 if table is None: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.")
87 return table
90class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
91 """A manager class for datasets that uses one dataset-collection table for
92 each group of dataset types that share the same dimensions.
94 In addition to the table organization, this class makes a number of
95 other design choices that would have been cumbersome (to say the least) to
96 try to pack into its name:
98 - It uses a private surrogate integer autoincrement field to identify
99 dataset types, instead of using the name as the primary and foreign key
100 directly.
102 - It aggressively loads all DatasetTypes into memory instead of fetching
103 them from the database only when needed or attempting more clever forms
104 of caching.
106 Alternative implementations that make different choices for these while
107 keeping the same general table organization might be reasonable as well.
109 This class provides complete implementation of manager logic but it is
110 parametrized by few class attributes that have to be defined by
111 sub-classes.
113 Parameters
114 ----------
115 db : `Database`
116 Interface to the underlying database engine and namespace.
117 collections : `CollectionManager`
118 Manager object for the collections in this `Registry`.
119 dimensions : `DimensionRecordStorageManager`
120 Manager object for the dimensions in this `Registry`.
121 static : `StaticDatasetTablesTuple`
122 Named tuple of `sqlalchemy.schema.Table` instances for all static
123 tables used by this class.
124 summaries : `CollectionSummaryManager`
125 Structure containing tables that summarize the contents of collections.
126 caching_context : `CachingContext`
127 Object controlling caching of information returned by managers.
128 registry_schema_version : `VersionTuple` or `None`, optional
129 Version of registry schema.
130 """
132 def __init__(
133 self,
134 *,
135 db: Database,
136 collections: CollectionManager,
137 dimensions: DimensionRecordStorageManager,
138 static: StaticDatasetTablesTuple,
139 summaries: CollectionSummaryManager,
140 caching_context: CachingContext,
141 registry_schema_version: VersionTuple | None = None,
142 ):
143 super().__init__(registry_schema_version=registry_schema_version)
144 self._db = db
145 self._collections = collections
146 self._dimensions = dimensions
147 self._static = static
148 self._summaries = summaries
149 self._caching_context = caching_context
151 @classmethod
152 def initialize(
153 cls,
154 db: Database,
155 context: StaticTablesContext,
156 *,
157 collections: CollectionManager,
158 dimensions: DimensionRecordStorageManager,
159 caching_context: CachingContext,
160 registry_schema_version: VersionTuple | None = None,
161 ) -> DatasetRecordStorageManager:
162 # Docstring inherited from DatasetRecordStorageManager.
163 specs = cls.makeStaticTableSpecs(
164 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
165 )
166 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
167 summaries = CollectionSummaryManager.initialize(
168 db,
169 context,
170 collections=collections,
171 dimensions=dimensions,
172 dataset_type_table=static.dataset_type,
173 caching_context=caching_context,
174 )
175 return cls(
176 db=db,
177 collections=collections,
178 dimensions=dimensions,
179 static=static,
180 summaries=summaries,
181 caching_context=caching_context,
182 registry_schema_version=registry_schema_version,
183 )
185 @classmethod
186 def currentVersions(cls) -> list[VersionTuple]:
187 # Docstring inherited from VersionedExtension.
188 return cls._versions
190 @classmethod
191 def makeStaticTableSpecs(
192 cls,
193 collections: type[CollectionManager],
194 universe: DimensionUniverse,
195 schema_version: VersionTuple | None,
196 ) -> StaticDatasetTablesTuple:
197 """Construct all static tables used by the classes in this package.
199 Static tables are those that are present in all Registries and do not
200 depend on what DatasetTypes have been registered.
202 Parameters
203 ----------
204 collections : `CollectionManager`
205 Manager object for the collections in this `Registry`.
206 universe : `DimensionUniverse`
207 Universe graph containing all dimensions known to this `Registry`.
208 schema_version : `VersionTuple` or `None`
209 Version of the schema that should be created, if `None` then
210 default schema should be used.
212 Returns
213 -------
214 specs : `StaticDatasetTablesTuple`
215 A named tuple containing `ddl.TableSpec` instances.
216 """
217 schema_version = cls.clsNewSchemaVersion(schema_version)
218 assert schema_version is not None, "New schema version cannot be None"
219 return makeStaticTableSpecs(
220 collections,
221 universe=universe,
222 dtype=cls.getIdColumnType(),
223 autoincrement=cls._autoincrement,
224 schema_version=schema_version,
225 )
227 @classmethod
228 def getIdColumnType(cls) -> type:
229 # Docstring inherited from base class.
230 return cls._idColumnType
232 @classmethod
233 def addDatasetForeignKey(
234 cls,
235 tableSpec: ddl.TableSpec,
236 *,
237 name: str = "dataset",
238 constraint: bool = True,
239 onDelete: str | None = None,
240 **kwargs: Any,
241 ) -> ddl.FieldSpec:
242 # Docstring inherited from DatasetRecordStorageManager.
243 return addDatasetForeignKey(
244 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
245 )
247 def refresh(self) -> None:
248 # Docstring inherited from DatasetRecordStorageManager.
249 if self._caching_context.dataset_types is not None: 249 ↛ exitline 249 didn't return from function 'refresh', because the condition on line 249 was never false
250 self._caching_context.dataset_types.clear()
252 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage:
253 """Create storage instance for a dataset type record."""
254 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType())
255 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec)
256 calibs_table_factory = None
257 if record.calib_table_name is not None:
258 calibs_spec = makeCalibTableSpec(
259 record.dataset_type,
260 type(self._collections),
261 self._db.getTimespanRepresentation(),
262 self.getIdColumnType(),
263 )
264 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec)
265 storage = self._recordStorageType(
266 db=self._db,
267 datasetType=record.dataset_type,
268 static=self._static,
269 summaries=self._summaries,
270 tags_table_factory=tags_table_factory,
271 calibs_table_factory=calibs_table_factory,
272 dataset_type_id=record.dataset_type_id,
273 collections=self._collections,
274 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
275 )
276 return storage
278 def remove(self, name: str) -> None:
279 # Docstring inherited from DatasetRecordStorageManager.
280 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
281 if componentName is not None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true
282 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
284 # Delete the row
285 try:
286 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
287 except sqlalchemy.exc.IntegrityError as e:
288 raise OrphanedRecordError(
289 f"Dataset type {name} can not be removed."
290 " It is associated with datasets that must be removed first."
291 ) from e
293 # Now refresh everything -- removal is rare enough that this does
294 # not need to be fast.
295 self.refresh()
297 def find(self, name: str) -> DatasetRecordStorage | None:
298 # Docstring inherited from DatasetRecordStorageManager.
299 if self._caching_context.dataset_types is not None: 299 ↛ 312line 299 didn't jump to line 312, because the condition on line 299 was never false
300 _, storage = self._caching_context.dataset_types.get(name)
301 if storage is not None:
302 return storage
303 else:
304 # On the first cache miss populate the cache with complete list
305 # of dataset types (if it was not done yet).
306 if not self._caching_context.dataset_types.full:
307 self._fetch_dataset_types()
308 # Try again
309 _, storage = self._caching_context.dataset_types.get(name)
310 if storage is not None:
311 return storage
312 record = self._fetch_dataset_type_record(name)
313 if record is not None: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true
314 storage = self._make_storage(record)
315 if self._caching_context.dataset_types is not None:
316 self._caching_context.dataset_types.add(storage.datasetType, storage)
317 return storage
318 else:
319 return None
321 def register(self, datasetType: DatasetType) -> bool:
322 # Docstring inherited from DatasetRecordStorageManager.
323 if datasetType.isComponent(): 323 ↛ 324line 323 didn't jump to line 324, because the condition on line 323 was never true
324 raise ValueError(
325 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
326 )
327 record = self._fetch_dataset_type_record(datasetType.name)
328 if record is None:
329 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group())
330 tagTableName = makeTagTableName(datasetType, dimensionsKey)
331 self._db.ensureTableExists(
332 tagTableName,
333 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
334 )
335 calibTableName = (
336 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
337 )
338 if calibTableName is not None:
339 self._db.ensureTableExists(
340 calibTableName,
341 makeCalibTableSpec(
342 datasetType,
343 type(self._collections),
344 self._db.getTimespanRepresentation(),
345 self.getIdColumnType(),
346 ),
347 )
348 row, inserted = self._db.sync(
349 self._static.dataset_type,
350 keys={"name": datasetType.name},
351 compared={
352 "dimensions_key": dimensionsKey,
353 # Force the storage class to be loaded to ensure it
354 # exists and there is no typo in the name.
355 "storage_class": datasetType.storageClass.name,
356 },
357 extra={
358 "tag_association_table": tagTableName,
359 "calibration_association_table": calibTableName,
360 },
361 returning=["id", "tag_association_table"],
362 )
363 # Make sure that cache is updated
364 if self._caching_context.dataset_types is not None and row is not None: 364 ↛ 381line 364 didn't jump to line 381, because the condition on line 364 was never false
365 record = _DatasetTypeRecord(
366 dataset_type=datasetType,
367 dataset_type_id=row["id"],
368 tag_table_name=tagTableName,
369 calib_table_name=calibTableName,
370 )
371 storage = self._make_storage(record)
372 self._caching_context.dataset_types.add(datasetType, storage)
373 else:
374 if datasetType != record.dataset_type:
375 raise ConflictingDefinitionError(
376 f"Given dataset type {datasetType} is inconsistent "
377 f"with database definition {record.dataset_type}."
378 )
379 inserted = False
381 return bool(inserted)
383 def resolve_wildcard(
384 self,
385 expression: Any,
386 missing: list[str] | None = None,
387 explicit_only: bool = False,
388 ) -> list[DatasetType]:
389 wildcard = DatasetTypeWildcard.from_expression(expression)
390 result: list[DatasetType] = []
391 for name, dataset_type in wildcard.values.items():
392 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
393 if component_name is not None:
394 raise DatasetTypeError(
395 "Component dataset types are not supported in Registry methods; use DatasetRef or "
396 "DatasetType methods to obtain components from parents instead."
397 )
398 if (found_storage := self.find(parent_name)) is not None:
399 resolved_dataset_type = found_storage.datasetType
400 if dataset_type is not None:
401 if dataset_type.is_compatible_with(resolved_dataset_type): 401 ↛ 406line 401 didn't jump to line 406, because the condition on line 401 was never false
402 # Prefer the given dataset type to enable storage class
403 # conversions.
404 resolved_dataset_type = dataset_type
405 else:
406 raise DatasetTypeError(
407 f"Dataset type definition in query expression {dataset_type} is "
408 f"not compatible with the registered type {resolved_dataset_type}."
409 )
410 result.append(resolved_dataset_type)
411 elif missing is not None:
412 missing.append(name)
413 if wildcard.patterns is ...:
414 if explicit_only:
415 raise TypeError(
416 "Universal wildcard '...' is not permitted for dataset types in this context."
417 )
418 for datasetType in self._fetch_dataset_types():
419 result.append(datasetType)
420 elif wildcard.patterns:
421 if explicit_only:
422 raise DatasetTypeExpressionError(
423 "Dataset type wildcard expressions are not supported in this context."
424 )
425 dataset_types = self._fetch_dataset_types()
426 for datasetType in dataset_types:
427 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
428 result.append(datasetType)
430 return result
432 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
433 # Docstring inherited from DatasetRecordStorageManager.
434 sql = (
435 sqlalchemy.sql.select(
436 self._static.dataset.columns.dataset_type_id,
437 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
438 *self._static.dataset_type.columns,
439 )
440 .select_from(self._static.dataset)
441 .join(self._static.dataset_type)
442 .where(self._static.dataset.columns.id == id)
443 )
444 with self._db.query(sql) as sql_result:
445 row = sql_result.mappings().fetchone()
446 if row is None:
447 return None
448 record = self._record_from_row(row)
449 storage: DatasetRecordStorage | None = None
450 if self._caching_context.dataset_types is not None: 450 ↛ 452line 450 didn't jump to line 452, because the condition on line 450 was never false
451 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name)
452 if storage is None:
453 storage = self._make_storage(record)
454 if self._caching_context.dataset_types is not None: 454 ↛ 456line 454 didn't jump to line 456, because the condition on line 454 was never false
455 self._caching_context.dataset_types.add(storage.datasetType, storage)
456 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class"
457 return DatasetRef(
458 storage.datasetType,
459 dataId=storage.getDataId(id=id),
460 id=id,
461 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
462 )
464 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None:
465 """Retrieve all dataset types defined in database.
467 Yields
468 ------
469 dataset_types : `_DatasetTypeRecord`
470 Information from a single database record.
471 """
472 c = self._static.dataset_type.columns
473 stmt = self._static.dataset_type.select().where(c.name == name)
474 with self._db.query(stmt) as sql_result:
475 row = sql_result.mappings().one_or_none()
476 if row is None:
477 return None
478 else:
479 return self._record_from_row(row)
481 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord:
482 name = row["name"]
483 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"])
484 calibTableName = row["calibration_association_table"]
485 datasetType = DatasetType(
486 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None)
487 )
488 return _DatasetTypeRecord(
489 dataset_type=datasetType,
490 dataset_type_id=row["id"],
491 tag_table_name=row["tag_association_table"],
492 calib_table_name=calibTableName,
493 )
495 def _dataset_type_from_row(self, row: Mapping) -> DatasetType:
496 return self._record_from_row(row).dataset_type
498 def _fetch_dataset_types(self) -> list[DatasetType]:
499 """Fetch list of all defined dataset types."""
500 if self._caching_context.dataset_types is not None: 500 ↛ 503line 500 didn't jump to line 503, because the condition on line 500 was never false
501 if self._caching_context.dataset_types.full:
502 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()]
503 with self._db.query(self._static.dataset_type.select()) as sql_result:
504 sql_rows = sql_result.mappings().fetchall()
505 records = [self._record_from_row(row) for row in sql_rows]
506 # Cache everything and specify that cache is complete.
507 if self._caching_context.dataset_types is not None: 507 ↛ 510line 507 didn't jump to line 510, because the condition on line 507 was never false
508 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records]
509 self._caching_context.dataset_types.set(cache_data, full=True)
510 return [record.dataset_type for record in records]
512 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
513 # Docstring inherited from DatasetRecordStorageManager.
514 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row)
515 return summaries[collection.key]
517 def fetch_summaries(
518 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
519 ) -> Mapping[Any, CollectionSummary]:
520 # Docstring inherited from DatasetRecordStorageManager.
521 dataset_type_names: Iterable[str] | None = None
522 if dataset_types is not None: 522 ↛ 524line 522 didn't jump to line 524, because the condition on line 522 was never false
523 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
524 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
526 _versions: list[VersionTuple]
527 """Schema version for this class."""
529 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
530 """Type of the storage class returned by this manager."""
532 _autoincrement: bool
533 """If True then PK column of the dataset table is auto-increment."""
535 _idColumnType: type
536 """Type of dataset column used to store dataset ID."""
539class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
540 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
541 UUID for dataset primary key.
542 """
544 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
545 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
546 _autoincrement: bool = False
547 _idColumnType: type = ddl.GUID
549 def clone(
550 self,
551 *,
552 db: Database,
553 collections: CollectionManager,
554 dimensions: DimensionRecordStorageManager,
555 caching_context: CachingContext,
556 ) -> ByDimensionsDatasetRecordStorageManagerUUID:
557 return ByDimensionsDatasetRecordStorageManagerUUID(
558 db=db,
559 collections=collections,
560 dimensions=dimensions,
561 static=self._static,
562 summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context),
563 caching_context=caching_context,
564 registry_schema_version=self._registry_schema_version,
565 )
567 @classmethod
568 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
569 # Docstring inherited from DatasetRecordStorageManager.
570 return True
572 @classmethod
573 def _newDefaultSchemaVersion(cls) -> VersionTuple:
574 # Docstring inherited from VersionedExtension.
576 # By default return 1.0.0 so that older clients can still access new
577 # registries created with a default config.
578 return _VERSION_UUID
580 def ingest_date_dtype(self) -> type:
581 """Return type of the ``ingest_date`` column."""
582 schema_version = self.newSchemaVersion()
583 if schema_version is not None and schema_version.major > 1:
584 return ddl.AstropyTimeNsecTai
585 else:
586 return sqlalchemy.TIMESTAMP