Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 90%
228 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:43 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:43 +0000
1from __future__ import annotations
3from .... import ddl
5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
7import dataclasses
8import logging
9from collections.abc import Iterable, Mapping
10from typing import TYPE_CHECKING, Any
12import sqlalchemy
14from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
15from ....dimensions import DimensionUniverse
16from ..._collection_summary import CollectionSummary
17from ..._exceptions import (
18 ConflictingDefinitionError,
19 DatasetTypeError,
20 DatasetTypeExpressionError,
21 OrphanedRecordError,
22)
23from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
24from ...wildcards import DatasetTypeWildcard
25from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
26from .summaries import CollectionSummaryManager
27from .tables import (
28 addDatasetForeignKey,
29 makeCalibTableName,
30 makeCalibTableSpec,
31 makeStaticTableSpecs,
32 makeTagTableName,
33 makeTagTableSpec,
34)
36if TYPE_CHECKING:
37 from ..._caching_context import CachingContext
38 from ...interfaces import (
39 CollectionManager,
40 CollectionRecord,
41 Database,
42 DimensionRecordStorageManager,
43 StaticTablesContext,
44 )
45 from .tables import StaticDatasetTablesTuple
48# This has to be updated on every schema change
49_VERSION_UUID = VersionTuple(1, 0, 0)
50# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
51# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
52# client migration period.
53_VERSION_UUID_NS = VersionTuple(2, 0, 0)
55_LOG = logging.getLogger(__name__)
58class MissingDatabaseTableError(RuntimeError):
59 """Exception raised when a table is not found in a database."""
62@dataclasses.dataclass
63class _DatasetTypeRecord:
64 """Contents of a single dataset type record."""
66 dataset_type: DatasetType
67 dataset_type_id: int
68 tag_table_name: str
69 calib_table_name: str | None
72class _SpecTableFactory:
73 """Factory for `sqlalchemy.schema.Table` instances that builds table
74 instances using provided `ddl.TableSpec` definition and verifies that
75 table exists in the database.
76 """
78 def __init__(self, db: Database, name: str, spec: ddl.TableSpec):
79 self._db = db
80 self._name = name
81 self._spec = spec
83 def __call__(self) -> sqlalchemy.schema.Table:
84 table = self._db.getExistingTable(self._name, self._spec)
85 if table is None: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true
86 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.")
87 return table
90class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
91 """A manager class for datasets that uses one dataset-collection table for
92 each group of dataset types that share the same dimensions.
94 In addition to the table organization, this class makes a number of
95 other design choices that would have been cumbersome (to say the least) to
96 try to pack into its name:
98 - It uses a private surrogate integer autoincrement field to identify
99 dataset types, instead of using the name as the primary and foreign key
100 directly.
102 - It aggressively loads all DatasetTypes into memory instead of fetching
103 them from the database only when needed or attempting more clever forms
104 of caching.
106 Alternative implementations that make different choices for these while
107 keeping the same general table organization might be reasonable as well.
109 This class provides complete implementation of manager logic but it is
110 parametrized by few class attributes that have to be defined by
111 sub-classes.
113 Parameters
114 ----------
115 db : `Database`
116 Interface to the underlying database engine and namespace.
117 collections : `CollectionManager`
118 Manager object for the collections in this `Registry`.
119 dimensions : `DimensionRecordStorageManager`
120 Manager object for the dimensions in this `Registry`.
121 static : `StaticDatasetTablesTuple`
122 Named tuple of `sqlalchemy.schema.Table` instances for all static
123 tables used by this class.
124 summaries : `CollectionSummaryManager`
125 Structure containing tables that summarize the contents of collections.
126 caching_context : `CachingContext`
127 Object controlling caching of information returned by managers.
128 registry_schema_version : `VersionTuple` or `None`, optional
129 Version of registry schema.
130 """
132 def __init__(
133 self,
134 *,
135 db: Database,
136 collections: CollectionManager,
137 dimensions: DimensionRecordStorageManager,
138 static: StaticDatasetTablesTuple,
139 summaries: CollectionSummaryManager,
140 caching_context: CachingContext,
141 registry_schema_version: VersionTuple | None = None,
142 ):
143 super().__init__(registry_schema_version=registry_schema_version)
144 self._db = db
145 self._collections = collections
146 self._dimensions = dimensions
147 self._static = static
148 self._summaries = summaries
149 self._caching_context = caching_context
151 @classmethod
152 def initialize(
153 cls,
154 db: Database,
155 context: StaticTablesContext,
156 *,
157 collections: CollectionManager,
158 dimensions: DimensionRecordStorageManager,
159 caching_context: CachingContext,
160 registry_schema_version: VersionTuple | None = None,
161 ) -> DatasetRecordStorageManager:
162 # Docstring inherited from DatasetRecordStorageManager.
163 specs = cls.makeStaticTableSpecs(
164 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
165 )
166 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
167 summaries = CollectionSummaryManager.initialize(
168 db,
169 context,
170 collections=collections,
171 dimensions=dimensions,
172 dataset_type_table=static.dataset_type,
173 caching_context=caching_context,
174 )
175 return cls(
176 db=db,
177 collections=collections,
178 dimensions=dimensions,
179 static=static,
180 summaries=summaries,
181 caching_context=caching_context,
182 registry_schema_version=registry_schema_version,
183 )
185 @classmethod
186 def currentVersions(cls) -> list[VersionTuple]:
187 # Docstring inherited from VersionedExtension.
188 return cls._versions
190 @classmethod
191 def makeStaticTableSpecs(
192 cls,
193 collections: type[CollectionManager],
194 universe: DimensionUniverse,
195 schema_version: VersionTuple | None,
196 ) -> StaticDatasetTablesTuple:
197 """Construct all static tables used by the classes in this package.
199 Static tables are those that are present in all Registries and do not
200 depend on what DatasetTypes have been registered.
202 Parameters
203 ----------
204 collections : `CollectionManager`
205 Manager object for the collections in this `Registry`.
206 universe : `DimensionUniverse`
207 Universe graph containing all dimensions known to this `Registry`.
208 schema_version : `VersionTuple` or `None`
209 Version of the schema that should be created, if `None` then
210 default schema should be used.
212 Returns
213 -------
214 specs : `StaticDatasetTablesTuple`
215 A named tuple containing `ddl.TableSpec` instances.
216 """
217 schema_version = cls.clsNewSchemaVersion(schema_version)
218 assert schema_version is not None, "New schema version cannot be None"
219 return makeStaticTableSpecs(
220 collections,
221 universe=universe,
222 dtype=cls.getIdColumnType(),
223 autoincrement=cls._autoincrement,
224 schema_version=schema_version,
225 )
227 @classmethod
228 def getIdColumnType(cls) -> type:
229 # Docstring inherited from base class.
230 return cls._idColumnType
232 @classmethod
233 def addDatasetForeignKey(
234 cls,
235 tableSpec: ddl.TableSpec,
236 *,
237 name: str = "dataset",
238 constraint: bool = True,
239 onDelete: str | None = None,
240 **kwargs: Any,
241 ) -> ddl.FieldSpec:
242 # Docstring inherited from DatasetRecordStorageManager.
243 return addDatasetForeignKey(
244 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
245 )
247 def refresh(self) -> None:
248 # Docstring inherited from DatasetRecordStorageManager.
249 if self._caching_context.dataset_types is not None: 249 ↛ exitline 249 didn't return from function 'refresh', because the condition on line 249 was never false
250 self._caching_context.dataset_types.clear()
252 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage:
253 """Create storage instance for a dataset type record."""
254 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType())
255 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec)
256 calibs_table_factory = None
257 if record.calib_table_name is not None:
258 calibs_spec = makeCalibTableSpec(
259 record.dataset_type,
260 type(self._collections),
261 self._db.getTimespanRepresentation(),
262 self.getIdColumnType(),
263 )
264 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec)
265 storage = self._recordStorageType(
266 db=self._db,
267 datasetType=record.dataset_type,
268 static=self._static,
269 summaries=self._summaries,
270 tags_table_factory=tags_table_factory,
271 calibs_table_factory=calibs_table_factory,
272 dataset_type_id=record.dataset_type_id,
273 collections=self._collections,
274 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
275 )
276 return storage
278 def remove(self, name: str) -> None:
279 # Docstring inherited from DatasetRecordStorageManager.
280 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
281 if componentName is not None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true
282 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
284 # Delete the row
285 try:
286 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
287 except sqlalchemy.exc.IntegrityError as e:
288 raise OrphanedRecordError(
289 f"Dataset type {name} can not be removed."
290 " It is associated with datasets that must be removed first."
291 ) from e
293 # Now refresh everything -- removal is rare enough that this does
294 # not need to be fast.
295 self.refresh()
297 def find(self, name: str) -> DatasetRecordStorage | None:
298 # Docstring inherited from DatasetRecordStorageManager.
299 if self._caching_context.dataset_types is not None: 299 ↛ 313line 299 didn't jump to line 313, because the condition on line 299 was never false
300 _, storage = self._caching_context.dataset_types.get(name)
301 if storage is not None:
302 return storage
303 else:
304 # On the first cache miss populate the cache with complete list
305 # of dataset types (if it was not done yet).
306 if not self._caching_context.dataset_types.full:
307 self._fetch_dataset_types()
308 # Try again
309 _, storage = self._caching_context.dataset_types.get(name)
310 if self._caching_context.dataset_types.full: 310 ↛ 313line 310 didn't jump to line 313, because the condition on line 310 was never false
311 # If not in cache then dataset type is not defined.
312 return storage
313 record = self._fetch_dataset_type_record(name)
314 if record is not None:
315 storage = self._make_storage(record)
316 if self._caching_context.dataset_types is not None:
317 self._caching_context.dataset_types.add(storage.datasetType, storage)
318 return storage
319 else:
320 return None
322 def register(self, datasetType: DatasetType) -> bool:
323 # Docstring inherited from DatasetRecordStorageManager.
324 if datasetType.isComponent(): 324 ↛ 325line 324 didn't jump to line 325, because the condition on line 324 was never true
325 raise ValueError(
326 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
327 )
328 record = self._fetch_dataset_type_record(datasetType.name)
329 if record is None:
330 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group())
331 tagTableName = makeTagTableName(datasetType, dimensionsKey)
332 self._db.ensureTableExists(
333 tagTableName,
334 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
335 )
336 calibTableName = (
337 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
338 )
339 if calibTableName is not None:
340 self._db.ensureTableExists(
341 calibTableName,
342 makeCalibTableSpec(
343 datasetType,
344 type(self._collections),
345 self._db.getTimespanRepresentation(),
346 self.getIdColumnType(),
347 ),
348 )
349 row, inserted = self._db.sync(
350 self._static.dataset_type,
351 keys={"name": datasetType.name},
352 compared={
353 "dimensions_key": dimensionsKey,
354 # Force the storage class to be loaded to ensure it
355 # exists and there is no typo in the name.
356 "storage_class": datasetType.storageClass.name,
357 },
358 extra={
359 "tag_association_table": tagTableName,
360 "calibration_association_table": calibTableName,
361 },
362 returning=["id", "tag_association_table"],
363 )
364 # Make sure that cache is updated
365 if self._caching_context.dataset_types is not None and row is not None: 365 ↛ 382line 365 didn't jump to line 382, because the condition on line 365 was never false
366 record = _DatasetTypeRecord(
367 dataset_type=datasetType,
368 dataset_type_id=row["id"],
369 tag_table_name=tagTableName,
370 calib_table_name=calibTableName,
371 )
372 storage = self._make_storage(record)
373 self._caching_context.dataset_types.add(datasetType, storage)
374 else:
375 if datasetType != record.dataset_type:
376 raise ConflictingDefinitionError(
377 f"Given dataset type {datasetType} is inconsistent "
378 f"with database definition {record.dataset_type}."
379 )
380 inserted = False
382 return bool(inserted)
384 def resolve_wildcard(
385 self,
386 expression: Any,
387 missing: list[str] | None = None,
388 explicit_only: bool = False,
389 ) -> list[DatasetType]:
390 wildcard = DatasetTypeWildcard.from_expression(expression)
391 result: list[DatasetType] = []
392 for name, dataset_type in wildcard.values.items():
393 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
394 if component_name is not None:
395 raise DatasetTypeError(
396 "Component dataset types are not supported in Registry methods; use DatasetRef or "
397 "DatasetType methods to obtain components from parents instead."
398 )
399 if (found_storage := self.find(parent_name)) is not None:
400 resolved_dataset_type = found_storage.datasetType
401 if dataset_type is not None:
402 if dataset_type.is_compatible_with(resolved_dataset_type): 402 ↛ 407line 402 didn't jump to line 407, because the condition on line 402 was never false
403 # Prefer the given dataset type to enable storage class
404 # conversions.
405 resolved_dataset_type = dataset_type
406 else:
407 raise DatasetTypeError(
408 f"Dataset type definition in query expression {dataset_type} is "
409 f"not compatible with the registered type {resolved_dataset_type}."
410 )
411 result.append(resolved_dataset_type)
412 elif missing is not None:
413 missing.append(name)
414 if wildcard.patterns is ...:
415 if explicit_only:
416 raise TypeError(
417 "Universal wildcard '...' is not permitted for dataset types in this context."
418 )
419 for datasetType in self._fetch_dataset_types():
420 result.append(datasetType)
421 elif wildcard.patterns:
422 if explicit_only:
423 raise DatasetTypeExpressionError(
424 "Dataset type wildcard expressions are not supported in this context."
425 )
426 dataset_types = self._fetch_dataset_types()
427 for datasetType in dataset_types:
428 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
429 result.append(datasetType)
431 return result
433 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
434 # Docstring inherited from DatasetRecordStorageManager.
435 sql = (
436 sqlalchemy.sql.select(
437 self._static.dataset.columns.dataset_type_id,
438 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
439 *self._static.dataset_type.columns,
440 )
441 .select_from(self._static.dataset)
442 .join(self._static.dataset_type)
443 .where(self._static.dataset.columns.id == id)
444 )
445 with self._db.query(sql) as sql_result:
446 row = sql_result.mappings().fetchone()
447 if row is None:
448 return None
449 record = self._record_from_row(row)
450 storage: DatasetRecordStorage | None = None
451 if self._caching_context.dataset_types is not None: 451 ↛ 453line 451 didn't jump to line 453, because the condition on line 451 was never false
452 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name)
453 if storage is None: 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true
454 storage = self._make_storage(record)
455 if self._caching_context.dataset_types is not None:
456 self._caching_context.dataset_types.add(storage.datasetType, storage)
457 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class"
458 return DatasetRef(
459 storage.datasetType,
460 dataId=storage.getDataId(id=id),
461 id=id,
462 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
463 )
465 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None:
466 """Retrieve all dataset types defined in database.
468 Yields
469 ------
470 dataset_types : `_DatasetTypeRecord`
471 Information from a single database record.
472 """
473 c = self._static.dataset_type.columns
474 stmt = self._static.dataset_type.select().where(c.name == name)
475 with self._db.query(stmt) as sql_result:
476 row = sql_result.mappings().one_or_none()
477 if row is None:
478 return None
479 else:
480 return self._record_from_row(row)
482 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord:
483 name = row["name"]
484 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"])
485 calibTableName = row["calibration_association_table"]
486 datasetType = DatasetType(
487 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None)
488 )
489 return _DatasetTypeRecord(
490 dataset_type=datasetType,
491 dataset_type_id=row["id"],
492 tag_table_name=row["tag_association_table"],
493 calib_table_name=calibTableName,
494 )
496 def _dataset_type_from_row(self, row: Mapping) -> DatasetType:
497 return self._record_from_row(row).dataset_type
499 def _fetch_dataset_types(self) -> list[DatasetType]:
500 """Fetch list of all defined dataset types."""
501 if self._caching_context.dataset_types is not None: 501 ↛ 504line 501 didn't jump to line 504, because the condition on line 501 was never false
502 if self._caching_context.dataset_types.full:
503 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()]
504 with self._db.query(self._static.dataset_type.select()) as sql_result:
505 sql_rows = sql_result.mappings().fetchall()
506 records = [self._record_from_row(row) for row in sql_rows]
507 # Cache everything and specify that cache is complete.
508 if self._caching_context.dataset_types is not None: 508 ↛ 511line 508 didn't jump to line 511, because the condition on line 508 was never false
509 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records]
510 self._caching_context.dataset_types.set(cache_data, full=True)
511 return [record.dataset_type for record in records]
513 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
514 # Docstring inherited from DatasetRecordStorageManager.
515 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row)
516 return summaries[collection.key]
518 def fetch_summaries(
519 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
520 ) -> Mapping[Any, CollectionSummary]:
521 # Docstring inherited from DatasetRecordStorageManager.
522 dataset_type_names: Iterable[str] | None = None
523 if dataset_types is not None: 523 ↛ 525line 523 didn't jump to line 525, because the condition on line 523 was never false
524 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
525 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
527 _versions: list[VersionTuple]
528 """Schema version for this class."""
530 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
531 """Type of the storage class returned by this manager."""
533 _autoincrement: bool
534 """If True then PK column of the dataset table is auto-increment."""
536 _idColumnType: type
537 """Type of dataset column used to store dataset ID."""
540class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
541 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
542 UUID for dataset primary key.
543 """
545 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
546 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
547 _autoincrement: bool = False
548 _idColumnType: type = ddl.GUID
550 @classmethod
551 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
552 # Docstring inherited from DatasetRecordStorageManager.
553 return True
555 @classmethod
556 def _newDefaultSchemaVersion(cls) -> VersionTuple:
557 # Docstring inherited from VersionedExtension.
559 # By default return 1.0.0 so that older clients can still access new
560 # registries created with a default config.
561 return _VERSION_UUID
563 def ingest_date_dtype(self) -> type:
564 """Return type of the ``ingest_date`` column."""
565 schema_version = self.newSchemaVersion()
566 if schema_version is not None and schema_version.major > 1:
567 return ddl.AstropyTimeNsecTai
568 else:
569 return sqlalchemy.TIMESTAMP