Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%
231 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-15 02:02 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-15 02:02 -0700
1from __future__ import annotations
3from .... import ddl
5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
7import dataclasses
8import logging
9from collections.abc import Iterable, Mapping
10from typing import TYPE_CHECKING, Any
12import sqlalchemy
14from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
15from ...._exceptions_legacy import DatasetTypeError
16from ....dimensions import DimensionUniverse
17from ..._collection_summary import CollectionSummary
18from ..._exceptions import ConflictingDefinitionError, DatasetTypeExpressionError, OrphanedRecordError
19from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
20from ...wildcards import DatasetTypeWildcard
21from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
22from .summaries import CollectionSummaryManager
23from .tables import (
24 addDatasetForeignKey,
25 makeCalibTableName,
26 makeCalibTableSpec,
27 makeStaticTableSpecs,
28 makeTagTableName,
29 makeTagTableSpec,
30)
32if TYPE_CHECKING:
33 from ..._caching_context import CachingContext
34 from ...interfaces import (
35 CollectionManager,
36 CollectionRecord,
37 Database,
38 DimensionRecordStorageManager,
39 StaticTablesContext,
40 )
41 from .tables import StaticDatasetTablesTuple
44# This has to be updated on every schema change
45_VERSION_UUID = VersionTuple(1, 0, 0)
46# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
47# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
48# client migration period.
49_VERSION_UUID_NS = VersionTuple(2, 0, 0)
51_LOG = logging.getLogger(__name__)
54class MissingDatabaseTableError(RuntimeError):
55 """Exception raised when a table is not found in a database."""
58@dataclasses.dataclass
59class _DatasetTypeRecord:
60 """Contents of a single dataset type record."""
62 dataset_type: DatasetType
63 dataset_type_id: int
64 tag_table_name: str
65 calib_table_name: str | None
68class _SpecTableFactory:
69 """Factory for `sqlalchemy.schema.Table` instances that builds table
70 instances using provided `ddl.TableSpec` definition and verifies that
71 table exists in the database.
72 """
74 def __init__(self, db: Database, name: str, spec: ddl.TableSpec):
75 self._db = db
76 self._name = name
77 self._spec = spec
79 def __call__(self) -> sqlalchemy.schema.Table:
80 table = self._db.getExistingTable(self._name, self._spec)
81 if table is None: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true
82 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.")
83 return table
86class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
87 """A manager class for datasets that uses one dataset-collection table for
88 each group of dataset types that share the same dimensions.
90 In addition to the table organization, this class makes a number of
91 other design choices that would have been cumbersome (to say the least) to
92 try to pack into its name:
94 - It uses a private surrogate integer autoincrement field to identify
95 dataset types, instead of using the name as the primary and foreign key
96 directly.
98 - It aggressively loads all DatasetTypes into memory instead of fetching
99 them from the database only when needed or attempting more clever forms
100 of caching.
102 Alternative implementations that make different choices for these while
103 keeping the same general table organization might be reasonable as well.
105 This class provides complete implementation of manager logic but it is
106 parametrized by few class attributes that have to be defined by
107 sub-classes.
109 Parameters
110 ----------
111 db : `Database`
112 Interface to the underlying database engine and namespace.
113 collections : `CollectionManager`
114 Manager object for the collections in this `Registry`.
115 dimensions : `DimensionRecordStorageManager`
116 Manager object for the dimensions in this `Registry`.
117 static : `StaticDatasetTablesTuple`
118 Named tuple of `sqlalchemy.schema.Table` instances for all static
119 tables used by this class.
120 summaries : `CollectionSummaryManager`
121 Structure containing tables that summarize the contents of collections.
122 caching_context : `CachingContext`
123 Object controlling caching of information returned by managers.
124 registry_schema_version : `VersionTuple` or `None`, optional
125 Version of registry schema.
126 """
128 def __init__(
129 self,
130 *,
131 db: Database,
132 collections: CollectionManager,
133 dimensions: DimensionRecordStorageManager,
134 static: StaticDatasetTablesTuple,
135 summaries: CollectionSummaryManager,
136 caching_context: CachingContext,
137 registry_schema_version: VersionTuple | None = None,
138 ):
139 super().__init__(registry_schema_version=registry_schema_version)
140 self._db = db
141 self._collections = collections
142 self._dimensions = dimensions
143 self._static = static
144 self._summaries = summaries
145 self._caching_context = caching_context
147 @classmethod
148 def initialize(
149 cls,
150 db: Database,
151 context: StaticTablesContext,
152 *,
153 collections: CollectionManager,
154 dimensions: DimensionRecordStorageManager,
155 caching_context: CachingContext,
156 registry_schema_version: VersionTuple | None = None,
157 ) -> DatasetRecordStorageManager:
158 # Docstring inherited from DatasetRecordStorageManager.
159 specs = cls.makeStaticTableSpecs(
160 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
161 )
162 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
163 summaries = CollectionSummaryManager.initialize(
164 db,
165 context,
166 collections=collections,
167 dimensions=dimensions,
168 dataset_type_table=static.dataset_type,
169 caching_context=caching_context,
170 )
171 return cls(
172 db=db,
173 collections=collections,
174 dimensions=dimensions,
175 static=static,
176 summaries=summaries,
177 caching_context=caching_context,
178 registry_schema_version=registry_schema_version,
179 )
181 @classmethod
182 def currentVersions(cls) -> list[VersionTuple]:
183 # Docstring inherited from VersionedExtension.
184 return cls._versions
186 @classmethod
187 def makeStaticTableSpecs(
188 cls,
189 collections: type[CollectionManager],
190 universe: DimensionUniverse,
191 schema_version: VersionTuple | None,
192 ) -> StaticDatasetTablesTuple:
193 """Construct all static tables used by the classes in this package.
195 Static tables are those that are present in all Registries and do not
196 depend on what DatasetTypes have been registered.
198 Parameters
199 ----------
200 collections : `CollectionManager`
201 Manager object for the collections in this `Registry`.
202 universe : `DimensionUniverse`
203 Universe graph containing all dimensions known to this `Registry`.
204 schema_version : `VersionTuple` or `None`
205 Version of the schema that should be created, if `None` then
206 default schema should be used.
208 Returns
209 -------
210 specs : `StaticDatasetTablesTuple`
211 A named tuple containing `ddl.TableSpec` instances.
212 """
213 schema_version = cls.clsNewSchemaVersion(schema_version)
214 assert schema_version is not None, "New schema version cannot be None"
215 return makeStaticTableSpecs(
216 collections,
217 universe=universe,
218 dtype=cls.getIdColumnType(),
219 autoincrement=cls._autoincrement,
220 schema_version=schema_version,
221 )
223 @classmethod
224 def getIdColumnType(cls) -> type:
225 # Docstring inherited from base class.
226 return cls._idColumnType
228 @classmethod
229 def addDatasetForeignKey(
230 cls,
231 tableSpec: ddl.TableSpec,
232 *,
233 name: str = "dataset",
234 constraint: bool = True,
235 onDelete: str | None = None,
236 **kwargs: Any,
237 ) -> ddl.FieldSpec:
238 # Docstring inherited from DatasetRecordStorageManager.
239 return addDatasetForeignKey(
240 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
241 )
243 def refresh(self) -> None:
244 # Docstring inherited from DatasetRecordStorageManager.
245 if self._caching_context.dataset_types is not None: 245 ↛ exitline 245 didn't return from function 'refresh', because the condition on line 245 was never false
246 self._caching_context.dataset_types.clear()
248 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage:
249 """Create storage instance for a dataset type record."""
250 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType())
251 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec)
252 calibs_table_factory = None
253 if record.calib_table_name is not None:
254 calibs_spec = makeCalibTableSpec(
255 record.dataset_type,
256 type(self._collections),
257 self._db.getTimespanRepresentation(),
258 self.getIdColumnType(),
259 )
260 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec)
261 storage = self._recordStorageType(
262 db=self._db,
263 datasetType=record.dataset_type,
264 static=self._static,
265 summaries=self._summaries,
266 tags_table_factory=tags_table_factory,
267 calibs_table_factory=calibs_table_factory,
268 dataset_type_id=record.dataset_type_id,
269 collections=self._collections,
270 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
271 )
272 return storage
274 def remove(self, name: str) -> None:
275 # Docstring inherited from DatasetRecordStorageManager.
276 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
277 if componentName is not None: 277 ↛ 278line 277 didn't jump to line 278, because the condition on line 277 was never true
278 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
280 # Delete the row
281 try:
282 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
283 except sqlalchemy.exc.IntegrityError as e:
284 raise OrphanedRecordError(
285 f"Dataset type {name} can not be removed."
286 " It is associated with datasets that must be removed first."
287 ) from e
289 # Now refresh everything -- removal is rare enough that this does
290 # not need to be fast.
291 self.refresh()
293 def find(self, name: str) -> DatasetRecordStorage | None:
294 # Docstring inherited from DatasetRecordStorageManager.
295 if self._caching_context.dataset_types is not None: 295 ↛ 308line 295 didn't jump to line 308, because the condition on line 295 was never false
296 _, storage = self._caching_context.dataset_types.get(name)
297 if storage is not None:
298 return storage
299 else:
300 # On the first cache miss populate the cache with complete list
301 # of dataset types (if it was not done yet).
302 if not self._caching_context.dataset_types.full:
303 self._fetch_dataset_types()
304 # Try again
305 _, storage = self._caching_context.dataset_types.get(name)
306 if storage is not None:
307 return storage
308 record = self._fetch_dataset_type_record(name)
309 if record is not None: 309 ↛ 310line 309 didn't jump to line 310, because the condition on line 309 was never true
310 storage = self._make_storage(record)
311 if self._caching_context.dataset_types is not None:
312 self._caching_context.dataset_types.add(storage.datasetType, storage)
313 return storage
314 else:
315 return None
317 def register(self, datasetType: DatasetType) -> bool:
318 # Docstring inherited from DatasetRecordStorageManager.
319 if datasetType.isComponent(): 319 ↛ 320line 319 didn't jump to line 320, because the condition on line 319 was never true
320 raise ValueError(
321 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
322 )
323 record = self._fetch_dataset_type_record(datasetType.name)
324 if record is None:
325 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group())
326 tagTableName = makeTagTableName(datasetType, dimensionsKey)
327 self._db.ensureTableExists(
328 tagTableName,
329 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
330 )
331 calibTableName = (
332 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
333 )
334 if calibTableName is not None:
335 self._db.ensureTableExists(
336 calibTableName,
337 makeCalibTableSpec(
338 datasetType,
339 type(self._collections),
340 self._db.getTimespanRepresentation(),
341 self.getIdColumnType(),
342 ),
343 )
344 row, inserted = self._db.sync(
345 self._static.dataset_type,
346 keys={"name": datasetType.name},
347 compared={
348 "dimensions_key": dimensionsKey,
349 # Force the storage class to be loaded to ensure it
350 # exists and there is no typo in the name.
351 "storage_class": datasetType.storageClass.name,
352 },
353 extra={
354 "tag_association_table": tagTableName,
355 "calibration_association_table": calibTableName,
356 },
357 returning=["id", "tag_association_table"],
358 )
359 # Make sure that cache is updated
360 if self._caching_context.dataset_types is not None and row is not None: 360 ↛ 377line 360 didn't jump to line 377, because the condition on line 360 was never false
361 record = _DatasetTypeRecord(
362 dataset_type=datasetType,
363 dataset_type_id=row["id"],
364 tag_table_name=tagTableName,
365 calib_table_name=calibTableName,
366 )
367 storage = self._make_storage(record)
368 self._caching_context.dataset_types.add(datasetType, storage)
369 else:
370 if datasetType != record.dataset_type:
371 raise ConflictingDefinitionError(
372 f"Given dataset type {datasetType} is inconsistent "
373 f"with database definition {record.dataset_type}."
374 )
375 inserted = False
377 return bool(inserted)
379 def resolve_wildcard(
380 self,
381 expression: Any,
382 missing: list[str] | None = None,
383 explicit_only: bool = False,
384 ) -> list[DatasetType]:
385 wildcard = DatasetTypeWildcard.from_expression(expression)
386 result: list[DatasetType] = []
387 for name, dataset_type in wildcard.values.items():
388 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
389 if component_name is not None:
390 raise DatasetTypeError(
391 "Component dataset types are not supported in Registry methods; use DatasetRef or "
392 "DatasetType methods to obtain components from parents instead."
393 )
394 if (found_storage := self.find(parent_name)) is not None:
395 resolved_dataset_type = found_storage.datasetType
396 if dataset_type is not None:
397 if dataset_type.is_compatible_with(resolved_dataset_type): 397 ↛ 402line 397 didn't jump to line 402, because the condition on line 397 was never false
398 # Prefer the given dataset type to enable storage class
399 # conversions.
400 resolved_dataset_type = dataset_type
401 else:
402 raise DatasetTypeError(
403 f"Dataset type definition in query expression {dataset_type} is "
404 f"not compatible with the registered type {resolved_dataset_type}."
405 )
406 result.append(resolved_dataset_type)
407 elif missing is not None:
408 missing.append(name)
409 if wildcard.patterns is ...:
410 if explicit_only:
411 raise TypeError(
412 "Universal wildcard '...' is not permitted for dataset types in this context."
413 )
414 for datasetType in self._fetch_dataset_types():
415 result.append(datasetType)
416 elif wildcard.patterns:
417 if explicit_only:
418 raise DatasetTypeExpressionError(
419 "Dataset type wildcard expressions are not supported in this context."
420 )
421 dataset_types = self._fetch_dataset_types()
422 for datasetType in dataset_types:
423 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
424 result.append(datasetType)
426 return result
428 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
429 # Docstring inherited from DatasetRecordStorageManager.
430 sql = (
431 sqlalchemy.sql.select(
432 self._static.dataset.columns.dataset_type_id,
433 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
434 *self._static.dataset_type.columns,
435 )
436 .select_from(self._static.dataset)
437 .join(self._static.dataset_type)
438 .where(self._static.dataset.columns.id == id)
439 )
440 with self._db.query(sql) as sql_result:
441 row = sql_result.mappings().fetchone()
442 if row is None:
443 return None
444 record = self._record_from_row(row)
445 storage: DatasetRecordStorage | None = None
446 if self._caching_context.dataset_types is not None: 446 ↛ 448line 446 didn't jump to line 448, because the condition on line 446 was never false
447 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name)
448 if storage is None:
449 storage = self._make_storage(record)
450 if self._caching_context.dataset_types is not None: 450 ↛ 452line 450 didn't jump to line 452, because the condition on line 450 was never false
451 self._caching_context.dataset_types.add(storage.datasetType, storage)
452 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class"
453 return DatasetRef(
454 storage.datasetType,
455 dataId=storage.getDataId(id=id),
456 id=id,
457 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
458 )
460 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None:
461 """Retrieve all dataset types defined in database.
463 Yields
464 ------
465 dataset_types : `_DatasetTypeRecord`
466 Information from a single database record.
467 """
468 c = self._static.dataset_type.columns
469 stmt = self._static.dataset_type.select().where(c.name == name)
470 with self._db.query(stmt) as sql_result:
471 row = sql_result.mappings().one_or_none()
472 if row is None:
473 return None
474 else:
475 return self._record_from_row(row)
477 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord:
478 name = row["name"]
479 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"])
480 calibTableName = row["calibration_association_table"]
481 datasetType = DatasetType(
482 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None)
483 )
484 return _DatasetTypeRecord(
485 dataset_type=datasetType,
486 dataset_type_id=row["id"],
487 tag_table_name=row["tag_association_table"],
488 calib_table_name=calibTableName,
489 )
491 def _dataset_type_from_row(self, row: Mapping) -> DatasetType:
492 return self._record_from_row(row).dataset_type
494 def _fetch_dataset_types(self) -> list[DatasetType]:
495 """Fetch list of all defined dataset types."""
496 if self._caching_context.dataset_types is not None: 496 ↛ 499line 496 didn't jump to line 499, because the condition on line 496 was never false
497 if self._caching_context.dataset_types.full:
498 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()]
499 with self._db.query(self._static.dataset_type.select()) as sql_result:
500 sql_rows = sql_result.mappings().fetchall()
501 records = [self._record_from_row(row) for row in sql_rows]
502 # Cache everything and specify that cache is complete.
503 if self._caching_context.dataset_types is not None: 503 ↛ 506line 503 didn't jump to line 506, because the condition on line 503 was never false
504 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records]
505 self._caching_context.dataset_types.set(cache_data, full=True)
506 return [record.dataset_type for record in records]
508 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
509 # Docstring inherited from DatasetRecordStorageManager.
510 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row)
511 return summaries[collection.key]
513 def fetch_summaries(
514 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
515 ) -> Mapping[Any, CollectionSummary]:
516 # Docstring inherited from DatasetRecordStorageManager.
517 dataset_type_names: Iterable[str] | None = None
518 if dataset_types is not None: 518 ↛ 520line 518 didn't jump to line 520, because the condition on line 518 was never false
519 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
520 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
522 _versions: list[VersionTuple]
523 """Schema version for this class."""
525 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
526 """Type of the storage class returned by this manager."""
528 _autoincrement: bool
529 """If True then PK column of the dataset table is auto-increment."""
531 _idColumnType: type
532 """Type of dataset column used to store dataset ID."""
535class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
536 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
537 UUID for dataset primary key.
538 """
540 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
541 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
542 _autoincrement: bool = False
543 _idColumnType: type = ddl.GUID
545 def clone(
546 self,
547 *,
548 db: Database,
549 collections: CollectionManager,
550 dimensions: DimensionRecordStorageManager,
551 caching_context: CachingContext,
552 ) -> ByDimensionsDatasetRecordStorageManagerUUID:
553 return ByDimensionsDatasetRecordStorageManagerUUID(
554 db=db,
555 collections=collections,
556 dimensions=dimensions,
557 static=self._static,
558 summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context),
559 caching_context=caching_context,
560 registry_schema_version=self._registry_schema_version,
561 )
563 @classmethod
564 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
565 # Docstring inherited from DatasetRecordStorageManager.
566 return True
568 @classmethod
569 def _newDefaultSchemaVersion(cls) -> VersionTuple:
570 # Docstring inherited from VersionedExtension.
572 # By default return 1.0.0 so that older clients can still access new
573 # registries created with a default config.
574 return _VERSION_UUID
576 def ingest_date_dtype(self) -> type:
577 """Return type of the ``ingest_date`` column."""
578 schema_version = self.newSchemaVersion()
579 if schema_version is not None and schema_version.major > 1:
580 return ddl.AstropyTimeNsecTai
581 else:
582 return sqlalchemy.TIMESTAMP