Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%
204 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:12 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:12 +0000
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
5import logging
6import warnings
7from collections import defaultdict
8from typing import TYPE_CHECKING, Any
10import sqlalchemy
12from ....core import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, DimensionUniverse, ddl
13from ..._collection_summary import CollectionSummary
14from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
15from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
16from ...wildcards import DatasetTypeWildcard
17from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
18from .summaries import CollectionSummaryManager
19from .tables import (
20 addDatasetForeignKey,
21 makeCalibTableName,
22 makeCalibTableSpec,
23 makeStaticTableSpecs,
24 makeTagTableName,
25 makeTagTableSpec,
26)
28if TYPE_CHECKING:
29 from ...interfaces import (
30 CollectionManager,
31 CollectionRecord,
32 Database,
33 DimensionRecordStorageManager,
34 StaticTablesContext,
35 )
36 from .tables import StaticDatasetTablesTuple
39# This has to be updated on every schema change
40_VERSION_UUID = VersionTuple(1, 0, 0)
41# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
42# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
43# client migration period.
44_VERSION_UUID_NS = VersionTuple(2, 0, 0)
46_LOG = logging.getLogger(__name__)
49class MissingDatabaseTableError(RuntimeError):
50 """Exception raised when a table is not found in a database."""
53class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
54 """A manager class for datasets that uses one dataset-collection table for
55 each group of dataset types that share the same dimensions.
57 In addition to the table organization, this class makes a number of
58 other design choices that would have been cumbersome (to say the least) to
59 try to pack into its name:
61 - It uses a private surrogate integer autoincrement field to identify
62 dataset types, instead of using the name as the primary and foreign key
63 directly.
65 - It aggressively loads all DatasetTypes into memory instead of fetching
66 them from the database only when needed or attempting more clever forms
67 of caching.
69 Alternative implementations that make different choices for these while
70 keeping the same general table organization might be reasonable as well.
72 This class provides complete implementation of manager logic but it is
73 parametrized by few class attributes that have to be defined by
74 sub-classes.
76 Parameters
77 ----------
78 db : `Database`
79 Interface to the underlying database engine and namespace.
80 collections : `CollectionManager`
81 Manager object for the collections in this `Registry`.
82 dimensions : `DimensionRecordStorageManager`
83 Manager object for the dimensions in this `Registry`.
84 static : `StaticDatasetTablesTuple`
85 Named tuple of `sqlalchemy.schema.Table` instances for all static
86 tables used by this class.
87 summaries : `CollectionSummaryManager`
88 Structure containing tables that summarize the contents of collections.
89 """
91 def __init__(
92 self,
93 *,
94 db: Database,
95 collections: CollectionManager,
96 dimensions: DimensionRecordStorageManager,
97 static: StaticDatasetTablesTuple,
98 summaries: CollectionSummaryManager,
99 registry_schema_version: VersionTuple | None = None,
100 ):
101 super().__init__(registry_schema_version=registry_schema_version)
102 self._db = db
103 self._collections = collections
104 self._dimensions = dimensions
105 self._static = static
106 self._summaries = summaries
107 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
108 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
110 @classmethod
111 def initialize(
112 cls,
113 db: Database,
114 context: StaticTablesContext,
115 *,
116 collections: CollectionManager,
117 dimensions: DimensionRecordStorageManager,
118 registry_schema_version: VersionTuple | None = None,
119 ) -> DatasetRecordStorageManager:
120 # Docstring inherited from DatasetRecordStorageManager.
121 specs = cls.makeStaticTableSpecs(
122 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
123 )
124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
125 summaries = CollectionSummaryManager.initialize(
126 db,
127 context,
128 collections=collections,
129 dimensions=dimensions,
130 )
131 return cls(
132 db=db,
133 collections=collections,
134 dimensions=dimensions,
135 static=static,
136 summaries=summaries,
137 registry_schema_version=registry_schema_version,
138 )
140 @classmethod
141 def currentVersions(cls) -> list[VersionTuple]:
142 # Docstring inherited from VersionedExtension.
143 return cls._versions
145 @classmethod
146 def makeStaticTableSpecs(
147 cls,
148 collections: type[CollectionManager],
149 universe: DimensionUniverse,
150 schema_version: VersionTuple | None,
151 ) -> StaticDatasetTablesTuple:
152 """Construct all static tables used by the classes in this package.
154 Static tables are those that are present in all Registries and do not
155 depend on what DatasetTypes have been registered.
157 Parameters
158 ----------
159 collections: `CollectionManager`
160 Manager object for the collections in this `Registry`.
161 universe : `DimensionUniverse`
162 Universe graph containing all dimensions known to this `Registry`.
163 schema_version : `VersionTuple` or `None`
164 Version of the schema that should be created, if `None` then
165 default schema should be used.
167 Returns
168 -------
169 specs : `StaticDatasetTablesTuple`
170 A named tuple containing `ddl.TableSpec` instances.
171 """
172 schema_version = cls.clsNewSchemaVersion(schema_version)
173 assert schema_version is not None, "New schema version cannot be None"
174 return makeStaticTableSpecs(
175 collections,
176 universe=universe,
177 dtype=cls.getIdColumnType(),
178 autoincrement=cls._autoincrement,
179 schema_version=schema_version,
180 )
182 @classmethod
183 def getIdColumnType(cls) -> type:
184 # Docstring inherited from base class.
185 return cls._idColumnType
187 @classmethod
188 def addDatasetForeignKey(
189 cls,
190 tableSpec: ddl.TableSpec,
191 *,
192 name: str = "dataset",
193 constraint: bool = True,
194 onDelete: str | None = None,
195 **kwargs: Any,
196 ) -> ddl.FieldSpec:
197 # Docstring inherited from DatasetRecordStorageManager.
198 return addDatasetForeignKey(
199 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
200 )
202 def refresh(self) -> None:
203 # Docstring inherited from DatasetRecordStorageManager.
204 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
205 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
206 dataset_types: dict[int, DatasetType] = {}
207 c = self._static.dataset_type.columns
208 with self._db.query(self._static.dataset_type.select()) as sql_result:
209 sql_rows = sql_result.mappings().fetchall()
210 for row in sql_rows:
211 name = row[c.name]
212 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
213 calibTableName = row[c.calibration_association_table]
214 datasetType = DatasetType(
215 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
216 )
217 tags = self._db.getExistingTable(
218 row[c.tag_association_table],
219 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
220 )
221 if tags is None: 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true
222 raise MissingDatabaseTableError(
223 f"Table {row[c.tag_association_table]} is missing from database schema."
224 )
225 if calibTableName is not None:
226 calibs = self._db.getExistingTable(
227 row[c.calibration_association_table],
228 makeCalibTableSpec(
229 datasetType,
230 type(self._collections),
231 self._db.getTimespanRepresentation(),
232 self.getIdColumnType(),
233 ),
234 )
235 if calibs is None: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true
236 raise MissingDatabaseTableError(
237 f"Table {row[c.calibration_association_table]} is missing from database schema."
238 )
239 else:
240 calibs = None
241 storage = self._recordStorageType(
242 db=self._db,
243 datasetType=datasetType,
244 static=self._static,
245 summaries=self._summaries,
246 tags=tags,
247 calibs=calibs,
248 dataset_type_id=row["id"],
249 collections=self._collections,
250 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
251 )
252 byName[datasetType.name] = storage
253 byId[storage._dataset_type_id] = storage
254 dataset_types[row["id"]] = datasetType
255 self._byName = byName
256 self._byId = byId
257 self._summaries.refresh(dataset_types)
259 def remove(self, name: str) -> None:
260 # Docstring inherited from DatasetRecordStorageManager.
261 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
262 if componentName is not None:
263 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
265 # Delete the row
266 try:
267 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
268 except sqlalchemy.exc.IntegrityError as e:
269 raise OrphanedRecordError(
270 f"Dataset type {name} can not be removed."
271 " It is associated with datasets that must be removed first."
272 ) from e
274 # Now refresh everything -- removal is rare enough that this does
275 # not need to be fast.
276 self.refresh()
278 def find(self, name: str) -> DatasetRecordStorage | None:
279 # Docstring inherited from DatasetRecordStorageManager.
280 return self._byName.get(name)
282 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
283 # Docstring inherited from DatasetRecordStorageManager.
284 if datasetType.isComponent(): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true
285 raise ValueError(
286 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
287 )
288 storage = self._byName.get(datasetType.name)
289 if storage is None:
290 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
291 tagTableName = makeTagTableName(datasetType, dimensionsKey)
292 calibTableName = (
293 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
294 )
295 # The order is important here, we want to create tables first and
296 # only register them if this operation is successful. We cannot
297 # wrap it into a transaction because database class assumes that
298 # DDL is not transaction safe in general.
299 tags = self._db.ensureTableExists(
300 tagTableName,
301 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
302 )
303 if calibTableName is not None:
304 calibs = self._db.ensureTableExists(
305 calibTableName,
306 makeCalibTableSpec(
307 datasetType,
308 type(self._collections),
309 self._db.getTimespanRepresentation(),
310 self.getIdColumnType(),
311 ),
312 )
313 else:
314 calibs = None
315 row, inserted = self._db.sync(
316 self._static.dataset_type,
317 keys={"name": datasetType.name},
318 compared={
319 "dimensions_key": dimensionsKey,
320 # Force the storage class to be loaded to ensure it
321 # exists and there is no typo in the name.
322 "storage_class": datasetType.storageClass.name,
323 },
324 extra={
325 "tag_association_table": tagTableName,
326 "calibration_association_table": calibTableName,
327 },
328 returning=["id", "tag_association_table"],
329 )
330 assert row is not None
331 storage = self._recordStorageType(
332 db=self._db,
333 datasetType=datasetType,
334 static=self._static,
335 summaries=self._summaries,
336 tags=tags,
337 calibs=calibs,
338 dataset_type_id=row["id"],
339 collections=self._collections,
340 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
341 )
342 self._byName[datasetType.name] = storage
343 self._byId[storage._dataset_type_id] = storage
344 else:
345 if datasetType != storage.datasetType:
346 raise ConflictingDefinitionError(
347 f"Given dataset type {datasetType} is inconsistent "
348 f"with database definition {storage.datasetType}."
349 )
350 inserted = False
351 return storage, bool(inserted)
353 def resolve_wildcard(
354 self,
355 expression: Any,
356 components: bool | None = None,
357 missing: list[str] | None = None,
358 explicit_only: bool = False,
359 components_deprecated: bool = True,
360 ) -> dict[DatasetType, list[str | None]]:
361 wildcard = DatasetTypeWildcard.from_expression(expression)
362 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
363 # This message can be transformed into an error on DM-36303 after v26,
364 # and the components and components_deprecated arguments can be merged
365 # into one on DM-36457 after v27.
366 deprecation_message = (
367 "Querying for component datasets via Registry query methods is deprecated in favor of using "
368 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
369 "after v26, and the components argument will be removed after v27."
370 )
371 for name, dataset_type in wildcard.values.items():
372 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
373 if component_name is not None and components_deprecated:
374 warnings.warn(deprecation_message, FutureWarning)
375 if (found_storage := self.find(parent_name)) is not None:
376 found_parent = found_storage.datasetType
377 if component_name is not None:
378 found = found_parent.makeComponentDatasetType(component_name)
379 else:
380 found = found_parent
381 if dataset_type is not None:
382 if dataset_type.is_compatible_with(found): 382 ↛ 390line 382 didn't jump to line 390, because the condition on line 382 was never false
383 # Prefer the given dataset type to enable storage class
384 # conversions.
385 if component_name is not None:
386 found_parent = dataset_type.makeCompositeDatasetType()
387 else:
388 found_parent = dataset_type
389 else:
390 raise DatasetTypeError(
391 f"Dataset type definition in query expression {dataset_type} is "
392 f"not compatible with the registered type {found}."
393 )
394 result[found_parent].add(component_name)
395 elif missing is not None:
396 missing.append(name)
397 already_warned = False
398 if wildcard.patterns is ...:
399 if explicit_only:
400 raise TypeError(
401 "Universal wildcard '...' is not permitted for dataset types in this context."
402 )
403 for storage in self._byName.values():
404 result[storage.datasetType].add(None)
405 if components:
406 try:
407 result[storage.datasetType].update(
408 storage.datasetType.storageClass.allComponents().keys()
409 )
410 if (
411 storage.datasetType.storageClass.allComponents()
412 and not already_warned
413 and components_deprecated
414 ):
415 warnings.warn(deprecation_message, FutureWarning)
416 already_warned = True
417 except KeyError as err:
418 _LOG.warning(
419 f"Could not load storage class {err} for {storage.datasetType.name}; "
420 "if it has components they will not be included in query results.",
421 )
422 elif wildcard.patterns:
423 if explicit_only:
424 # After v26 this should raise DatasetTypeExpressionError, to
425 # be implemented on DM-36303.
426 warnings.warn(
427 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
428 FutureWarning,
429 )
430 for storage in self._byName.values():
431 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
432 result[storage.datasetType].add(None)
433 if components is not False:
434 for storage in self._byName.values():
435 if components is None and storage.datasetType in result:
436 continue
437 try:
438 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
439 except KeyError as err:
440 _LOG.warning(
441 f"Could not load storage class {err} for {storage.datasetType.name}; "
442 "if it has components they will not be included in query results."
443 )
444 continue
445 for component_name in components_for_parent:
446 if any(
447 p.fullmatch(
448 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
449 )
450 for p in wildcard.patterns
451 ):
452 result[storage.datasetType].add(component_name)
453 if not already_warned and components_deprecated:
454 warnings.warn(deprecation_message, FutureWarning)
455 already_warned = True
456 return {k: list(v) for k, v in result.items()}
458 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
459 # Docstring inherited from DatasetRecordStorageManager.
460 sql = (
461 sqlalchemy.sql.select(
462 self._static.dataset.columns.dataset_type_id,
463 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
464 )
465 .select_from(self._static.dataset)
466 .where(self._static.dataset.columns.id == id)
467 )
468 with self._db.query(sql) as sql_result:
469 row = sql_result.mappings().fetchone()
470 if row is None:
471 return None
472 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
473 if recordsForType is None: 473 ↛ 474line 473 didn't jump to line 474, because the condition on line 473 was never true
474 self.refresh()
475 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
476 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
477 return DatasetRef(
478 recordsForType.datasetType,
479 dataId=recordsForType.getDataId(id=id),
480 id=id,
481 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
482 )
484 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
485 # Docstring inherited from DatasetRecordStorageManager.
486 return self._summaries.get(collection)
488 _versions: list[VersionTuple]
489 """Schema version for this class."""
491 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
492 """Type of the storage class returned by this manager."""
494 _autoincrement: bool
495 """If True then PK column of the dataset table is auto-increment."""
497 _idColumnType: type
498 """Type of dataset column used to store dataset ID."""
501class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
502 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
503 UUID for dataset primary key.
504 """
506 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
507 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
508 _autoincrement: bool = False
509 _idColumnType: type = ddl.GUID
511 @classmethod
512 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
513 # Docstring inherited from DatasetRecordStorageManager.
514 return True
516 @classmethod
517 def _newDefaultSchemaVersion(cls) -> VersionTuple:
518 # Docstring inherited from VersionedExtension.
520 # By default return 1.0.0 so that older clients can still access new
521 # registries created with a default config.
522 return _VERSION_UUID
524 def ingest_date_dtype(self) -> type:
525 """Return type of the ``ingest_date`` column."""
526 schema_version = self.newSchemaVersion()
527 if schema_version is not None and schema_version.major > 1:
528 return ddl.AstropyTimeNsecTai
529 else:
530 return sqlalchemy.TIMESTAMP