Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 94%
197 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-11 02:06 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-11 02:06 -0800
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
5import logging
6import warnings
7from collections import defaultdict
8from typing import TYPE_CHECKING, Any
10import sqlalchemy
11from lsst.utils.ellipsis import Ellipsis
13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
14from ..._collection_summary import CollectionSummary
15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
17from ...wildcards import DatasetTypeWildcard
18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
19from .summaries import CollectionSummaryManager
20from .tables import (
21 addDatasetForeignKey,
22 makeCalibTableName,
23 makeCalibTableSpec,
24 makeStaticTableSpecs,
25 makeTagTableName,
26 makeTagTableSpec,
27)
29if TYPE_CHECKING: 29 ↛ 30line 29 didn't jump to line 30, because the condition on line 29 was never true
30 from ...interfaces import (
31 CollectionManager,
32 CollectionRecord,
33 Database,
34 DimensionRecordStorageManager,
35 StaticTablesContext,
36 )
37 from .tables import StaticDatasetTablesTuple
40# This has to be updated on every schema change
41_VERSION_INT = VersionTuple(1, 0, 0)
42_VERSION_UUID = VersionTuple(1, 0, 0)
44_LOG = logging.getLogger(__name__)
47class MissingDatabaseTableError(RuntimeError):
48 """Exception raised when a table is not found in a database."""
51class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
52 """A manager class for datasets that uses one dataset-collection table for
53 each group of dataset types that share the same dimensions.
55 In addition to the table organization, this class makes a number of
56 other design choices that would have been cumbersome (to say the least) to
57 try to pack into its name:
59 - It uses a private surrogate integer autoincrement field to identify
60 dataset types, instead of using the name as the primary and foreign key
61 directly.
63 - It aggressively loads all DatasetTypes into memory instead of fetching
64 them from the database only when needed or attempting more clever forms
65 of caching.
67 Alternative implementations that make different choices for these while
68 keeping the same general table organization might be reasonable as well.
70 This class provides complete implementation of manager logic but it is
71 parametrized by few class attributes that have to be defined by
72 sub-classes.
74 Parameters
75 ----------
76 db : `Database`
77 Interface to the underlying database engine and namespace.
78 collections : `CollectionManager`
79 Manager object for the collections in this `Registry`.
80 dimensions : `DimensionRecordStorageManager`
81 Manager object for the dimensions in this `Registry`.
82 static : `StaticDatasetTablesTuple`
83 Named tuple of `sqlalchemy.schema.Table` instances for all static
84 tables used by this class.
85 summaries : `CollectionSummaryManager`
86 Structure containing tables that summarize the contents of collections.
87 """
89 def __init__(
90 self,
91 *,
92 db: Database,
93 collections: CollectionManager,
94 dimensions: DimensionRecordStorageManager,
95 static: StaticDatasetTablesTuple,
96 summaries: CollectionSummaryManager,
97 ):
98 self._db = db
99 self._collections = collections
100 self._dimensions = dimensions
101 self._static = static
102 self._summaries = summaries
103 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
104 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
106 @classmethod
107 def initialize(
108 cls,
109 db: Database,
110 context: StaticTablesContext,
111 *,
112 collections: CollectionManager,
113 dimensions: DimensionRecordStorageManager,
114 ) -> DatasetRecordStorageManager:
115 # Docstring inherited from DatasetRecordStorageManager.
116 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
117 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
118 summaries = CollectionSummaryManager.initialize(
119 db,
120 context,
121 collections=collections,
122 dimensions=dimensions,
123 )
124 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
126 @classmethod
127 def currentVersion(cls) -> VersionTuple | None:
128 # Docstring inherited from VersionedExtension.
129 return cls._version
131 @classmethod
132 def makeStaticTableSpecs(
133 cls, collections: type[CollectionManager], universe: DimensionUniverse
134 ) -> StaticDatasetTablesTuple:
135 """Construct all static tables used by the classes in this package.
137 Static tables are those that are present in all Registries and do not
138 depend on what DatasetTypes have been registered.
140 Parameters
141 ----------
142 collections: `CollectionManager`
143 Manager object for the collections in this `Registry`.
144 universe : `DimensionUniverse`
145 Universe graph containing all dimensions known to this `Registry`.
147 Returns
148 -------
149 specs : `StaticDatasetTablesTuple`
150 A named tuple containing `ddl.TableSpec` instances.
151 """
152 return makeStaticTableSpecs(
153 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
154 )
156 @classmethod
157 def getIdColumnType(cls) -> type:
158 # Docstring inherited from base class.
159 return cls._idColumnType
161 @classmethod
162 def addDatasetForeignKey(
163 cls,
164 tableSpec: ddl.TableSpec,
165 *,
166 name: str = "dataset",
167 constraint: bool = True,
168 onDelete: str | None = None,
169 **kwargs: Any,
170 ) -> ddl.FieldSpec:
171 # Docstring inherited from DatasetRecordStorageManager.
172 return addDatasetForeignKey(
173 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
174 )
176 def refresh(self) -> None:
177 # Docstring inherited from DatasetRecordStorageManager.
178 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
179 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
180 c = self._static.dataset_type.columns
181 with self._db.query(self._static.dataset_type.select()) as sql_result:
182 sql_rows = sql_result.mappings().fetchall()
183 for row in sql_rows:
184 name = row[c.name]
185 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
186 calibTableName = row[c.calibration_association_table]
187 datasetType = DatasetType(
188 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
189 )
190 tags = self._db.getExistingTable(
191 row[c.tag_association_table],
192 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
193 )
194 if tags is None: 194 ↛ 195line 194 didn't jump to line 195, because the condition on line 194 was never true
195 raise MissingDatabaseTableError(
196 f"Table {row[c.tag_association_table]} is missing from database schema."
197 )
198 if calibTableName is not None:
199 calibs = self._db.getExistingTable(
200 row[c.calibration_association_table],
201 makeCalibTableSpec(
202 datasetType,
203 type(self._collections),
204 self._db.getTimespanRepresentation(),
205 self.getIdColumnType(),
206 ),
207 )
208 if calibs is None: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true
209 raise MissingDatabaseTableError(
210 f"Table {row[c.calibration_association_table]} is missing from database schema."
211 )
212 else:
213 calibs = None
214 storage = self._recordStorageType(
215 db=self._db,
216 datasetType=datasetType,
217 static=self._static,
218 summaries=self._summaries,
219 tags=tags,
220 calibs=calibs,
221 dataset_type_id=row["id"],
222 collections=self._collections,
223 )
224 byName[datasetType.name] = storage
225 byId[storage._dataset_type_id] = storage
226 self._byName = byName
227 self._byId = byId
228 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
230 def remove(self, name: str) -> None:
231 # Docstring inherited from DatasetRecordStorageManager.
232 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
233 if componentName is not None:
234 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
236 # Delete the row
237 try:
238 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
239 except sqlalchemy.exc.IntegrityError as e:
240 raise OrphanedRecordError(
241 f"Dataset type {name} can not be removed."
242 " It is associated with datasets that must be removed first."
243 ) from e
245 # Now refresh everything -- removal is rare enough that this does
246 # not need to be fast.
247 self.refresh()
249 def find(self, name: str) -> DatasetRecordStorage | None:
250 # Docstring inherited from DatasetRecordStorageManager.
251 return self._byName.get(name)
253 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
254 # Docstring inherited from DatasetRecordStorageManager.
255 if datasetType.isComponent(): 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true
256 raise ValueError(
257 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
258 )
259 storage = self._byName.get(datasetType.name)
260 if storage is None:
261 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
262 tagTableName = makeTagTableName(datasetType, dimensionsKey)
263 calibTableName = (
264 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
265 )
266 # The order is important here, we want to create tables first and
267 # only register them if this operation is successful. We cannot
268 # wrap it into a transaction because database class assumes that
269 # DDL is not transaction safe in general.
270 tags = self._db.ensureTableExists(
271 tagTableName,
272 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
273 )
274 if calibTableName is not None:
275 calibs = self._db.ensureTableExists(
276 calibTableName,
277 makeCalibTableSpec(
278 datasetType,
279 type(self._collections),
280 self._db.getTimespanRepresentation(),
281 self.getIdColumnType(),
282 ),
283 )
284 else:
285 calibs = None
286 row, inserted = self._db.sync(
287 self._static.dataset_type,
288 keys={"name": datasetType.name},
289 compared={
290 "dimensions_key": dimensionsKey,
291 # Force the storage class to be loaded to ensure it
292 # exists and there is no typo in the name.
293 "storage_class": datasetType.storageClass.name,
294 },
295 extra={
296 "tag_association_table": tagTableName,
297 "calibration_association_table": calibTableName,
298 },
299 returning=["id", "tag_association_table"],
300 )
301 assert row is not None
302 storage = self._recordStorageType(
303 db=self._db,
304 datasetType=datasetType,
305 static=self._static,
306 summaries=self._summaries,
307 tags=tags,
308 calibs=calibs,
309 dataset_type_id=row["id"],
310 collections=self._collections,
311 )
312 self._byName[datasetType.name] = storage
313 self._byId[storage._dataset_type_id] = storage
314 else:
315 if datasetType != storage.datasetType:
316 raise ConflictingDefinitionError(
317 f"Given dataset type {datasetType} is inconsistent "
318 f"with database definition {storage.datasetType}."
319 )
320 inserted = False
321 return storage, bool(inserted)
323 def resolve_wildcard(
324 self,
325 expression: Any,
326 components: bool | None = None,
327 missing: list[str] | None = None,
328 explicit_only: bool = False,
329 components_deprecated: bool = True,
330 ) -> dict[DatasetType, list[str | None]]:
331 wildcard = DatasetTypeWildcard.from_expression(expression)
332 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
333 # This message can be transformed into an error on DM-36303 after v26,
334 # and the components and components_deprecated arguments can be merged
335 # into one on DM-36457 after v27.
336 deprecation_message = (
337 "Querying for component datasets via Registry query methods is deprecated in favor of using "
338 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
339 "after v26, and the components argument will be removed after v27."
340 )
341 for name, dataset_type in wildcard.values.items():
342 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
343 if component_name is not None and components_deprecated:
344 warnings.warn(deprecation_message, FutureWarning)
345 if (found_storage := self.find(parent_name)) is not None:
346 found_parent = found_storage.datasetType
347 if component_name is not None:
348 found = found_parent.makeComponentDatasetType(component_name)
349 else:
350 found = found_parent
351 if dataset_type is not None:
352 if dataset_type.is_compatible_with(found): 352 ↛ 360line 352 didn't jump to line 360, because the condition on line 352 was never false
353 # Prefer the given dataset type to enable storage class
354 # conversions.
355 if component_name is not None:
356 found_parent = dataset_type.makeCompositeDatasetType()
357 else:
358 found_parent = dataset_type
359 else:
360 raise DatasetTypeError(
361 f"Dataset type definition in query expression {dataset_type} is "
362 f"not compatible with the registered type {found}."
363 )
364 result[found_parent].add(component_name)
365 elif missing is not None:
366 missing.append(name)
367 already_warned = False
368 if wildcard.patterns is Ellipsis:
369 if explicit_only:
370 raise TypeError(
371 "Universal wildcard '...' is not permitted for dataset types in this context."
372 )
373 for storage in self._byName.values():
374 result[storage.datasetType].add(None)
375 if components:
376 try:
377 result[storage.datasetType].update(
378 storage.datasetType.storageClass.allComponents().keys()
379 )
380 if (
381 storage.datasetType.storageClass.allComponents()
382 and not already_warned
383 and components_deprecated
384 ):
385 warnings.warn(deprecation_message, FutureWarning)
386 already_warned = True
387 except KeyError as err:
388 _LOG.warning(
389 f"Could not load storage class {err} for {storage.datasetType.name}; "
390 "if it has components they will not be included in query results.",
391 )
392 elif wildcard.patterns:
393 if explicit_only:
394 # After v26 this should raise DatasetTypeExpressionError, to
395 # be implemented on DM-36303.
396 warnings.warn(
397 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
398 FutureWarning,
399 )
400 for storage in self._byName.values():
401 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
402 result[storage.datasetType].add(None)
403 if components is not False:
404 for storage in self._byName.values():
405 if components is None and storage.datasetType in result:
406 continue
407 try:
408 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
409 except KeyError as err:
410 _LOG.warning(
411 f"Could not load storage class {err} for {storage.datasetType.name}; "
412 "if it has components they will not be included in query results."
413 )
414 continue
415 for component_name in components_for_parent:
416 if any(
417 p.fullmatch(
418 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
419 )
420 for p in wildcard.patterns
421 ):
422 result[storage.datasetType].add(component_name)
423 if not already_warned and components_deprecated:
424 warnings.warn(deprecation_message, FutureWarning)
425 already_warned = True
426 return {k: list(v) for k, v in result.items()}
428 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
429 # Docstring inherited from DatasetRecordStorageManager.
430 sql = (
431 sqlalchemy.sql.select(
432 self._static.dataset.columns.dataset_type_id,
433 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
434 )
435 .select_from(self._static.dataset)
436 .where(self._static.dataset.columns.id == id)
437 )
438 with self._db.query(sql) as sql_result:
439 row = sql_result.mappings().fetchone()
440 if row is None:
441 return None
442 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
443 if recordsForType is None: 443 ↛ 444line 443 didn't jump to line 444, because the condition on line 443 was never true
444 self.refresh()
445 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
446 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
447 return DatasetRef(
448 recordsForType.datasetType,
449 dataId=recordsForType.getDataId(id=id),
450 id=id,
451 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
452 )
454 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
455 # Docstring inherited from DatasetRecordStorageManager.
456 return self._summaries.get(collection)
458 def schemaDigest(self) -> str | None:
459 # Docstring inherited from VersionedExtension.
460 return self._defaultSchemaDigest(self._static, self._db.dialect)
462 _version: VersionTuple
463 """Schema version for this class."""
465 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
466 """Type of the storage class returned by this manager."""
468 _autoincrement: bool
469 """If True then PK column of the dataset table is auto-increment."""
471 _idColumnType: type
472 """Type of dataset column used to store dataset ID."""
475class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
476 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
477 UUID for dataset primary key.
478 """
480 _version: VersionTuple = _VERSION_UUID
481 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
482 _autoincrement: bool = False
483 _idColumnType: type = ddl.GUID
485 @classmethod
486 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
487 # Docstring inherited from DatasetRecordStorageManager.
488 return True