Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 93%
202 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-21 02:02 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-21 02:02 -0700
1from __future__ import annotations
3__all__ = (
4 "ByDimensionsDatasetRecordStorageManager",
5 "ByDimensionsDatasetRecordStorageManagerUUID",
6)
8import logging
9import warnings
10from collections import defaultdict
11from typing import TYPE_CHECKING, Any
13import sqlalchemy
14from lsst.utils.ellipsis import Ellipsis
16from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
17from ..._collection_summary import CollectionSummary
18from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
19from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
20from ...wildcards import DatasetTypeWildcard
21from ._storage import (
22 ByDimensionsDatasetRecordStorage,
23 ByDimensionsDatasetRecordStorageInt,
24 ByDimensionsDatasetRecordStorageUUID,
25)
26from .summaries import CollectionSummaryManager
27from .tables import (
28 addDatasetForeignKey,
29 makeCalibTableName,
30 makeCalibTableSpec,
31 makeStaticTableSpecs,
32 makeTagTableName,
33 makeTagTableSpec,
34)
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from ...interfaces import (
38 CollectionManager,
39 CollectionRecord,
40 Database,
41 DimensionRecordStorageManager,
42 StaticTablesContext,
43 )
44 from .tables import StaticDatasetTablesTuple
47# This has to be updated on every schema change
48_VERSION_INT = VersionTuple(1, 0, 0)
49_VERSION_UUID = VersionTuple(1, 0, 0)
51_LOG = logging.getLogger(__name__)
54class MissingDatabaseTableError(RuntimeError):
55 """Exception raised when a table is not found in a database."""
58class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
59 """A manager class for datasets that uses one dataset-collection table for
60 each group of dataset types that share the same dimensions.
62 In addition to the table organization, this class makes a number of
63 other design choices that would have been cumbersome (to say the least) to
64 try to pack into its name:
66 - It uses a private surrogate integer autoincrement field to identify
67 dataset types, instead of using the name as the primary and foreign key
68 directly.
70 - It aggressively loads all DatasetTypes into memory instead of fetching
71 them from the database only when needed or attempting more clever forms
72 of caching.
74 Alternative implementations that make different choices for these while
75 keeping the same general table organization might be reasonable as well.
77 This class provides complete implementation of manager logic but it is
78 parametrized by few class attributes that have to be defined by
79 sub-classes.
81 Parameters
82 ----------
83 db : `Database`
84 Interface to the underlying database engine and namespace.
85 collections : `CollectionManager`
86 Manager object for the collections in this `Registry`.
87 dimensions : `DimensionRecordStorageManager`
88 Manager object for the dimensions in this `Registry`.
89 static : `StaticDatasetTablesTuple`
90 Named tuple of `sqlalchemy.schema.Table` instances for all static
91 tables used by this class.
92 summaries : `CollectionSummaryManager`
93 Structure containing tables that summarize the contents of collections.
94 """
96 def __init__(
97 self,
98 *,
99 db: Database,
100 collections: CollectionManager,
101 dimensions: DimensionRecordStorageManager,
102 static: StaticDatasetTablesTuple,
103 summaries: CollectionSummaryManager,
104 ):
105 self._db = db
106 self._collections = collections
107 self._dimensions = dimensions
108 self._static = static
109 self._summaries = summaries
110 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
111 self._byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
113 @classmethod
114 def initialize(
115 cls,
116 db: Database,
117 context: StaticTablesContext,
118 *,
119 collections: CollectionManager,
120 dimensions: DimensionRecordStorageManager,
121 ) -> DatasetRecordStorageManager:
122 # Docstring inherited from DatasetRecordStorageManager.
123 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
124 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
125 summaries = CollectionSummaryManager.initialize(
126 db,
127 context,
128 collections=collections,
129 dimensions=dimensions,
130 )
131 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
133 @classmethod
134 def currentVersion(cls) -> VersionTuple | None:
135 # Docstring inherited from VersionedExtension.
136 return cls._version
138 @classmethod
139 def makeStaticTableSpecs(
140 cls, collections: type[CollectionManager], universe: DimensionUniverse
141 ) -> StaticDatasetTablesTuple:
142 """Construct all static tables used by the classes in this package.
144 Static tables are those that are present in all Registries and do not
145 depend on what DatasetTypes have been registered.
147 Parameters
148 ----------
149 collections: `CollectionManager`
150 Manager object for the collections in this `Registry`.
151 universe : `DimensionUniverse`
152 Universe graph containing all dimensions known to this `Registry`.
154 Returns
155 -------
156 specs : `StaticDatasetTablesTuple`
157 A named tuple containing `ddl.TableSpec` instances.
158 """
159 return makeStaticTableSpecs(
160 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
161 )
163 @classmethod
164 def getIdColumnType(cls) -> type:
165 # Docstring inherited from base class.
166 return cls._idColumnType
168 @classmethod
169 def addDatasetForeignKey(
170 cls,
171 tableSpec: ddl.TableSpec,
172 *,
173 name: str = "dataset",
174 constraint: bool = True,
175 onDelete: str | None = None,
176 **kwargs: Any,
177 ) -> ddl.FieldSpec:
178 # Docstring inherited from DatasetRecordStorageManager.
179 return addDatasetForeignKey(
180 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
181 )
183 def refresh(self) -> None:
184 # Docstring inherited from DatasetRecordStorageManager.
185 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
186 byId: dict[DatasetId, ByDimensionsDatasetRecordStorage] = {}
187 c = self._static.dataset_type.columns
188 for row in self._db.query(self._static.dataset_type.select()).mappings():
189 name = row[c.name]
190 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
191 calibTableName = row[c.calibration_association_table]
192 datasetType = DatasetType(
193 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
194 )
195 tags = self._db.getExistingTable(
196 row[c.tag_association_table],
197 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
198 )
199 if tags is None: 199 ↛ 200line 199 didn't jump to line 200, because the condition on line 199 was never true
200 raise MissingDatabaseTableError(
201 f"Table {row[c.tag_association_table]} is missing from database schema."
202 )
203 if calibTableName is not None:
204 calibs = self._db.getExistingTable(
205 row[c.calibration_association_table],
206 makeCalibTableSpec(
207 datasetType,
208 type(self._collections),
209 self._db.getTimespanRepresentation(),
210 self.getIdColumnType(),
211 ),
212 )
213 if calibs is None: 213 ↛ 214line 213 didn't jump to line 214, because the condition on line 213 was never true
214 raise MissingDatabaseTableError(
215 f"Table {row[c.calibration_association_table]} is missing from database schema."
216 )
217 else:
218 calibs = None
219 storage = self._recordStorageType(
220 db=self._db,
221 datasetType=datasetType,
222 static=self._static,
223 summaries=self._summaries,
224 tags=tags,
225 calibs=calibs,
226 dataset_type_id=row["id"],
227 collections=self._collections,
228 )
229 byName[datasetType.name] = storage
230 byId[storage._dataset_type_id] = storage
231 self._byName = byName
232 self._byId = byId
233 self._summaries.refresh(lambda dataset_type_id: self._byId[dataset_type_id].datasetType)
235 def remove(self, name: str) -> None:
236 # Docstring inherited from DatasetRecordStorageManager.
237 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
238 if componentName is not None:
239 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
241 # Delete the row
242 try:
243 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
244 except sqlalchemy.exc.IntegrityError as e:
245 raise OrphanedRecordError(
246 f"Dataset type {name} can not be removed."
247 " It is associated with datasets that must be removed first."
248 ) from e
250 # Now refresh everything -- removal is rare enough that this does
251 # not need to be fast.
252 self.refresh()
254 def find(self, name: str) -> DatasetRecordStorage | None:
255 # Docstring inherited from DatasetRecordStorageManager.
256 return self._byName.get(name)
258 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
259 # Docstring inherited from DatasetRecordStorageManager.
260 if datasetType.isComponent(): 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true
261 raise ValueError(
262 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
263 )
264 storage = self._byName.get(datasetType.name)
265 if storage is None:
266 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
267 tagTableName = makeTagTableName(datasetType, dimensionsKey)
268 calibTableName = (
269 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
270 )
271 # The order is important here, we want to create tables first and
272 # only register them if this operation is successful. We cannot
273 # wrap it into a transaction because database class assumes that
274 # DDL is not transaction safe in general.
275 tags = self._db.ensureTableExists(
276 tagTableName,
277 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
278 )
279 if calibTableName is not None:
280 calibs = self._db.ensureTableExists(
281 calibTableName,
282 makeCalibTableSpec(
283 datasetType,
284 type(self._collections),
285 self._db.getTimespanRepresentation(),
286 self.getIdColumnType(),
287 ),
288 )
289 else:
290 calibs = None
291 row, inserted = self._db.sync(
292 self._static.dataset_type,
293 keys={"name": datasetType.name},
294 compared={
295 "dimensions_key": dimensionsKey,
296 # Force the storage class to be loaded to ensure it
297 # exists and there is no typo in the name.
298 "storage_class": datasetType.storageClass.name,
299 },
300 extra={
301 "tag_association_table": tagTableName,
302 "calibration_association_table": calibTableName,
303 },
304 returning=["id", "tag_association_table"],
305 )
306 assert row is not None
307 storage = self._recordStorageType(
308 db=self._db,
309 datasetType=datasetType,
310 static=self._static,
311 summaries=self._summaries,
312 tags=tags,
313 calibs=calibs,
314 dataset_type_id=row["id"],
315 collections=self._collections,
316 )
317 self._byName[datasetType.name] = storage
318 self._byId[storage._dataset_type_id] = storage
319 else:
320 if datasetType != storage.datasetType:
321 raise ConflictingDefinitionError(
322 f"Given dataset type {datasetType} is inconsistent "
323 f"with database definition {storage.datasetType}."
324 )
325 inserted = False
326 return storage, bool(inserted)
328 def resolve_wildcard(
329 self,
330 expression: Any,
331 components: bool | None = None,
332 missing: list[str] | None = None,
333 explicit_only: bool = False,
334 ) -> dict[DatasetType, list[str | None]]:
335 wildcard = DatasetTypeWildcard.from_expression(expression)
336 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
337 # This message can be transformed into an error on DM-36303 after v26,
338 # and the components argument here (and in all callers) can be removed
339 # entirely on DM-36457 after v27.
340 deprecation_message = (
341 "Querying for component datasets via Registry query methods is deprecated in favor of using "
342 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
343 "after v26, and the components argument will be removed after v27."
344 )
345 for name, dataset_type in wildcard.values.items():
346 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
347 if component_name is not None:
348 warnings.warn(deprecation_message, FutureWarning)
349 if (found_storage := self.find(parent_name)) is not None:
350 found_parent = found_storage.datasetType
351 if component_name is not None:
352 found = found_parent.makeComponentDatasetType(component_name)
353 else:
354 found = found_parent
355 if dataset_type is not None:
356 if dataset_type.is_compatible_with(found): 356 ↛ 364line 356 didn't jump to line 364, because the condition on line 356 was never false
357 # Prefer the given dataset type to enable storage class
358 # conversions.
359 if component_name is not None: 359 ↛ 360line 359 didn't jump to line 360, because the condition on line 359 was never true
360 found_parent = dataset_type.makeCompositeDatasetType()
361 else:
362 found_parent = dataset_type
363 else:
364 raise DatasetTypeError(
365 f"Dataset type definition in query expression {dataset_type} is "
366 f"not compatible with the registered type {found}."
367 )
368 result[found_parent].add(component_name)
369 elif missing is not None: 369 ↛ 345line 369 didn't jump to line 345, because the condition on line 369 was never false
370 missing.append(name)
371 already_warned = False
372 if wildcard.patterns is Ellipsis:
373 if explicit_only:
374 raise TypeError(
375 "Universal wildcard '...' is not permitted for dataset types in this context."
376 )
377 for storage in self._byName.values():
378 result[storage.datasetType].add(None)
379 if components:
380 try:
381 result[storage.datasetType].update(
382 storage.datasetType.storageClass.allComponents().keys()
383 )
384 if storage.datasetType.storageClass.allComponents() and not already_warned:
385 warnings.warn(deprecation_message, FutureWarning)
386 already_warned = True
387 except KeyError as err:
388 _LOG.warning(
389 f"Could not load storage class {err} for {storage.datasetType.name}; "
390 "if it has components they will not be included in query results.",
391 )
392 elif wildcard.patterns:
393 if explicit_only:
394 # After v26 this should raise DatasetTypeExpressionError, to
395 # be implemented on DM-36303.
396 warnings.warn(
397 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
398 FutureWarning,
399 )
400 for storage in self._byName.values():
401 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
402 result[storage.datasetType].add(None)
403 if components is not False:
404 for storage in self._byName.values():
405 if components is None and storage.datasetType in result:
406 continue
407 try:
408 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
409 except KeyError as err:
410 _LOG.warning(
411 f"Could not load storage class {err} for {storage.datasetType.name}; "
412 "if it has components they will not be included in query results."
413 )
414 continue
415 for component_name in components_for_parent:
416 if any(
417 p.fullmatch(
418 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
419 )
420 for p in wildcard.patterns
421 ):
422 result[storage.datasetType].add(component_name)
423 if not already_warned:
424 warnings.warn(deprecation_message, FutureWarning)
425 already_warned = True
426 return {k: list(v) for k, v in result.items()}
428 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
429 # Docstring inherited from DatasetRecordStorageManager.
430 sql = (
431 sqlalchemy.sql.select(
432 self._static.dataset.columns.dataset_type_id,
433 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
434 )
435 .select_from(self._static.dataset)
436 .where(self._static.dataset.columns.id == id)
437 )
438 row = self._db.query(sql).mappings().fetchone()
439 if row is None:
440 return None
441 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
442 if recordsForType is None: 442 ↛ 443line 442 didn't jump to line 443, because the condition on line 442 was never true
443 self.refresh()
444 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
445 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
446 return DatasetRef(
447 recordsForType.datasetType,
448 dataId=recordsForType.getDataId(id=id),
449 id=id,
450 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
451 )
453 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
454 # Docstring inherited from DatasetRecordStorageManager.
455 return self._summaries.get(collection)
457 def schemaDigest(self) -> str | None:
458 # Docstring inherited from VersionedExtension.
459 return self._defaultSchemaDigest(self._static, self._db.dialect)
461 _version: VersionTuple
462 """Schema version for this class."""
464 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
465 """Type of the storage class returned by this manager."""
467 _autoincrement: bool
468 """If True then PK column of the dataset table is auto-increment."""
470 _idColumnType: type
471 """Type of dataset column used to store dataset ID."""
474class ByDimensionsDatasetRecordStorageManager(ByDimensionsDatasetRecordStorageManagerBase):
475 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
476 auto-incremental integer for dataset primary key.
477 """
479 _version: VersionTuple = _VERSION_INT
480 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageInt
481 _autoincrement: bool = True
482 _idColumnType: type = sqlalchemy.BigInteger
484 @classmethod
485 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
486 # Docstring inherited from DatasetRecordStorageManager.
487 # MyPy seems confused about enum value types here.
488 return mode is mode.UNIQUE # type: ignore
491class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
492 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
493 UUID for dataset primary key.
494 """
496 _version: VersionTuple = _VERSION_UUID
497 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
498 _autoincrement: bool = False
499 _idColumnType: type = ddl.GUID
501 @classmethod
502 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
503 # Docstring inherited from DatasetRecordStorageManager.
504 return True