Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%
194 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:32 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-30 02:32 -0700
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
5import logging
6import warnings
7from collections import defaultdict
8from typing import TYPE_CHECKING, Any
10import sqlalchemy
11from lsst.utils.ellipsis import Ellipsis
13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
14from ..._collection_summary import CollectionSummary
15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
17from ...wildcards import DatasetTypeWildcard
18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
19from .summaries import CollectionSummaryManager
20from .tables import (
21 addDatasetForeignKey,
22 makeCalibTableName,
23 makeCalibTableSpec,
24 makeStaticTableSpecs,
25 makeTagTableName,
26 makeTagTableSpec,
27)
29if TYPE_CHECKING:
30 from ...interfaces import (
31 CollectionManager,
32 CollectionRecord,
33 Database,
34 DimensionRecordStorageManager,
35 StaticTablesContext,
36 )
37 from .tables import StaticDatasetTablesTuple
40# This has to be updated on every schema change
41_VERSION_UUID = VersionTuple(1, 0, 0)
43_LOG = logging.getLogger(__name__)
46class MissingDatabaseTableError(RuntimeError):
47 """Exception raised when a table is not found in a database."""
50class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
51 """A manager class for datasets that uses one dataset-collection table for
52 each group of dataset types that share the same dimensions.
54 In addition to the table organization, this class makes a number of
55 other design choices that would have been cumbersome (to say the least) to
56 try to pack into its name:
58 - It uses a private surrogate integer autoincrement field to identify
59 dataset types, instead of using the name as the primary and foreign key
60 directly.
62 - It aggressively loads all DatasetTypes into memory instead of fetching
63 them from the database only when needed or attempting more clever forms
64 of caching.
66 Alternative implementations that make different choices for these while
67 keeping the same general table organization might be reasonable as well.
69 This class provides complete implementation of manager logic but it is
70 parametrized by few class attributes that have to be defined by
71 sub-classes.
73 Parameters
74 ----------
75 db : `Database`
76 Interface to the underlying database engine and namespace.
77 collections : `CollectionManager`
78 Manager object for the collections in this `Registry`.
79 dimensions : `DimensionRecordStorageManager`
80 Manager object for the dimensions in this `Registry`.
81 static : `StaticDatasetTablesTuple`
82 Named tuple of `sqlalchemy.schema.Table` instances for all static
83 tables used by this class.
84 summaries : `CollectionSummaryManager`
85 Structure containing tables that summarize the contents of collections.
86 """
88 def __init__(
89 self,
90 *,
91 db: Database,
92 collections: CollectionManager,
93 dimensions: DimensionRecordStorageManager,
94 static: StaticDatasetTablesTuple,
95 summaries: CollectionSummaryManager,
96 registry_schema_version: VersionTuple | None = None,
97 ):
98 super().__init__(registry_schema_version=registry_schema_version)
99 self._db = db
100 self._collections = collections
101 self._dimensions = dimensions
102 self._static = static
103 self._summaries = summaries
104 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
105 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
107 @classmethod
108 def initialize(
109 cls,
110 db: Database,
111 context: StaticTablesContext,
112 *,
113 collections: CollectionManager,
114 dimensions: DimensionRecordStorageManager,
115 registry_schema_version: VersionTuple | None = None,
116 ) -> DatasetRecordStorageManager:
117 # Docstring inherited from DatasetRecordStorageManager.
118 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
119 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
120 summaries = CollectionSummaryManager.initialize(
121 db,
122 context,
123 collections=collections,
124 dimensions=dimensions,
125 )
126 return cls(
127 db=db,
128 collections=collections,
129 dimensions=dimensions,
130 static=static,
131 summaries=summaries,
132 registry_schema_version=registry_schema_version,
133 )
135 @classmethod
136 def currentVersions(cls) -> list[VersionTuple]:
137 # Docstring inherited from VersionedExtension.
138 return [cls._version]
140 @classmethod
141 def makeStaticTableSpecs(
142 cls, collections: type[CollectionManager], universe: DimensionUniverse
143 ) -> StaticDatasetTablesTuple:
144 """Construct all static tables used by the classes in this package.
146 Static tables are those that are present in all Registries and do not
147 depend on what DatasetTypes have been registered.
149 Parameters
150 ----------
151 collections: `CollectionManager`
152 Manager object for the collections in this `Registry`.
153 universe : `DimensionUniverse`
154 Universe graph containing all dimensions known to this `Registry`.
156 Returns
157 -------
158 specs : `StaticDatasetTablesTuple`
159 A named tuple containing `ddl.TableSpec` instances.
160 """
161 return makeStaticTableSpecs(
162 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
163 )
165 @classmethod
166 def getIdColumnType(cls) -> type:
167 # Docstring inherited from base class.
168 return cls._idColumnType
170 @classmethod
171 def addDatasetForeignKey(
172 cls,
173 tableSpec: ddl.TableSpec,
174 *,
175 name: str = "dataset",
176 constraint: bool = True,
177 onDelete: str | None = None,
178 **kwargs: Any,
179 ) -> ddl.FieldSpec:
180 # Docstring inherited from DatasetRecordStorageManager.
181 return addDatasetForeignKey(
182 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
183 )
185 def refresh(self) -> None:
186 # Docstring inherited from DatasetRecordStorageManager.
187 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
188 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
189 dataset_types: dict[int, DatasetType] = {}
190 c = self._static.dataset_type.columns
191 with self._db.query(self._static.dataset_type.select()) as sql_result:
192 sql_rows = sql_result.mappings().fetchall()
193 for row in sql_rows:
194 name = row[c.name]
195 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
196 calibTableName = row[c.calibration_association_table]
197 datasetType = DatasetType(
198 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
199 )
200 tags = self._db.getExistingTable(
201 row[c.tag_association_table],
202 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
203 )
204 if tags is None: 204 ↛ 205line 204 didn't jump to line 205, because the condition on line 204 was never true
205 raise MissingDatabaseTableError(
206 f"Table {row[c.tag_association_table]} is missing from database schema."
207 )
208 if calibTableName is not None:
209 calibs = self._db.getExistingTable(
210 row[c.calibration_association_table],
211 makeCalibTableSpec(
212 datasetType,
213 type(self._collections),
214 self._db.getTimespanRepresentation(),
215 self.getIdColumnType(),
216 ),
217 )
218 if calibs is None: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true
219 raise MissingDatabaseTableError(
220 f"Table {row[c.calibration_association_table]} is missing from database schema."
221 )
222 else:
223 calibs = None
224 storage = self._recordStorageType(
225 db=self._db,
226 datasetType=datasetType,
227 static=self._static,
228 summaries=self._summaries,
229 tags=tags,
230 calibs=calibs,
231 dataset_type_id=row["id"],
232 collections=self._collections,
233 )
234 byName[datasetType.name] = storage
235 byId[storage._dataset_type_id] = storage
236 dataset_types[row["id"]] = datasetType
237 self._byName = byName
238 self._byId = byId
239 self._summaries.refresh(dataset_types)
241 def remove(self, name: str) -> None:
242 # Docstring inherited from DatasetRecordStorageManager.
243 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
244 if componentName is not None:
245 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
247 # Delete the row
248 try:
249 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
250 except sqlalchemy.exc.IntegrityError as e:
251 raise OrphanedRecordError(
252 f"Dataset type {name} can not be removed."
253 " It is associated with datasets that must be removed first."
254 ) from e
256 # Now refresh everything -- removal is rare enough that this does
257 # not need to be fast.
258 self.refresh()
260 def find(self, name: str) -> DatasetRecordStorage | None:
261 # Docstring inherited from DatasetRecordStorageManager.
262 return self._byName.get(name)
264 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
265 # Docstring inherited from DatasetRecordStorageManager.
266 if datasetType.isComponent(): 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true
267 raise ValueError(
268 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
269 )
270 storage = self._byName.get(datasetType.name)
271 if storage is None:
272 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
273 tagTableName = makeTagTableName(datasetType, dimensionsKey)
274 calibTableName = (
275 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
276 )
277 # The order is important here, we want to create tables first and
278 # only register them if this operation is successful. We cannot
279 # wrap it into a transaction because database class assumes that
280 # DDL is not transaction safe in general.
281 tags = self._db.ensureTableExists(
282 tagTableName,
283 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
284 )
285 if calibTableName is not None:
286 calibs = self._db.ensureTableExists(
287 calibTableName,
288 makeCalibTableSpec(
289 datasetType,
290 type(self._collections),
291 self._db.getTimespanRepresentation(),
292 self.getIdColumnType(),
293 ),
294 )
295 else:
296 calibs = None
297 row, inserted = self._db.sync(
298 self._static.dataset_type,
299 keys={"name": datasetType.name},
300 compared={
301 "dimensions_key": dimensionsKey,
302 # Force the storage class to be loaded to ensure it
303 # exists and there is no typo in the name.
304 "storage_class": datasetType.storageClass.name,
305 },
306 extra={
307 "tag_association_table": tagTableName,
308 "calibration_association_table": calibTableName,
309 },
310 returning=["id", "tag_association_table"],
311 )
312 assert row is not None
313 storage = self._recordStorageType(
314 db=self._db,
315 datasetType=datasetType,
316 static=self._static,
317 summaries=self._summaries,
318 tags=tags,
319 calibs=calibs,
320 dataset_type_id=row["id"],
321 collections=self._collections,
322 )
323 self._byName[datasetType.name] = storage
324 self._byId[storage._dataset_type_id] = storage
325 else:
326 if datasetType != storage.datasetType:
327 raise ConflictingDefinitionError(
328 f"Given dataset type {datasetType} is inconsistent "
329 f"with database definition {storage.datasetType}."
330 )
331 inserted = False
332 return storage, bool(inserted)
334 def resolve_wildcard(
335 self,
336 expression: Any,
337 components: bool | None = None,
338 missing: list[str] | None = None,
339 explicit_only: bool = False,
340 components_deprecated: bool = True,
341 ) -> dict[DatasetType, list[str | None]]:
342 wildcard = DatasetTypeWildcard.from_expression(expression)
343 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
344 # This message can be transformed into an error on DM-36303 after v26,
345 # and the components and components_deprecated arguments can be merged
346 # into one on DM-36457 after v27.
347 deprecation_message = (
348 "Querying for component datasets via Registry query methods is deprecated in favor of using "
349 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
350 "after v26, and the components argument will be removed after v27."
351 )
352 for name, dataset_type in wildcard.values.items():
353 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
354 if component_name is not None and components_deprecated:
355 warnings.warn(deprecation_message, FutureWarning)
356 if (found_storage := self.find(parent_name)) is not None:
357 found_parent = found_storage.datasetType
358 if component_name is not None:
359 found = found_parent.makeComponentDatasetType(component_name)
360 else:
361 found = found_parent
362 if dataset_type is not None:
363 if dataset_type.is_compatible_with(found): 363 ↛ 371line 363 didn't jump to line 371, because the condition on line 363 was never false
364 # Prefer the given dataset type to enable storage class
365 # conversions.
366 if component_name is not None:
367 found_parent = dataset_type.makeCompositeDatasetType()
368 else:
369 found_parent = dataset_type
370 else:
371 raise DatasetTypeError(
372 f"Dataset type definition in query expression {dataset_type} is "
373 f"not compatible with the registered type {found}."
374 )
375 result[found_parent].add(component_name)
376 elif missing is not None:
377 missing.append(name)
378 already_warned = False
379 if wildcard.patterns is Ellipsis:
380 if explicit_only:
381 raise TypeError(
382 "Universal wildcard '...' is not permitted for dataset types in this context."
383 )
384 for storage in self._byName.values():
385 result[storage.datasetType].add(None)
386 if components:
387 try:
388 result[storage.datasetType].update(
389 storage.datasetType.storageClass.allComponents().keys()
390 )
391 if (
392 storage.datasetType.storageClass.allComponents()
393 and not already_warned
394 and components_deprecated
395 ):
396 warnings.warn(deprecation_message, FutureWarning)
397 already_warned = True
398 except KeyError as err:
399 _LOG.warning(
400 f"Could not load storage class {err} for {storage.datasetType.name}; "
401 "if it has components they will not be included in query results.",
402 )
403 elif wildcard.patterns:
404 if explicit_only:
405 # After v26 this should raise DatasetTypeExpressionError, to
406 # be implemented on DM-36303.
407 warnings.warn(
408 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
409 FutureWarning,
410 )
411 for storage in self._byName.values():
412 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
413 result[storage.datasetType].add(None)
414 if components is not False:
415 for storage in self._byName.values():
416 if components is None and storage.datasetType in result:
417 continue
418 try:
419 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
420 except KeyError as err:
421 _LOG.warning(
422 f"Could not load storage class {err} for {storage.datasetType.name}; "
423 "if it has components they will not be included in query results."
424 )
425 continue
426 for component_name in components_for_parent:
427 if any(
428 p.fullmatch(
429 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
430 )
431 for p in wildcard.patterns
432 ):
433 result[storage.datasetType].add(component_name)
434 if not already_warned and components_deprecated:
435 warnings.warn(deprecation_message, FutureWarning)
436 already_warned = True
437 return {k: list(v) for k, v in result.items()}
439 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
440 # Docstring inherited from DatasetRecordStorageManager.
441 sql = (
442 sqlalchemy.sql.select(
443 self._static.dataset.columns.dataset_type_id,
444 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
445 )
446 .select_from(self._static.dataset)
447 .where(self._static.dataset.columns.id == id)
448 )
449 with self._db.query(sql) as sql_result:
450 row = sql_result.mappings().fetchone()
451 if row is None:
452 return None
453 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
454 if recordsForType is None: 454 ↛ 455line 454 didn't jump to line 455, because the condition on line 454 was never true
455 self.refresh()
456 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
457 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
458 return DatasetRef(
459 recordsForType.datasetType,
460 dataId=recordsForType.getDataId(id=id),
461 id=id,
462 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
463 )
465 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
466 # Docstring inherited from DatasetRecordStorageManager.
467 return self._summaries.get(collection)
469 _version: VersionTuple
470 """Schema version for this class."""
472 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
473 """Type of the storage class returned by this manager."""
475 _autoincrement: bool
476 """If True then PK column of the dataset table is auto-increment."""
478 _idColumnType: type
479 """Type of dataset column used to store dataset ID."""
482class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
483 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
484 UUID for dataset primary key.
485 """
487 _version: VersionTuple = _VERSION_UUID
488 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
489 _autoincrement: bool = False
490 _idColumnType: type = ddl.GUID
492 @classmethod
493 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
494 # Docstring inherited from DatasetRecordStorageManager.
495 return True