Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 94%
197 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-28 04:40 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-28 04:40 -0700
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
5import logging
6import warnings
7from collections import defaultdict
8from typing import TYPE_CHECKING, Any
10import sqlalchemy
11from lsst.utils.ellipsis import Ellipsis
13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
14from ..._collection_summary import CollectionSummary
15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
17from ...wildcards import DatasetTypeWildcard
18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
19from .summaries import CollectionSummaryManager
20from .tables import (
21 addDatasetForeignKey,
22 makeCalibTableName,
23 makeCalibTableSpec,
24 makeStaticTableSpecs,
25 makeTagTableName,
26 makeTagTableSpec,
27)
29if TYPE_CHECKING: 29 ↛ 30line 29 didn't jump to line 30, because the condition on line 29 was never true
30 from ...interfaces import (
31 CollectionManager,
32 CollectionRecord,
33 Database,
34 DimensionRecordStorageManager,
35 StaticTablesContext,
36 )
37 from .tables import StaticDatasetTablesTuple
40# This has to be updated on every schema change
41_VERSION_INT = VersionTuple(1, 0, 0)
42_VERSION_UUID = VersionTuple(1, 0, 0)
44_LOG = logging.getLogger(__name__)
47class MissingDatabaseTableError(RuntimeError):
48 """Exception raised when a table is not found in a database."""
51class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
52 """A manager class for datasets that uses one dataset-collection table for
53 each group of dataset types that share the same dimensions.
55 In addition to the table organization, this class makes a number of
56 other design choices that would have been cumbersome (to say the least) to
57 try to pack into its name:
59 - It uses a private surrogate integer autoincrement field to identify
60 dataset types, instead of using the name as the primary and foreign key
61 directly.
63 - It aggressively loads all DatasetTypes into memory instead of fetching
64 them from the database only when needed or attempting more clever forms
65 of caching.
67 Alternative implementations that make different choices for these while
68 keeping the same general table organization might be reasonable as well.
70 This class provides complete implementation of manager logic but it is
71 parametrized by few class attributes that have to be defined by
72 sub-classes.
74 Parameters
75 ----------
76 db : `Database`
77 Interface to the underlying database engine and namespace.
78 collections : `CollectionManager`
79 Manager object for the collections in this `Registry`.
80 dimensions : `DimensionRecordStorageManager`
81 Manager object for the dimensions in this `Registry`.
82 static : `StaticDatasetTablesTuple`
83 Named tuple of `sqlalchemy.schema.Table` instances for all static
84 tables used by this class.
85 summaries : `CollectionSummaryManager`
86 Structure containing tables that summarize the contents of collections.
87 """
89 def __init__(
90 self,
91 *,
92 db: Database,
93 collections: CollectionManager,
94 dimensions: DimensionRecordStorageManager,
95 static: StaticDatasetTablesTuple,
96 summaries: CollectionSummaryManager,
97 ):
98 self._db = db
99 self._collections = collections
100 self._dimensions = dimensions
101 self._static = static
102 self._summaries = summaries
103 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
104 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
106 @classmethod
107 def initialize(
108 cls,
109 db: Database,
110 context: StaticTablesContext,
111 *,
112 collections: CollectionManager,
113 dimensions: DimensionRecordStorageManager,
114 ) -> DatasetRecordStorageManager:
115 # Docstring inherited from DatasetRecordStorageManager.
116 specs = cls.makeStaticTableSpecs(type(collections), universe=dimensions.universe)
117 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
118 summaries = CollectionSummaryManager.initialize(
119 db,
120 context,
121 collections=collections,
122 dimensions=dimensions,
123 )
124 return cls(db=db, collections=collections, dimensions=dimensions, static=static, summaries=summaries)
126 @classmethod
127 def currentVersion(cls) -> VersionTuple | None:
128 # Docstring inherited from VersionedExtension.
129 return cls._version
131 @classmethod
132 def makeStaticTableSpecs(
133 cls, collections: type[CollectionManager], universe: DimensionUniverse
134 ) -> StaticDatasetTablesTuple:
135 """Construct all static tables used by the classes in this package.
137 Static tables are those that are present in all Registries and do not
138 depend on what DatasetTypes have been registered.
140 Parameters
141 ----------
142 collections: `CollectionManager`
143 Manager object for the collections in this `Registry`.
144 universe : `DimensionUniverse`
145 Universe graph containing all dimensions known to this `Registry`.
147 Returns
148 -------
149 specs : `StaticDatasetTablesTuple`
150 A named tuple containing `ddl.TableSpec` instances.
151 """
152 return makeStaticTableSpecs(
153 collections, universe=universe, dtype=cls.getIdColumnType(), autoincrement=cls._autoincrement
154 )
156 @classmethod
157 def getIdColumnType(cls) -> type:
158 # Docstring inherited from base class.
159 return cls._idColumnType
161 @classmethod
162 def addDatasetForeignKey(
163 cls,
164 tableSpec: ddl.TableSpec,
165 *,
166 name: str = "dataset",
167 constraint: bool = True,
168 onDelete: str | None = None,
169 **kwargs: Any,
170 ) -> ddl.FieldSpec:
171 # Docstring inherited from DatasetRecordStorageManager.
172 return addDatasetForeignKey(
173 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
174 )
176 def refresh(self) -> None:
177 # Docstring inherited from DatasetRecordStorageManager.
178 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
179 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
180 dataset_types: dict[int, DatasetType] = {}
181 c = self._static.dataset_type.columns
182 with self._db.query(self._static.dataset_type.select()) as sql_result:
183 sql_rows = sql_result.mappings().fetchall()
184 for row in sql_rows:
185 name = row[c.name]
186 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
187 calibTableName = row[c.calibration_association_table]
188 datasetType = DatasetType(
189 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
190 )
191 tags = self._db.getExistingTable(
192 row[c.tag_association_table],
193 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
194 )
195 if tags is None: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true
196 raise MissingDatabaseTableError(
197 f"Table {row[c.tag_association_table]} is missing from database schema."
198 )
199 if calibTableName is not None:
200 calibs = self._db.getExistingTable(
201 row[c.calibration_association_table],
202 makeCalibTableSpec(
203 datasetType,
204 type(self._collections),
205 self._db.getTimespanRepresentation(),
206 self.getIdColumnType(),
207 ),
208 )
209 if calibs is None: 209 ↛ 210line 209 didn't jump to line 210, because the condition on line 209 was never true
210 raise MissingDatabaseTableError(
211 f"Table {row[c.calibration_association_table]} is missing from database schema."
212 )
213 else:
214 calibs = None
215 storage = self._recordStorageType(
216 db=self._db,
217 datasetType=datasetType,
218 static=self._static,
219 summaries=self._summaries,
220 tags=tags,
221 calibs=calibs,
222 dataset_type_id=row["id"],
223 collections=self._collections,
224 )
225 byName[datasetType.name] = storage
226 byId[storage._dataset_type_id] = storage
227 dataset_types[row["id"]] = datasetType
228 self._byName = byName
229 self._byId = byId
230 self._summaries.refresh(dataset_types)
232 def remove(self, name: str) -> None:
233 # Docstring inherited from DatasetRecordStorageManager.
234 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
235 if componentName is not None:
236 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
238 # Delete the row
239 try:
240 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
241 except sqlalchemy.exc.IntegrityError as e:
242 raise OrphanedRecordError(
243 f"Dataset type {name} can not be removed."
244 " It is associated with datasets that must be removed first."
245 ) from e
247 # Now refresh everything -- removal is rare enough that this does
248 # not need to be fast.
249 self.refresh()
251 def find(self, name: str) -> DatasetRecordStorage | None:
252 # Docstring inherited from DatasetRecordStorageManager.
253 return self._byName.get(name)
255 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
256 # Docstring inherited from DatasetRecordStorageManager.
257 if datasetType.isComponent(): 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true
258 raise ValueError(
259 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
260 )
261 storage = self._byName.get(datasetType.name)
262 if storage is None:
263 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
264 tagTableName = makeTagTableName(datasetType, dimensionsKey)
265 calibTableName = (
266 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
267 )
268 # The order is important here, we want to create tables first and
269 # only register them if this operation is successful. We cannot
270 # wrap it into a transaction because database class assumes that
271 # DDL is not transaction safe in general.
272 tags = self._db.ensureTableExists(
273 tagTableName,
274 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
275 )
276 if calibTableName is not None:
277 calibs = self._db.ensureTableExists(
278 calibTableName,
279 makeCalibTableSpec(
280 datasetType,
281 type(self._collections),
282 self._db.getTimespanRepresentation(),
283 self.getIdColumnType(),
284 ),
285 )
286 else:
287 calibs = None
288 row, inserted = self._db.sync(
289 self._static.dataset_type,
290 keys={"name": datasetType.name},
291 compared={
292 "dimensions_key": dimensionsKey,
293 # Force the storage class to be loaded to ensure it
294 # exists and there is no typo in the name.
295 "storage_class": datasetType.storageClass.name,
296 },
297 extra={
298 "tag_association_table": tagTableName,
299 "calibration_association_table": calibTableName,
300 },
301 returning=["id", "tag_association_table"],
302 )
303 assert row is not None
304 storage = self._recordStorageType(
305 db=self._db,
306 datasetType=datasetType,
307 static=self._static,
308 summaries=self._summaries,
309 tags=tags,
310 calibs=calibs,
311 dataset_type_id=row["id"],
312 collections=self._collections,
313 )
314 self._byName[datasetType.name] = storage
315 self._byId[storage._dataset_type_id] = storage
316 else:
317 if datasetType != storage.datasetType:
318 raise ConflictingDefinitionError(
319 f"Given dataset type {datasetType} is inconsistent "
320 f"with database definition {storage.datasetType}."
321 )
322 inserted = False
323 return storage, bool(inserted)
325 def resolve_wildcard(
326 self,
327 expression: Any,
328 components: bool | None = None,
329 missing: list[str] | None = None,
330 explicit_only: bool = False,
331 components_deprecated: bool = True,
332 ) -> dict[DatasetType, list[str | None]]:
333 wildcard = DatasetTypeWildcard.from_expression(expression)
334 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
335 # This message can be transformed into an error on DM-36303 after v26,
336 # and the components and components_deprecated arguments can be merged
337 # into one on DM-36457 after v27.
338 deprecation_message = (
339 "Querying for component datasets via Registry query methods is deprecated in favor of using "
340 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
341 "after v26, and the components argument will be removed after v27."
342 )
343 for name, dataset_type in wildcard.values.items():
344 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
345 if component_name is not None and components_deprecated:
346 warnings.warn(deprecation_message, FutureWarning)
347 if (found_storage := self.find(parent_name)) is not None:
348 found_parent = found_storage.datasetType
349 if component_name is not None:
350 found = found_parent.makeComponentDatasetType(component_name)
351 else:
352 found = found_parent
353 if dataset_type is not None:
354 if dataset_type.is_compatible_with(found): 354 ↛ 362line 354 didn't jump to line 362, because the condition on line 354 was never false
355 # Prefer the given dataset type to enable storage class
356 # conversions.
357 if component_name is not None:
358 found_parent = dataset_type.makeCompositeDatasetType()
359 else:
360 found_parent = dataset_type
361 else:
362 raise DatasetTypeError(
363 f"Dataset type definition in query expression {dataset_type} is "
364 f"not compatible with the registered type {found}."
365 )
366 result[found_parent].add(component_name)
367 elif missing is not None:
368 missing.append(name)
369 already_warned = False
370 if wildcard.patterns is Ellipsis:
371 if explicit_only:
372 raise TypeError(
373 "Universal wildcard '...' is not permitted for dataset types in this context."
374 )
375 for storage in self._byName.values():
376 result[storage.datasetType].add(None)
377 if components:
378 try:
379 result[storage.datasetType].update(
380 storage.datasetType.storageClass.allComponents().keys()
381 )
382 if (
383 storage.datasetType.storageClass.allComponents()
384 and not already_warned
385 and components_deprecated
386 ):
387 warnings.warn(deprecation_message, FutureWarning)
388 already_warned = True
389 except KeyError as err:
390 _LOG.warning(
391 f"Could not load storage class {err} for {storage.datasetType.name}; "
392 "if it has components they will not be included in query results.",
393 )
394 elif wildcard.patterns:
395 if explicit_only:
396 # After v26 this should raise DatasetTypeExpressionError, to
397 # be implemented on DM-36303.
398 warnings.warn(
399 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
400 FutureWarning,
401 )
402 for storage in self._byName.values():
403 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
404 result[storage.datasetType].add(None)
405 if components is not False:
406 for storage in self._byName.values():
407 if components is None and storage.datasetType in result:
408 continue
409 try:
410 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
411 except KeyError as err:
412 _LOG.warning(
413 f"Could not load storage class {err} for {storage.datasetType.name}; "
414 "if it has components they will not be included in query results."
415 )
416 continue
417 for component_name in components_for_parent:
418 if any(
419 p.fullmatch(
420 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
421 )
422 for p in wildcard.patterns
423 ):
424 result[storage.datasetType].add(component_name)
425 if not already_warned and components_deprecated:
426 warnings.warn(deprecation_message, FutureWarning)
427 already_warned = True
428 return {k: list(v) for k, v in result.items()}
430 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
431 # Docstring inherited from DatasetRecordStorageManager.
432 sql = (
433 sqlalchemy.sql.select(
434 self._static.dataset.columns.dataset_type_id,
435 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
436 )
437 .select_from(self._static.dataset)
438 .where(self._static.dataset.columns.id == id)
439 )
440 with self._db.query(sql) as sql_result:
441 row = sql_result.mappings().fetchone()
442 if row is None:
443 return None
444 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
445 if recordsForType is None: 445 ↛ 446line 445 didn't jump to line 446, because the condition on line 445 was never true
446 self.refresh()
447 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
448 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
449 return DatasetRef(
450 recordsForType.datasetType,
451 dataId=recordsForType.getDataId(id=id),
452 id=id,
453 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
454 )
456 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
457 # Docstring inherited from DatasetRecordStorageManager.
458 return self._summaries.get(collection)
460 _version: VersionTuple
461 """Schema version for this class."""
463 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
464 """Type of the storage class returned by this manager."""
466 _autoincrement: bool
467 """If True then PK column of the dataset table is auto-increment."""
469 _idColumnType: type
470 """Type of dataset column used to store dataset ID."""
473class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
474 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
475 UUID for dataset primary key.
476 """
478 _version: VersionTuple = _VERSION_UUID
479 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
480 _autoincrement: bool = False
481 _idColumnType: type = ddl.GUID
483 @classmethod
484 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
485 # Docstring inherited from DatasetRecordStorageManager.
486 return True