Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%
205 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-19 03:42 -0700
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-19 03:42 -0700
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
5import logging
6import warnings
7from collections import defaultdict
8from typing import TYPE_CHECKING, Any
10import sqlalchemy
11from lsst.utils.ellipsis import Ellipsis
13from ....core import DatasetId, DatasetRef, DatasetType, DimensionUniverse, ddl
14from ..._collection_summary import CollectionSummary
15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
16from ...interfaces import DatasetIdGenEnum, DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
17from ...wildcards import DatasetTypeWildcard
18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
19from .summaries import CollectionSummaryManager
20from .tables import (
21 addDatasetForeignKey,
22 makeCalibTableName,
23 makeCalibTableSpec,
24 makeStaticTableSpecs,
25 makeTagTableName,
26 makeTagTableSpec,
27)
29if TYPE_CHECKING:
30 from ...interfaces import (
31 CollectionManager,
32 CollectionRecord,
33 Database,
34 DimensionRecordStorageManager,
35 StaticTablesContext,
36 )
37 from .tables import StaticDatasetTablesTuple
40# This has to be updated on every schema change
41_VERSION_UUID = VersionTuple(1, 0, 0)
42# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
43# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
44# client migration period.
45_VERSION_UUID_NS = VersionTuple(2, 0, 0)
47_LOG = logging.getLogger(__name__)
50class MissingDatabaseTableError(RuntimeError):
51 """Exception raised when a table is not found in a database."""
54class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
55 """A manager class for datasets that uses one dataset-collection table for
56 each group of dataset types that share the same dimensions.
58 In addition to the table organization, this class makes a number of
59 other design choices that would have been cumbersome (to say the least) to
60 try to pack into its name:
62 - It uses a private surrogate integer autoincrement field to identify
63 dataset types, instead of using the name as the primary and foreign key
64 directly.
66 - It aggressively loads all DatasetTypes into memory instead of fetching
67 them from the database only when needed or attempting more clever forms
68 of caching.
70 Alternative implementations that make different choices for these while
71 keeping the same general table organization might be reasonable as well.
73 This class provides complete implementation of manager logic but it is
74 parametrized by few class attributes that have to be defined by
75 sub-classes.
77 Parameters
78 ----------
79 db : `Database`
80 Interface to the underlying database engine and namespace.
81 collections : `CollectionManager`
82 Manager object for the collections in this `Registry`.
83 dimensions : `DimensionRecordStorageManager`
84 Manager object for the dimensions in this `Registry`.
85 static : `StaticDatasetTablesTuple`
86 Named tuple of `sqlalchemy.schema.Table` instances for all static
87 tables used by this class.
88 summaries : `CollectionSummaryManager`
89 Structure containing tables that summarize the contents of collections.
90 """
92 def __init__(
93 self,
94 *,
95 db: Database,
96 collections: CollectionManager,
97 dimensions: DimensionRecordStorageManager,
98 static: StaticDatasetTablesTuple,
99 summaries: CollectionSummaryManager,
100 registry_schema_version: VersionTuple | None = None,
101 ):
102 super().__init__(registry_schema_version=registry_schema_version)
103 self._db = db
104 self._collections = collections
105 self._dimensions = dimensions
106 self._static = static
107 self._summaries = summaries
108 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
109 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
111 @classmethod
112 def initialize(
113 cls,
114 db: Database,
115 context: StaticTablesContext,
116 *,
117 collections: CollectionManager,
118 dimensions: DimensionRecordStorageManager,
119 registry_schema_version: VersionTuple | None = None,
120 ) -> DatasetRecordStorageManager:
121 # Docstring inherited from DatasetRecordStorageManager.
122 specs = cls.makeStaticTableSpecs(
123 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
124 )
125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
126 summaries = CollectionSummaryManager.initialize(
127 db,
128 context,
129 collections=collections,
130 dimensions=dimensions,
131 )
132 return cls(
133 db=db,
134 collections=collections,
135 dimensions=dimensions,
136 static=static,
137 summaries=summaries,
138 registry_schema_version=registry_schema_version,
139 )
141 @classmethod
142 def currentVersions(cls) -> list[VersionTuple]:
143 # Docstring inherited from VersionedExtension.
144 return cls._versions
146 @classmethod
147 def makeStaticTableSpecs(
148 cls,
149 collections: type[CollectionManager],
150 universe: DimensionUniverse,
151 schema_version: VersionTuple | None,
152 ) -> StaticDatasetTablesTuple:
153 """Construct all static tables used by the classes in this package.
155 Static tables are those that are present in all Registries and do not
156 depend on what DatasetTypes have been registered.
158 Parameters
159 ----------
160 collections: `CollectionManager`
161 Manager object for the collections in this `Registry`.
162 universe : `DimensionUniverse`
163 Universe graph containing all dimensions known to this `Registry`.
164 schema_version : `VersionTuple` or `None`
165 Version of the schema that should be created, if `None` then
166 default schema should be used.
168 Returns
169 -------
170 specs : `StaticDatasetTablesTuple`
171 A named tuple containing `ddl.TableSpec` instances.
172 """
173 schema_version = cls.clsNewSchemaVersion(schema_version)
174 assert schema_version is not None, "New schema version cannot be None"
175 return makeStaticTableSpecs(
176 collections,
177 universe=universe,
178 dtype=cls.getIdColumnType(),
179 autoincrement=cls._autoincrement,
180 schema_version=schema_version,
181 )
183 @classmethod
184 def getIdColumnType(cls) -> type:
185 # Docstring inherited from base class.
186 return cls._idColumnType
188 @classmethod
189 def addDatasetForeignKey(
190 cls,
191 tableSpec: ddl.TableSpec,
192 *,
193 name: str = "dataset",
194 constraint: bool = True,
195 onDelete: str | None = None,
196 **kwargs: Any,
197 ) -> ddl.FieldSpec:
198 # Docstring inherited from DatasetRecordStorageManager.
199 return addDatasetForeignKey(
200 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
201 )
203 def refresh(self) -> None:
204 # Docstring inherited from DatasetRecordStorageManager.
205 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
206 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
207 dataset_types: dict[int, DatasetType] = {}
208 c = self._static.dataset_type.columns
209 with self._db.query(self._static.dataset_type.select()) as sql_result:
210 sql_rows = sql_result.mappings().fetchall()
211 for row in sql_rows:
212 name = row[c.name]
213 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
214 calibTableName = row[c.calibration_association_table]
215 datasetType = DatasetType(
216 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
217 )
218 tags = self._db.getExistingTable(
219 row[c.tag_association_table],
220 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
221 )
222 if tags is None: 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true
223 raise MissingDatabaseTableError(
224 f"Table {row[c.tag_association_table]} is missing from database schema."
225 )
226 if calibTableName is not None:
227 calibs = self._db.getExistingTable(
228 row[c.calibration_association_table],
229 makeCalibTableSpec(
230 datasetType,
231 type(self._collections),
232 self._db.getTimespanRepresentation(),
233 self.getIdColumnType(),
234 ),
235 )
236 if calibs is None: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 raise MissingDatabaseTableError(
238 f"Table {row[c.calibration_association_table]} is missing from database schema."
239 )
240 else:
241 calibs = None
242 storage = self._recordStorageType(
243 db=self._db,
244 datasetType=datasetType,
245 static=self._static,
246 summaries=self._summaries,
247 tags=tags,
248 calibs=calibs,
249 dataset_type_id=row["id"],
250 collections=self._collections,
251 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
252 )
253 byName[datasetType.name] = storage
254 byId[storage._dataset_type_id] = storage
255 dataset_types[row["id"]] = datasetType
256 self._byName = byName
257 self._byId = byId
258 self._summaries.refresh(dataset_types)
260 def remove(self, name: str) -> None:
261 # Docstring inherited from DatasetRecordStorageManager.
262 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
263 if componentName is not None:
264 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
266 # Delete the row
267 try:
268 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
269 except sqlalchemy.exc.IntegrityError as e:
270 raise OrphanedRecordError(
271 f"Dataset type {name} can not be removed."
272 " It is associated with datasets that must be removed first."
273 ) from e
275 # Now refresh everything -- removal is rare enough that this does
276 # not need to be fast.
277 self.refresh()
279 def find(self, name: str) -> DatasetRecordStorage | None:
280 # Docstring inherited from DatasetRecordStorageManager.
281 return self._byName.get(name)
283 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
284 # Docstring inherited from DatasetRecordStorageManager.
285 if datasetType.isComponent(): 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true
286 raise ValueError(
287 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
288 )
289 storage = self._byName.get(datasetType.name)
290 if storage is None:
291 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
292 tagTableName = makeTagTableName(datasetType, dimensionsKey)
293 calibTableName = (
294 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
295 )
296 # The order is important here, we want to create tables first and
297 # only register them if this operation is successful. We cannot
298 # wrap it into a transaction because database class assumes that
299 # DDL is not transaction safe in general.
300 tags = self._db.ensureTableExists(
301 tagTableName,
302 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
303 )
304 if calibTableName is not None:
305 calibs = self._db.ensureTableExists(
306 calibTableName,
307 makeCalibTableSpec(
308 datasetType,
309 type(self._collections),
310 self._db.getTimespanRepresentation(),
311 self.getIdColumnType(),
312 ),
313 )
314 else:
315 calibs = None
316 row, inserted = self._db.sync(
317 self._static.dataset_type,
318 keys={"name": datasetType.name},
319 compared={
320 "dimensions_key": dimensionsKey,
321 # Force the storage class to be loaded to ensure it
322 # exists and there is no typo in the name.
323 "storage_class": datasetType.storageClass.name,
324 },
325 extra={
326 "tag_association_table": tagTableName,
327 "calibration_association_table": calibTableName,
328 },
329 returning=["id", "tag_association_table"],
330 )
331 assert row is not None
332 storage = self._recordStorageType(
333 db=self._db,
334 datasetType=datasetType,
335 static=self._static,
336 summaries=self._summaries,
337 tags=tags,
338 calibs=calibs,
339 dataset_type_id=row["id"],
340 collections=self._collections,
341 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
342 )
343 self._byName[datasetType.name] = storage
344 self._byId[storage._dataset_type_id] = storage
345 else:
346 if datasetType != storage.datasetType:
347 raise ConflictingDefinitionError(
348 f"Given dataset type {datasetType} is inconsistent "
349 f"with database definition {storage.datasetType}."
350 )
351 inserted = False
352 return storage, bool(inserted)
354 def resolve_wildcard(
355 self,
356 expression: Any,
357 components: bool | None = None,
358 missing: list[str] | None = None,
359 explicit_only: bool = False,
360 components_deprecated: bool = True,
361 ) -> dict[DatasetType, list[str | None]]:
362 wildcard = DatasetTypeWildcard.from_expression(expression)
363 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
364 # This message can be transformed into an error on DM-36303 after v26,
365 # and the components and components_deprecated arguments can be merged
366 # into one on DM-36457 after v27.
367 deprecation_message = (
368 "Querying for component datasets via Registry query methods is deprecated in favor of using "
369 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
370 "after v26, and the components argument will be removed after v27."
371 )
372 for name, dataset_type in wildcard.values.items():
373 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
374 if component_name is not None and components_deprecated:
375 warnings.warn(deprecation_message, FutureWarning)
376 if (found_storage := self.find(parent_name)) is not None:
377 found_parent = found_storage.datasetType
378 if component_name is not None:
379 found = found_parent.makeComponentDatasetType(component_name)
380 else:
381 found = found_parent
382 if dataset_type is not None:
383 if dataset_type.is_compatible_with(found): 383 ↛ 391line 383 didn't jump to line 391, because the condition on line 383 was never false
384 # Prefer the given dataset type to enable storage class
385 # conversions.
386 if component_name is not None:
387 found_parent = dataset_type.makeCompositeDatasetType()
388 else:
389 found_parent = dataset_type
390 else:
391 raise DatasetTypeError(
392 f"Dataset type definition in query expression {dataset_type} is "
393 f"not compatible with the registered type {found}."
394 )
395 result[found_parent].add(component_name)
396 elif missing is not None:
397 missing.append(name)
398 already_warned = False
399 if wildcard.patterns is Ellipsis:
400 if explicit_only:
401 raise TypeError(
402 "Universal wildcard '...' is not permitted for dataset types in this context."
403 )
404 for storage in self._byName.values():
405 result[storage.datasetType].add(None)
406 if components:
407 try:
408 result[storage.datasetType].update(
409 storage.datasetType.storageClass.allComponents().keys()
410 )
411 if (
412 storage.datasetType.storageClass.allComponents()
413 and not already_warned
414 and components_deprecated
415 ):
416 warnings.warn(deprecation_message, FutureWarning)
417 already_warned = True
418 except KeyError as err:
419 _LOG.warning(
420 f"Could not load storage class {err} for {storage.datasetType.name}; "
421 "if it has components they will not be included in query results.",
422 )
423 elif wildcard.patterns:
424 if explicit_only:
425 # After v26 this should raise DatasetTypeExpressionError, to
426 # be implemented on DM-36303.
427 warnings.warn(
428 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
429 FutureWarning,
430 )
431 for storage in self._byName.values():
432 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
433 result[storage.datasetType].add(None)
434 if components is not False:
435 for storage in self._byName.values():
436 if components is None and storage.datasetType in result:
437 continue
438 try:
439 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
440 except KeyError as err:
441 _LOG.warning(
442 f"Could not load storage class {err} for {storage.datasetType.name}; "
443 "if it has components they will not be included in query results."
444 )
445 continue
446 for component_name in components_for_parent:
447 if any(
448 p.fullmatch(
449 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
450 )
451 for p in wildcard.patterns
452 ):
453 result[storage.datasetType].add(component_name)
454 if not already_warned and components_deprecated:
455 warnings.warn(deprecation_message, FutureWarning)
456 already_warned = True
457 return {k: list(v) for k, v in result.items()}
459 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
460 # Docstring inherited from DatasetRecordStorageManager.
461 sql = (
462 sqlalchemy.sql.select(
463 self._static.dataset.columns.dataset_type_id,
464 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
465 )
466 .select_from(self._static.dataset)
467 .where(self._static.dataset.columns.id == id)
468 )
469 with self._db.query(sql) as sql_result:
470 row = sql_result.mappings().fetchone()
471 if row is None:
472 return None
473 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
474 if recordsForType is None: 474 ↛ 475line 474 didn't jump to line 475, because the condition on line 474 was never true
475 self.refresh()
476 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
477 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
478 return DatasetRef(
479 recordsForType.datasetType,
480 dataId=recordsForType.getDataId(id=id),
481 id=id,
482 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
483 )
485 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
486 # Docstring inherited from DatasetRecordStorageManager.
487 return self._summaries.get(collection)
489 _versions: list[VersionTuple]
490 """Schema version for this class."""
492 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
493 """Type of the storage class returned by this manager."""
495 _autoincrement: bool
496 """If True then PK column of the dataset table is auto-increment."""
498 _idColumnType: type
499 """Type of dataset column used to store dataset ID."""
502class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
503 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
504 UUID for dataset primary key.
505 """
507 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
508 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
509 _autoincrement: bool = False
510 _idColumnType: type = ddl.GUID
512 @classmethod
513 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
514 # Docstring inherited from DatasetRecordStorageManager.
515 return True
517 @classmethod
518 def _newDefaultSchemaVersion(cls) -> VersionTuple:
519 # Docstring inherited from VersionedExtension.
521 # By default return 1.0.0 so that older clients can still access new
522 # registries created with a default config.
523 return _VERSION_UUID
525 def ingest_date_dtype(self) -> type:
526 """Return type of the ``ingest_date`` column."""
527 schema_version = self.newSchemaVersion()
528 if schema_version is not None and schema_version.major > 1:
529 return ddl.AstropyTimeNsecTai
530 else:
531 return sqlalchemy.TIMESTAMP