Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 96%
205 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-21 09:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-21 09:54 +0000
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
5import logging
6import warnings
7from collections import defaultdict
8from typing import TYPE_CHECKING, Any
10import sqlalchemy
11from lsst.utils.introspection import find_outside_stacklevel
13from ....core import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, DimensionUniverse, ddl
14from ..._collection_summary import CollectionSummary
15from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
16from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
17from ...wildcards import DatasetTypeWildcard
18from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
19from .summaries import CollectionSummaryManager
20from .tables import (
21 addDatasetForeignKey,
22 makeCalibTableName,
23 makeCalibTableSpec,
24 makeStaticTableSpecs,
25 makeTagTableName,
26 makeTagTableSpec,
27)
29if TYPE_CHECKING:
30 from ...interfaces import (
31 CollectionManager,
32 CollectionRecord,
33 Database,
34 DimensionRecordStorageManager,
35 StaticTablesContext,
36 )
37 from .tables import StaticDatasetTablesTuple
40# This has to be updated on every schema change
41_VERSION_UUID = VersionTuple(1, 0, 0)
42# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
43# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
44# client migration period.
45_VERSION_UUID_NS = VersionTuple(2, 0, 0)
47_LOG = logging.getLogger(__name__)
50class MissingDatabaseTableError(RuntimeError):
51 """Exception raised when a table is not found in a database."""
54class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
55 """A manager class for datasets that uses one dataset-collection table for
56 each group of dataset types that share the same dimensions.
58 In addition to the table organization, this class makes a number of
59 other design choices that would have been cumbersome (to say the least) to
60 try to pack into its name:
62 - It uses a private surrogate integer autoincrement field to identify
63 dataset types, instead of using the name as the primary and foreign key
64 directly.
66 - It aggressively loads all DatasetTypes into memory instead of fetching
67 them from the database only when needed or attempting more clever forms
68 of caching.
70 Alternative implementations that make different choices for these while
71 keeping the same general table organization might be reasonable as well.
73 This class provides complete implementation of manager logic but it is
74 parametrized by few class attributes that have to be defined by
75 sub-classes.
77 Parameters
78 ----------
79 db : `Database`
80 Interface to the underlying database engine and namespace.
81 collections : `CollectionManager`
82 Manager object for the collections in this `Registry`.
83 dimensions : `DimensionRecordStorageManager`
84 Manager object for the dimensions in this `Registry`.
85 static : `StaticDatasetTablesTuple`
86 Named tuple of `sqlalchemy.schema.Table` instances for all static
87 tables used by this class.
88 summaries : `CollectionSummaryManager`
89 Structure containing tables that summarize the contents of collections.
90 """
92 def __init__(
93 self,
94 *,
95 db: Database,
96 collections: CollectionManager,
97 dimensions: DimensionRecordStorageManager,
98 static: StaticDatasetTablesTuple,
99 summaries: CollectionSummaryManager,
100 registry_schema_version: VersionTuple | None = None,
101 ):
102 super().__init__(registry_schema_version=registry_schema_version)
103 self._db = db
104 self._collections = collections
105 self._dimensions = dimensions
106 self._static = static
107 self._summaries = summaries
108 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
109 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
111 @classmethod
112 def initialize(
113 cls,
114 db: Database,
115 context: StaticTablesContext,
116 *,
117 collections: CollectionManager,
118 dimensions: DimensionRecordStorageManager,
119 registry_schema_version: VersionTuple | None = None,
120 ) -> DatasetRecordStorageManager:
121 # Docstring inherited from DatasetRecordStorageManager.
122 specs = cls.makeStaticTableSpecs(
123 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
124 )
125 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
126 summaries = CollectionSummaryManager.initialize(
127 db,
128 context,
129 collections=collections,
130 dimensions=dimensions,
131 )
132 return cls(
133 db=db,
134 collections=collections,
135 dimensions=dimensions,
136 static=static,
137 summaries=summaries,
138 registry_schema_version=registry_schema_version,
139 )
141 @classmethod
142 def currentVersions(cls) -> list[VersionTuple]:
143 # Docstring inherited from VersionedExtension.
144 return cls._versions
146 @classmethod
147 def makeStaticTableSpecs(
148 cls,
149 collections: type[CollectionManager],
150 universe: DimensionUniverse,
151 schema_version: VersionTuple | None,
152 ) -> StaticDatasetTablesTuple:
153 """Construct all static tables used by the classes in this package.
155 Static tables are those that are present in all Registries and do not
156 depend on what DatasetTypes have been registered.
158 Parameters
159 ----------
160 collections: `CollectionManager`
161 Manager object for the collections in this `Registry`.
162 universe : `DimensionUniverse`
163 Universe graph containing all dimensions known to this `Registry`.
164 schema_version : `VersionTuple` or `None`
165 Version of the schema that should be created, if `None` then
166 default schema should be used.
168 Returns
169 -------
170 specs : `StaticDatasetTablesTuple`
171 A named tuple containing `ddl.TableSpec` instances.
172 """
173 schema_version = cls.clsNewSchemaVersion(schema_version)
174 assert schema_version is not None, "New schema version cannot be None"
175 return makeStaticTableSpecs(
176 collections,
177 universe=universe,
178 dtype=cls.getIdColumnType(),
179 autoincrement=cls._autoincrement,
180 schema_version=schema_version,
181 )
183 @classmethod
184 def getIdColumnType(cls) -> type:
185 # Docstring inherited from base class.
186 return cls._idColumnType
188 @classmethod
189 def addDatasetForeignKey(
190 cls,
191 tableSpec: ddl.TableSpec,
192 *,
193 name: str = "dataset",
194 constraint: bool = True,
195 onDelete: str | None = None,
196 **kwargs: Any,
197 ) -> ddl.FieldSpec:
198 # Docstring inherited from DatasetRecordStorageManager.
199 return addDatasetForeignKey(
200 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
201 )
203 def refresh(self) -> None:
204 # Docstring inherited from DatasetRecordStorageManager.
205 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
206 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
207 dataset_types: dict[int, DatasetType] = {}
208 c = self._static.dataset_type.columns
209 with self._db.query(self._static.dataset_type.select()) as sql_result:
210 sql_rows = sql_result.mappings().fetchall()
211 for row in sql_rows:
212 name = row[c.name]
213 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
214 calibTableName = row[c.calibration_association_table]
215 datasetType = DatasetType(
216 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
217 )
218 tags = self._db.getExistingTable(
219 row[c.tag_association_table],
220 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
221 )
222 if tags is None: 222 ↛ 223line 222 didn't jump to line 223, because the condition on line 222 was never true
223 raise MissingDatabaseTableError(
224 f"Table {row[c.tag_association_table]} is missing from database schema."
225 )
226 if calibTableName is not None:
227 calibs = self._db.getExistingTable(
228 row[c.calibration_association_table],
229 makeCalibTableSpec(
230 datasetType,
231 type(self._collections),
232 self._db.getTimespanRepresentation(),
233 self.getIdColumnType(),
234 ),
235 )
236 if calibs is None: 236 ↛ 237line 236 didn't jump to line 237, because the condition on line 236 was never true
237 raise MissingDatabaseTableError(
238 f"Table {row[c.calibration_association_table]} is missing from database schema."
239 )
240 else:
241 calibs = None
242 storage = self._recordStorageType(
243 db=self._db,
244 datasetType=datasetType,
245 static=self._static,
246 summaries=self._summaries,
247 tags=tags,
248 calibs=calibs,
249 dataset_type_id=row["id"],
250 collections=self._collections,
251 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
252 )
253 byName[datasetType.name] = storage
254 byId[storage._dataset_type_id] = storage
255 dataset_types[row["id"]] = datasetType
256 self._byName = byName
257 self._byId = byId
258 self._summaries.refresh(dataset_types)
260 def remove(self, name: str) -> None:
261 # Docstring inherited from DatasetRecordStorageManager.
262 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
263 if componentName is not None:
264 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
266 # Delete the row
267 try:
268 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
269 except sqlalchemy.exc.IntegrityError as e:
270 raise OrphanedRecordError(
271 f"Dataset type {name} can not be removed."
272 " It is associated with datasets that must be removed first."
273 ) from e
275 # Now refresh everything -- removal is rare enough that this does
276 # not need to be fast.
277 self.refresh()
279 def find(self, name: str) -> DatasetRecordStorage | None:
280 # Docstring inherited from DatasetRecordStorageManager.
281 return self._byName.get(name)
283 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
284 # Docstring inherited from DatasetRecordStorageManager.
285 if datasetType.isComponent(): 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true
286 raise ValueError(
287 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
288 )
289 storage = self._byName.get(datasetType.name)
290 if storage is None:
291 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
292 tagTableName = makeTagTableName(datasetType, dimensionsKey)
293 calibTableName = (
294 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
295 )
296 # The order is important here, we want to create tables first and
297 # only register them if this operation is successful. We cannot
298 # wrap it into a transaction because database class assumes that
299 # DDL is not transaction safe in general.
300 tags = self._db.ensureTableExists(
301 tagTableName,
302 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
303 )
304 if calibTableName is not None:
305 calibs = self._db.ensureTableExists(
306 calibTableName,
307 makeCalibTableSpec(
308 datasetType,
309 type(self._collections),
310 self._db.getTimespanRepresentation(),
311 self.getIdColumnType(),
312 ),
313 )
314 else:
315 calibs = None
316 row, inserted = self._db.sync(
317 self._static.dataset_type,
318 keys={"name": datasetType.name},
319 compared={
320 "dimensions_key": dimensionsKey,
321 # Force the storage class to be loaded to ensure it
322 # exists and there is no typo in the name.
323 "storage_class": datasetType.storageClass.name,
324 },
325 extra={
326 "tag_association_table": tagTableName,
327 "calibration_association_table": calibTableName,
328 },
329 returning=["id", "tag_association_table"],
330 )
331 assert row is not None
332 storage = self._recordStorageType(
333 db=self._db,
334 datasetType=datasetType,
335 static=self._static,
336 summaries=self._summaries,
337 tags=tags,
338 calibs=calibs,
339 dataset_type_id=row["id"],
340 collections=self._collections,
341 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
342 )
343 self._byName[datasetType.name] = storage
344 self._byId[storage._dataset_type_id] = storage
345 else:
346 if datasetType != storage.datasetType:
347 raise ConflictingDefinitionError(
348 f"Given dataset type {datasetType} is inconsistent "
349 f"with database definition {storage.datasetType}."
350 )
351 inserted = False
352 return storage, bool(inserted)
354 def resolve_wildcard(
355 self,
356 expression: Any,
357 components: bool | None = None,
358 missing: list[str] | None = None,
359 explicit_only: bool = False,
360 components_deprecated: bool = True,
361 ) -> dict[DatasetType, list[str | None]]:
362 wildcard = DatasetTypeWildcard.from_expression(expression)
363 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
364 # This message can be transformed into an error on DM-36303 after v26,
365 # and the components and components_deprecated arguments can be merged
366 # into one on DM-36457 after v27.
367 deprecation_message = (
368 "Querying for component datasets via Registry query methods is deprecated in favor of using "
369 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
370 "after v26, and the components argument will be removed after v27."
371 )
372 for name, dataset_type in wildcard.values.items():
373 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
374 if component_name is not None and components_deprecated:
375 warnings.warn(
376 deprecation_message, FutureWarning, stacklevel=find_outside_stacklevel("lsst.daf.butler")
377 )
378 if (found_storage := self.find(parent_name)) is not None:
379 found_parent = found_storage.datasetType
380 if component_name is not None:
381 found = found_parent.makeComponentDatasetType(component_name)
382 else:
383 found = found_parent
384 if dataset_type is not None:
385 if dataset_type.is_compatible_with(found): 385 ↛ 393line 385 didn't jump to line 393, because the condition on line 385 was never false
386 # Prefer the given dataset type to enable storage class
387 # conversions.
388 if component_name is not None:
389 found_parent = dataset_type.makeCompositeDatasetType()
390 else:
391 found_parent = dataset_type
392 else:
393 raise DatasetTypeError(
394 f"Dataset type definition in query expression {dataset_type} is "
395 f"not compatible with the registered type {found}."
396 )
397 result[found_parent].add(component_name)
398 elif missing is not None:
399 missing.append(name)
400 already_warned = False
401 if wildcard.patterns is ...:
402 if explicit_only:
403 raise TypeError(
404 "Universal wildcard '...' is not permitted for dataset types in this context."
405 )
406 for storage in self._byName.values():
407 result[storage.datasetType].add(None)
408 if components:
409 try:
410 result[storage.datasetType].update(
411 storage.datasetType.storageClass.allComponents().keys()
412 )
413 if (
414 storage.datasetType.storageClass.allComponents()
415 and not already_warned
416 and components_deprecated
417 ):
418 warnings.warn(
419 deprecation_message,
420 FutureWarning,
421 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
422 )
423 already_warned = True
424 except KeyError as err:
425 _LOG.warning(
426 f"Could not load storage class {err} for {storage.datasetType.name}; "
427 "if it has components they will not be included in query results.",
428 )
429 elif wildcard.patterns:
430 if explicit_only:
431 # After v26 this should raise DatasetTypeExpressionError, to
432 # be implemented on DM-36303.
433 warnings.warn(
434 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
435 FutureWarning,
436 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
437 )
438 for storage in self._byName.values():
439 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
440 result[storage.datasetType].add(None)
441 if components is not False:
442 for storage in self._byName.values():
443 if components is None and storage.datasetType in result:
444 continue
445 try:
446 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
447 except KeyError as err:
448 _LOG.warning(
449 f"Could not load storage class {err} for {storage.datasetType.name}; "
450 "if it has components they will not be included in query results."
451 )
452 continue
453 for component_name in components_for_parent:
454 if any(
455 p.fullmatch(
456 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
457 )
458 for p in wildcard.patterns
459 ):
460 result[storage.datasetType].add(component_name)
461 if not already_warned and components_deprecated:
462 warnings.warn(
463 deprecation_message,
464 FutureWarning,
465 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
466 )
467 already_warned = True
468 return {k: list(v) for k, v in result.items()}
470 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
471 # Docstring inherited from DatasetRecordStorageManager.
472 sql = (
473 sqlalchemy.sql.select(
474 self._static.dataset.columns.dataset_type_id,
475 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
476 )
477 .select_from(self._static.dataset)
478 .where(self._static.dataset.columns.id == id)
479 )
480 with self._db.query(sql) as sql_result:
481 row = sql_result.mappings().fetchone()
482 if row is None:
483 return None
484 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
485 if recordsForType is None: 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true
486 self.refresh()
487 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
488 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
489 return DatasetRef(
490 recordsForType.datasetType,
491 dataId=recordsForType.getDataId(id=id),
492 id=id,
493 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
494 )
496 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
497 # Docstring inherited from DatasetRecordStorageManager.
498 return self._summaries.get(collection)
500 _versions: list[VersionTuple]
501 """Schema version for this class."""
503 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
504 """Type of the storage class returned by this manager."""
506 _autoincrement: bool
507 """If True then PK column of the dataset table is auto-increment."""
509 _idColumnType: type
510 """Type of dataset column used to store dataset ID."""
513class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
514 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
515 UUID for dataset primary key.
516 """
518 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
519 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
520 _autoincrement: bool = False
521 _idColumnType: type = ddl.GUID
523 @classmethod
524 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
525 # Docstring inherited from DatasetRecordStorageManager.
526 return True
528 @classmethod
529 def _newDefaultSchemaVersion(cls) -> VersionTuple:
530 # Docstring inherited from VersionedExtension.
532 # By default return 1.0.0 so that older clients can still access new
533 # registries created with a default config.
534 return _VERSION_UUID
536 def ingest_date_dtype(self) -> type:
537 """Return type of the ``ingest_date`` column."""
538 schema_version = self.newSchemaVersion()
539 if schema_version is not None and schema_version.major > 1:
540 return ddl.AstropyTimeNsecTai
541 else:
542 return sqlalchemy.TIMESTAMP