Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 95%
207 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:43 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:43 +0000
1from __future__ import annotations
3from .... import ddl
5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
7import logging
8import warnings
9from collections import defaultdict
10from typing import TYPE_CHECKING, Any
12import sqlalchemy
13from lsst.utils.introspection import find_outside_stacklevel
15from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
16from ....dimensions import DimensionUniverse
17from ..._collection_summary import CollectionSummary
18from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
19from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
20from ...wildcards import DatasetTypeWildcard
21from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
22from .summaries import CollectionSummaryManager
23from .tables import (
24 addDatasetForeignKey,
25 makeCalibTableName,
26 makeCalibTableSpec,
27 makeStaticTableSpecs,
28 makeTagTableName,
29 makeTagTableSpec,
30)
32if TYPE_CHECKING:
33 from ...interfaces import (
34 CollectionManager,
35 CollectionRecord,
36 Database,
37 DimensionRecordStorageManager,
38 StaticTablesContext,
39 )
40 from .tables import StaticDatasetTablesTuple
43# This has to be updated on every schema change
44_VERSION_UUID = VersionTuple(1, 0, 0)
45# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
46# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
47# client migration period.
48_VERSION_UUID_NS = VersionTuple(2, 0, 0)
50_LOG = logging.getLogger(__name__)
53class MissingDatabaseTableError(RuntimeError):
54 """Exception raised when a table is not found in a database."""
57class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
58 """A manager class for datasets that uses one dataset-collection table for
59 each group of dataset types that share the same dimensions.
61 In addition to the table organization, this class makes a number of
62 other design choices that would have been cumbersome (to say the least) to
63 try to pack into its name:
65 - It uses a private surrogate integer autoincrement field to identify
66 dataset types, instead of using the name as the primary and foreign key
67 directly.
69 - It aggressively loads all DatasetTypes into memory instead of fetching
70 them from the database only when needed or attempting more clever forms
71 of caching.
73 Alternative implementations that make different choices for these while
74 keeping the same general table organization might be reasonable as well.
76 This class provides complete implementation of manager logic but it is
77 parametrized by few class attributes that have to be defined by
78 sub-classes.
80 Parameters
81 ----------
82 db : `Database`
83 Interface to the underlying database engine and namespace.
84 collections : `CollectionManager`
85 Manager object for the collections in this `Registry`.
86 dimensions : `DimensionRecordStorageManager`
87 Manager object for the dimensions in this `Registry`.
88 static : `StaticDatasetTablesTuple`
89 Named tuple of `sqlalchemy.schema.Table` instances for all static
90 tables used by this class.
91 summaries : `CollectionSummaryManager`
92 Structure containing tables that summarize the contents of collections.
93 """
95 def __init__(
96 self,
97 *,
98 db: Database,
99 collections: CollectionManager,
100 dimensions: DimensionRecordStorageManager,
101 static: StaticDatasetTablesTuple,
102 summaries: CollectionSummaryManager,
103 registry_schema_version: VersionTuple | None = None,
104 ):
105 super().__init__(registry_schema_version=registry_schema_version)
106 self._db = db
107 self._collections = collections
108 self._dimensions = dimensions
109 self._static = static
110 self._summaries = summaries
111 self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
112 self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
114 @classmethod
115 def initialize(
116 cls,
117 db: Database,
118 context: StaticTablesContext,
119 *,
120 collections: CollectionManager,
121 dimensions: DimensionRecordStorageManager,
122 registry_schema_version: VersionTuple | None = None,
123 ) -> DatasetRecordStorageManager:
124 # Docstring inherited from DatasetRecordStorageManager.
125 specs = cls.makeStaticTableSpecs(
126 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
127 )
128 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
129 summaries = CollectionSummaryManager.initialize(
130 db,
131 context,
132 collections=collections,
133 dimensions=dimensions,
134 )
135 return cls(
136 db=db,
137 collections=collections,
138 dimensions=dimensions,
139 static=static,
140 summaries=summaries,
141 registry_schema_version=registry_schema_version,
142 )
144 @classmethod
145 def currentVersions(cls) -> list[VersionTuple]:
146 # Docstring inherited from VersionedExtension.
147 return cls._versions
149 @classmethod
150 def makeStaticTableSpecs(
151 cls,
152 collections: type[CollectionManager],
153 universe: DimensionUniverse,
154 schema_version: VersionTuple | None,
155 ) -> StaticDatasetTablesTuple:
156 """Construct all static tables used by the classes in this package.
158 Static tables are those that are present in all Registries and do not
159 depend on what DatasetTypes have been registered.
161 Parameters
162 ----------
163 collections: `CollectionManager`
164 Manager object for the collections in this `Registry`.
165 universe : `DimensionUniverse`
166 Universe graph containing all dimensions known to this `Registry`.
167 schema_version : `VersionTuple` or `None`
168 Version of the schema that should be created, if `None` then
169 default schema should be used.
171 Returns
172 -------
173 specs : `StaticDatasetTablesTuple`
174 A named tuple containing `ddl.TableSpec` instances.
175 """
176 schema_version = cls.clsNewSchemaVersion(schema_version)
177 assert schema_version is not None, "New schema version cannot be None"
178 return makeStaticTableSpecs(
179 collections,
180 universe=universe,
181 dtype=cls.getIdColumnType(),
182 autoincrement=cls._autoincrement,
183 schema_version=schema_version,
184 )
186 @classmethod
187 def getIdColumnType(cls) -> type:
188 # Docstring inherited from base class.
189 return cls._idColumnType
191 @classmethod
192 def addDatasetForeignKey(
193 cls,
194 tableSpec: ddl.TableSpec,
195 *,
196 name: str = "dataset",
197 constraint: bool = True,
198 onDelete: str | None = None,
199 **kwargs: Any,
200 ) -> ddl.FieldSpec:
201 # Docstring inherited from DatasetRecordStorageManager.
202 return addDatasetForeignKey(
203 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
204 )
206 def refresh(self) -> None:
207 # Docstring inherited from DatasetRecordStorageManager.
208 byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
209 byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
210 dataset_types: dict[int, DatasetType] = {}
211 c = self._static.dataset_type.columns
212 with self._db.query(self._static.dataset_type.select()) as sql_result:
213 sql_rows = sql_result.mappings().fetchall()
214 for row in sql_rows:
215 name = row[c.name]
216 dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
217 calibTableName = row[c.calibration_association_table]
218 datasetType = DatasetType(
219 name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
220 )
221 tags = self._db.getExistingTable(
222 row[c.tag_association_table],
223 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
224 )
225 if tags is None: 225 ↛ 226line 225 didn't jump to line 226, because the condition on line 225 was never true
226 raise MissingDatabaseTableError(
227 f"Table {row[c.tag_association_table]} is missing from database schema."
228 )
229 if calibTableName is not None:
230 calibs = self._db.getExistingTable(
231 row[c.calibration_association_table],
232 makeCalibTableSpec(
233 datasetType,
234 type(self._collections),
235 self._db.getTimespanRepresentation(),
236 self.getIdColumnType(),
237 ),
238 )
239 if calibs is None: 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true
240 raise MissingDatabaseTableError(
241 f"Table {row[c.calibration_association_table]} is missing from database schema."
242 )
243 else:
244 calibs = None
245 storage = self._recordStorageType(
246 db=self._db,
247 datasetType=datasetType,
248 static=self._static,
249 summaries=self._summaries,
250 tags=tags,
251 calibs=calibs,
252 dataset_type_id=row["id"],
253 collections=self._collections,
254 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
255 )
256 byName[datasetType.name] = storage
257 byId[storage._dataset_type_id] = storage
258 dataset_types[row["id"]] = datasetType
259 self._byName = byName
260 self._byId = byId
261 self._summaries.refresh(dataset_types)
263 def remove(self, name: str) -> None:
264 # Docstring inherited from DatasetRecordStorageManager.
265 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
266 if componentName is not None:
267 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
269 # Delete the row
270 try:
271 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
272 except sqlalchemy.exc.IntegrityError as e:
273 raise OrphanedRecordError(
274 f"Dataset type {name} can not be removed."
275 " It is associated with datasets that must be removed first."
276 ) from e
278 # Now refresh everything -- removal is rare enough that this does
279 # not need to be fast.
280 self.refresh()
282 def find(self, name: str) -> DatasetRecordStorage | None:
283 # Docstring inherited from DatasetRecordStorageManager.
284 return self._byName.get(name)
286 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
287 # Docstring inherited from DatasetRecordStorageManager.
288 if datasetType.isComponent(): 288 ↛ 289line 288 didn't jump to line 289, because the condition on line 288 was never true
289 raise ValueError(
290 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
291 )
292 storage = self._byName.get(datasetType.name)
293 if storage is None:
294 dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
295 tagTableName = makeTagTableName(datasetType, dimensionsKey)
296 calibTableName = (
297 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
298 )
299 # The order is important here, we want to create tables first and
300 # only register them if this operation is successful. We cannot
301 # wrap it into a transaction because database class assumes that
302 # DDL is not transaction safe in general.
303 tags = self._db.ensureTableExists(
304 tagTableName,
305 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
306 )
307 if calibTableName is not None:
308 calibs = self._db.ensureTableExists(
309 calibTableName,
310 makeCalibTableSpec(
311 datasetType,
312 type(self._collections),
313 self._db.getTimespanRepresentation(),
314 self.getIdColumnType(),
315 ),
316 )
317 else:
318 calibs = None
319 row, inserted = self._db.sync(
320 self._static.dataset_type,
321 keys={"name": datasetType.name},
322 compared={
323 "dimensions_key": dimensionsKey,
324 # Force the storage class to be loaded to ensure it
325 # exists and there is no typo in the name.
326 "storage_class": datasetType.storageClass.name,
327 },
328 extra={
329 "tag_association_table": tagTableName,
330 "calibration_association_table": calibTableName,
331 },
332 returning=["id", "tag_association_table"],
333 )
334 assert row is not None
335 storage = self._recordStorageType(
336 db=self._db,
337 datasetType=datasetType,
338 static=self._static,
339 summaries=self._summaries,
340 tags=tags,
341 calibs=calibs,
342 dataset_type_id=row["id"],
343 collections=self._collections,
344 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
345 )
346 self._byName[datasetType.name] = storage
347 self._byId[storage._dataset_type_id] = storage
348 else:
349 if datasetType != storage.datasetType:
350 raise ConflictingDefinitionError(
351 f"Given dataset type {datasetType} is inconsistent "
352 f"with database definition {storage.datasetType}."
353 )
354 inserted = False
355 return storage, bool(inserted)
357 def resolve_wildcard(
358 self,
359 expression: Any,
360 components: bool | None = False,
361 missing: list[str] | None = None,
362 explicit_only: bool = False,
363 components_deprecated: bool = True,
364 ) -> dict[DatasetType, list[str | None]]:
365 wildcard = DatasetTypeWildcard.from_expression(expression)
366 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
367 # This message can be transformed into an error on DM-36303 after v26,
368 # and the components and components_deprecated arguments can be merged
369 # into one on DM-36457 after v27.
370 deprecation_message = (
371 "Querying for component datasets via Registry query methods is deprecated in favor of using "
372 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
373 "after v26, and the components argument will be removed after v27."
374 )
375 for name, dataset_type in wildcard.values.items():
376 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
377 if component_name is not None and components_deprecated:
378 warnings.warn(
379 deprecation_message, FutureWarning, stacklevel=find_outside_stacklevel("lsst.daf.butler")
380 )
381 if (found_storage := self.find(parent_name)) is not None:
382 found_parent = found_storage.datasetType
383 if component_name is not None:
384 found = found_parent.makeComponentDatasetType(component_name)
385 else:
386 found = found_parent
387 if dataset_type is not None:
388 if dataset_type.is_compatible_with(found): 388 ↛ 396line 388 didn't jump to line 396, because the condition on line 388 was never false
389 # Prefer the given dataset type to enable storage class
390 # conversions.
391 if component_name is not None:
392 found_parent = dataset_type.makeCompositeDatasetType()
393 else:
394 found_parent = dataset_type
395 else:
396 raise DatasetTypeError(
397 f"Dataset type definition in query expression {dataset_type} is "
398 f"not compatible with the registered type {found}."
399 )
400 result[found_parent].add(component_name)
401 elif missing is not None:
402 missing.append(name)
403 already_warned = False
404 if wildcard.patterns is ...:
405 if explicit_only:
406 raise TypeError(
407 "Universal wildcard '...' is not permitted for dataset types in this context."
408 )
409 for storage in self._byName.values():
410 result[storage.datasetType].add(None)
411 if components:
412 try:
413 result[storage.datasetType].update(
414 storage.datasetType.storageClass.allComponents().keys()
415 )
416 if (
417 storage.datasetType.storageClass.allComponents()
418 and not already_warned
419 and components_deprecated
420 ):
421 warnings.warn(
422 deprecation_message,
423 FutureWarning,
424 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
425 )
426 already_warned = True
427 except KeyError as err:
428 _LOG.warning(
429 f"Could not load storage class {err} for {storage.datasetType.name}; "
430 "if it has components they will not be included in query results.",
431 )
432 elif wildcard.patterns:
433 if explicit_only:
434 # After v26 this should raise DatasetTypeExpressionError, to
435 # be implemented on DM-36303.
436 warnings.warn(
437 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
438 FutureWarning,
439 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
440 )
441 for storage in self._byName.values():
442 if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
443 result[storage.datasetType].add(None)
444 if components is not False:
445 for storage in self._byName.values():
446 if components is None and storage.datasetType in result: 446 ↛ 447line 446 didn't jump to line 447, because the condition on line 446 was never true
447 continue
448 try:
449 components_for_parent = storage.datasetType.storageClass.allComponents().keys()
450 except KeyError as err:
451 _LOG.warning(
452 f"Could not load storage class {err} for {storage.datasetType.name}; "
453 "if it has components they will not be included in query results."
454 )
455 continue
456 for component_name in components_for_parent:
457 if any(
458 p.fullmatch(
459 DatasetType.nameWithComponent(storage.datasetType.name, component_name)
460 )
461 for p in wildcard.patterns
462 ):
463 result[storage.datasetType].add(component_name)
464 if not already_warned and components_deprecated:
465 warnings.warn(
466 deprecation_message,
467 FutureWarning,
468 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
469 )
470 already_warned = True
471 return {k: list(v) for k, v in result.items()}
473 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
474 # Docstring inherited from DatasetRecordStorageManager.
475 sql = (
476 sqlalchemy.sql.select(
477 self._static.dataset.columns.dataset_type_id,
478 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
479 )
480 .select_from(self._static.dataset)
481 .where(self._static.dataset.columns.id == id)
482 )
483 with self._db.query(sql) as sql_result:
484 row = sql_result.mappings().fetchone()
485 if row is None:
486 return None
487 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
488 if recordsForType is None: 488 ↛ 489line 488 didn't jump to line 489, because the condition on line 488 was never true
489 self.refresh()
490 recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
491 assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
492 return DatasetRef(
493 recordsForType.datasetType,
494 dataId=recordsForType.getDataId(id=id),
495 id=id,
496 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
497 )
499 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
500 # Docstring inherited from DatasetRecordStorageManager.
501 return self._summaries.get(collection)
503 _versions: list[VersionTuple]
504 """Schema version for this class."""
506 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
507 """Type of the storage class returned by this manager."""
509 _autoincrement: bool
510 """If True then PK column of the dataset table is auto-increment."""
512 _idColumnType: type
513 """Type of dataset column used to store dataset ID."""
516class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
517 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
518 UUID for dataset primary key.
519 """
521 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
522 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
523 _autoincrement: bool = False
524 _idColumnType: type = ddl.GUID
526 @classmethod
527 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
528 # Docstring inherited from DatasetRecordStorageManager.
529 return True
531 @classmethod
532 def _newDefaultSchemaVersion(cls) -> VersionTuple:
533 # Docstring inherited from VersionedExtension.
535 # By default return 1.0.0 so that older clients can still access new
536 # registries created with a default config.
537 return _VERSION_UUID
539 def ingest_date_dtype(self) -> type:
540 """Return type of the ``ingest_date`` column."""
541 schema_version = self.newSchemaVersion()
542 if schema_version is not None and schema_version.major > 1:
543 return ddl.AstropyTimeNsecTai
544 else:
545 return sqlalchemy.TIMESTAMP