Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py: 91%
261 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 10:59 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 10:59 +0000
1from __future__ import annotations
3from .... import ddl
5__all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
7import dataclasses
8import logging
9import warnings
10from collections import defaultdict
11from collections.abc import Iterable, Mapping
12from typing import TYPE_CHECKING, Any
14import sqlalchemy
15from lsst.utils.introspection import find_outside_stacklevel
17from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
18from ....dimensions import DimensionUniverse
19from ..._collection_summary import CollectionSummary
20from ..._exceptions import ConflictingDefinitionError, DatasetTypeError, OrphanedRecordError
21from ...interfaces import DatasetRecordStorage, DatasetRecordStorageManager, VersionTuple
22from ...wildcards import DatasetTypeWildcard
23from ._storage import ByDimensionsDatasetRecordStorage, ByDimensionsDatasetRecordStorageUUID
24from .summaries import CollectionSummaryManager
25from .tables import (
26 addDatasetForeignKey,
27 makeCalibTableName,
28 makeCalibTableSpec,
29 makeStaticTableSpecs,
30 makeTagTableName,
31 makeTagTableSpec,
32)
34if TYPE_CHECKING:
35 from ..._caching_context import CachingContext
36 from ...interfaces import (
37 CollectionManager,
38 CollectionRecord,
39 Database,
40 DimensionRecordStorageManager,
41 StaticTablesContext,
42 )
43 from .tables import StaticDatasetTablesTuple
46# This has to be updated on every schema change
47_VERSION_UUID = VersionTuple(1, 0, 0)
48# Starting with 2.0.0 the `ingest_date` column type uses nanoseconds instead
49# of TIMESTAMP. The code supports both 1.0.0 and 2.0.0 for the duration of
50# client migration period.
51_VERSION_UUID_NS = VersionTuple(2, 0, 0)
53_LOG = logging.getLogger(__name__)
56class MissingDatabaseTableError(RuntimeError):
57 """Exception raised when a table is not found in a database."""
60@dataclasses.dataclass
61class _DatasetTypeRecord:
62 """Contents of a single dataset type record."""
64 dataset_type: DatasetType
65 dataset_type_id: int
66 tag_table_name: str
67 calib_table_name: str | None
70class _SpecTableFactory:
71 """Factory for `sqlalchemy.schema.Table` instances that builds table
72 instances using provided `ddl.TableSpec` definition and verifies that
73 table exists in the database.
74 """
76 def __init__(self, db: Database, name: str, spec: ddl.TableSpec):
77 self._db = db
78 self._name = name
79 self._spec = spec
81 def __call__(self) -> sqlalchemy.schema.Table:
82 table = self._db.getExistingTable(self._name, self._spec)
83 if table is None: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 raise MissingDatabaseTableError(f"Table {self._name} is missing from database schema.")
85 return table
88class ByDimensionsDatasetRecordStorageManagerBase(DatasetRecordStorageManager):
89 """A manager class for datasets that uses one dataset-collection table for
90 each group of dataset types that share the same dimensions.
92 In addition to the table organization, this class makes a number of
93 other design choices that would have been cumbersome (to say the least) to
94 try to pack into its name:
96 - It uses a private surrogate integer autoincrement field to identify
97 dataset types, instead of using the name as the primary and foreign key
98 directly.
100 - It aggressively loads all DatasetTypes into memory instead of fetching
101 them from the database only when needed or attempting more clever forms
102 of caching.
104 Alternative implementations that make different choices for these while
105 keeping the same general table organization might be reasonable as well.
107 This class provides complete implementation of manager logic but it is
108 parametrized by few class attributes that have to be defined by
109 sub-classes.
111 Parameters
112 ----------
113 db : `Database`
114 Interface to the underlying database engine and namespace.
115 collections : `CollectionManager`
116 Manager object for the collections in this `Registry`.
117 dimensions : `DimensionRecordStorageManager`
118 Manager object for the dimensions in this `Registry`.
119 static : `StaticDatasetTablesTuple`
120 Named tuple of `sqlalchemy.schema.Table` instances for all static
121 tables used by this class.
122 summaries : `CollectionSummaryManager`
123 Structure containing tables that summarize the contents of collections.
124 caching_context : `CachingContext`
125 Object controlling caching of information returned by managers.
126 """
128 def __init__(
129 self,
130 *,
131 db: Database,
132 collections: CollectionManager,
133 dimensions: DimensionRecordStorageManager,
134 static: StaticDatasetTablesTuple,
135 summaries: CollectionSummaryManager,
136 caching_context: CachingContext,
137 registry_schema_version: VersionTuple | None = None,
138 ):
139 super().__init__(registry_schema_version=registry_schema_version)
140 self._db = db
141 self._collections = collections
142 self._dimensions = dimensions
143 self._static = static
144 self._summaries = summaries
145 self._caching_context = caching_context
147 @classmethod
148 def initialize(
149 cls,
150 db: Database,
151 context: StaticTablesContext,
152 *,
153 collections: CollectionManager,
154 dimensions: DimensionRecordStorageManager,
155 caching_context: CachingContext,
156 registry_schema_version: VersionTuple | None = None,
157 ) -> DatasetRecordStorageManager:
158 # Docstring inherited from DatasetRecordStorageManager.
159 specs = cls.makeStaticTableSpecs(
160 type(collections), universe=dimensions.universe, schema_version=registry_schema_version
161 )
162 static: StaticDatasetTablesTuple = context.addTableTuple(specs) # type: ignore
163 summaries = CollectionSummaryManager.initialize(
164 db,
165 context,
166 collections=collections,
167 dimensions=dimensions,
168 dataset_type_table=static.dataset_type,
169 caching_context=caching_context,
170 )
171 return cls(
172 db=db,
173 collections=collections,
174 dimensions=dimensions,
175 static=static,
176 summaries=summaries,
177 caching_context=caching_context,
178 registry_schema_version=registry_schema_version,
179 )
181 @classmethod
182 def currentVersions(cls) -> list[VersionTuple]:
183 # Docstring inherited from VersionedExtension.
184 return cls._versions
186 @classmethod
187 def makeStaticTableSpecs(
188 cls,
189 collections: type[CollectionManager],
190 universe: DimensionUniverse,
191 schema_version: VersionTuple | None,
192 ) -> StaticDatasetTablesTuple:
193 """Construct all static tables used by the classes in this package.
195 Static tables are those that are present in all Registries and do not
196 depend on what DatasetTypes have been registered.
198 Parameters
199 ----------
200 collections: `CollectionManager`
201 Manager object for the collections in this `Registry`.
202 universe : `DimensionUniverse`
203 Universe graph containing all dimensions known to this `Registry`.
204 schema_version : `VersionTuple` or `None`
205 Version of the schema that should be created, if `None` then
206 default schema should be used.
208 Returns
209 -------
210 specs : `StaticDatasetTablesTuple`
211 A named tuple containing `ddl.TableSpec` instances.
212 """
213 schema_version = cls.clsNewSchemaVersion(schema_version)
214 assert schema_version is not None, "New schema version cannot be None"
215 return makeStaticTableSpecs(
216 collections,
217 universe=universe,
218 dtype=cls.getIdColumnType(),
219 autoincrement=cls._autoincrement,
220 schema_version=schema_version,
221 )
223 @classmethod
224 def getIdColumnType(cls) -> type:
225 # Docstring inherited from base class.
226 return cls._idColumnType
228 @classmethod
229 def addDatasetForeignKey(
230 cls,
231 tableSpec: ddl.TableSpec,
232 *,
233 name: str = "dataset",
234 constraint: bool = True,
235 onDelete: str | None = None,
236 **kwargs: Any,
237 ) -> ddl.FieldSpec:
238 # Docstring inherited from DatasetRecordStorageManager.
239 return addDatasetForeignKey(
240 tableSpec, cls.getIdColumnType(), name=name, onDelete=onDelete, constraint=constraint, **kwargs
241 )
243 def refresh(self) -> None:
244 # Docstring inherited from DatasetRecordStorageManager.
245 if self._caching_context.dataset_types is not None: 245 ↛ exitline 245 didn't return from function 'refresh', because the condition on line 245 was never false
246 self._caching_context.dataset_types.clear()
248 def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage:
249 """Create storage instance for a dataset type record."""
250 tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType())
251 tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec)
252 calibs_table_factory = None
253 if record.calib_table_name is not None:
254 calibs_spec = makeCalibTableSpec(
255 record.dataset_type,
256 type(self._collections),
257 self._db.getTimespanRepresentation(),
258 self.getIdColumnType(),
259 )
260 calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec)
261 storage = self._recordStorageType(
262 db=self._db,
263 datasetType=record.dataset_type,
264 static=self._static,
265 summaries=self._summaries,
266 tags_table_factory=tags_table_factory,
267 calibs_table_factory=calibs_table_factory,
268 dataset_type_id=record.dataset_type_id,
269 collections=self._collections,
270 use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
271 )
272 return storage
274 def remove(self, name: str) -> None:
275 # Docstring inherited from DatasetRecordStorageManager.
276 compositeName, componentName = DatasetType.splitDatasetTypeName(name)
277 if componentName is not None:
278 raise ValueError(f"Cannot delete a dataset type of a component of a composite (given {name})")
280 # Delete the row
281 try:
282 self._db.delete(self._static.dataset_type, ["name"], {"name": name})
283 except sqlalchemy.exc.IntegrityError as e:
284 raise OrphanedRecordError(
285 f"Dataset type {name} can not be removed."
286 " It is associated with datasets that must be removed first."
287 ) from e
289 # Now refresh everything -- removal is rare enough that this does
290 # not need to be fast.
291 self.refresh()
293 def find(self, name: str) -> DatasetRecordStorage | None:
294 # Docstring inherited from DatasetRecordStorageManager.
295 if self._caching_context.dataset_types is not None: 295 ↛ 309line 295 didn't jump to line 309, because the condition on line 295 was never false
296 _, storage = self._caching_context.dataset_types.get(name)
297 if storage is not None:
298 return storage
299 else:
300 # On the first cache miss populate the cache with complete list
301 # of dataset types (if it was not done yet).
302 if not self._caching_context.dataset_types.full:
303 self._fetch_dataset_types()
304 # Try again
305 _, storage = self._caching_context.dataset_types.get(name)
306 if self._caching_context.dataset_types.full: 306 ↛ 309line 306 didn't jump to line 309, because the condition on line 306 was never false
307 # If not in cache then dataset type is not defined.
308 return storage
309 record = self._fetch_dataset_type_record(name)
310 if record is not None:
311 storage = self._make_storage(record)
312 if self._caching_context.dataset_types is not None:
313 self._caching_context.dataset_types.add(storage.datasetType, storage)
314 return storage
315 else:
316 return None
318 def register(self, datasetType: DatasetType) -> bool:
319 # Docstring inherited from DatasetRecordStorageManager.
320 if datasetType.isComponent(): 320 ↛ 321line 320 didn't jump to line 321, because the condition on line 320 was never true
321 raise ValueError(
322 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
323 )
324 record = self._fetch_dataset_type_record(datasetType.name)
325 if record is None:
326 dimensionsKey = self._dimensions.save_dimension_group(datasetType.dimensions.as_group())
327 tagTableName = makeTagTableName(datasetType, dimensionsKey)
328 self._db.ensureTableExists(
329 tagTableName,
330 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
331 )
332 calibTableName = (
333 makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
334 )
335 if calibTableName is not None:
336 self._db.ensureTableExists(
337 calibTableName,
338 makeCalibTableSpec(
339 datasetType,
340 type(self._collections),
341 self._db.getTimespanRepresentation(),
342 self.getIdColumnType(),
343 ),
344 )
345 row, inserted = self._db.sync(
346 self._static.dataset_type,
347 keys={"name": datasetType.name},
348 compared={
349 "dimensions_key": dimensionsKey,
350 # Force the storage class to be loaded to ensure it
351 # exists and there is no typo in the name.
352 "storage_class": datasetType.storageClass.name,
353 },
354 extra={
355 "tag_association_table": tagTableName,
356 "calibration_association_table": calibTableName,
357 },
358 returning=["id", "tag_association_table"],
359 )
360 # Make sure that cache is updated
361 if self._caching_context.dataset_types is not None and row is not None: 361 ↛ 378line 361 didn't jump to line 378, because the condition on line 361 was never false
362 record = _DatasetTypeRecord(
363 dataset_type=datasetType,
364 dataset_type_id=row["id"],
365 tag_table_name=tagTableName,
366 calib_table_name=calibTableName,
367 )
368 storage = self._make_storage(record)
369 self._caching_context.dataset_types.add(datasetType, storage)
370 else:
371 if datasetType != record.dataset_type:
372 raise ConflictingDefinitionError(
373 f"Given dataset type {datasetType} is inconsistent "
374 f"with database definition {record.dataset_type}."
375 )
376 inserted = False
378 return bool(inserted)
380 def resolve_wildcard(
381 self,
382 expression: Any,
383 components: bool | None = False,
384 missing: list[str] | None = None,
385 explicit_only: bool = False,
386 components_deprecated: bool = True,
387 ) -> dict[DatasetType, list[str | None]]:
388 wildcard = DatasetTypeWildcard.from_expression(expression)
389 result: defaultdict[DatasetType, set[str | None]] = defaultdict(set)
390 # This message can be transformed into an error on DM-36303 after v26,
391 # and the components and components_deprecated arguments can be merged
392 # into one on DM-36457 after v27.
393 deprecation_message = (
394 "Querying for component datasets via Registry query methods is deprecated in favor of using "
395 "DatasetRef and DatasetType methods on parent datasets. Only components=False will be supported "
396 "after v26, and the components argument will be removed after v27."
397 )
398 for name, dataset_type in wildcard.values.items():
399 parent_name, component_name = DatasetType.splitDatasetTypeName(name)
400 if component_name is not None and components_deprecated:
401 warnings.warn(
402 deprecation_message, FutureWarning, stacklevel=find_outside_stacklevel("lsst.daf.butler")
403 )
404 if (found_storage := self.find(parent_name)) is not None:
405 found_parent = found_storage.datasetType
406 if component_name is not None:
407 found = found_parent.makeComponentDatasetType(component_name)
408 else:
409 found = found_parent
410 if dataset_type is not None:
411 if dataset_type.is_compatible_with(found): 411 ↛ 419line 411 didn't jump to line 419, because the condition on line 411 was never false
412 # Prefer the given dataset type to enable storage class
413 # conversions.
414 if component_name is not None:
415 found_parent = dataset_type.makeCompositeDatasetType()
416 else:
417 found_parent = dataset_type
418 else:
419 raise DatasetTypeError(
420 f"Dataset type definition in query expression {dataset_type} is "
421 f"not compatible with the registered type {found}."
422 )
423 result[found_parent].add(component_name)
424 elif missing is not None:
425 missing.append(name)
426 already_warned = False
427 if wildcard.patterns is ...:
428 if explicit_only:
429 raise TypeError(
430 "Universal wildcard '...' is not permitted for dataset types in this context."
431 )
432 for datasetType in self._fetch_dataset_types():
433 result[datasetType].add(None)
434 if components:
435 try:
436 result[datasetType].update(datasetType.storageClass.allComponents().keys())
437 if (
438 datasetType.storageClass.allComponents()
439 and not already_warned
440 and components_deprecated
441 ):
442 warnings.warn(
443 deprecation_message,
444 FutureWarning,
445 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
446 )
447 already_warned = True
448 except KeyError as err:
449 _LOG.warning(
450 f"Could not load storage class {err} for {datasetType.name}; "
451 "if it has components they will not be included in query results.",
452 )
453 elif wildcard.patterns:
454 if explicit_only:
455 # After v26 this should raise DatasetTypeExpressionError, to
456 # be implemented on DM-36303.
457 warnings.warn(
458 "Passing wildcard patterns here is deprecated and will be prohibited after v26.",
459 FutureWarning,
460 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
461 )
462 dataset_types = self._fetch_dataset_types()
463 for datasetType in dataset_types:
464 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
465 result[datasetType].add(None)
466 if components is not False:
467 for datasetType in dataset_types:
468 if components is None and datasetType in result: 468 ↛ 469line 468 didn't jump to line 469, because the condition on line 468 was never true
469 continue
470 try:
471 components_for_parent = datasetType.storageClass.allComponents().keys()
472 except KeyError as err:
473 _LOG.warning(
474 f"Could not load storage class {err} for {datasetType.name}; "
475 "if it has components they will not be included in query results."
476 )
477 continue
478 for component_name in components_for_parent:
479 if any(
480 p.fullmatch(DatasetType.nameWithComponent(datasetType.name, component_name))
481 for p in wildcard.patterns
482 ):
483 result[datasetType].add(component_name)
484 if not already_warned and components_deprecated:
485 warnings.warn(
486 deprecation_message,
487 FutureWarning,
488 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
489 )
490 already_warned = True
491 return {k: list(v) for k, v in result.items()}
493 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
494 # Docstring inherited from DatasetRecordStorageManager.
495 sql = (
496 sqlalchemy.sql.select(
497 self._static.dataset.columns.dataset_type_id,
498 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
499 *self._static.dataset_type.columns,
500 )
501 .select_from(self._static.dataset)
502 .join(self._static.dataset_type)
503 .where(self._static.dataset.columns.id == id)
504 )
505 with self._db.query(sql) as sql_result:
506 row = sql_result.mappings().fetchone()
507 if row is None:
508 return None
509 record = self._record_from_row(row)
510 storage: DatasetRecordStorage | None = None
511 if self._caching_context.dataset_types is not None: 511 ↛ 513line 511 didn't jump to line 513, because the condition on line 511 was never false
512 _, storage = self._caching_context.dataset_types.get(record.dataset_type.name)
513 if storage is None: 513 ↛ 514line 513 didn't jump to line 514, because the condition on line 513 was never true
514 storage = self._make_storage(record)
515 if self._caching_context.dataset_types is not None:
516 self._caching_context.dataset_types.add(storage.datasetType, storage)
517 assert isinstance(storage, ByDimensionsDatasetRecordStorage), "Not expected storage class"
518 return DatasetRef(
519 storage.datasetType,
520 dataId=storage.getDataId(id=id),
521 id=id,
522 run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
523 )
525 def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None:
526 """Retrieve all dataset types defined in database.
528 Yields
529 ------
530 dataset_types : `_DatasetTypeRecord`
531 Information from a single database record.
532 """
533 c = self._static.dataset_type.columns
534 stmt = self._static.dataset_type.select().where(c.name == name)
535 with self._db.query(stmt) as sql_result:
536 row = sql_result.mappings().one_or_none()
537 if row is None:
538 return None
539 else:
540 return self._record_from_row(row)
542 def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord:
543 name = row["name"]
544 dimensions = self._dimensions.load_dimension_group(row["dimensions_key"])
545 calibTableName = row["calibration_association_table"]
546 datasetType = DatasetType(
547 name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None)
548 )
549 return _DatasetTypeRecord(
550 dataset_type=datasetType,
551 dataset_type_id=row["id"],
552 tag_table_name=row["tag_association_table"],
553 calib_table_name=calibTableName,
554 )
556 def _dataset_type_from_row(self, row: Mapping) -> DatasetType:
557 return self._record_from_row(row).dataset_type
559 def _fetch_dataset_types(self) -> list[DatasetType]:
560 """Fetch list of all defined dataset types."""
561 if self._caching_context.dataset_types is not None: 561 ↛ 564line 561 didn't jump to line 564, because the condition on line 561 was never false
562 if self._caching_context.dataset_types.full:
563 return [dataset_type for dataset_type, _ in self._caching_context.dataset_types.items()]
564 with self._db.query(self._static.dataset_type.select()) as sql_result:
565 sql_rows = sql_result.mappings().fetchall()
566 records = [self._record_from_row(row) for row in sql_rows]
567 # Cache everything and specify that cache is complete.
568 if self._caching_context.dataset_types is not None: 568 ↛ 571line 568 didn't jump to line 571, because the condition on line 568 was never false
569 cache_data = [(record.dataset_type, self._make_storage(record)) for record in records]
570 self._caching_context.dataset_types.set(cache_data, full=True)
571 return [record.dataset_type for record in records]
573 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
574 # Docstring inherited from DatasetRecordStorageManager.
575 summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row)
576 return summaries[collection.key]
578 def fetch_summaries(
579 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
580 ) -> Mapping[Any, CollectionSummary]:
581 # Docstring inherited from DatasetRecordStorageManager.
582 dataset_type_names: Iterable[str] | None = None
583 if dataset_types is not None: 583 ↛ 585line 583 didn't jump to line 585, because the condition on line 583 was never false
584 dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
585 return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
587 _versions: list[VersionTuple]
588 """Schema version for this class."""
590 _recordStorageType: type[ByDimensionsDatasetRecordStorage]
591 """Type of the storage class returned by this manager."""
593 _autoincrement: bool
594 """If True then PK column of the dataset table is auto-increment."""
596 _idColumnType: type
597 """Type of dataset column used to store dataset ID."""
600class ByDimensionsDatasetRecordStorageManagerUUID(ByDimensionsDatasetRecordStorageManagerBase):
601 """Implementation of ByDimensionsDatasetRecordStorageManagerBase which uses
602 UUID for dataset primary key.
603 """
605 _versions: list[VersionTuple] = [_VERSION_UUID, _VERSION_UUID_NS]
606 _recordStorageType: type[ByDimensionsDatasetRecordStorage] = ByDimensionsDatasetRecordStorageUUID
607 _autoincrement: bool = False
608 _idColumnType: type = ddl.GUID
610 @classmethod
611 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
612 # Docstring inherited from DatasetRecordStorageManager.
613 return True
615 @classmethod
616 def _newDefaultSchemaVersion(cls) -> VersionTuple:
617 # Docstring inherited from VersionedExtension.
619 # By default return 1.0.0 so that older clients can still access new
620 # registries created with a default config.
621 return _VERSION_UUID
623 def ingest_date_dtype(self) -> type:
624 """Return type of the ``ingest_date`` column."""
625 schema_version = self.newSchemaVersion()
626 if schema_version is not None and schema_version.major > 1:
627 return ddl.AstropyTimeNsecTai
628 else:
629 return sqlalchemy.TIMESTAMP