Coverage for python/lsst/daf/butler/core/ddl.py: 50%
232 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-11 02:31 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-11 02:31 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Classes for representing SQL data-definition language (DDL) in Python.
23This include "CREATE TABLE" etc.
25This provides an extra layer on top of SQLAlchemy's classes for these concepts,
26because we need a level of indirection between logical tables and the actual
27SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL.
29We've opted for the rather more obscure "ddl" as the name of this module
30instead of "schema" because the latter is too overloaded; in most SQL
31databases, a "schema" is also another term for a namespace.
32"""
33from __future__ import annotations
35from lsst import sphgeom
37__all__ = (
38 "TableSpec",
39 "FieldSpec",
40 "ForeignKeySpec",
41 "IndexSpec",
42 "Base64Bytes",
43 "Base64Region",
44 "AstropyTimeNsecTai",
45 "GUID",
46)
48import logging
49import uuid
50from base64 import b64decode, b64encode
51from dataclasses import dataclass
52from math import ceil
53from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Set, Tuple, Type, Union
55import astropy.time
56import sqlalchemy
57from lsst.sphgeom import Region
58from lsst.utils.iteration import ensure_iterable
59from sqlalchemy.dialects.postgresql import UUID
61from . import time_utils
62from .config import Config
63from .exceptions import ValidationError
64from .named import NamedValueSet
65from .utils import stripIfNotNone
67if TYPE_CHECKING: 67 ↛ 68line 67 didn't jump to line 68, because the condition on line 67 was never true
68 from .timespan import TimespanDatabaseRepresentation
71_LOG = logging.getLogger(__name__)
74class SchemaValidationError(ValidationError):
75 """Exceptions that indicate problems in Registry schema configuration."""
77 @classmethod
78 def translate(cls, caught: Type[Exception], message: str) -> Callable:
79 """Return decorator to re-raise exceptions as `SchemaValidationError`.
81 Decorated functions must be class or instance methods, with a
82 ``config`` parameter as their first argument. This will be passed
83 to ``message.format()`` as a keyword argument, along with ``err``,
84 the original exception.
86 Parameters
87 ----------
88 caught : `type` (`Exception` subclass)
89 The type of exception to catch.
90 message : `str`
91 A `str.format` string that may contain named placeholders for
92 ``config``, ``err``, or any keyword-only argument accepted by
93 the decorated function.
94 """
96 def decorate(func: Callable) -> Callable:
97 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any:
98 try:
99 return func(self, config, *args, **kwargs)
100 except caught as err:
101 raise cls(message.format(config=str(config), err=err))
103 return decorated
105 return decorate
108class Base64Bytes(sqlalchemy.TypeDecorator):
109 """A SQLAlchemy custom type for Python `bytes`.
111 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field.
112 """
114 impl = sqlalchemy.Text
116 cache_ok = True
118 def __init__(self, nbytes: int | None = None, *args: Any, **kwargs: Any):
119 if nbytes is not None:
120 length = 4 * ceil(nbytes / 3) if self.impl == sqlalchemy.String else None
121 else:
122 length = None
123 super().__init__(*args, length=length, **kwargs)
124 self.nbytes = nbytes
126 def process_bind_param(self, value: Optional[bytes], dialect: sqlalchemy.engine.Dialect) -> Optional[str]:
127 # 'value' is native `bytes`. We want to encode that to base64 `bytes`
128 # and then ASCII `str`, because `str` is what SQLAlchemy expects for
129 # String fields.
130 if value is None:
131 return None
132 if not isinstance(value, bytes):
133 raise TypeError(
134 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}."
135 )
136 return b64encode(value).decode("ascii")
138 def process_result_value(
139 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect
140 ) -> Optional[bytes]:
141 # 'value' is a `str` that must be ASCII because it's base64-encoded.
142 # We want to transform that to base64-encoded `bytes` and then
143 # native `bytes`.
144 return b64decode(value.encode("ascii")) if value is not None else None
146 @property
147 def python_type(self) -> Type[bytes]:
148 return bytes
151# create an alias, for use below to disambiguate between the built in
152# sqlachemy type
153LocalBase64Bytes = Base64Bytes
156class Base64Region(Base64Bytes):
157 """A SQLAlchemy custom type for Python `sphgeom.Region`.
159 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`.
160 """
162 cache_ok = True # have to be set explicitly in each class
164 def process_bind_param(
165 self, value: Optional[Region], dialect: sqlalchemy.engine.Dialect
166 ) -> Optional[str]:
167 if value is None:
168 return None
169 return super().process_bind_param(value.encode(), dialect)
171 def process_result_value(
172 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect
173 ) -> Optional[Region]:
174 if value is None:
175 return None
176 return Region.decode(super().process_result_value(value, dialect))
178 @property
179 def python_type(self) -> Type[sphgeom.Region]:
180 return sphgeom.Region
183class AstropyTimeNsecTai(sqlalchemy.TypeDecorator):
184 """A SQLAlchemy custom type for Python `astropy.time.Time`.
186 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix
187 epoch in TAI scale.
188 """
190 impl = sqlalchemy.BigInteger
192 cache_ok = True
194 def process_bind_param(
195 self, value: Optional[astropy.time.Time], dialect: sqlalchemy.engine.Dialect
196 ) -> Optional[int]:
197 if value is None:
198 return None
199 if not isinstance(value, astropy.time.Time):
200 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time")
201 value = time_utils.TimeConverter().astropy_to_nsec(value)
202 return value
204 def process_result_value(
205 self, value: Optional[int], dialect: sqlalchemy.engine.Dialect
206 ) -> Optional[astropy.time.Time]:
207 # value is nanoseconds since epoch, or None
208 if value is None:
209 return None
210 value = time_utils.TimeConverter().nsec_to_astropy(value)
211 return value
214class GUID(sqlalchemy.TypeDecorator):
215 """Platform-independent GUID type.
217 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as
218 stringified hex values.
219 """
221 impl = sqlalchemy.CHAR
223 cache_ok = True
225 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.TypeEngine:
226 if dialect.name == "postgresql":
227 return dialect.type_descriptor(UUID())
228 else:
229 return dialect.type_descriptor(sqlalchemy.CHAR(32))
231 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> Optional[str]:
232 if value is None:
233 return value
235 # Coerce input to UUID type, in general having UUID on input is the
236 # only thing that we want but there is code right now that uses ints.
237 if isinstance(value, int):
238 value = uuid.UUID(int=value)
239 elif isinstance(value, bytes):
240 value = uuid.UUID(bytes=value)
241 elif isinstance(value, str):
242 # hexstring
243 value = uuid.UUID(hex=value)
244 elif not isinstance(value, uuid.UUID):
245 raise TypeError(f"Unexpected type of a bind value: {type(value)}")
247 if dialect.name == "postgresql":
248 return str(value)
249 else:
250 return "%.32x" % value.int
252 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.Dialect) -> Optional[uuid.UUID]:
253 if value is None:
254 return value
255 else:
256 return uuid.UUID(hex=value)
259VALID_CONFIG_COLUMN_TYPES = {
260 "string": sqlalchemy.String,
261 "int": sqlalchemy.BigInteger,
262 "float": sqlalchemy.Float,
263 "region": Base64Region,
264 "bool": sqlalchemy.Boolean,
265 "blob": sqlalchemy.LargeBinary,
266 "datetime": AstropyTimeNsecTai,
267 "hash": Base64Bytes,
268 "uuid": GUID,
269}
272@dataclass
273class FieldSpec:
274 """A data class for defining a column in a logical `Registry` table."""
276 name: str
277 """Name of the column."""
279 dtype: type
280 """Type of the column; usually a `type` subclass provided by SQLAlchemy
281 that defines both a Python type and a corresponding precise SQL type.
282 """
284 length: Optional[int] = None
285 """Length of the type in the database, for variable-length types."""
287 nbytes: Optional[int] = None
288 """Natural length used for hash and encoded-region columns, to be converted
289 into the post-encoding length.
290 """
292 primaryKey: bool = False
293 """Whether this field is (part of) its table's primary key."""
295 autoincrement: bool = False
296 """Whether the database should insert automatically incremented values when
297 no value is provided in an INSERT.
298 """
300 nullable: bool = True
301 """Whether this field is allowed to be NULL. If ``primaryKey`` is
302 `True`, during construction this value will be forced to `False`."""
304 default: Any = None
305 """A server-side default value for this field.
307 This is passed directly as the ``server_default`` argument to
308 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual
309 type conversion or quoting for Python literals, and should hence be used
310 with care. See the SQLAlchemy documentation for more information.
311 """
313 doc: Optional[str] = None
314 """Documentation for this field."""
316 def __post_init__(self) -> None:
317 if self.primaryKey:
318 # Change the default to match primaryKey.
319 self.nullable = False
321 def __eq__(self, other: Any) -> bool:
322 if isinstance(other, FieldSpec):
323 return self.name == other.name
324 else:
325 return NotImplemented
327 def __hash__(self) -> int:
328 return hash(self.name)
330 @classmethod
331 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.")
332 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec:
333 """Create a `FieldSpec` from a subset of a `SchemaConfig`.
335 Parameters
336 ----------
337 config: `Config`
338 Configuration describing the column. Nested configuration keys
339 correspond to `FieldSpec` attributes.
340 **kwargs
341 Additional keyword arguments that provide defaults for values
342 not present in config.
344 Returns
345 -------
346 spec: `FieldSpec`
347 Specification structure for the column.
349 Raises
350 ------
351 SchemaValidationError
352 Raised if configuration keys are missing or have invalid values.
353 """
354 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"])
355 if dtype is None:
356 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.")
357 if not config["name"].islower():
358 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.")
359 self = cls(name=config["name"], dtype=dtype, **kwargs)
360 self.length = config.get("length", self.length)
361 self.nbytes = config.get("nbytes", self.nbytes)
362 if self.length is not None and self.nbytes is not None:
363 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.")
364 self.primaryKey = config.get("primaryKey", self.primaryKey)
365 self.autoincrement = config.get("autoincrement", self.autoincrement)
366 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable)
367 self.doc = stripIfNotNone(config.get("doc", None))
368 return self
370 @classmethod
371 def for_region(cls, name: str = "region", nullable: bool = True, nbytes: int = 2048) -> FieldSpec:
372 """Create a `FieldSpec` for a spatial region column.
374 Parameters
375 ----------
376 name : `str`, optional
377 Name for the field.
378 nullable : `bool`, optional
379 Whether NULL values are permitted.
380 nbytes : `int`, optional
381 Maximum number of bytes for serialized regions. The actual column
382 size will be larger to allow for base-64 encoding.
384 Returns
385 -------
386 spec : `FieldSpec`
387 Specification structure for a region column.
388 """
389 return cls(name, nullable=nullable, dtype=Base64Region, nbytes=nbytes)
391 def isStringType(self) -> bool:
392 """Indicate that this is a sqlalchemy.String field spec.
394 Returns
395 -------
396 isString : `bool`
397 The field refers to a `sqlalchemy.String` and not any other type.
398 This can return `False` even if the object was created with a
399 string type if it has been decided that it should be implemented
400 as a `sqlalchemy.Text` type.
401 """
402 if self.dtype == sqlalchemy.String:
403 # For short strings retain them as strings
404 if self.dtype == sqlalchemy.String and self.length and self.length <= 32:
405 return True
406 return False
408 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine:
409 """Return a sized version of the column type.
411 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``.
413 Returns
414 -------
415 dtype : `sqlalchemy.types.TypeEngine`
416 A SQLAlchemy column type object.
417 """
418 if self.length is not None:
419 # Last chance check that we are only looking at possible String
420 if self.dtype == sqlalchemy.String and not self.isStringType():
421 return sqlalchemy.Text
422 return self.dtype(length=self.length)
423 if self.nbytes is not None:
424 return self.dtype(nbytes=self.nbytes)
425 return self.dtype
427 def getPythonType(self) -> type:
428 """Return the Python type associated with this field's (SQL) dtype.
430 Returns
431 -------
432 type : `type`
433 Python type associated with this field's (SQL) `dtype`.
434 """
435 # to construct these objects, nbytes keyword is needed
436 if issubclass(self.dtype, LocalBase64Bytes):
437 # satisfy mypy for something that must be true
438 assert self.nbytes is not None
439 return self.dtype(nbytes=self.nbytes).python_type
440 else:
441 return self.dtype().python_type # type: ignore
444@dataclass
445class ForeignKeySpec:
446 """Definition of a foreign key constraint in a logical `Registry` table."""
448 table: str
449 """Name of the target table."""
451 source: Tuple[str, ...]
452 """Tuple of source table column names."""
454 target: Tuple[str, ...]
455 """Tuple of target table column names."""
457 onDelete: Optional[str] = None
458 """SQL clause indicating how to handle deletes to the target table.
460 If not `None` (which indicates that a constraint violation exception should
461 be raised), should be either "SET NULL" or "CASCADE".
462 """
464 addIndex: bool = True
465 """If `True`, create an index on the columns of this foreign key in the
466 source table.
467 """
469 @classmethod
470 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.")
471 def fromConfig(cls, config: Config) -> ForeignKeySpec:
472 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`.
474 Parameters
475 ----------
476 config: `Config`
477 Configuration describing the constraint. Nested configuration keys
478 correspond to `ForeignKeySpec` attributes.
480 Returns
481 -------
482 spec: `ForeignKeySpec`
483 Specification structure for the constraint.
485 Raises
486 ------
487 SchemaValidationError
488 Raised if configuration keys are missing or have invalid values.
489 """
490 return cls(
491 table=config["table"],
492 source=tuple(ensure_iterable(config["source"])),
493 target=tuple(ensure_iterable(config["target"])),
494 onDelete=config.get("onDelete", None),
495 )
498@dataclass(frozen=True)
499class IndexSpec:
500 """Specification of an index on table columns.
502 Parameters
503 ----------
504 *columns : `str`
505 Names of the columns to index.
506 **kwargs: `Any`
507 Additional keyword arguments to pass directly to
508 `sqlalchemy.schema.Index` constructor. This could be used to provide
509 backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL
510 one can pass ``postgresql_using="gist"``.
511 """
513 def __init__(self, *columns: str, **kwargs: Any):
514 object.__setattr__(self, "columns", tuple(columns))
515 object.__setattr__(self, "kwargs", kwargs)
517 def __hash__(self) -> int:
518 return hash(self.columns)
520 columns: Tuple[str, ...]
521 """Column names to include in the index (`Tuple` [ `str` ])."""
523 kwargs: dict[str, Any]
524 """Additional keyword arguments passed directly to
525 `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]).
526 """
529@dataclass
530class TableSpec:
531 """A data class used to define a table or table-like query interface.
533 Parameters
534 ----------
535 fields : `Iterable` [ `FieldSpec` ]
536 Specifications for the columns in this table.
537 unique : `Iterable` [ `tuple` [ `str` ] ], optional
538 Non-primary-key unique constraints for the table.
539 indexes: `Iterable` [ `IndexSpec` ], optional
540 Indexes for the table.
541 foreignKeys : `Iterable` [ `ForeignKeySpec` ], optional
542 Foreign key constraints for the table.
543 exclusion : `Iterable` [ `tuple` [ `str` or `type` ] ]
544 Special constraints that prohibit overlaps between timespans over rows
545 where other columns are equal. These take the same form as unique
546 constraints, but each tuple may contain a single
547 `TimespanDatabaseRepresentation` subclass representing a timespan
548 column.
549 recycleIds : `bool`, optional
550 If `True`, allow databases that might normally recycle autoincrement
551 IDs to do so (usually better for performance) on any autoincrement
552 field in this table.
553 doc : `str`, optional
554 Documentation for the table.
555 """
557 def __init__(
558 self,
559 fields: Iterable[FieldSpec],
560 *,
561 unique: Iterable[Tuple[str, ...]] = (),
562 indexes: Iterable[IndexSpec] = (),
563 foreignKeys: Iterable[ForeignKeySpec] = (),
564 exclusion: Iterable[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] = (),
565 recycleIds: bool = True,
566 doc: Optional[str] = None,
567 ):
568 self.fields = NamedValueSet(fields)
569 self.unique = set(unique)
570 self.indexes = set(indexes)
571 self.foreignKeys = list(foreignKeys)
572 self.exclusion = set(exclusion)
573 self.recycleIds = recycleIds
574 self.doc = doc
576 fields: NamedValueSet[FieldSpec]
577 """Specifications for the columns in this table."""
579 unique: Set[Tuple[str, ...]]
580 """Non-primary-key unique constraints for the table."""
582 indexes: Set[IndexSpec]
583 """Indexes for the table."""
585 foreignKeys: List[ForeignKeySpec]
586 """Foreign key constraints for the table."""
588 exclusion: Set[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]]
589 """Exclusion constraints for the table.
591 Exclusion constraints behave mostly like unique constraints, but may
592 contain a database-native Timespan column that is restricted to not overlap
593 across rows (for identical combinations of any non-Timespan columns in the
594 constraint).
595 """
597 recycleIds: bool = True
598 """If `True`, allow databases that might normally recycle autoincrement IDs
599 to do so (usually better for performance) on any autoincrement field in
600 this table.
601 """
603 doc: Optional[str] = None
604 """Documentation for the table."""
606 @classmethod
607 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.")
608 def fromConfig(cls, config: Config) -> TableSpec:
609 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`.
611 Parameters
612 ----------
613 config: `Config`
614 Configuration describing the constraint. Nested configuration keys
615 correspond to `TableSpec` attributes.
617 Returns
618 -------
619 spec: `TableSpec`
620 Specification structure for the table.
622 Raises
623 ------
624 SchemaValidationError
625 Raised if configuration keys are missing or have invalid values.
626 """
627 return cls(
628 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]),
629 unique={tuple(u) for u in config.get("unique", ())},
630 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())],
631 doc=stripIfNotNone(config.get("doc")),
632 )