Coverage for python/lsst/daf/butler/core/ddl.py: 55%
230 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Classes for representing SQL data-definition language (DDL) in Python.
23This include "CREATE TABLE" etc.
25This provides an extra layer on top of SQLAlchemy's classes for these concepts,
26because we need a level of indirection between logical tables and the actual
27SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL.
29We've opted for the rather more obscure "ddl" as the name of this module
30instead of "schema" because the latter is too overloaded; in most SQL
31databases, a "schema" is also another term for a namespace.
32"""
33from __future__ import annotations
35from lsst import sphgeom
37__all__ = (
38 "TableSpec",
39 "FieldSpec",
40 "ForeignKeySpec",
41 "IndexSpec",
42 "Base64Bytes",
43 "Base64Region",
44 "AstropyTimeNsecTai",
45 "GUID",
46)
48import logging
49import uuid
50from base64 import b64decode, b64encode
51from collections.abc import Callable, Iterable
52from dataclasses import dataclass
53from math import ceil
54from typing import TYPE_CHECKING, Any
56import astropy.time
57import sqlalchemy
58from lsst.sphgeom import Region
59from lsst.utils.iteration import ensure_iterable
60from sqlalchemy.dialects.postgresql import UUID
62from . import time_utils
63from .config import Config
64from .exceptions import ValidationError
65from .named import NamedValueSet
66from .utils import stripIfNotNone
68if TYPE_CHECKING:
69 from .timespan import TimespanDatabaseRepresentation
72_LOG = logging.getLogger(__name__)
75class SchemaValidationError(ValidationError):
76 """Exceptions that indicate problems in Registry schema configuration."""
78 @classmethod
79 def translate(cls, caught: type[Exception], message: str) -> Callable:
80 """Return decorator to re-raise exceptions as `SchemaValidationError`.
82 Decorated functions must be class or instance methods, with a
83 ``config`` parameter as their first argument. This will be passed
84 to ``message.format()`` as a keyword argument, along with ``err``,
85 the original exception.
87 Parameters
88 ----------
89 caught : `type` (`Exception` subclass)
90 The type of exception to catch.
91 message : `str`
92 A `str.format` string that may contain named placeholders for
93 ``config``, ``err``, or any keyword-only argument accepted by
94 the decorated function.
95 """
97 def decorate(func: Callable) -> Callable:
98 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any:
99 try:
100 return func(self, config, *args, **kwargs)
101 except caught as err:
102 raise cls(message.format(config=str(config), err=err)) from err
104 return decorated
106 return decorate
109class Base64Bytes(sqlalchemy.TypeDecorator):
110 """A SQLAlchemy custom type for Python `bytes`.
112 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field.
113 """
115 impl = sqlalchemy.Text
117 cache_ok = True
119 def __init__(self, nbytes: int | None = None, *args: Any, **kwargs: Any):
120 if nbytes is not None:
121 length = 4 * ceil(nbytes / 3) if self.impl is sqlalchemy.String else None
122 else:
123 length = None
124 super().__init__(*args, length=length, **kwargs)
125 self.nbytes = nbytes
127 def process_bind_param(self, value: bytes | None, dialect: sqlalchemy.engine.Dialect) -> str | None:
128 # 'value' is native `bytes`. We want to encode that to base64 `bytes`
129 # and then ASCII `str`, because `str` is what SQLAlchemy expects for
130 # String fields.
131 if value is None:
132 return None
133 if not isinstance(value, bytes):
134 raise TypeError(
135 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}."
136 )
137 return b64encode(value).decode("ascii")
139 def process_result_value(self, value: str | None, dialect: sqlalchemy.engine.Dialect) -> bytes | None:
140 # 'value' is a `str` that must be ASCII because it's base64-encoded.
141 # We want to transform that to base64-encoded `bytes` and then
142 # native `bytes`.
143 return b64decode(value.encode("ascii")) if value is not None else None
145 @property
146 def python_type(self) -> type[bytes]:
147 return bytes
150# create an alias, for use below to disambiguate between the built in
151# sqlachemy type
152LocalBase64Bytes = Base64Bytes
155class Base64Region(Base64Bytes):
156 """A SQLAlchemy custom type for Python `sphgeom.Region`.
158 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`.
159 """
161 cache_ok = True # have to be set explicitly in each class
163 def process_bind_param(self, value: Region | None, dialect: sqlalchemy.engine.Dialect) -> str | None:
164 if value is None:
165 return None
166 return super().process_bind_param(value.encode(), dialect)
168 def process_result_value(self, value: str | None, dialect: sqlalchemy.engine.Dialect) -> Region | None:
169 if value is None:
170 return None
171 return Region.decode(super().process_result_value(value, dialect))
173 @property
174 def python_type(self) -> type[sphgeom.Region]:
175 return sphgeom.Region
178class AstropyTimeNsecTai(sqlalchemy.TypeDecorator):
179 """A SQLAlchemy custom type for Python `astropy.time.Time`.
181 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix
182 epoch in TAI scale.
183 """
185 impl = sqlalchemy.BigInteger
187 cache_ok = True
189 def process_bind_param(
190 self, value: astropy.time.Time | None, dialect: sqlalchemy.engine.Dialect
191 ) -> int | None:
192 if value is None:
193 return None
194 if not isinstance(value, astropy.time.Time):
195 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time")
196 value = time_utils.TimeConverter().astropy_to_nsec(value)
197 return value
199 def process_result_value(
200 self, value: int | None, dialect: sqlalchemy.engine.Dialect
201 ) -> astropy.time.Time | None:
202 # value is nanoseconds since epoch, or None
203 if value is None:
204 return None
205 value = time_utils.TimeConverter().nsec_to_astropy(value)
206 return value
209# TODO: sqlalchemy 2 has internal support for UUID:
210# https://docs.sqlalchemy.org/en/20/core/type_basics.html#sqlalchemy.types.Uuid
211class GUID(sqlalchemy.TypeDecorator):
212 """Platform-independent GUID type.
214 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as
215 stringified hex values.
216 """
218 impl = sqlalchemy.CHAR
220 cache_ok = True
222 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.types.TypeEngine:
223 if dialect.name == "postgresql":
224 return dialect.type_descriptor(UUID())
225 else:
226 return dialect.type_descriptor(sqlalchemy.CHAR(32))
228 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> str | None:
229 if value is None:
230 return value
232 # Coerce input to UUID type, in general having UUID on input is the
233 # only thing that we want but there is code right now that uses ints.
234 if isinstance(value, int):
235 value = uuid.UUID(int=value)
236 elif isinstance(value, bytes):
237 value = uuid.UUID(bytes=value)
238 elif isinstance(value, str):
239 # hexstring
240 value = uuid.UUID(hex=value)
241 elif not isinstance(value, uuid.UUID):
242 raise TypeError(f"Unexpected type of a bind value: {type(value)}")
244 if dialect.name == "postgresql":
245 return str(value)
246 else:
247 return "%.32x" % value.int
249 def process_result_value(
250 self, value: str | uuid.UUID | None, dialect: sqlalchemy.Dialect
251 ) -> uuid.UUID | None:
252 if value is None or isinstance(value, uuid.UUID):
253 # sqlalchemy 2 converts to UUID internally
254 return value
255 else:
256 return uuid.UUID(hex=value)
259VALID_CONFIG_COLUMN_TYPES = {
260 "string": sqlalchemy.String,
261 "int": sqlalchemy.BigInteger,
262 "float": sqlalchemy.Float,
263 "region": Base64Region,
264 "bool": sqlalchemy.Boolean,
265 "blob": sqlalchemy.LargeBinary,
266 "datetime": AstropyTimeNsecTai,
267 "hash": Base64Bytes,
268 "uuid": GUID,
269}
272@dataclass
273class FieldSpec:
274 """A data class for defining a column in a logical `Registry` table."""
276 name: str
277 """Name of the column."""
279 dtype: type
280 """Type of the column; usually a `type` subclass provided by SQLAlchemy
281 that defines both a Python type and a corresponding precise SQL type.
282 """
284 length: int | None = None
285 """Length of the type in the database, for variable-length types."""
287 nbytes: int | None = None
288 """Natural length used for hash and encoded-region columns, to be converted
289 into the post-encoding length.
290 """
292 primaryKey: bool = False
293 """Whether this field is (part of) its table's primary key."""
295 autoincrement: bool = False
296 """Whether the database should insert automatically incremented values when
297 no value is provided in an INSERT.
298 """
300 nullable: bool = True
301 """Whether this field is allowed to be NULL. If ``primaryKey`` is
302 `True`, during construction this value will be forced to `False`."""
304 default: Any = None
305 """A server-side default value for this field.
307 This is passed directly as the ``server_default`` argument to
308 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual
309 type conversion or quoting for Python literals, and should hence be used
310 with care. See the SQLAlchemy documentation for more information.
311 """
313 doc: str | None = None
314 """Documentation for this field."""
316 def __post_init__(self) -> None:
317 if self.primaryKey:
318 # Change the default to match primaryKey.
319 self.nullable = False
321 def __eq__(self, other: Any) -> bool:
322 if isinstance(other, FieldSpec):
323 return self.name == other.name
324 else:
325 return NotImplemented
327 def __hash__(self) -> int:
328 return hash(self.name)
330 @classmethod
331 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.")
332 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec:
333 """Create a `FieldSpec` from a subset of a `SchemaConfig`.
335 Parameters
336 ----------
337 config: `Config`
338 Configuration describing the column. Nested configuration keys
339 correspond to `FieldSpec` attributes.
340 **kwargs
341 Additional keyword arguments that provide defaults for values
342 not present in config.
344 Returns
345 -------
346 spec: `FieldSpec`
347 Specification structure for the column.
349 Raises
350 ------
351 SchemaValidationError
352 Raised if configuration keys are missing or have invalid values.
353 """
354 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"])
355 if dtype is None:
356 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.")
357 if not config["name"].islower():
358 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.")
359 self = cls(name=config["name"], dtype=dtype, **kwargs)
360 self.length = config.get("length", self.length)
361 self.nbytes = config.get("nbytes", self.nbytes)
362 if self.length is not None and self.nbytes is not None:
363 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.")
364 self.primaryKey = config.get("primaryKey", self.primaryKey)
365 self.autoincrement = config.get("autoincrement", self.autoincrement)
366 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable)
367 self.doc = stripIfNotNone(config.get("doc", None))
368 return self
370 @classmethod
371 def for_region(cls, name: str = "region", nullable: bool = True, nbytes: int = 2048) -> FieldSpec:
372 """Create a `FieldSpec` for a spatial region column.
374 Parameters
375 ----------
376 name : `str`, optional
377 Name for the field.
378 nullable : `bool`, optional
379 Whether NULL values are permitted.
380 nbytes : `int`, optional
381 Maximum number of bytes for serialized regions. The actual column
382 size will be larger to allow for base-64 encoding.
384 Returns
385 -------
386 spec : `FieldSpec`
387 Specification structure for a region column.
388 """
389 return cls(name, nullable=nullable, dtype=Base64Region, nbytes=nbytes)
391 def isStringType(self) -> bool:
392 """Indicate that this is a sqlalchemy.String field spec.
394 Returns
395 -------
396 isString : `bool`
397 The field refers to a `sqlalchemy.String` and not any other type.
398 This can return `False` even if the object was created with a
399 string type if it has been decided that it should be implemented
400 as a `sqlalchemy.Text` type.
401 """
402 # For short strings retain them as strings
403 if self.dtype is sqlalchemy.String and self.length and self.length <= 32:
404 return True
405 return False
407 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine | type:
408 """Return a sized version of the column type.
410 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``.
412 Returns
413 -------
414 dtype : `sqlalchemy.types.TypeEngine`
415 A SQLAlchemy column type object.
416 """
417 if self.length is not None:
418 # Last chance check that we are only looking at possible String
419 if self.dtype is sqlalchemy.String and not self.isStringType():
420 return sqlalchemy.Text
421 return self.dtype(length=self.length)
422 if self.nbytes is not None:
423 return self.dtype(nbytes=self.nbytes)
424 return self.dtype
426 def getPythonType(self) -> type:
427 """Return the Python type associated with this field's (SQL) dtype.
429 Returns
430 -------
431 type : `type`
432 Python type associated with this field's (SQL) `dtype`.
433 """
434 # to construct these objects, nbytes keyword is needed
435 if issubclass(self.dtype, LocalBase64Bytes):
436 # satisfy mypy for something that must be true
437 assert self.nbytes is not None
438 return self.dtype(nbytes=self.nbytes).python_type
439 else:
440 return self.dtype().python_type # type: ignore
443@dataclass
444class ForeignKeySpec:
445 """Definition of a foreign key constraint in a logical `Registry` table."""
447 table: str
448 """Name of the target table."""
450 source: tuple[str, ...]
451 """Tuple of source table column names."""
453 target: tuple[str, ...]
454 """Tuple of target table column names."""
456 onDelete: str | None = None
457 """SQL clause indicating how to handle deletes to the target table.
459 If not `None` (which indicates that a constraint violation exception should
460 be raised), should be either "SET NULL" or "CASCADE".
461 """
463 addIndex: bool = True
464 """If `True`, create an index on the columns of this foreign key in the
465 source table.
466 """
468 @classmethod
469 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.")
470 def fromConfig(cls, config: Config) -> ForeignKeySpec:
471 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`.
473 Parameters
474 ----------
475 config: `Config`
476 Configuration describing the constraint. Nested configuration keys
477 correspond to `ForeignKeySpec` attributes.
479 Returns
480 -------
481 spec: `ForeignKeySpec`
482 Specification structure for the constraint.
484 Raises
485 ------
486 SchemaValidationError
487 Raised if configuration keys are missing or have invalid values.
488 """
489 return cls(
490 table=config["table"],
491 source=tuple(ensure_iterable(config["source"])),
492 target=tuple(ensure_iterable(config["target"])),
493 onDelete=config.get("onDelete", None),
494 )
497@dataclass(frozen=True)
498class IndexSpec:
499 """Specification of an index on table columns.
501 Parameters
502 ----------
503 *columns : `str`
504 Names of the columns to index.
505 **kwargs: `Any`
506 Additional keyword arguments to pass directly to
507 `sqlalchemy.schema.Index` constructor. This could be used to provide
508 backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL
509 one can pass ``postgresql_using="gist"``.
510 """
512 def __init__(self, *columns: str, **kwargs: Any):
513 object.__setattr__(self, "columns", tuple(columns))
514 object.__setattr__(self, "kwargs", kwargs)
516 def __hash__(self) -> int:
517 return hash(self.columns)
519 columns: tuple[str, ...]
520 """Column names to include in the index (`Tuple` [ `str` ])."""
522 kwargs: dict[str, Any]
523 """Additional keyword arguments passed directly to
524 `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]).
525 """
528@dataclass
529class TableSpec:
530 """A data class used to define a table or table-like query interface.
532 Parameters
533 ----------
534 fields : `~collections.abc.Iterable` [ `FieldSpec` ]
535 Specifications for the columns in this table.
536 unique : `~collections.abc.Iterable` [ `tuple` [ `str` ] ], optional
537 Non-primary-key unique constraints for the table.
538 indexes: `~collections.abc.Iterable` [ `IndexSpec` ], optional
539 Indexes for the table.
540 foreignKeys : `~collections.abc.Iterable` [ `ForeignKeySpec` ], optional
541 Foreign key constraints for the table.
542 exclusion : `~collections.abc.Iterable` [ `tuple` [ `str` or `type` ] ]
543 Special constraints that prohibit overlaps between timespans over rows
544 where other columns are equal. These take the same form as unique
545 constraints, but each tuple may contain a single
546 `TimespanDatabaseRepresentation` subclass representing a timespan
547 column.
548 recycleIds : `bool`, optional
549 If `True`, allow databases that might normally recycle autoincrement
550 IDs to do so (usually better for performance) on any autoincrement
551 field in this table.
552 doc : `str`, optional
553 Documentation for the table.
554 """
556 def __init__(
557 self,
558 fields: Iterable[FieldSpec],
559 *,
560 unique: Iterable[tuple[str, ...]] = (),
561 indexes: Iterable[IndexSpec] = (),
562 foreignKeys: Iterable[ForeignKeySpec] = (),
563 exclusion: Iterable[tuple[str | type[TimespanDatabaseRepresentation], ...]] = (),
564 recycleIds: bool = True,
565 doc: str | None = None,
566 ):
567 self.fields = NamedValueSet(fields)
568 self.unique = set(unique)
569 self.indexes = set(indexes)
570 self.foreignKeys = list(foreignKeys)
571 self.exclusion = set(exclusion)
572 self.recycleIds = recycleIds
573 self.doc = doc
575 fields: NamedValueSet[FieldSpec]
576 """Specifications for the columns in this table."""
578 unique: set[tuple[str, ...]]
579 """Non-primary-key unique constraints for the table."""
581 indexes: set[IndexSpec]
582 """Indexes for the table."""
584 foreignKeys: list[ForeignKeySpec]
585 """Foreign key constraints for the table."""
587 exclusion: set[tuple[str | type[TimespanDatabaseRepresentation], ...]]
588 """Exclusion constraints for the table.
590 Exclusion constraints behave mostly like unique constraints, but may
591 contain a database-native Timespan column that is restricted to not overlap
592 across rows (for identical combinations of any non-Timespan columns in the
593 constraint).
594 """
596 recycleIds: bool = True
597 """If `True`, allow databases that might normally recycle autoincrement IDs
598 to do so (usually better for performance) on any autoincrement field in
599 this table.
600 """
602 doc: str | None = None
603 """Documentation for the table."""
605 @classmethod
606 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.")
607 def fromConfig(cls, config: Config) -> TableSpec:
608 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`.
610 Parameters
611 ----------
612 config: `Config`
613 Configuration describing the constraint. Nested configuration keys
614 correspond to `TableSpec` attributes.
616 Returns
617 -------
618 spec: `TableSpec`
619 Specification structure for the table.
621 Raises
622 ------
623 SchemaValidationError
624 Raised if configuration keys are missing or have invalid values.
625 """
626 return cls(
627 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]),
628 unique={tuple(u) for u in config.get("unique", ())},
629 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())],
630 doc=stripIfNotNone(config.get("doc")),
631 )