Coverage for python/lsst/daf/butler/ddl.py: 55%
233 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27"""Classes for representing SQL data-definition language (DDL) in Python.
29This include "CREATE TABLE" etc.
31This provides an extra layer on top of SQLAlchemy's classes for these concepts,
32because we need a level of indirection between logical tables and the actual
33SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL.
35We've opted for the rather more obscure "ddl" as the name of this module
36instead of "schema" because the latter is too overloaded; in most SQL
37databases, a "schema" is also another term for a namespace.
38"""
39from __future__ import annotations
41__all__ = (
42 "TableSpec",
43 "FieldSpec",
44 "ForeignKeySpec",
45 "IndexSpec",
46 "Base64Bytes",
47 "Base64Region",
48 "AstropyTimeNsecTai",
49 "GUID",
50)
52import functools
53import logging
54import uuid
55from base64 import b64decode, b64encode
56from collections.abc import Callable, Iterable
57from dataclasses import dataclass
58from math import ceil
59from typing import TYPE_CHECKING, Any
61import astropy.time
62import sqlalchemy
63from lsst.sphgeom import Region, UnionRegion
64from lsst.utils.iteration import ensure_iterable
65from sqlalchemy.dialects.postgresql import UUID
67from . import time_utils
68from ._config import Config
69from ._exceptions import ValidationError
70from ._named import NamedValueSet
71from .utils import stripIfNotNone
73if TYPE_CHECKING:
74 from .timespan_database_representation import TimespanDatabaseRepresentation
77_LOG = logging.getLogger(__name__)
80class SchemaValidationError(ValidationError):
81 """Exceptions that indicate problems in Registry schema configuration."""
83 @classmethod
84 def translate(cls, caught: type[Exception], message: str) -> Callable:
85 """Return decorator to re-raise exceptions as `SchemaValidationError`.
87 Decorated functions must be class or instance methods, with a
88 ``config`` parameter as their first argument. This will be passed
89 to ``message.format()`` as a keyword argument, along with ``err``,
90 the original exception.
92 Parameters
93 ----------
94 caught : `type` (`Exception` subclass)
95 The type of exception to catch.
96 message : `str`
97 A `str.format` string that may contain named placeholders for
98 ``config``, ``err``, or any keyword-only argument accepted by
99 the decorated function.
100 """
102 def decorate(func: Callable) -> Callable:
103 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any:
104 try:
105 return func(self, config, *args, **kwargs)
106 except caught as err:
107 raise cls(message.format(config=str(config), err=err)) from err
109 return decorated
111 return decorate
114class Base64Bytes(sqlalchemy.TypeDecorator):
115 """A SQLAlchemy custom type for Python `bytes`.
117 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field.
119 Parameters
120 ----------
121 nbytes : `int` or `None`, optional
122 Number of bytes.
123 *args : `typing.Any`
124 Parameters passed to base class constructor.
125 **kwargs : `typing.Any`
126 Keyword parameters passed to base class constructor.
127 """
129 impl = sqlalchemy.Text
131 cache_ok = True
133 def __init__(self, nbytes: int | None = None, *args: Any, **kwargs: Any):
134 if nbytes is not None:
135 length = 4 * ceil(nbytes / 3) if self.impl is sqlalchemy.String else None
136 else:
137 length = None
138 super().__init__(*args, length=length, **kwargs)
139 self.nbytes = nbytes
141 def process_bind_param(self, value: bytes | None, dialect: sqlalchemy.engine.Dialect) -> str | None:
142 # 'value' is native `bytes`. We want to encode that to base64 `bytes`
143 # and then ASCII `str`, because `str` is what SQLAlchemy expects for
144 # String fields.
145 if value is None:
146 return None
147 if not isinstance(value, bytes):
148 raise TypeError(
149 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}."
150 )
151 return b64encode(value).decode("ascii")
153 def process_result_value(self, value: str | None, dialect: sqlalchemy.engine.Dialect) -> bytes | None:
154 # 'value' is a `str` that must be ASCII because it's base64-encoded.
155 # We want to transform that to base64-encoded `bytes` and then
156 # native `bytes`.
157 return b64decode(value.encode("ascii")) if value is not None else None
159 @property
160 def python_type(self) -> type[bytes]:
161 return bytes
164# create an alias, for use below to disambiguate between the built in
165# sqlachemy type
166LocalBase64Bytes = Base64Bytes
169class Base64Region(Base64Bytes):
170 """A SQLAlchemy custom type for Python `lsst.sphgeom.Region`.
172 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`.
173 """
175 cache_ok = True # have to be set explicitly in each class
177 def process_bind_param(self, value: Region | None, dialect: sqlalchemy.engine.Dialect) -> str | None:
178 if value is None:
179 return None
180 return super().process_bind_param(value.encode(), dialect)
182 def process_result_value(self, value: str | None, dialect: sqlalchemy.engine.Dialect) -> Region | None:
183 if value is None:
184 return None
185 return functools.reduce(
186 UnionRegion,
187 [
188 # For some reason super() doesn't work here!
189 Region.decode(Base64Bytes.process_result_value(self, union_member, dialect))
190 for union_member in value.split(":")
191 ],
192 )
194 @property
195 def python_type(self) -> type[Region]:
196 return Region
198 @classmethod
199 def union_aggregate(
200 cls, column: sqlalchemy.ColumnElement[Base64Region]
201 ) -> sqlalchemy.ColumnElement[Base64Region]:
202 """Return a SQLAlchemy aggregate expression that computes the union of
203 a set of unions.
205 Parameters
206 ----------
207 column : `sqlalchemy.ColumnElement`
208 SQLAlchemy column expression representing the regions to be
209 combined.
211 Returns
212 -------
213 union_column : `sqlalchemy.ColumnElement`
214 SQLAlchemy column expression representing the union.
215 """
216 return sqlalchemy.cast(sqlalchemy.func.aggregate_strings(column, ":"), type_=Base64Region)
219class AstropyTimeNsecTai(sqlalchemy.TypeDecorator):
220 """A SQLAlchemy custom type for Python `astropy.time.Time`.
222 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix
223 epoch in TAI scale.
224 """
226 impl = sqlalchemy.BigInteger
228 cache_ok = True
230 def process_bind_param(
231 self, value: astropy.time.Time | None, dialect: sqlalchemy.engine.Dialect
232 ) -> int | None:
233 if value is None:
234 return None
235 if not isinstance(value, astropy.time.Time):
236 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time")
237 value = time_utils.TimeConverter().astropy_to_nsec(value)
238 return value
240 def process_result_value(
241 self, value: int | None, dialect: sqlalchemy.engine.Dialect
242 ) -> astropy.time.Time | None:
243 # value is nanoseconds since epoch, or None
244 if value is None:
245 return None
246 value = time_utils.TimeConverter().nsec_to_astropy(value)
247 return value
250# TODO: sqlalchemy 2 has internal support for UUID:
251# https://docs.sqlalchemy.org/en/20/core/type_basics.html#sqlalchemy.types.Uuid
252class GUID(sqlalchemy.TypeDecorator):
253 """Platform-independent GUID type.
255 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as
256 stringified hex values.
257 """
259 impl = sqlalchemy.CHAR
261 cache_ok = True
263 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.types.TypeEngine:
264 if dialect.name == "postgresql":
265 return dialect.type_descriptor(UUID())
266 else:
267 return dialect.type_descriptor(sqlalchemy.CHAR(32))
269 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> str | None:
270 if value is None:
271 return value
273 # Coerce input to UUID type, in general having UUID on input is the
274 # only thing that we want but there is code right now that uses ints.
275 if isinstance(value, int):
276 value = uuid.UUID(int=value)
277 elif isinstance(value, bytes):
278 value = uuid.UUID(bytes=value)
279 elif isinstance(value, str):
280 # hexstring
281 value = uuid.UUID(hex=value)
282 elif not isinstance(value, uuid.UUID):
283 raise TypeError(f"Unexpected type of a bind value: {type(value)}")
285 if dialect.name == "postgresql":
286 return str(value)
287 else:
288 return "%.32x" % value.int
290 def process_result_value(
291 self, value: str | uuid.UUID | None, dialect: sqlalchemy.Dialect
292 ) -> uuid.UUID | None:
293 if value is None or isinstance(value, uuid.UUID):
294 # sqlalchemy 2 converts to UUID internally
295 return value
296 else:
297 return uuid.UUID(hex=value)
300VALID_CONFIG_COLUMN_TYPES = {
301 "string": sqlalchemy.String,
302 "int": sqlalchemy.BigInteger,
303 "float": sqlalchemy.Float,
304 "region": Base64Region,
305 "bool": sqlalchemy.Boolean,
306 "blob": sqlalchemy.LargeBinary,
307 "datetime": AstropyTimeNsecTai,
308 "hash": Base64Bytes,
309 "uuid": GUID,
310}
313@dataclass
314class FieldSpec:
315 """A data class for defining a column in a logical `Registry` table."""
317 name: str
318 """Name of the column."""
320 dtype: type
321 """Type of the column; usually a `type` subclass provided by SQLAlchemy
322 that defines both a Python type and a corresponding precise SQL type.
323 """
325 length: int | None = None
326 """Length of the type in the database, for variable-length types."""
328 nbytes: int | None = None
329 """Natural length used for hash and encoded-region columns, to be converted
330 into the post-encoding length.
331 """
333 primaryKey: bool = False
334 """Whether this field is (part of) its table's primary key."""
336 autoincrement: bool = False
337 """Whether the database should insert automatically incremented values when
338 no value is provided in an INSERT.
339 """
341 nullable: bool = True
342 """Whether this field is allowed to be NULL. If ``primaryKey`` is
343 `True`, during construction this value will be forced to `False`."""
345 default: Any = None
346 """A server-side default value for this field.
348 This is passed directly as the ``server_default`` argument to
349 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual
350 type conversion or quoting for Python literals, and should hence be used
351 with care. See the SQLAlchemy documentation for more information.
352 """
354 doc: str | None = None
355 """Documentation for this field."""
357 def __post_init__(self) -> None:
358 if self.primaryKey:
359 # Change the default to match primaryKey.
360 self.nullable = False
362 def __eq__(self, other: Any) -> bool:
363 if isinstance(other, FieldSpec):
364 return self.name == other.name
365 else:
366 return NotImplemented
368 def __hash__(self) -> int:
369 return hash(self.name)
371 @classmethod
372 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.")
373 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec:
374 """Create a `FieldSpec` from a subset of a `SchemaConfig`.
376 Parameters
377 ----------
378 config : `Config`
379 Configuration describing the column. Nested configuration keys
380 correspond to `FieldSpec` attributes.
381 **kwargs
382 Additional keyword arguments that provide defaults for values
383 not present in config.
385 Returns
386 -------
387 spec: `FieldSpec`
388 Specification structure for the column.
390 Raises
391 ------
392 SchemaValidationError
393 Raised if configuration keys are missing or have invalid values.
394 """
395 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"])
396 if dtype is None:
397 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.")
398 if not config["name"].islower():
399 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.")
400 self = cls(name=config["name"], dtype=dtype, **kwargs)
401 self.length = config.get("length", self.length)
402 self.nbytes = config.get("nbytes", self.nbytes)
403 if self.length is not None and self.nbytes is not None:
404 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.")
405 self.primaryKey = config.get("primaryKey", self.primaryKey)
406 self.autoincrement = config.get("autoincrement", self.autoincrement)
407 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable)
408 self.doc = stripIfNotNone(config.get("doc", None))
409 return self
411 @classmethod
412 def for_region(cls, name: str = "region", nullable: bool = True, nbytes: int = 2048) -> FieldSpec:
413 """Create a `FieldSpec` for a spatial region column.
415 Parameters
416 ----------
417 name : `str`, optional
418 Name for the field.
419 nullable : `bool`, optional
420 Whether NULL values are permitted.
421 nbytes : `int`, optional
422 Maximum number of bytes for serialized regions. The actual column
423 size will be larger to allow for base-64 encoding.
425 Returns
426 -------
427 spec : `FieldSpec`
428 Specification structure for a region column.
429 """
430 return cls(name, nullable=nullable, dtype=Base64Region, nbytes=nbytes)
432 def isStringType(self) -> bool:
433 """Indicate that this is a sqlalchemy.String field spec.
435 Returns
436 -------
437 isString : `bool`
438 The field refers to a `sqlalchemy.String` and not any other type.
439 This can return `False` even if the object was created with a
440 string type if it has been decided that it should be implemented
441 as a `sqlalchemy.Text` type.
442 """
443 # For short strings retain them as strings
444 if self.dtype is sqlalchemy.String and self.length and self.length <= 32:
445 return True
446 return False
448 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine | type:
449 """Return a sized version of the column type.
451 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``.
453 Returns
454 -------
455 dtype : `sqlalchemy.types.TypeEngine`
456 A SQLAlchemy column type object.
457 """
458 if self.length is not None:
459 # Last chance check that we are only looking at possible String
460 if self.dtype is sqlalchemy.String and not self.isStringType():
461 return sqlalchemy.Text
462 return self.dtype(length=self.length)
463 if self.nbytes is not None:
464 return self.dtype(nbytes=self.nbytes)
465 return self.dtype
467 def getPythonType(self) -> type:
468 """Return the Python type associated with this field's (SQL) dtype.
470 Returns
471 -------
472 type : `type`
473 Python type associated with this field's (SQL) `dtype`.
474 """
475 # to construct these objects, nbytes keyword is needed
476 if issubclass(self.dtype, LocalBase64Bytes):
477 # satisfy mypy for something that must be true
478 assert self.nbytes is not None
479 return self.dtype(nbytes=self.nbytes).python_type
480 else:
481 return self.dtype().python_type # type: ignore
484@dataclass
485class ForeignKeySpec:
486 """Definition of a foreign key constraint in a logical `Registry` table."""
488 table: str
489 """Name of the target table."""
491 source: tuple[str, ...]
492 """Tuple of source table column names."""
494 target: tuple[str, ...]
495 """Tuple of target table column names."""
497 onDelete: str | None = None
498 """SQL clause indicating how to handle deletes to the target table.
500 If not `None` (which indicates that a constraint violation exception should
501 be raised), should be either "SET NULL" or "CASCADE".
502 """
504 addIndex: bool = True
505 """If `True`, create an index on the columns of this foreign key in the
506 source table.
507 """
509 @classmethod
510 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.")
511 def fromConfig(cls, config: Config) -> ForeignKeySpec:
512 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`.
514 Parameters
515 ----------
516 config : `Config`
517 Configuration describing the constraint. Nested configuration keys
518 correspond to `ForeignKeySpec` attributes.
520 Returns
521 -------
522 spec: `ForeignKeySpec`
523 Specification structure for the constraint.
525 Raises
526 ------
527 SchemaValidationError
528 Raised if configuration keys are missing or have invalid values.
529 """
530 return cls(
531 table=config["table"],
532 source=tuple(ensure_iterable(config["source"])),
533 target=tuple(ensure_iterable(config["target"])),
534 onDelete=config.get("onDelete", None),
535 )
538@dataclass(frozen=True)
539class IndexSpec:
540 """Specification of an index on table columns.
542 Parameters
543 ----------
544 *columns : `str`
545 Names of the columns to index.
546 **kwargs : `Any`
547 Additional keyword arguments to pass directly to
548 `sqlalchemy.schema.Index` constructor. This could be used to provide
549 backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL
550 one can pass ``postgresql_using="gist"``.
551 """
553 def __init__(self, *columns: str, **kwargs: Any):
554 object.__setattr__(self, "columns", tuple(columns))
555 object.__setattr__(self, "kwargs", kwargs)
557 def __hash__(self) -> int:
558 return hash(self.columns)
560 columns: tuple[str, ...]
561 """Column names to include in the index (`Tuple` [ `str` ])."""
563 kwargs: dict[str, Any]
564 """Additional keyword arguments passed directly to
565 `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]).
566 """
569@dataclass
570class TableSpec:
571 """A data class used to define a table or table-like query interface.
573 Parameters
574 ----------
575 fields : `~collections.abc.Iterable` [ `FieldSpec` ]
576 Specifications for the columns in this table.
577 unique : `~collections.abc.Iterable` [ `tuple` [ `str` ] ], optional
578 Non-primary-key unique constraints for the table.
579 indexes : `~collections.abc.Iterable` [ `IndexSpec` ], optional
580 Indexes for the table.
581 foreignKeys : `~collections.abc.Iterable` [ `ForeignKeySpec` ], optional
582 Foreign key constraints for the table.
583 exclusion : `~collections.abc.Iterable` [ `tuple` [ `str` or `type` ] ]
584 Special constraints that prohibit overlaps between timespans over rows
585 where other columns are equal. These take the same form as unique
586 constraints, but each tuple may contain a single
587 `TimespanDatabaseRepresentation` subclass representing a timespan
588 column.
589 recycleIds : `bool`, optional
590 If `True`, allow databases that might normally recycle autoincrement
591 IDs to do so (usually better for performance) on any autoincrement
592 field in this table.
593 doc : `str`, optional
594 Documentation for the table.
595 """
597 def __init__(
598 self,
599 fields: Iterable[FieldSpec],
600 *,
601 unique: Iterable[tuple[str, ...]] = (),
602 indexes: Iterable[IndexSpec] = (),
603 foreignKeys: Iterable[ForeignKeySpec] = (),
604 exclusion: Iterable[tuple[str | type[TimespanDatabaseRepresentation], ...]] = (),
605 recycleIds: bool = True,
606 doc: str | None = None,
607 ):
608 self.fields = NamedValueSet(fields)
609 self.unique = set(unique)
610 self.indexes = set(indexes)
611 self.foreignKeys = list(foreignKeys)
612 self.exclusion = set(exclusion)
613 self.recycleIds = recycleIds
614 self.doc = doc
616 fields: NamedValueSet[FieldSpec]
617 """Specifications for the columns in this table."""
619 unique: set[tuple[str, ...]]
620 """Non-primary-key unique constraints for the table."""
622 indexes: set[IndexSpec]
623 """Indexes for the table."""
625 foreignKeys: list[ForeignKeySpec]
626 """Foreign key constraints for the table."""
628 exclusion: set[tuple[str | type[TimespanDatabaseRepresentation], ...]]
629 """Exclusion constraints for the table.
631 Exclusion constraints behave mostly like unique constraints, but may
632 contain a database-native Timespan column that is restricted to not overlap
633 across rows (for identical combinations of any non-Timespan columns in the
634 constraint).
635 """
637 recycleIds: bool = True
638 """If `True`, allow databases that might normally recycle autoincrement IDs
639 to do so (usually better for performance) on any autoincrement field in
640 this table.
641 """
643 doc: str | None = None
644 """Documentation for the table."""
646 @classmethod
647 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.")
648 def fromConfig(cls, config: Config) -> TableSpec:
649 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`.
651 Parameters
652 ----------
653 config : `Config`
654 Configuration describing the constraint. Nested configuration keys
655 correspond to `TableSpec` attributes.
657 Returns
658 -------
659 spec: `TableSpec`
660 Specification structure for the table.
662 Raises
663 ------
664 SchemaValidationError
665 Raised if configuration keys are missing or have invalid values.
666 """
667 return cls(
668 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]),
669 unique={tuple(u) for u in config.get("unique", ())},
670 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())],
671 doc=stripIfNotNone(config.get("doc")),
672 )