Coverage for python/lsst/daf/butler/core/ddl.py: 55%

230 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27"""Classes for representing SQL data-definition language (DDL) in Python. 

28 

29This include "CREATE TABLE" etc. 

30 

31This provides an extra layer on top of SQLAlchemy's classes for these concepts, 

32because we need a level of indirection between logical tables and the actual 

33SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL. 

34 

35We've opted for the rather more obscure "ddl" as the name of this module 

36instead of "schema" because the latter is too overloaded; in most SQL 

37databases, a "schema" is also another term for a namespace. 

38""" 

39from __future__ import annotations 

40 

41from lsst import sphgeom 

42 

43__all__ = ( 

44 "TableSpec", 

45 "FieldSpec", 

46 "ForeignKeySpec", 

47 "IndexSpec", 

48 "Base64Bytes", 

49 "Base64Region", 

50 "AstropyTimeNsecTai", 

51 "GUID", 

52) 

53 

54import logging 

55import uuid 

56from base64 import b64decode, b64encode 

57from collections.abc import Callable, Iterable 

58from dataclasses import dataclass 

59from math import ceil 

60from typing import TYPE_CHECKING, Any 

61 

62import astropy.time 

63import sqlalchemy 

64from lsst.sphgeom import Region 

65from lsst.utils.iteration import ensure_iterable 

66from sqlalchemy.dialects.postgresql import UUID 

67 

68from . import time_utils 

69from .config import Config 

70from .exceptions import ValidationError 

71from .named import NamedValueSet 

72from .utils import stripIfNotNone 

73 

74if TYPE_CHECKING: 

75 from .timespan import TimespanDatabaseRepresentation 

76 

77 

78_LOG = logging.getLogger(__name__) 

79 

80 

81class SchemaValidationError(ValidationError): 

82 """Exceptions that indicate problems in Registry schema configuration.""" 

83 

84 @classmethod 

85 def translate(cls, caught: type[Exception], message: str) -> Callable: 

86 """Return decorator to re-raise exceptions as `SchemaValidationError`. 

87 

88 Decorated functions must be class or instance methods, with a 

89 ``config`` parameter as their first argument. This will be passed 

90 to ``message.format()`` as a keyword argument, along with ``err``, 

91 the original exception. 

92 

93 Parameters 

94 ---------- 

95 caught : `type` (`Exception` subclass) 

96 The type of exception to catch. 

97 message : `str` 

98 A `str.format` string that may contain named placeholders for 

99 ``config``, ``err``, or any keyword-only argument accepted by 

100 the decorated function. 

101 """ 

102 

103 def decorate(func: Callable) -> Callable: 

104 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any: 

105 try: 

106 return func(self, config, *args, **kwargs) 

107 except caught as err: 

108 raise cls(message.format(config=str(config), err=err)) from err 

109 

110 return decorated 

111 

112 return decorate 

113 

114 

115class Base64Bytes(sqlalchemy.TypeDecorator): 

116 """A SQLAlchemy custom type for Python `bytes`. 

117 

118 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field. 

119 """ 

120 

121 impl = sqlalchemy.Text 

122 

123 cache_ok = True 

124 

125 def __init__(self, nbytes: int | None = None, *args: Any, **kwargs: Any): 

126 if nbytes is not None: 

127 length = 4 * ceil(nbytes / 3) if self.impl is sqlalchemy.String else None 

128 else: 

129 length = None 

130 super().__init__(*args, length=length, **kwargs) 

131 self.nbytes = nbytes 

132 

133 def process_bind_param(self, value: bytes | None, dialect: sqlalchemy.engine.Dialect) -> str | None: 

134 # 'value' is native `bytes`. We want to encode that to base64 `bytes` 

135 # and then ASCII `str`, because `str` is what SQLAlchemy expects for 

136 # String fields. 

137 if value is None: 

138 return None 

139 if not isinstance(value, bytes): 

140 raise TypeError( 

141 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}." 

142 ) 

143 return b64encode(value).decode("ascii") 

144 

145 def process_result_value(self, value: str | None, dialect: sqlalchemy.engine.Dialect) -> bytes | None: 

146 # 'value' is a `str` that must be ASCII because it's base64-encoded. 

147 # We want to transform that to base64-encoded `bytes` and then 

148 # native `bytes`. 

149 return b64decode(value.encode("ascii")) if value is not None else None 

150 

151 @property 

152 def python_type(self) -> type[bytes]: 

153 return bytes 

154 

155 

156# create an alias, for use below to disambiguate between the built in 

157# sqlachemy type 

158LocalBase64Bytes = Base64Bytes 

159 

160 

161class Base64Region(Base64Bytes): 

162 """A SQLAlchemy custom type for Python `sphgeom.Region`. 

163 

164 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`. 

165 """ 

166 

167 cache_ok = True # have to be set explicitly in each class 

168 

169 def process_bind_param(self, value: Region | None, dialect: sqlalchemy.engine.Dialect) -> str | None: 

170 if value is None: 

171 return None 

172 return super().process_bind_param(value.encode(), dialect) 

173 

174 def process_result_value(self, value: str | None, dialect: sqlalchemy.engine.Dialect) -> Region | None: 

175 if value is None: 

176 return None 

177 return Region.decode(super().process_result_value(value, dialect)) 

178 

179 @property 

180 def python_type(self) -> type[sphgeom.Region]: 

181 return sphgeom.Region 

182 

183 

184class AstropyTimeNsecTai(sqlalchemy.TypeDecorator): 

185 """A SQLAlchemy custom type for Python `astropy.time.Time`. 

186 

187 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix 

188 epoch in TAI scale. 

189 """ 

190 

191 impl = sqlalchemy.BigInteger 

192 

193 cache_ok = True 

194 

195 def process_bind_param( 

196 self, value: astropy.time.Time | None, dialect: sqlalchemy.engine.Dialect 

197 ) -> int | None: 

198 if value is None: 

199 return None 

200 if not isinstance(value, astropy.time.Time): 

201 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time") 

202 value = time_utils.TimeConverter().astropy_to_nsec(value) 

203 return value 

204 

205 def process_result_value( 

206 self, value: int | None, dialect: sqlalchemy.engine.Dialect 

207 ) -> astropy.time.Time | None: 

208 # value is nanoseconds since epoch, or None 

209 if value is None: 

210 return None 

211 value = time_utils.TimeConverter().nsec_to_astropy(value) 

212 return value 

213 

214 

215# TODO: sqlalchemy 2 has internal support for UUID: 

216# https://docs.sqlalchemy.org/en/20/core/type_basics.html#sqlalchemy.types.Uuid 

217class GUID(sqlalchemy.TypeDecorator): 

218 """Platform-independent GUID type. 

219 

220 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as 

221 stringified hex values. 

222 """ 

223 

224 impl = sqlalchemy.CHAR 

225 

226 cache_ok = True 

227 

228 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.types.TypeEngine: 

229 if dialect.name == "postgresql": 

230 return dialect.type_descriptor(UUID()) 

231 else: 

232 return dialect.type_descriptor(sqlalchemy.CHAR(32)) 

233 

234 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> str | None: 

235 if value is None: 

236 return value 

237 

238 # Coerce input to UUID type, in general having UUID on input is the 

239 # only thing that we want but there is code right now that uses ints. 

240 if isinstance(value, int): 

241 value = uuid.UUID(int=value) 

242 elif isinstance(value, bytes): 

243 value = uuid.UUID(bytes=value) 

244 elif isinstance(value, str): 

245 # hexstring 

246 value = uuid.UUID(hex=value) 

247 elif not isinstance(value, uuid.UUID): 

248 raise TypeError(f"Unexpected type of a bind value: {type(value)}") 

249 

250 if dialect.name == "postgresql": 

251 return str(value) 

252 else: 

253 return "%.32x" % value.int 

254 

255 def process_result_value( 

256 self, value: str | uuid.UUID | None, dialect: sqlalchemy.Dialect 

257 ) -> uuid.UUID | None: 

258 if value is None or isinstance(value, uuid.UUID): 

259 # sqlalchemy 2 converts to UUID internally 

260 return value 

261 else: 

262 return uuid.UUID(hex=value) 

263 

264 

265VALID_CONFIG_COLUMN_TYPES = { 

266 "string": sqlalchemy.String, 

267 "int": sqlalchemy.BigInteger, 

268 "float": sqlalchemy.Float, 

269 "region": Base64Region, 

270 "bool": sqlalchemy.Boolean, 

271 "blob": sqlalchemy.LargeBinary, 

272 "datetime": AstropyTimeNsecTai, 

273 "hash": Base64Bytes, 

274 "uuid": GUID, 

275} 

276 

277 

278@dataclass 

279class FieldSpec: 

280 """A data class for defining a column in a logical `Registry` table.""" 

281 

282 name: str 

283 """Name of the column.""" 

284 

285 dtype: type 

286 """Type of the column; usually a `type` subclass provided by SQLAlchemy 

287 that defines both a Python type and a corresponding precise SQL type. 

288 """ 

289 

290 length: int | None = None 

291 """Length of the type in the database, for variable-length types.""" 

292 

293 nbytes: int | None = None 

294 """Natural length used for hash and encoded-region columns, to be converted 

295 into the post-encoding length. 

296 """ 

297 

298 primaryKey: bool = False 

299 """Whether this field is (part of) its table's primary key.""" 

300 

301 autoincrement: bool = False 

302 """Whether the database should insert automatically incremented values when 

303 no value is provided in an INSERT. 

304 """ 

305 

306 nullable: bool = True 

307 """Whether this field is allowed to be NULL. If ``primaryKey`` is 

308 `True`, during construction this value will be forced to `False`.""" 

309 

310 default: Any = None 

311 """A server-side default value for this field. 

312 

313 This is passed directly as the ``server_default`` argument to 

314 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual 

315 type conversion or quoting for Python literals, and should hence be used 

316 with care. See the SQLAlchemy documentation for more information. 

317 """ 

318 

319 doc: str | None = None 

320 """Documentation for this field.""" 

321 

322 def __post_init__(self) -> None: 

323 if self.primaryKey: 

324 # Change the default to match primaryKey. 

325 self.nullable = False 

326 

327 def __eq__(self, other: Any) -> bool: 

328 if isinstance(other, FieldSpec): 

329 return self.name == other.name 

330 else: 

331 return NotImplemented 

332 

333 def __hash__(self) -> int: 

334 return hash(self.name) 

335 

336 @classmethod 

337 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.") 

338 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec: 

339 """Create a `FieldSpec` from a subset of a `SchemaConfig`. 

340 

341 Parameters 

342 ---------- 

343 config: `Config` 

344 Configuration describing the column. Nested configuration keys 

345 correspond to `FieldSpec` attributes. 

346 **kwargs 

347 Additional keyword arguments that provide defaults for values 

348 not present in config. 

349 

350 Returns 

351 ------- 

352 spec: `FieldSpec` 

353 Specification structure for the column. 

354 

355 Raises 

356 ------ 

357 SchemaValidationError 

358 Raised if configuration keys are missing or have invalid values. 

359 """ 

360 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"]) 

361 if dtype is None: 

362 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.") 

363 if not config["name"].islower(): 

364 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.") 

365 self = cls(name=config["name"], dtype=dtype, **kwargs) 

366 self.length = config.get("length", self.length) 

367 self.nbytes = config.get("nbytes", self.nbytes) 

368 if self.length is not None and self.nbytes is not None: 

369 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.") 

370 self.primaryKey = config.get("primaryKey", self.primaryKey) 

371 self.autoincrement = config.get("autoincrement", self.autoincrement) 

372 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable) 

373 self.doc = stripIfNotNone(config.get("doc", None)) 

374 return self 

375 

376 @classmethod 

377 def for_region(cls, name: str = "region", nullable: bool = True, nbytes: int = 2048) -> FieldSpec: 

378 """Create a `FieldSpec` for a spatial region column. 

379 

380 Parameters 

381 ---------- 

382 name : `str`, optional 

383 Name for the field. 

384 nullable : `bool`, optional 

385 Whether NULL values are permitted. 

386 nbytes : `int`, optional 

387 Maximum number of bytes for serialized regions. The actual column 

388 size will be larger to allow for base-64 encoding. 

389 

390 Returns 

391 ------- 

392 spec : `FieldSpec` 

393 Specification structure for a region column. 

394 """ 

395 return cls(name, nullable=nullable, dtype=Base64Region, nbytes=nbytes) 

396 

397 def isStringType(self) -> bool: 

398 """Indicate that this is a sqlalchemy.String field spec. 

399 

400 Returns 

401 ------- 

402 isString : `bool` 

403 The field refers to a `sqlalchemy.String` and not any other type. 

404 This can return `False` even if the object was created with a 

405 string type if it has been decided that it should be implemented 

406 as a `sqlalchemy.Text` type. 

407 """ 

408 # For short strings retain them as strings 

409 if self.dtype is sqlalchemy.String and self.length and self.length <= 32: 

410 return True 

411 return False 

412 

413 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine | type: 

414 """Return a sized version of the column type. 

415 

416 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``. 

417 

418 Returns 

419 ------- 

420 dtype : `sqlalchemy.types.TypeEngine` 

421 A SQLAlchemy column type object. 

422 """ 

423 if self.length is not None: 

424 # Last chance check that we are only looking at possible String 

425 if self.dtype is sqlalchemy.String and not self.isStringType(): 

426 return sqlalchemy.Text 

427 return self.dtype(length=self.length) 

428 if self.nbytes is not None: 

429 return self.dtype(nbytes=self.nbytes) 

430 return self.dtype 

431 

432 def getPythonType(self) -> type: 

433 """Return the Python type associated with this field's (SQL) dtype. 

434 

435 Returns 

436 ------- 

437 type : `type` 

438 Python type associated with this field's (SQL) `dtype`. 

439 """ 

440 # to construct these objects, nbytes keyword is needed 

441 if issubclass(self.dtype, LocalBase64Bytes): 

442 # satisfy mypy for something that must be true 

443 assert self.nbytes is not None 

444 return self.dtype(nbytes=self.nbytes).python_type 

445 else: 

446 return self.dtype().python_type # type: ignore 

447 

448 

449@dataclass 

450class ForeignKeySpec: 

451 """Definition of a foreign key constraint in a logical `Registry` table.""" 

452 

453 table: str 

454 """Name of the target table.""" 

455 

456 source: tuple[str, ...] 

457 """Tuple of source table column names.""" 

458 

459 target: tuple[str, ...] 

460 """Tuple of target table column names.""" 

461 

462 onDelete: str | None = None 

463 """SQL clause indicating how to handle deletes to the target table. 

464 

465 If not `None` (which indicates that a constraint violation exception should 

466 be raised), should be either "SET NULL" or "CASCADE". 

467 """ 

468 

469 addIndex: bool = True 

470 """If `True`, create an index on the columns of this foreign key in the 

471 source table. 

472 """ 

473 

474 @classmethod 

475 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.") 

476 def fromConfig(cls, config: Config) -> ForeignKeySpec: 

477 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

478 

479 Parameters 

480 ---------- 

481 config: `Config` 

482 Configuration describing the constraint. Nested configuration keys 

483 correspond to `ForeignKeySpec` attributes. 

484 

485 Returns 

486 ------- 

487 spec: `ForeignKeySpec` 

488 Specification structure for the constraint. 

489 

490 Raises 

491 ------ 

492 SchemaValidationError 

493 Raised if configuration keys are missing or have invalid values. 

494 """ 

495 return cls( 

496 table=config["table"], 

497 source=tuple(ensure_iterable(config["source"])), 

498 target=tuple(ensure_iterable(config["target"])), 

499 onDelete=config.get("onDelete", None), 

500 ) 

501 

502 

503@dataclass(frozen=True) 

504class IndexSpec: 

505 """Specification of an index on table columns. 

506 

507 Parameters 

508 ---------- 

509 *columns : `str` 

510 Names of the columns to index. 

511 **kwargs: `Any` 

512 Additional keyword arguments to pass directly to 

513 `sqlalchemy.schema.Index` constructor. This could be used to provide 

514 backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL 

515 one can pass ``postgresql_using="gist"``. 

516 """ 

517 

518 def __init__(self, *columns: str, **kwargs: Any): 

519 object.__setattr__(self, "columns", tuple(columns)) 

520 object.__setattr__(self, "kwargs", kwargs) 

521 

522 def __hash__(self) -> int: 

523 return hash(self.columns) 

524 

525 columns: tuple[str, ...] 

526 """Column names to include in the index (`Tuple` [ `str` ]).""" 

527 

528 kwargs: dict[str, Any] 

529 """Additional keyword arguments passed directly to 

530 `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]). 

531 """ 

532 

533 

534@dataclass 

535class TableSpec: 

536 """A data class used to define a table or table-like query interface. 

537 

538 Parameters 

539 ---------- 

540 fields : `~collections.abc.Iterable` [ `FieldSpec` ] 

541 Specifications for the columns in this table. 

542 unique : `~collections.abc.Iterable` [ `tuple` [ `str` ] ], optional 

543 Non-primary-key unique constraints for the table. 

544 indexes: `~collections.abc.Iterable` [ `IndexSpec` ], optional 

545 Indexes for the table. 

546 foreignKeys : `~collections.abc.Iterable` [ `ForeignKeySpec` ], optional 

547 Foreign key constraints for the table. 

548 exclusion : `~collections.abc.Iterable` [ `tuple` [ `str` or `type` ] ] 

549 Special constraints that prohibit overlaps between timespans over rows 

550 where other columns are equal. These take the same form as unique 

551 constraints, but each tuple may contain a single 

552 `TimespanDatabaseRepresentation` subclass representing a timespan 

553 column. 

554 recycleIds : `bool`, optional 

555 If `True`, allow databases that might normally recycle autoincrement 

556 IDs to do so (usually better for performance) on any autoincrement 

557 field in this table. 

558 doc : `str`, optional 

559 Documentation for the table. 

560 """ 

561 

562 def __init__( 

563 self, 

564 fields: Iterable[FieldSpec], 

565 *, 

566 unique: Iterable[tuple[str, ...]] = (), 

567 indexes: Iterable[IndexSpec] = (), 

568 foreignKeys: Iterable[ForeignKeySpec] = (), 

569 exclusion: Iterable[tuple[str | type[TimespanDatabaseRepresentation], ...]] = (), 

570 recycleIds: bool = True, 

571 doc: str | None = None, 

572 ): 

573 self.fields = NamedValueSet(fields) 

574 self.unique = set(unique) 

575 self.indexes = set(indexes) 

576 self.foreignKeys = list(foreignKeys) 

577 self.exclusion = set(exclusion) 

578 self.recycleIds = recycleIds 

579 self.doc = doc 

580 

581 fields: NamedValueSet[FieldSpec] 

582 """Specifications for the columns in this table.""" 

583 

584 unique: set[tuple[str, ...]] 

585 """Non-primary-key unique constraints for the table.""" 

586 

587 indexes: set[IndexSpec] 

588 """Indexes for the table.""" 

589 

590 foreignKeys: list[ForeignKeySpec] 

591 """Foreign key constraints for the table.""" 

592 

593 exclusion: set[tuple[str | type[TimespanDatabaseRepresentation], ...]] 

594 """Exclusion constraints for the table. 

595 

596 Exclusion constraints behave mostly like unique constraints, but may 

597 contain a database-native Timespan column that is restricted to not overlap 

598 across rows (for identical combinations of any non-Timespan columns in the 

599 constraint). 

600 """ 

601 

602 recycleIds: bool = True 

603 """If `True`, allow databases that might normally recycle autoincrement IDs 

604 to do so (usually better for performance) on any autoincrement field in 

605 this table. 

606 """ 

607 

608 doc: str | None = None 

609 """Documentation for the table.""" 

610 

611 @classmethod 

612 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.") 

613 def fromConfig(cls, config: Config) -> TableSpec: 

614 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

615 

616 Parameters 

617 ---------- 

618 config: `Config` 

619 Configuration describing the constraint. Nested configuration keys 

620 correspond to `TableSpec` attributes. 

621 

622 Returns 

623 ------- 

624 spec: `TableSpec` 

625 Specification structure for the table. 

626 

627 Raises 

628 ------ 

629 SchemaValidationError 

630 Raised if configuration keys are missing or have invalid values. 

631 """ 

632 return cls( 

633 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]), 

634 unique={tuple(u) for u in config.get("unique", ())}, 

635 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())], 

636 doc=stripIfNotNone(config.get("doc")), 

637 )