Coverage for python/lsst/daf/butler/core/ddl.py: 50%

232 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-12 02:19 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes for representing SQL data-definition language (DDL) in Python. 

22 

23This include "CREATE TABLE" etc. 

24 

25This provides an extra layer on top of SQLAlchemy's classes for these concepts, 

26because we need a level of indirection between logical tables and the actual 

27SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL. 

28 

29We've opted for the rather more obscure "ddl" as the name of this module 

30instead of "schema" because the latter is too overloaded; in most SQL 

31databases, a "schema" is also another term for a namespace. 

32""" 

33from __future__ import annotations 

34 

35from lsst import sphgeom 

36 

37__all__ = ( 

38 "TableSpec", 

39 "FieldSpec", 

40 "ForeignKeySpec", 

41 "IndexSpec", 

42 "Base64Bytes", 

43 "Base64Region", 

44 "AstropyTimeNsecTai", 

45 "GUID", 

46) 

47 

48import logging 

49import uuid 

50from base64 import b64decode, b64encode 

51from dataclasses import dataclass 

52from math import ceil 

53from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Set, Tuple, Type, Union 

54 

55import astropy.time 

56import sqlalchemy 

57from lsst.sphgeom import Region 

58from lsst.utils.iteration import ensure_iterable 

59from sqlalchemy.dialects.postgresql import UUID 

60 

61from . import time_utils 

62from .config import Config 

63from .exceptions import ValidationError 

64from .named import NamedValueSet 

65from .utils import stripIfNotNone 

66 

67if TYPE_CHECKING: 67 ↛ 68line 67 didn't jump to line 68, because the condition on line 67 was never true

68 from .timespan import TimespanDatabaseRepresentation 

69 

70 

71_LOG = logging.getLogger(__name__) 

72 

73 

74class SchemaValidationError(ValidationError): 

75 """Exceptions that indicate problems in Registry schema configuration.""" 

76 

77 @classmethod 

78 def translate(cls, caught: Type[Exception], message: str) -> Callable: 

79 """Return decorator to re-raise exceptions as `SchemaValidationError`. 

80 

81 Decorated functions must be class or instance methods, with a 

82 ``config`` parameter as their first argument. This will be passed 

83 to ``message.format()`` as a keyword argument, along with ``err``, 

84 the original exception. 

85 

86 Parameters 

87 ---------- 

88 caught : `type` (`Exception` subclass) 

89 The type of exception to catch. 

90 message : `str` 

91 A `str.format` string that may contain named placeholders for 

92 ``config``, ``err``, or any keyword-only argument accepted by 

93 the decorated function. 

94 """ 

95 

96 def decorate(func: Callable) -> Callable: 

97 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any: 

98 try: 

99 return func(self, config, *args, **kwargs) 

100 except caught as err: 

101 raise cls(message.format(config=str(config), err=err)) 

102 

103 return decorated 

104 

105 return decorate 

106 

107 

108class Base64Bytes(sqlalchemy.TypeDecorator): 

109 """A SQLAlchemy custom type for Python `bytes`. 

110 

111 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field. 

112 """ 

113 

114 impl = sqlalchemy.Text 

115 

116 cache_ok = True 

117 

118 def __init__(self, nbytes: int | None = None, *args: Any, **kwargs: Any): 

119 if nbytes is not None: 

120 length = 4 * ceil(nbytes / 3) if self.impl == sqlalchemy.String else None 

121 else: 

122 length = None 

123 super().__init__(*args, length=length, **kwargs) 

124 self.nbytes = nbytes 

125 

126 def process_bind_param(self, value: Optional[bytes], dialect: sqlalchemy.engine.Dialect) -> Optional[str]: 

127 # 'value' is native `bytes`. We want to encode that to base64 `bytes` 

128 # and then ASCII `str`, because `str` is what SQLAlchemy expects for 

129 # String fields. 

130 if value is None: 

131 return None 

132 if not isinstance(value, bytes): 

133 raise TypeError( 

134 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}." 

135 ) 

136 return b64encode(value).decode("ascii") 

137 

138 def process_result_value( 

139 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

140 ) -> Optional[bytes]: 

141 # 'value' is a `str` that must be ASCII because it's base64-encoded. 

142 # We want to transform that to base64-encoded `bytes` and then 

143 # native `bytes`. 

144 return b64decode(value.encode("ascii")) if value is not None else None 

145 

146 @property 

147 def python_type(self) -> Type[bytes]: 

148 return bytes 

149 

150 

151# create an alias, for use below to disambiguate between the built in 

152# sqlachemy type 

153LocalBase64Bytes = Base64Bytes 

154 

155 

156class Base64Region(Base64Bytes): 

157 """A SQLAlchemy custom type for Python `sphgeom.Region`. 

158 

159 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`. 

160 """ 

161 

162 cache_ok = True # have to be set explicitly in each class 

163 

164 def process_bind_param( 

165 self, value: Optional[Region], dialect: sqlalchemy.engine.Dialect 

166 ) -> Optional[str]: 

167 if value is None: 

168 return None 

169 return super().process_bind_param(value.encode(), dialect) 

170 

171 def process_result_value( 

172 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

173 ) -> Optional[Region]: 

174 if value is None: 

175 return None 

176 return Region.decode(super().process_result_value(value, dialect)) 

177 

178 @property 

179 def python_type(self) -> Type[sphgeom.Region]: 

180 return sphgeom.Region 

181 

182 

183class AstropyTimeNsecTai(sqlalchemy.TypeDecorator): 

184 """A SQLAlchemy custom type for Python `astropy.time.Time`. 

185 

186 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix 

187 epoch in TAI scale. 

188 """ 

189 

190 impl = sqlalchemy.BigInteger 

191 

192 cache_ok = True 

193 

194 def process_bind_param( 

195 self, value: Optional[astropy.time.Time], dialect: sqlalchemy.engine.Dialect 

196 ) -> Optional[int]: 

197 if value is None: 

198 return None 

199 if not isinstance(value, astropy.time.Time): 

200 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time") 

201 value = time_utils.TimeConverter().astropy_to_nsec(value) 

202 return value 

203 

204 def process_result_value( 

205 self, value: Optional[int], dialect: sqlalchemy.engine.Dialect 

206 ) -> Optional[astropy.time.Time]: 

207 # value is nanoseconds since epoch, or None 

208 if value is None: 

209 return None 

210 value = time_utils.TimeConverter().nsec_to_astropy(value) 

211 return value 

212 

213 

214class GUID(sqlalchemy.TypeDecorator): 

215 """Platform-independent GUID type. 

216 

217 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as 

218 stringified hex values. 

219 """ 

220 

221 impl = sqlalchemy.CHAR 

222 

223 cache_ok = True 

224 

225 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.TypeEngine: 

226 if dialect.name == "postgresql": 

227 return dialect.type_descriptor(UUID()) 

228 else: 

229 return dialect.type_descriptor(sqlalchemy.CHAR(32)) 

230 

231 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> Optional[str]: 

232 if value is None: 

233 return value 

234 

235 # Coerce input to UUID type, in general having UUID on input is the 

236 # only thing that we want but there is code right now that uses ints. 

237 if isinstance(value, int): 

238 value = uuid.UUID(int=value) 

239 elif isinstance(value, bytes): 

240 value = uuid.UUID(bytes=value) 

241 elif isinstance(value, str): 

242 # hexstring 

243 value = uuid.UUID(hex=value) 

244 elif not isinstance(value, uuid.UUID): 

245 raise TypeError(f"Unexpected type of a bind value: {type(value)}") 

246 

247 if dialect.name == "postgresql": 

248 return str(value) 

249 else: 

250 return "%.32x" % value.int 

251 

252 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.Dialect) -> Optional[uuid.UUID]: 

253 if value is None: 

254 return value 

255 else: 

256 return uuid.UUID(hex=value) 

257 

258 

259VALID_CONFIG_COLUMN_TYPES = { 

260 "string": sqlalchemy.String, 

261 "int": sqlalchemy.BigInteger, 

262 "float": sqlalchemy.Float, 

263 "region": Base64Region, 

264 "bool": sqlalchemy.Boolean, 

265 "blob": sqlalchemy.LargeBinary, 

266 "datetime": AstropyTimeNsecTai, 

267 "hash": Base64Bytes, 

268 "uuid": GUID, 

269} 

270 

271 

272@dataclass 

273class FieldSpec: 

274 """A data class for defining a column in a logical `Registry` table.""" 

275 

276 name: str 

277 """Name of the column.""" 

278 

279 dtype: type 

280 """Type of the column; usually a `type` subclass provided by SQLAlchemy 

281 that defines both a Python type and a corresponding precise SQL type. 

282 """ 

283 

284 length: Optional[int] = None 

285 """Length of the type in the database, for variable-length types.""" 

286 

287 nbytes: Optional[int] = None 

288 """Natural length used for hash and encoded-region columns, to be converted 

289 into the post-encoding length. 

290 """ 

291 

292 primaryKey: bool = False 

293 """Whether this field is (part of) its table's primary key.""" 

294 

295 autoincrement: bool = False 

296 """Whether the database should insert automatically incremented values when 

297 no value is provided in an INSERT. 

298 """ 

299 

300 nullable: bool = True 

301 """Whether this field is allowed to be NULL. If ``primaryKey`` is 

302 `True`, during construction this value will be forced to `False`.""" 

303 

304 default: Any = None 

305 """A server-side default value for this field. 

306 

307 This is passed directly as the ``server_default`` argument to 

308 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual 

309 type conversion or quoting for Python literals, and should hence be used 

310 with care. See the SQLAlchemy documentation for more information. 

311 """ 

312 

313 doc: Optional[str] = None 

314 """Documentation for this field.""" 

315 

316 def __post_init__(self) -> None: 

317 if self.primaryKey: 

318 # Change the default to match primaryKey. 

319 self.nullable = False 

320 

321 def __eq__(self, other: Any) -> bool: 

322 if isinstance(other, FieldSpec): 

323 return self.name == other.name 

324 else: 

325 return NotImplemented 

326 

327 def __hash__(self) -> int: 

328 return hash(self.name) 

329 

330 @classmethod 

331 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.") 

332 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec: 

333 """Create a `FieldSpec` from a subset of a `SchemaConfig`. 

334 

335 Parameters 

336 ---------- 

337 config: `Config` 

338 Configuration describing the column. Nested configuration keys 

339 correspond to `FieldSpec` attributes. 

340 **kwargs 

341 Additional keyword arguments that provide defaults for values 

342 not present in config. 

343 

344 Returns 

345 ------- 

346 spec: `FieldSpec` 

347 Specification structure for the column. 

348 

349 Raises 

350 ------ 

351 SchemaValidationError 

352 Raised if configuration keys are missing or have invalid values. 

353 """ 

354 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"]) 

355 if dtype is None: 

356 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.") 

357 if not config["name"].islower(): 

358 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.") 

359 self = cls(name=config["name"], dtype=dtype, **kwargs) 

360 self.length = config.get("length", self.length) 

361 self.nbytes = config.get("nbytes", self.nbytes) 

362 if self.length is not None and self.nbytes is not None: 

363 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.") 

364 self.primaryKey = config.get("primaryKey", self.primaryKey) 

365 self.autoincrement = config.get("autoincrement", self.autoincrement) 

366 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable) 

367 self.doc = stripIfNotNone(config.get("doc", None)) 

368 return self 

369 

370 @classmethod 

371 def for_region(cls, name: str = "region", nullable: bool = True, nbytes: int = 2048) -> FieldSpec: 

372 """Create a `FieldSpec` for a spatial region column. 

373 

374 Parameters 

375 ---------- 

376 name : `str`, optional 

377 Name for the field. 

378 nullable : `bool`, optional 

379 Whether NULL values are permitted. 

380 nbytes : `int`, optional 

381 Maximum number of bytes for serialized regions. The actual column 

382 size will be larger to allow for base-64 encoding. 

383 

384 Returns 

385 ------- 

386 spec : `FieldSpec` 

387 Specification structure for a region column. 

388 """ 

389 return cls(name, nullable=nullable, dtype=Base64Region, nbytes=nbytes) 

390 

391 def isStringType(self) -> bool: 

392 """Indicate that this is a sqlalchemy.String field spec. 

393 

394 Returns 

395 ------- 

396 isString : `bool` 

397 The field refers to a `sqlalchemy.String` and not any other type. 

398 This can return `False` even if the object was created with a 

399 string type if it has been decided that it should be implemented 

400 as a `sqlalchemy.Text` type. 

401 """ 

402 if self.dtype == sqlalchemy.String: 

403 # For short strings retain them as strings 

404 if self.dtype == sqlalchemy.String and self.length and self.length <= 32: 

405 return True 

406 return False 

407 

408 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine: 

409 """Return a sized version of the column type. 

410 

411 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``. 

412 

413 Returns 

414 ------- 

415 dtype : `sqlalchemy.types.TypeEngine` 

416 A SQLAlchemy column type object. 

417 """ 

418 if self.length is not None: 

419 # Last chance check that we are only looking at possible String 

420 if self.dtype == sqlalchemy.String and not self.isStringType(): 

421 return sqlalchemy.Text 

422 return self.dtype(length=self.length) 

423 if self.nbytes is not None: 

424 return self.dtype(nbytes=self.nbytes) 

425 return self.dtype 

426 

427 def getPythonType(self) -> type: 

428 """Return the Python type associated with this field's (SQL) dtype. 

429 

430 Returns 

431 ------- 

432 type : `type` 

433 Python type associated with this field's (SQL) `dtype`. 

434 """ 

435 # to construct these objects, nbytes keyword is needed 

436 if issubclass(self.dtype, LocalBase64Bytes): 

437 # satisfy mypy for something that must be true 

438 assert self.nbytes is not None 

439 return self.dtype(nbytes=self.nbytes).python_type 

440 else: 

441 return self.dtype().python_type # type: ignore 

442 

443 

444@dataclass 

445class ForeignKeySpec: 

446 """Definition of a foreign key constraint in a logical `Registry` table.""" 

447 

448 table: str 

449 """Name of the target table.""" 

450 

451 source: Tuple[str, ...] 

452 """Tuple of source table column names.""" 

453 

454 target: Tuple[str, ...] 

455 """Tuple of target table column names.""" 

456 

457 onDelete: Optional[str] = None 

458 """SQL clause indicating how to handle deletes to the target table. 

459 

460 If not `None` (which indicates that a constraint violation exception should 

461 be raised), should be either "SET NULL" or "CASCADE". 

462 """ 

463 

464 addIndex: bool = True 

465 """If `True`, create an index on the columns of this foreign key in the 

466 source table. 

467 """ 

468 

469 @classmethod 

470 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.") 

471 def fromConfig(cls, config: Config) -> ForeignKeySpec: 

472 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

473 

474 Parameters 

475 ---------- 

476 config: `Config` 

477 Configuration describing the constraint. Nested configuration keys 

478 correspond to `ForeignKeySpec` attributes. 

479 

480 Returns 

481 ------- 

482 spec: `ForeignKeySpec` 

483 Specification structure for the constraint. 

484 

485 Raises 

486 ------ 

487 SchemaValidationError 

488 Raised if configuration keys are missing or have invalid values. 

489 """ 

490 return cls( 

491 table=config["table"], 

492 source=tuple(ensure_iterable(config["source"])), 

493 target=tuple(ensure_iterable(config["target"])), 

494 onDelete=config.get("onDelete", None), 

495 ) 

496 

497 

498@dataclass(frozen=True) 

499class IndexSpec: 

500 """Specification of an index on table columns. 

501 

502 Parameters 

503 ---------- 

504 *columns : `str` 

505 Names of the columns to index. 

506 **kwargs: `Any` 

507 Additional keyword arguments to pass directly to 

508 `sqlalchemy.schema.Index` constructor. This could be used to provide 

509 backend-specific options, e.g. to create a ``GIST`` index in PostgreSQL 

510 one can pass ``postgresql_using="gist"``. 

511 """ 

512 

513 def __init__(self, *columns: str, **kwargs: Any): 

514 object.__setattr__(self, "columns", tuple(columns)) 

515 object.__setattr__(self, "kwargs", kwargs) 

516 

517 def __hash__(self) -> int: 

518 return hash(self.columns) 

519 

520 columns: Tuple[str, ...] 

521 """Column names to include in the index (`Tuple` [ `str` ]).""" 

522 

523 kwargs: dict[str, Any] 

524 """Additional keyword arguments passed directly to 

525 `sqlalchemy.schema.Index` constructor (`dict` [ `str`, `Any` ]). 

526 """ 

527 

528 

529@dataclass 

530class TableSpec: 

531 """A data class used to define a table or table-like query interface. 

532 

533 Parameters 

534 ---------- 

535 fields : `Iterable` [ `FieldSpec` ] 

536 Specifications for the columns in this table. 

537 unique : `Iterable` [ `tuple` [ `str` ] ], optional 

538 Non-primary-key unique constraints for the table. 

539 indexes: `Iterable` [ `IndexSpec` ], optional 

540 Indexes for the table. 

541 foreignKeys : `Iterable` [ `ForeignKeySpec` ], optional 

542 Foreign key constraints for the table. 

543 exclusion : `Iterable` [ `tuple` [ `str` or `type` ] ] 

544 Special constraints that prohibit overlaps between timespans over rows 

545 where other columns are equal. These take the same form as unique 

546 constraints, but each tuple may contain a single 

547 `TimespanDatabaseRepresentation` subclass representing a timespan 

548 column. 

549 recycleIds : `bool`, optional 

550 If `True`, allow databases that might normally recycle autoincrement 

551 IDs to do so (usually better for performance) on any autoincrement 

552 field in this table. 

553 doc : `str`, optional 

554 Documentation for the table. 

555 """ 

556 

557 def __init__( 

558 self, 

559 fields: Iterable[FieldSpec], 

560 *, 

561 unique: Iterable[Tuple[str, ...]] = (), 

562 indexes: Iterable[IndexSpec] = (), 

563 foreignKeys: Iterable[ForeignKeySpec] = (), 

564 exclusion: Iterable[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] = (), 

565 recycleIds: bool = True, 

566 doc: Optional[str] = None, 

567 ): 

568 self.fields = NamedValueSet(fields) 

569 self.unique = set(unique) 

570 self.indexes = set(indexes) 

571 self.foreignKeys = list(foreignKeys) 

572 self.exclusion = set(exclusion) 

573 self.recycleIds = recycleIds 

574 self.doc = doc 

575 

576 fields: NamedValueSet[FieldSpec] 

577 """Specifications for the columns in this table.""" 

578 

579 unique: Set[Tuple[str, ...]] 

580 """Non-primary-key unique constraints for the table.""" 

581 

582 indexes: Set[IndexSpec] 

583 """Indexes for the table.""" 

584 

585 foreignKeys: List[ForeignKeySpec] 

586 """Foreign key constraints for the table.""" 

587 

588 exclusion: Set[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] 

589 """Exclusion constraints for the table. 

590 

591 Exclusion constraints behave mostly like unique constraints, but may 

592 contain a database-native Timespan column that is restricted to not overlap 

593 across rows (for identical combinations of any non-Timespan columns in the 

594 constraint). 

595 """ 

596 

597 recycleIds: bool = True 

598 """If `True`, allow databases that might normally recycle autoincrement IDs 

599 to do so (usually better for performance) on any autoincrement field in 

600 this table. 

601 """ 

602 

603 doc: Optional[str] = None 

604 """Documentation for the table.""" 

605 

606 @classmethod 

607 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.") 

608 def fromConfig(cls, config: Config) -> TableSpec: 

609 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

610 

611 Parameters 

612 ---------- 

613 config: `Config` 

614 Configuration describing the constraint. Nested configuration keys 

615 correspond to `TableSpec` attributes. 

616 

617 Returns 

618 ------- 

619 spec: `TableSpec` 

620 Specification structure for the table. 

621 

622 Raises 

623 ------ 

624 SchemaValidationError 

625 Raised if configuration keys are missing or have invalid values. 

626 """ 

627 return cls( 

628 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]), 

629 unique={tuple(u) for u in config.get("unique", ())}, 

630 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())], 

631 doc=stripIfNotNone(config.get("doc")), 

632 )