Coverage for python/lsst/daf/butler/core/ddl.py: 47%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

192 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes for representing SQL data-definition language (DDL) in Python. 

22 

23This include "CREATE TABLE" etc. 

24 

25This provides an extra layer on top of SQLAlchemy's classes for these concepts, 

26because we need a level of indirection between logical tables and the actual 

27SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL. 

28 

29We've opted for the rather more obscure "ddl" as the name of this module 

30instead of "schema" because the latter is too overloaded; in most SQL 

31databases, a "schema" is also another term for a namespace. 

32""" 

33from __future__ import annotations 

34 

35from lsst import sphgeom 

36 

37__all__ = ( 

38 "TableSpec", 

39 "FieldSpec", 

40 "ForeignKeySpec", 

41 "Base64Bytes", 

42 "Base64Region", 

43 "AstropyTimeNsecTai", 

44 "GUID", 

45) 

46 

47import logging 

48import uuid 

49from base64 import b64decode, b64encode 

50from dataclasses import dataclass 

51from math import ceil 

52from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Set, Tuple, Type, Union 

53 

54import astropy.time 

55import sqlalchemy 

56from lsst.sphgeom import Region 

57from lsst.utils.iteration import ensure_iterable 

58from sqlalchemy.dialects.postgresql import UUID 

59 

60from . import time_utils 

61from .config import Config 

62from .exceptions import ValidationError 

63from .named import NamedValueSet 

64from .utils import stripIfNotNone 

65 

66if TYPE_CHECKING: 66 ↛ 67line 66 didn't jump to line 67, because the condition on line 66 was never true

67 from .timespan import TimespanDatabaseRepresentation 

68 

69 

70_LOG = logging.getLogger(__name__) 

71 

72 

73class SchemaValidationError(ValidationError): 

74 """Exceptions that indicate problems in Registry schema configuration.""" 

75 

76 @classmethod 

77 def translate(cls, caught: Type[Exception], message: str) -> Callable: 

78 """Return decorator to re-raise exceptions as `SchemaValidationError`. 

79 

80 Decorated functions must be class or instance methods, with a 

81 ``config`` parameter as their first argument. This will be passed 

82 to ``message.format()`` as a keyword argument, along with ``err``, 

83 the original exception. 

84 

85 Parameters 

86 ---------- 

87 caught : `type` (`Exception` subclass) 

88 The type of exception to catch. 

89 message : `str` 

90 A `str.format` string that may contain named placeholders for 

91 ``config``, ``err``, or any keyword-only argument accepted by 

92 the decorated function. 

93 """ 

94 

95 def decorate(func: Callable) -> Callable: 

96 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any: 

97 try: 

98 return func(self, config, *args, **kwargs) 

99 except caught as err: 

100 raise cls(message.format(config=str(config), err=err)) 

101 

102 return decorated 

103 

104 return decorate 

105 

106 

107class Base64Bytes(sqlalchemy.TypeDecorator): 

108 """A SQLAlchemy custom type for Python `bytes`. 

109 

110 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field. 

111 """ 

112 

113 impl = sqlalchemy.Text 

114 

115 cache_ok = True 

116 

117 def __init__(self, nbytes: int, *args: Any, **kwargs: Any): 

118 length = 4 * ceil(nbytes / 3) if self.impl == sqlalchemy.String else None 

119 super().__init__(*args, length=length, **kwargs) 

120 self.nbytes = nbytes 

121 

122 def process_bind_param(self, value: Optional[bytes], dialect: sqlalchemy.engine.Dialect) -> Optional[str]: 

123 # 'value' is native `bytes`. We want to encode that to base64 `bytes` 

124 # and then ASCII `str`, because `str` is what SQLAlchemy expects for 

125 # String fields. 

126 if value is None: 

127 return None 

128 if not isinstance(value, bytes): 

129 raise TypeError( 

130 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}." 

131 ) 

132 return b64encode(value).decode("ascii") 

133 

134 def process_result_value( 

135 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

136 ) -> Optional[bytes]: 

137 # 'value' is a `str` that must be ASCII because it's base64-encoded. 

138 # We want to transform that to base64-encoded `bytes` and then 

139 # native `bytes`. 

140 return b64decode(value.encode("ascii")) if value is not None else None 

141 

142 @property 

143 def python_type(self) -> Type[bytes]: 

144 return bytes 

145 

146 

147# create an alias, for use below to disambiguate between the built in 

148# sqlachemy type 

149LocalBase64Bytes = Base64Bytes 

150 

151 

152class Base64Region(Base64Bytes): 

153 """A SQLAlchemy custom type for Python `sphgeom.Region`. 

154 

155 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`. 

156 """ 

157 

158 cache_ok = True # have to be set explicitly in each class 

159 

160 def process_bind_param( 

161 self, value: Optional[Region], dialect: sqlalchemy.engine.Dialect 

162 ) -> Optional[str]: 

163 if value is None: 

164 return None 

165 return super().process_bind_param(value.encode(), dialect) 

166 

167 def process_result_value( 

168 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

169 ) -> Optional[Region]: 

170 if value is None: 

171 return None 

172 return Region.decode(super().process_result_value(value, dialect)) 

173 

174 @property 

175 def python_type(self) -> Type[sphgeom.Region]: 

176 return sphgeom.Region 

177 

178 

179class AstropyTimeNsecTai(sqlalchemy.TypeDecorator): 

180 """A SQLAlchemy custom type for Python `astropy.time.Time`. 

181 

182 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix 

183 epoch in TAI scale. 

184 """ 

185 

186 impl = sqlalchemy.BigInteger 

187 

188 cache_ok = True 

189 

190 def process_bind_param( 

191 self, value: Optional[astropy.time.Time], dialect: sqlalchemy.engine.Dialect 

192 ) -> Optional[int]: 

193 if value is None: 

194 return None 

195 if not isinstance(value, astropy.time.Time): 

196 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time") 

197 value = time_utils.TimeConverter().astropy_to_nsec(value) 

198 return value 

199 

200 def process_result_value( 

201 self, value: Optional[int], dialect: sqlalchemy.engine.Dialect 

202 ) -> Optional[astropy.time.Time]: 

203 # value is nanoseconds since epoch, or None 

204 if value is None: 

205 return None 

206 value = time_utils.TimeConverter().nsec_to_astropy(value) 

207 return value 

208 

209 

210class GUID(sqlalchemy.TypeDecorator): 

211 """Platform-independent GUID type. 

212 

213 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as 

214 stringified hex values. 

215 """ 

216 

217 impl = sqlalchemy.CHAR 

218 

219 cache_ok = True 

220 

221 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.TypeEngine: 

222 if dialect.name == "postgresql": 

223 return dialect.type_descriptor(UUID()) 

224 else: 

225 return dialect.type_descriptor(sqlalchemy.CHAR(32)) 

226 

227 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> Optional[str]: 

228 if value is None: 

229 return value 

230 

231 # Coerce input to UUID type, in general having UUID on input is the 

232 # only thing that we want but there is code right now that uses ints. 

233 if isinstance(value, int): 

234 value = uuid.UUID(int=value) 

235 elif isinstance(value, bytes): 

236 value = uuid.UUID(bytes=value) 

237 elif isinstance(value, str): 

238 # hexstring 

239 value = uuid.UUID(hex=value) 

240 elif not isinstance(value, uuid.UUID): 

241 raise TypeError(f"Unexpected type of a bind value: {type(value)}") 

242 

243 if dialect.name == "postgresql": 

244 return str(value) 

245 else: 

246 return "%.32x" % value.int 

247 

248 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.Dialect) -> Optional[uuid.UUID]: 

249 if value is None: 

250 return value 

251 else: 

252 return uuid.UUID(hex=value) 

253 

254 

255VALID_CONFIG_COLUMN_TYPES = { 

256 "string": sqlalchemy.String, 

257 "int": sqlalchemy.BigInteger, 

258 "float": sqlalchemy.Float, 

259 "region": Base64Region, 

260 "bool": sqlalchemy.Boolean, 

261 "blob": sqlalchemy.LargeBinary, 

262 "datetime": AstropyTimeNsecTai, 

263 "hash": Base64Bytes, 

264 "uuid": GUID, 

265} 

266 

267 

268@dataclass 

269class FieldSpec: 

270 """A data class for defining a column in a logical `Registry` table.""" 

271 

272 name: str 

273 """Name of the column.""" 

274 

275 dtype: type 

276 """Type of the column; usually a `type` subclass provided by SQLAlchemy 

277 that defines both a Python type and a corresponding precise SQL type. 

278 """ 

279 

280 length: Optional[int] = None 

281 """Length of the type in the database, for variable-length types.""" 

282 

283 nbytes: Optional[int] = None 

284 """Natural length used for hash and encoded-region columns, to be converted 

285 into the post-encoding length. 

286 """ 

287 

288 primaryKey: bool = False 

289 """Whether this field is (part of) its table's primary key.""" 

290 

291 autoincrement: bool = False 

292 """Whether the database should insert automatically incremented values when 

293 no value is provided in an INSERT. 

294 """ 

295 

296 nullable: bool = True 

297 """Whether this field is allowed to be NULL.""" 

298 

299 default: Any = None 

300 """A server-side default value for this field. 

301 

302 This is passed directly as the ``server_default`` argument to 

303 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual 

304 type conversion or quoting for Python literals, and should hence be used 

305 with care. See the SQLAlchemy documentation for more information. 

306 """ 

307 

308 doc: Optional[str] = None 

309 """Documentation for this field.""" 

310 

311 def __eq__(self, other: Any) -> bool: 

312 if isinstance(other, FieldSpec): 

313 return self.name == other.name 

314 else: 

315 return NotImplemented 

316 

317 def __hash__(self) -> int: 

318 return hash(self.name) 

319 

320 @classmethod 

321 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.") 

322 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec: 

323 """Create a `FieldSpec` from a subset of a `SchemaConfig`. 

324 

325 Parameters 

326 ---------- 

327 config: `Config` 

328 Configuration describing the column. Nested configuration keys 

329 correspond to `FieldSpec` attributes. 

330 **kwargs 

331 Additional keyword arguments that provide defaults for values 

332 not present in config. 

333 

334 Returns 

335 ------- 

336 spec: `FieldSpec` 

337 Specification structure for the column. 

338 

339 Raises 

340 ------ 

341 SchemaValidationError 

342 Raised if configuration keys are missing or have invalid values. 

343 """ 

344 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"]) 

345 if dtype is None: 

346 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.") 

347 if not config["name"].islower(): 

348 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.") 

349 self = cls(name=config["name"], dtype=dtype, **kwargs) 

350 self.length = config.get("length", self.length) 

351 self.nbytes = config.get("nbytes", self.nbytes) 

352 if self.length is not None and self.nbytes is not None: 

353 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.") 

354 self.primaryKey = config.get("primaryKey", self.primaryKey) 

355 self.autoincrement = config.get("autoincrement", self.autoincrement) 

356 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable) 

357 self.doc = stripIfNotNone(config.get("doc", None)) 

358 return self 

359 

360 def isStringType(self) -> bool: 

361 """Indicate that this is a sqlalchemy.String field spec. 

362 

363 Returns 

364 ------- 

365 isString : `bool` 

366 The field refers to a `sqlalchemy.String` and not any other type. 

367 This can return `False` even if the object was created with a 

368 string type if it has been decided that it should be implemented 

369 as a `sqlalchemy.Text` type. 

370 """ 

371 if self.dtype == sqlalchemy.String: 

372 # For short strings retain them as strings 

373 if self.dtype == sqlalchemy.String and self.length and self.length <= 32: 

374 return True 

375 return False 

376 

377 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine: 

378 """Return a sized version of the column type. 

379 

380 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``. 

381 

382 Returns 

383 ------- 

384 dtype : `sqlalchemy.types.TypeEngine` 

385 A SQLAlchemy column type object. 

386 """ 

387 if self.length is not None: 

388 # Last chance check that we are only looking at possible String 

389 if self.dtype == sqlalchemy.String and not self.isStringType(): 

390 return sqlalchemy.Text 

391 return self.dtype(length=self.length) 

392 if self.nbytes is not None: 

393 return self.dtype(nbytes=self.nbytes) 

394 return self.dtype 

395 

396 def getPythonType(self) -> type: 

397 """Return the Python type associated with this field's (SQL) dtype. 

398 

399 Returns 

400 ------- 

401 type : `type` 

402 Python type associated with this field's (SQL) `dtype`. 

403 """ 

404 # to construct these objects, nbytes keyword is needed 

405 if issubclass(self.dtype, LocalBase64Bytes): 

406 # satisfy mypy for something that must be true 

407 assert self.nbytes is not None 

408 return self.dtype(nbytes=self.nbytes).python_type 

409 else: 

410 return self.dtype().python_type # type: ignore 

411 

412 

413@dataclass 

414class ForeignKeySpec: 

415 """Definition of a foreign key constraint in a logical `Registry` table.""" 

416 

417 table: str 

418 """Name of the target table.""" 

419 

420 source: Tuple[str, ...] 

421 """Tuple of source table column names.""" 

422 

423 target: Tuple[str, ...] 

424 """Tuple of target table column names.""" 

425 

426 onDelete: Optional[str] = None 

427 """SQL clause indicating how to handle deletes to the target table. 

428 

429 If not `None` (which indicates that a constraint violation exception should 

430 be raised), should be either "SET NULL" or "CASCADE". 

431 """ 

432 

433 addIndex: bool = True 

434 """If `True`, create an index on the columns of this foreign key in the 

435 source table. 

436 """ 

437 

438 @classmethod 

439 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.") 

440 def fromConfig(cls, config: Config) -> ForeignKeySpec: 

441 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

442 

443 Parameters 

444 ---------- 

445 config: `Config` 

446 Configuration describing the constraint. Nested configuration keys 

447 correspond to `ForeignKeySpec` attributes. 

448 

449 Returns 

450 ------- 

451 spec: `ForeignKeySpec` 

452 Specification structure for the constraint. 

453 

454 Raises 

455 ------ 

456 SchemaValidationError 

457 Raised if configuration keys are missing or have invalid values. 

458 """ 

459 return cls( 

460 table=config["table"], 

461 source=tuple(ensure_iterable(config["source"])), 

462 target=tuple(ensure_iterable(config["target"])), 

463 onDelete=config.get("onDelete", None), 

464 ) 

465 

466 

467@dataclass 

468class TableSpec: 

469 """A data class used to define a table or table-like query interface. 

470 

471 Parameters 

472 ---------- 

473 fields : `Iterable` [ `FieldSpec` ] 

474 Specifications for the columns in this table. 

475 unique : `Iterable` [ `tuple` [ `str` ] ], optional 

476 Non-primary-key unique constraints for the table. 

477 indexes: `Iterable` [ `tuple` [ `str` ] ], optional 

478 Indexes for the table. 

479 foreignKeys : `Iterable` [ `ForeignKeySpec` ], optional 

480 Foreign key constraints for the table. 

481 exclusion : `Iterable` [ `tuple` [ `str` or `type` ] ] 

482 Special constraints that prohibit overlaps between timespans over rows 

483 where other columns are equal. These take the same form as unique 

484 constraints, but each tuple may contain a single 

485 `TimespanDatabaseRepresentation` subclass representing a timespan 

486 column. 

487 recycleIds : `bool`, optional 

488 If `True`, allow databases that might normally recycle autoincrement 

489 IDs to do so (usually better for performance) on any autoincrement 

490 field in this table. 

491 doc : `str`, optional 

492 Documentation for the table. 

493 """ 

494 

495 def __init__( 

496 self, 

497 fields: Iterable[FieldSpec], 

498 *, 

499 unique: Iterable[Tuple[str, ...]] = (), 

500 indexes: Iterable[Tuple[str, ...]] = (), 

501 foreignKeys: Iterable[ForeignKeySpec] = (), 

502 exclusion: Iterable[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] = (), 

503 recycleIds: bool = True, 

504 doc: Optional[str] = None, 

505 ): 

506 self.fields = NamedValueSet(fields) 

507 self.unique = set(unique) 

508 self.indexes = set(indexes) 

509 self.foreignKeys = list(foreignKeys) 

510 self.exclusion = set(exclusion) 

511 self.recycleIds = recycleIds 

512 self.doc = doc 

513 

514 fields: NamedValueSet[FieldSpec] 

515 """Specifications for the columns in this table.""" 

516 

517 unique: Set[Tuple[str, ...]] 

518 """Non-primary-key unique constraints for the table.""" 

519 

520 indexes: Set[Tuple[str, ...]] 

521 """Indexes for the table.""" 

522 

523 foreignKeys: List[ForeignKeySpec] 

524 """Foreign key constraints for the table.""" 

525 

526 exclusion: Set[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] 

527 """Exclusion constraints for the table. 

528 

529 Exclusion constraints behave mostly like unique constraints, but may 

530 contain a database-native Timespan column that is restricted to not overlap 

531 across rows (for identical combinations of any non-Timespan columns in the 

532 constraint). 

533 """ 

534 

535 recycleIds: bool = True 

536 """If `True`, allow databases that might normally recycle autoincrement IDs 

537 to do so (usually better for performance) on any autoincrement field in 

538 this table. 

539 """ 

540 

541 doc: Optional[str] = None 

542 """Documentation for the table.""" 

543 

544 @classmethod 

545 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.") 

546 def fromConfig(cls, config: Config) -> TableSpec: 

547 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

548 

549 Parameters 

550 ---------- 

551 config: `Config` 

552 Configuration describing the constraint. Nested configuration keys 

553 correspond to `TableSpec` attributes. 

554 

555 Returns 

556 ------- 

557 spec: `TableSpec` 

558 Specification structure for the table. 

559 

560 Raises 

561 ------ 

562 SchemaValidationError 

563 Raised if configuration keys are missing or have invalid values. 

564 """ 

565 return cls( 

566 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]), 

567 unique={tuple(u) for u in config.get("unique", ())}, 

568 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())], 

569 doc=stripIfNotNone(config.get("doc")), 

570 )