Coverage for python/lsst/daf/butler/core/ddl.py: 46%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

191 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes for representing SQL data-definition language (DDL) in Python. 

22 

23This include "CREATE TABLE" etc. 

24 

25This provides an extra layer on top of SQLAlchemy's classes for these concepts, 

26because we need a level of indirection between logical tables and the actual 

27SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL. 

28 

29We've opted for the rather more obscure "ddl" as the name of this module 

30instead of "schema" because the latter is too overloaded; in most SQL 

31databases, a "schema" is also another term for a namespace. 

32""" 

33from __future__ import annotations 

34 

35from lsst import sphgeom 

36 

37__all__ = ( 

38 "TableSpec", 

39 "FieldSpec", 

40 "ForeignKeySpec", 

41 "Base64Bytes", 

42 "Base64Region", 

43 "AstropyTimeNsecTai", 

44 "GUID", 

45) 

46 

47import logging 

48import uuid 

49from base64 import b64decode, b64encode 

50from dataclasses import dataclass 

51from math import ceil 

52from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Set, Tuple, Type, Union 

53 

54import astropy.time 

55import sqlalchemy 

56from lsst.sphgeom import Region 

57from lsst.utils.iteration import ensure_iterable 

58from sqlalchemy.dialects.postgresql import UUID 

59 

60from . import time_utils 

61from .config import Config 

62from .exceptions import ValidationError 

63from .named import NamedValueSet 

64from .utils import stripIfNotNone 

65 

66if TYPE_CHECKING: 66 ↛ 67line 66 didn't jump to line 67, because the condition on line 66 was never true

67 from .timespan import TimespanDatabaseRepresentation 

68 

69 

70_LOG = logging.getLogger(__name__) 

71 

72 

73class SchemaValidationError(ValidationError): 

74 """Exceptions that indicate problems in Registry schema configuration.""" 

75 

76 @classmethod 

77 def translate(cls, caught: Type[Exception], message: str) -> Callable: 

78 """Return decorator to re-raise exceptions as `SchemaValidationError`. 

79 

80 Decorated functions must be class or instance methods, with a 

81 ``config`` parameter as their first argument. This will be passed 

82 to ``message.format()`` as a keyword argument, along with ``err``, 

83 the original exception. 

84 

85 Parameters 

86 ---------- 

87 caught : `type` (`Exception` subclass) 

88 The type of exception to catch. 

89 message : `str` 

90 A `str.format` string that may contain named placeholders for 

91 ``config``, ``err``, or any keyword-only argument accepted by 

92 the decorated function. 

93 """ 

94 

95 def decorate(func: Callable) -> Callable: 

96 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any: 

97 try: 

98 return func(self, config, *args, **kwargs) 

99 except caught as err: 

100 raise cls(message.format(config=str(config), err=err)) 

101 

102 return decorated 

103 

104 return decorate 

105 

106 

107class Base64Bytes(sqlalchemy.TypeDecorator): 

108 """A SQLAlchemy custom type for Python `bytes`. 

109 

110 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field. 

111 """ 

112 

113 impl = sqlalchemy.Text 

114 

115 cache_ok = True 

116 

117 def __init__(self, nbytes: int, *args: Any, **kwargs: Any): 

118 length = 4 * ceil(nbytes / 3) if self.impl == sqlalchemy.String else None 

119 super().__init__(*args, length=length, **kwargs) 

120 self.nbytes = nbytes 

121 

122 def process_bind_param(self, value: Optional[bytes], dialect: sqlalchemy.engine.Dialect) -> Optional[str]: 

123 # 'value' is native `bytes`. We want to encode that to base64 `bytes` 

124 # and then ASCII `str`, because `str` is what SQLAlchemy expects for 

125 # String fields. 

126 if value is None: 

127 return None 

128 if not isinstance(value, bytes): 

129 raise TypeError( 

130 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}." 

131 ) 

132 return b64encode(value).decode("ascii") 

133 

134 def process_result_value( 

135 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

136 ) -> Optional[bytes]: 

137 # 'value' is a `str` that must be ASCII because it's base64-encoded. 

138 # We want to transform that to base64-encoded `bytes` and then 

139 # native `bytes`. 

140 return b64decode(value.encode("ascii")) if value is not None else None 

141 

142 @property 

143 def python_type(self) -> Type[bytes]: 

144 return bytes 

145 

146 

147# create an alias, for use below to disambiguate between the built in 

148# sqlachemy type 

149LocalBase64Bytes = Base64Bytes 

150 

151 

152class Base64Region(Base64Bytes): 

153 """A SQLAlchemy custom type for Python `sphgeom.Region`. 

154 

155 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`. 

156 """ 

157 

158 def process_bind_param( 

159 self, value: Optional[Region], dialect: sqlalchemy.engine.Dialect 

160 ) -> Optional[str]: 

161 if value is None: 

162 return None 

163 return super().process_bind_param(value.encode(), dialect) 

164 

165 def process_result_value( 

166 self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

167 ) -> Optional[Region]: 

168 if value is None: 

169 return None 

170 return Region.decode(super().process_result_value(value, dialect)) 

171 

172 @property 

173 def python_type(self) -> Type[sphgeom.Region]: 

174 return sphgeom.Region 

175 

176 

177class AstropyTimeNsecTai(sqlalchemy.TypeDecorator): 

178 """A SQLAlchemy custom type for Python `astropy.time.Time`. 

179 

180 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix 

181 epoch in TAI scale. 

182 """ 

183 

184 impl = sqlalchemy.BigInteger 

185 

186 cache_ok = True 

187 

188 def process_bind_param( 

189 self, value: Optional[astropy.time.Time], dialect: sqlalchemy.engine.Dialect 

190 ) -> Optional[int]: 

191 if value is None: 

192 return None 

193 if not isinstance(value, astropy.time.Time): 

194 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time") 

195 value = time_utils.TimeConverter().astropy_to_nsec(value) 

196 return value 

197 

198 def process_result_value( 

199 self, value: Optional[int], dialect: sqlalchemy.engine.Dialect 

200 ) -> Optional[astropy.time.Time]: 

201 # value is nanoseconds since epoch, or None 

202 if value is None: 

203 return None 

204 value = time_utils.TimeConverter().nsec_to_astropy(value) 

205 return value 

206 

207 

208class GUID(sqlalchemy.TypeDecorator): 

209 """Platform-independent GUID type. 

210 

211 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as 

212 stringified hex values. 

213 """ 

214 

215 impl = sqlalchemy.CHAR 

216 

217 cache_ok = True 

218 

219 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.TypeEngine: 

220 if dialect.name == "postgresql": 

221 return dialect.type_descriptor(UUID()) 

222 else: 

223 return dialect.type_descriptor(sqlalchemy.CHAR(32)) 

224 

225 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> Optional[str]: 

226 if value is None: 

227 return value 

228 

229 # Coerce input to UUID type, in general having UUID on input is the 

230 # only thing that we want but there is code right now that uses ints. 

231 if isinstance(value, int): 

232 value = uuid.UUID(int=value) 

233 elif isinstance(value, bytes): 

234 value = uuid.UUID(bytes=value) 

235 elif isinstance(value, str): 

236 # hexstring 

237 value = uuid.UUID(hex=value) 

238 elif not isinstance(value, uuid.UUID): 

239 raise TypeError(f"Unexpected type of a bind value: {type(value)}") 

240 

241 if dialect.name == "postgresql": 

242 return str(value) 

243 else: 

244 return "%.32x" % value.int 

245 

246 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.Dialect) -> Optional[uuid.UUID]: 

247 if value is None: 

248 return value 

249 else: 

250 return uuid.UUID(hex=value) 

251 

252 

253VALID_CONFIG_COLUMN_TYPES = { 

254 "string": sqlalchemy.String, 

255 "int": sqlalchemy.BigInteger, 

256 "float": sqlalchemy.Float, 

257 "region": Base64Region, 

258 "bool": sqlalchemy.Boolean, 

259 "blob": sqlalchemy.LargeBinary, 

260 "datetime": AstropyTimeNsecTai, 

261 "hash": Base64Bytes, 

262 "uuid": GUID, 

263} 

264 

265 

266@dataclass 

267class FieldSpec: 

268 """A data class for defining a column in a logical `Registry` table.""" 

269 

270 name: str 

271 """Name of the column.""" 

272 

273 dtype: type 

274 """Type of the column; usually a `type` subclass provided by SQLAlchemy 

275 that defines both a Python type and a corresponding precise SQL type. 

276 """ 

277 

278 length: Optional[int] = None 

279 """Length of the type in the database, for variable-length types.""" 

280 

281 nbytes: Optional[int] = None 

282 """Natural length used for hash and encoded-region columns, to be converted 

283 into the post-encoding length. 

284 """ 

285 

286 primaryKey: bool = False 

287 """Whether this field is (part of) its table's primary key.""" 

288 

289 autoincrement: bool = False 

290 """Whether the database should insert automatically incremented values when 

291 no value is provided in an INSERT. 

292 """ 

293 

294 nullable: bool = True 

295 """Whether this field is allowed to be NULL.""" 

296 

297 default: Any = None 

298 """A server-side default value for this field. 

299 

300 This is passed directly as the ``server_default`` argument to 

301 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual 

302 type conversion or quoting for Python literals, and should hence be used 

303 with care. See the SQLAlchemy documentation for more information. 

304 """ 

305 

306 doc: Optional[str] = None 

307 """Documentation for this field.""" 

308 

309 def __eq__(self, other: Any) -> bool: 

310 if isinstance(other, FieldSpec): 

311 return self.name == other.name 

312 else: 

313 return NotImplemented 

314 

315 def __hash__(self) -> int: 

316 return hash(self.name) 

317 

318 @classmethod 

319 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.") 

320 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec: 

321 """Create a `FieldSpec` from a subset of a `SchemaConfig`. 

322 

323 Parameters 

324 ---------- 

325 config: `Config` 

326 Configuration describing the column. Nested configuration keys 

327 correspond to `FieldSpec` attributes. 

328 **kwargs 

329 Additional keyword arguments that provide defaults for values 

330 not present in config. 

331 

332 Returns 

333 ------- 

334 spec: `FieldSpec` 

335 Specification structure for the column. 

336 

337 Raises 

338 ------ 

339 SchemaValidationError 

340 Raised if configuration keys are missing or have invalid values. 

341 """ 

342 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"]) 

343 if dtype is None: 

344 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.") 

345 if not config["name"].islower(): 

346 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.") 

347 self = cls(name=config["name"], dtype=dtype, **kwargs) 

348 self.length = config.get("length", self.length) 

349 self.nbytes = config.get("nbytes", self.nbytes) 

350 if self.length is not None and self.nbytes is not None: 

351 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.") 

352 self.primaryKey = config.get("primaryKey", self.primaryKey) 

353 self.autoincrement = config.get("autoincrement", self.autoincrement) 

354 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable) 

355 self.doc = stripIfNotNone(config.get("doc", None)) 

356 return self 

357 

358 def isStringType(self) -> bool: 

359 """Indicate that this is a sqlalchemy.String field spec. 

360 

361 Returns 

362 ------- 

363 isString : `bool` 

364 The field refers to a `sqlalchemy.String` and not any other type. 

365 This can return `False` even if the object was created with a 

366 string type if it has been decided that it should be implemented 

367 as a `sqlalchemy.Text` type. 

368 """ 

369 if self.dtype == sqlalchemy.String: 

370 # For short strings retain them as strings 

371 if self.dtype == sqlalchemy.String and self.length and self.length <= 32: 

372 return True 

373 return False 

374 

375 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine: 

376 """Return a sized version of the column type. 

377 

378 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``. 

379 

380 Returns 

381 ------- 

382 dtype : `sqlalchemy.types.TypeEngine` 

383 A SQLAlchemy column type object. 

384 """ 

385 if self.length is not None: 

386 # Last chance check that we are only looking at possible String 

387 if self.dtype == sqlalchemy.String and not self.isStringType(): 

388 return sqlalchemy.Text 

389 return self.dtype(length=self.length) 

390 if self.nbytes is not None: 

391 return self.dtype(nbytes=self.nbytes) 

392 return self.dtype 

393 

394 def getPythonType(self) -> type: 

395 """Return the Python type associated with this field's (SQL) dtype. 

396 

397 Returns 

398 ------- 

399 type : `type` 

400 Python type associated with this field's (SQL) `dtype`. 

401 """ 

402 # to construct these objects, nbytes keyword is needed 

403 if issubclass(self.dtype, LocalBase64Bytes): 

404 # satisfy mypy for something that must be true 

405 assert self.nbytes is not None 

406 return self.dtype(nbytes=self.nbytes).python_type 

407 else: 

408 return self.dtype().python_type # type: ignore 

409 

410 

411@dataclass 

412class ForeignKeySpec: 

413 """Definition of a foreign key constraint in a logical `Registry` table.""" 

414 

415 table: str 

416 """Name of the target table.""" 

417 

418 source: Tuple[str, ...] 

419 """Tuple of source table column names.""" 

420 

421 target: Tuple[str, ...] 

422 """Tuple of target table column names.""" 

423 

424 onDelete: Optional[str] = None 

425 """SQL clause indicating how to handle deletes to the target table. 

426 

427 If not `None` (which indicates that a constraint violation exception should 

428 be raised), should be either "SET NULL" or "CASCADE". 

429 """ 

430 

431 addIndex: bool = True 

432 """If `True`, create an index on the columns of this foreign key in the 

433 source table. 

434 """ 

435 

436 @classmethod 

437 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.") 

438 def fromConfig(cls, config: Config) -> ForeignKeySpec: 

439 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

440 

441 Parameters 

442 ---------- 

443 config: `Config` 

444 Configuration describing the constraint. Nested configuration keys 

445 correspond to `ForeignKeySpec` attributes. 

446 

447 Returns 

448 ------- 

449 spec: `ForeignKeySpec` 

450 Specification structure for the constraint. 

451 

452 Raises 

453 ------ 

454 SchemaValidationError 

455 Raised if configuration keys are missing or have invalid values. 

456 """ 

457 return cls( 

458 table=config["table"], 

459 source=tuple(ensure_iterable(config["source"])), 

460 target=tuple(ensure_iterable(config["target"])), 

461 onDelete=config.get("onDelete", None), 

462 ) 

463 

464 

465@dataclass 

466class TableSpec: 

467 """A data class used to define a table or table-like query interface. 

468 

469 Parameters 

470 ---------- 

471 fields : `Iterable` [ `FieldSpec` ] 

472 Specifications for the columns in this table. 

473 unique : `Iterable` [ `tuple` [ `str` ] ], optional 

474 Non-primary-key unique constraints for the table. 

475 indexes: `Iterable` [ `tuple` [ `str` ] ], optional 

476 Indexes for the table. 

477 foreignKeys : `Iterable` [ `ForeignKeySpec` ], optional 

478 Foreign key constraints for the table. 

479 exclusion : `Iterable` [ `tuple` [ `str` or `type` ] ] 

480 Special constraints that prohibit overlaps between timespans over rows 

481 where other columns are equal. These take the same form as unique 

482 constraints, but each tuple may contain a single 

483 `TimespanDatabaseRepresentation` subclass representing a timespan 

484 column. 

485 recycleIds : `bool`, optional 

486 If `True`, allow databases that might normally recycle autoincrement 

487 IDs to do so (usually better for performance) on any autoincrement 

488 field in this table. 

489 doc : `str`, optional 

490 Documentation for the table. 

491 """ 

492 

493 def __init__( 

494 self, 

495 fields: Iterable[FieldSpec], 

496 *, 

497 unique: Iterable[Tuple[str, ...]] = (), 

498 indexes: Iterable[Tuple[str, ...]] = (), 

499 foreignKeys: Iterable[ForeignKeySpec] = (), 

500 exclusion: Iterable[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] = (), 

501 recycleIds: bool = True, 

502 doc: Optional[str] = None, 

503 ): 

504 self.fields = NamedValueSet(fields) 

505 self.unique = set(unique) 

506 self.indexes = set(indexes) 

507 self.foreignKeys = list(foreignKeys) 

508 self.exclusion = set(exclusion) 

509 self.recycleIds = recycleIds 

510 self.doc = doc 

511 

512 fields: NamedValueSet[FieldSpec] 

513 """Specifications for the columns in this table.""" 

514 

515 unique: Set[Tuple[str, ...]] 

516 """Non-primary-key unique constraints for the table.""" 

517 

518 indexes: Set[Tuple[str, ...]] 

519 """Indexes for the table.""" 

520 

521 foreignKeys: List[ForeignKeySpec] 

522 """Foreign key constraints for the table.""" 

523 

524 exclusion: Set[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] 

525 """Exclusion constraints for the table. 

526 

527 Exclusion constraints behave mostly like unique constraints, but may 

528 contain a database-native Timespan column that is restricted to not overlap 

529 across rows (for identical combinations of any non-Timespan columns in the 

530 constraint). 

531 """ 

532 

533 recycleIds: bool = True 

534 """If `True`, allow databases that might normally recycle autoincrement IDs 

535 to do so (usually better for performance) on any autoincrement field in 

536 this table. 

537 """ 

538 

539 doc: Optional[str] = None 

540 """Documentation for the table.""" 

541 

542 @classmethod 

543 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.") 

544 def fromConfig(cls, config: Config) -> TableSpec: 

545 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

546 

547 Parameters 

548 ---------- 

549 config: `Config` 

550 Configuration describing the constraint. Nested configuration keys 

551 correspond to `TableSpec` attributes. 

552 

553 Returns 

554 ------- 

555 spec: `TableSpec` 

556 Specification structure for the table. 

557 

558 Raises 

559 ------ 

560 SchemaValidationError 

561 Raised if configuration keys are missing or have invalid values. 

562 """ 

563 return cls( 

564 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]), 

565 unique={tuple(u) for u in config.get("unique", ())}, 

566 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())], 

567 doc=stripIfNotNone(config.get("doc")), 

568 )