Coverage for python/lsst/daf/butler/core/ddl.py: 46%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

191 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21"""Classes for representing SQL data-definition language (DDL) in Python. 

22 

23This include "CREATE TABLE" etc. 

24 

25This provides an extra layer on top of SQLAlchemy's classes for these concepts, 

26because we need a level of indirection between logical tables and the actual 

27SQL, and SQLAlchemy's DDL classes always map 1-1 to SQL. 

28 

29We've opted for the rather more obscure "ddl" as the name of this module 

30instead of "schema" because the latter is too overloaded; in most SQL 

31databases, a "schema" is also another term for a namespace. 

32""" 

33from __future__ import annotations 

34 

35from lsst import sphgeom 

36 

37__all__ = ("TableSpec", "FieldSpec", "ForeignKeySpec", "Base64Bytes", "Base64Region", 

38 "AstropyTimeNsecTai", "GUID") 

39 

40from base64 import b64encode, b64decode 

41import logging 

42from math import ceil 

43from dataclasses import dataclass 

44from typing import Any, Callable, Iterable, List, Optional, Set, Tuple, Type, TYPE_CHECKING, Union 

45import uuid 

46 

47import sqlalchemy 

48from sqlalchemy.dialects.postgresql import UUID 

49import astropy.time 

50 

51from lsst.utils.iteration import ensure_iterable 

52from lsst.sphgeom import Region 

53from .config import Config 

54from .exceptions import ValidationError 

55from . import time_utils 

56from .utils import stripIfNotNone 

57from .named import NamedValueSet 

58 

59if TYPE_CHECKING: 59 ↛ 60line 59 didn't jump to line 60, because the condition on line 59 was never true

60 from .timespan import TimespanDatabaseRepresentation 

61 

62 

63_LOG = logging.getLogger(__name__) 

64 

65 

66class SchemaValidationError(ValidationError): 

67 """Exceptions that indicate problems in Registry schema configuration.""" 

68 

69 @classmethod 

70 def translate(cls, caught: Type[Exception], message: str) -> Callable: 

71 """Return decorator to re-raise exceptions as `SchemaValidationError`. 

72 

73 Decorated functions must be class or instance methods, with a 

74 ``config`` parameter as their first argument. This will be passed 

75 to ``message.format()`` as a keyword argument, along with ``err``, 

76 the original exception. 

77 

78 Parameters 

79 ---------- 

80 caught : `type` (`Exception` subclass) 

81 The type of exception to catch. 

82 message : `str` 

83 A `str.format` string that may contain named placeholders for 

84 ``config``, ``err``, or any keyword-only argument accepted by 

85 the decorated function. 

86 """ 

87 def decorate(func: Callable) -> Callable: 

88 def decorated(self: Any, config: Config, *args: Any, **kwargs: Any) -> Any: 

89 try: 

90 return func(self, config, *args, **kwargs) 

91 except caught as err: 

92 raise cls(message.format(config=str(config), err=err)) 

93 return decorated 

94 return decorate 

95 

96 

97class Base64Bytes(sqlalchemy.TypeDecorator): 

98 """A SQLAlchemy custom type for Python `bytes`. 

99 

100 Maps Python `bytes` to a base64-encoded `sqlalchemy.Text` field. 

101 """ 

102 

103 impl = sqlalchemy.Text 

104 

105 cache_ok = True 

106 

107 def __init__(self, nbytes: int, *args: Any, **kwargs: Any): 

108 length = 4*ceil(nbytes/3) if self.impl == sqlalchemy.String else None 

109 super().__init__(*args, length=length, **kwargs) 

110 self.nbytes = nbytes 

111 

112 def process_bind_param(self, value: Optional[bytes], dialect: sqlalchemy.engine.Dialect 

113 ) -> Optional[str]: 

114 # 'value' is native `bytes`. We want to encode that to base64 `bytes` 

115 # and then ASCII `str`, because `str` is what SQLAlchemy expects for 

116 # String fields. 

117 if value is None: 

118 return None 

119 if not isinstance(value, bytes): 

120 raise TypeError( 

121 f"Base64Bytes fields require 'bytes' values; got '{value}' with type {type(value)}." 

122 ) 

123 return b64encode(value).decode("ascii") 

124 

125 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

126 ) -> Optional[bytes]: 

127 # 'value' is a `str` that must be ASCII because it's base64-encoded. 

128 # We want to transform that to base64-encoded `bytes` and then 

129 # native `bytes`. 

130 return b64decode(value.encode("ascii")) if value is not None else None 

131 

132 @property 

133 def python_type(self) -> Type[bytes]: 

134 return bytes 

135 

136 

137# create an alias, for use below to disambiguate between the built in 

138# sqlachemy type 

139LocalBase64Bytes = Base64Bytes 

140 

141 

142class Base64Region(Base64Bytes): 

143 """A SQLAlchemy custom type for Python `sphgeom.Region`. 

144 

145 Maps Python `sphgeom.Region` to a base64-encoded `sqlalchemy.String`. 

146 """ 

147 

148 def process_bind_param(self, value: Optional[Region], dialect: sqlalchemy.engine.Dialect 

149 ) -> Optional[str]: 

150 if value is None: 

151 return None 

152 return super().process_bind_param(value.encode(), dialect) 

153 

154 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.engine.Dialect 

155 ) -> Optional[Region]: 

156 if value is None: 

157 return None 

158 return Region.decode(super().process_result_value(value, dialect)) 

159 

160 @property 

161 def python_type(self) -> Type[sphgeom.Region]: 

162 return sphgeom.Region 

163 

164 

165class AstropyTimeNsecTai(sqlalchemy.TypeDecorator): 

166 """A SQLAlchemy custom type for Python `astropy.time.Time`. 

167 

168 Maps Python `astropy.time.Time` to a number of nanoseconds since Unix 

169 epoch in TAI scale. 

170 """ 

171 

172 impl = sqlalchemy.BigInteger 

173 

174 cache_ok = True 

175 

176 def process_bind_param(self, value: Optional[astropy.time.Time], dialect: sqlalchemy.engine.Dialect 

177 ) -> Optional[int]: 

178 if value is None: 

179 return None 

180 if not isinstance(value, astropy.time.Time): 

181 raise TypeError(f"Unsupported type: {type(value)}, expected astropy.time.Time") 

182 value = time_utils.TimeConverter().astropy_to_nsec(value) 

183 return value 

184 

185 def process_result_value(self, value: Optional[int], dialect: sqlalchemy.engine.Dialect 

186 ) -> Optional[astropy.time.Time]: 

187 # value is nanoseconds since epoch, or None 

188 if value is None: 

189 return None 

190 value = time_utils.TimeConverter().nsec_to_astropy(value) 

191 return value 

192 

193 

194class GUID(sqlalchemy.TypeDecorator): 

195 """Platform-independent GUID type. 

196 

197 Uses PostgreSQL's UUID type, otherwise uses CHAR(32), storing as 

198 stringified hex values. 

199 """ 

200 

201 impl = sqlalchemy.CHAR 

202 

203 cache_ok = True 

204 

205 def load_dialect_impl(self, dialect: sqlalchemy.Dialect) -> sqlalchemy.TypeEngine: 

206 if dialect.name == 'postgresql': 

207 return dialect.type_descriptor(UUID()) 

208 else: 

209 return dialect.type_descriptor(sqlalchemy.CHAR(32)) 

210 

211 def process_bind_param(self, value: Any, dialect: sqlalchemy.Dialect) -> Optional[str]: 

212 if value is None: 

213 return value 

214 

215 # Coerce input to UUID type, in general having UUID on input is the 

216 # only thing that we want but there is code right now that uses ints. 

217 if isinstance(value, int): 

218 value = uuid.UUID(int=value) 

219 elif isinstance(value, bytes): 

220 value = uuid.UUID(bytes=value) 

221 elif isinstance(value, str): 

222 # hexstring 

223 value = uuid.UUID(hex=value) 

224 elif not isinstance(value, uuid.UUID): 

225 raise TypeError(f"Unexpected type of a bind value: {type(value)}") 

226 

227 if dialect.name == 'postgresql': 

228 return str(value) 

229 else: 

230 return "%.32x" % value.int 

231 

232 def process_result_value(self, value: Optional[str], dialect: sqlalchemy.Dialect) -> Optional[uuid.UUID]: 

233 if value is None: 

234 return value 

235 else: 

236 return uuid.UUID(hex=value) 

237 

238 

239VALID_CONFIG_COLUMN_TYPES = { 

240 "string": sqlalchemy.String, 

241 "int": sqlalchemy.BigInteger, 

242 "float": sqlalchemy.Float, 

243 "region": Base64Region, 

244 "bool": sqlalchemy.Boolean, 

245 "blob": sqlalchemy.LargeBinary, 

246 "datetime": AstropyTimeNsecTai, 

247 "hash": Base64Bytes, 

248 "uuid": GUID, 

249} 

250 

251 

252@dataclass 

253class FieldSpec: 

254 """A data class for defining a column in a logical `Registry` table.""" 

255 

256 name: str 

257 """Name of the column.""" 

258 

259 dtype: type 

260 """Type of the column; usually a `type` subclass provided by SQLAlchemy 

261 that defines both a Python type and a corresponding precise SQL type. 

262 """ 

263 

264 length: Optional[int] = None 

265 """Length of the type in the database, for variable-length types.""" 

266 

267 nbytes: Optional[int] = None 

268 """Natural length used for hash and encoded-region columns, to be converted 

269 into the post-encoding length. 

270 """ 

271 

272 primaryKey: bool = False 

273 """Whether this field is (part of) its table's primary key.""" 

274 

275 autoincrement: bool = False 

276 """Whether the database should insert automatically incremented values when 

277 no value is provided in an INSERT. 

278 """ 

279 

280 nullable: bool = True 

281 """Whether this field is allowed to be NULL.""" 

282 

283 default: Any = None 

284 """A server-side default value for this field. 

285 

286 This is passed directly as the ``server_default`` argument to 

287 `sqlalchemy.schema.Column`. It does _not_ go through SQLAlchemy's usual 

288 type conversion or quoting for Python literals, and should hence be used 

289 with care. See the SQLAlchemy documentation for more information. 

290 """ 

291 

292 doc: Optional[str] = None 

293 """Documentation for this field.""" 

294 

295 def __eq__(self, other: Any) -> bool: 

296 if isinstance(other, FieldSpec): 

297 return self.name == other.name 

298 else: 

299 return NotImplemented 

300 

301 def __hash__(self) -> int: 

302 return hash(self.name) 

303 

304 @classmethod 

305 @SchemaValidationError.translate(KeyError, "Missing key {err} in column config '{config}'.") 

306 def fromConfig(cls, config: Config, **kwargs: Any) -> FieldSpec: 

307 """Create a `FieldSpec` from a subset of a `SchemaConfig`. 

308 

309 Parameters 

310 ---------- 

311 config: `Config` 

312 Configuration describing the column. Nested configuration keys 

313 correspond to `FieldSpec` attributes. 

314 **kwargs 

315 Additional keyword arguments that provide defaults for values 

316 not present in config. 

317 

318 Returns 

319 ------- 

320 spec: `FieldSpec` 

321 Specification structure for the column. 

322 

323 Raises 

324 ------ 

325 SchemaValidationError 

326 Raised if configuration keys are missing or have invalid values. 

327 """ 

328 dtype = VALID_CONFIG_COLUMN_TYPES.get(config["type"]) 

329 if dtype is None: 

330 raise SchemaValidationError(f"Invalid field type string: '{config['type']}'.") 

331 if not config["name"].islower(): 

332 raise SchemaValidationError(f"Column name '{config['name']}' is not all lowercase.") 

333 self = cls(name=config["name"], dtype=dtype, **kwargs) 

334 self.length = config.get("length", self.length) 

335 self.nbytes = config.get("nbytes", self.nbytes) 

336 if self.length is not None and self.nbytes is not None: 

337 raise SchemaValidationError(f"Both length and nbytes provided for field '{self.name}'.") 

338 self.primaryKey = config.get("primaryKey", self.primaryKey) 

339 self.autoincrement = config.get("autoincrement", self.autoincrement) 

340 self.nullable = config.get("nullable", False if self.primaryKey else self.nullable) 

341 self.doc = stripIfNotNone(config.get("doc", None)) 

342 return self 

343 

344 def isStringType(self) -> bool: 

345 """Indicate that this is a sqlalchemy.String field spec. 

346 

347 Returns 

348 ------- 

349 isString : `bool` 

350 The field refers to a `sqlalchemy.String` and not any other type. 

351 This can return `False` even if the object was created with a 

352 string type if it has been decided that it should be implemented 

353 as a `sqlalchemy.Text` type. 

354 """ 

355 if self.dtype == sqlalchemy.String: 

356 # For short strings retain them as strings 

357 if self.dtype == sqlalchemy.String and self.length and self.length <= 32: 

358 return True 

359 return False 

360 

361 def getSizedColumnType(self) -> sqlalchemy.types.TypeEngine: 

362 """Return a sized version of the column type. 

363 

364 Utilizes either (or neither) of ``self.length`` and ``self.nbytes``. 

365 

366 Returns 

367 ------- 

368 dtype : `sqlalchemy.types.TypeEngine` 

369 A SQLAlchemy column type object. 

370 """ 

371 if self.length is not None: 

372 # Last chance check that we are only looking at possible String 

373 if self.dtype == sqlalchemy.String and not self.isStringType(): 

374 return sqlalchemy.Text 

375 return self.dtype(length=self.length) 

376 if self.nbytes is not None: 

377 return self.dtype(nbytes=self.nbytes) 

378 return self.dtype 

379 

380 def getPythonType(self) -> type: 

381 """Return the Python type associated with this field's (SQL) dtype. 

382 

383 Returns 

384 ------- 

385 type : `type` 

386 Python type associated with this field's (SQL) `dtype`. 

387 """ 

388 # to construct these objects, nbytes keyword is needed 

389 if issubclass(self.dtype, LocalBase64Bytes): 

390 # satisfy mypy for something that must be true 

391 assert self.nbytes is not None 

392 return self.dtype(nbytes=self.nbytes).python_type 

393 else: 

394 return self.dtype().python_type # type: ignore 

395 

396 

397@dataclass 

398class ForeignKeySpec: 

399 """Definition of a foreign key constraint in a logical `Registry` table.""" 

400 

401 table: str 

402 """Name of the target table.""" 

403 

404 source: Tuple[str, ...] 

405 """Tuple of source table column names.""" 

406 

407 target: Tuple[str, ...] 

408 """Tuple of target table column names.""" 

409 

410 onDelete: Optional[str] = None 

411 """SQL clause indicating how to handle deletes to the target table. 

412 

413 If not `None` (which indicates that a constraint violation exception should 

414 be raised), should be either "SET NULL" or "CASCADE". 

415 """ 

416 

417 addIndex: bool = True 

418 """If `True`, create an index on the columns of this foreign key in the 

419 source table. 

420 """ 

421 

422 @classmethod 

423 @SchemaValidationError.translate(KeyError, "Missing key {err} in foreignKey config '{config}'.") 

424 def fromConfig(cls, config: Config) -> ForeignKeySpec: 

425 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

426 

427 Parameters 

428 ---------- 

429 config: `Config` 

430 Configuration describing the constraint. Nested configuration keys 

431 correspond to `ForeignKeySpec` attributes. 

432 

433 Returns 

434 ------- 

435 spec: `ForeignKeySpec` 

436 Specification structure for the constraint. 

437 

438 Raises 

439 ------ 

440 SchemaValidationError 

441 Raised if configuration keys are missing or have invalid values. 

442 """ 

443 return cls(table=config["table"], 

444 source=tuple(ensure_iterable(config["source"])), 

445 target=tuple(ensure_iterable(config["target"])), 

446 onDelete=config.get("onDelete", None)) 

447 

448 

449@dataclass 

450class TableSpec: 

451 """A data class used to define a table or table-like query interface. 

452 

453 Parameters 

454 ---------- 

455 fields : `Iterable` [ `FieldSpec` ] 

456 Specifications for the columns in this table. 

457 unique : `Iterable` [ `tuple` [ `str` ] ], optional 

458 Non-primary-key unique constraints for the table. 

459 indexes: `Iterable` [ `tuple` [ `str` ] ], optional 

460 Indexes for the table. 

461 foreignKeys : `Iterable` [ `ForeignKeySpec` ], optional 

462 Foreign key constraints for the table. 

463 exclusion : `Iterable` [ `tuple` [ `str` or `type` ] ] 

464 Special constraints that prohibit overlaps between timespans over rows 

465 where other columns are equal. These take the same form as unique 

466 constraints, but each tuple may contain a single 

467 `TimespanDatabaseRepresentation` subclass representing a timespan 

468 column. 

469 recycleIds : `bool`, optional 

470 If `True`, allow databases that might normally recycle autoincrement 

471 IDs to do so (usually better for performance) on any autoincrement 

472 field in this table. 

473 doc : `str`, optional 

474 Documentation for the table. 

475 """ 

476 

477 def __init__( 

478 self, fields: Iterable[FieldSpec], *, 

479 unique: Iterable[Tuple[str, ...]] = (), 

480 indexes: Iterable[Tuple[str, ...]] = (), 

481 foreignKeys: Iterable[ForeignKeySpec] = (), 

482 exclusion: Iterable[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] = (), 

483 recycleIds: bool = True, 

484 doc: Optional[str] = None, 

485 ): 

486 self.fields = NamedValueSet(fields) 

487 self.unique = set(unique) 

488 self.indexes = set(indexes) 

489 self.foreignKeys = list(foreignKeys) 

490 self.exclusion = set(exclusion) 

491 self.recycleIds = recycleIds 

492 self.doc = doc 

493 

494 fields: NamedValueSet[FieldSpec] 

495 """Specifications for the columns in this table.""" 

496 

497 unique: Set[Tuple[str, ...]] 

498 """Non-primary-key unique constraints for the table.""" 

499 

500 indexes: Set[Tuple[str, ...]] 

501 """Indexes for the table.""" 

502 

503 foreignKeys: List[ForeignKeySpec] 

504 """Foreign key constraints for the table.""" 

505 

506 exclusion: Set[Tuple[Union[str, Type[TimespanDatabaseRepresentation]], ...]] 

507 """Exclusion constraints for the table. 

508 

509 Exclusion constraints behave mostly like unique constraints, but may 

510 contain a database-native Timespan column that is restricted to not overlap 

511 across rows (for identical combinations of any non-Timespan columns in the 

512 constraint). 

513 """ 

514 

515 recycleIds: bool = True 

516 """If `True`, allow databases that might normally recycle autoincrement IDs 

517 to do so (usually better for performance) on any autoincrement field in 

518 this table. 

519 """ 

520 

521 doc: Optional[str] = None 

522 """Documentation for the table.""" 

523 

524 @classmethod 

525 @SchemaValidationError.translate(KeyError, "Missing key {err} in table config '{config}'.") 

526 def fromConfig(cls, config: Config) -> TableSpec: 

527 """Create a `ForeignKeySpec` from a subset of a `SchemaConfig`. 

528 

529 Parameters 

530 ---------- 

531 config: `Config` 

532 Configuration describing the constraint. Nested configuration keys 

533 correspond to `TableSpec` attributes. 

534 

535 Returns 

536 ------- 

537 spec: `TableSpec` 

538 Specification structure for the table. 

539 

540 Raises 

541 ------ 

542 SchemaValidationError 

543 Raised if configuration keys are missing or have invalid values. 

544 """ 

545 return cls( 

546 fields=NamedValueSet(FieldSpec.fromConfig(c) for c in config["columns"]), 

547 unique={tuple(u) for u in config.get("unique", ())}, 

548 foreignKeys=[ForeignKeySpec.fromConfig(c) for c in config.get("foreignKeys", ())], 

549 doc=stripIfNotNone(config.get("doc")), 

550 )