Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "addDatasetForeignKey", 

26 "makeCalibTableName", 

27 "makeCalibTableSpec", 

28 "makeStaticTableSpecs", 

29 "makeTagTableName", 

30 "makeTagTableSpec", 

31 "StaticDatasetTablesTuple", 

32) 

33 

34from typing import ( 

35 Any, 

36 Generic, 

37 List, 

38 Optional, 

39 Type, 

40 TypeVar, 

41 Union, 

42) 

43 

44from collections import namedtuple 

45 

46import sqlalchemy 

47 

48from lsst.daf.butler import ( 

49 DatasetType, 

50 ddl, 

51 DimensionUniverse, 

52 GovernorDimension, 

53 NamedKeyDict, 

54 NamedKeyMapping, 

55) 

56from lsst.daf.butler import addDimensionForeignKey, TimespanDatabaseRepresentation 

57from lsst.daf.butler.registry.interfaces import ( 

58 CollectionManager, 

59 Database, 

60 DimensionRecordStorageManager, 

61 StaticTablesContext, 

62) 

63 

64 

65DATASET_TYPE_NAME_LENGTH = 128 

66 

67 

68StaticDatasetTablesTuple = namedtuple( 

69 "StaticDatasetTablesTuple", 

70 [ 

71 "dataset_type", 

72 "dataset", 

73 ] 

74) 

75 

76 

77def addDatasetForeignKey(tableSpec: ddl.TableSpec, *, 

78 name: str = "dataset", 

79 onDelete: Optional[str] = None, 

80 constraint: bool = True, 

81 **kwargs: Any) -> ddl.FieldSpec: 

82 """Add a foreign key column for datasets and (optionally) a constraint to 

83 a table. 

84 

85 This is an internal interface for the ``byDimensions`` package; external 

86 code should use `DatasetRecordStorageManager.addDatasetForeignKey` instead. 

87 

88 Parameters 

89 ---------- 

90 tableSpec : `ddl.TableSpec` 

91 Specification for the table that should reference the dataset 

92 table. Will be modified in place. 

93 name: `str`, optional 

94 A name to use for the prefix of the new field; the full name is 

95 ``{name}_id``. 

96 onDelete: `str`, optional 

97 One of "CASCADE" or "SET NULL", indicating what should happen to 

98 the referencing row if the collection row is deleted. `None` 

99 indicates that this should be an integrity error. 

100 constraint: `bool`, optional 

101 If `False` (`True` is default), add a field that can be joined to 

102 the dataset primary key, but do not add a foreign key constraint. 

103 **kwargs 

104 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

105 constructor (only the ``name`` and ``dtype`` arguments are 

106 otherwise provided). 

107 

108 Returns 

109 ------- 

110 idSpec : `ddl.FieldSpec` 

111 Specification for the ID field. 

112 """ 

113 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=sqlalchemy.BigInteger, **kwargs) 

114 tableSpec.fields.add(idFieldSpec) 

115 if constraint: 

116 tableSpec.foreignKeys.append(ddl.ForeignKeySpec("dataset", source=(idFieldSpec.name,), 

117 target=("id",), onDelete=onDelete)) 

118 return idFieldSpec 

119 

120 

121def makeStaticTableSpecs(collections: Type[CollectionManager], 

122 universe: DimensionUniverse, 

123 ) -> StaticDatasetTablesTuple: 

124 """Construct all static tables used by the classes in this package. 

125 

126 Static tables are those that are present in all Registries and do not 

127 depend on what DatasetTypes have been registered. 

128 

129 Parameters 

130 ---------- 

131 collections: `CollectionManager` 

132 Manager object for the collections in this `Registry`. 

133 universe : `DimensionUniverse` 

134 Universe graph containing all dimensions known to this `Registry`. 

135 

136 Returns 

137 ------- 

138 specs : `StaticDatasetTablesTuple` 

139 A named tuple containing `ddl.TableSpec` instances. 

140 """ 

141 specs = StaticDatasetTablesTuple( 

142 dataset_type=ddl.TableSpec( 

143 fields=[ 

144 ddl.FieldSpec( 

145 name="id", 

146 dtype=sqlalchemy.BigInteger, 

147 autoincrement=True, 

148 primaryKey=True, 

149 doc=( 

150 "Autoincrement ID that uniquely identifies a dataset " 

151 "type in other tables. Python code outside the " 

152 "`Registry` class should never interact with this; " 

153 "its existence is considered an implementation detail." 

154 ), 

155 ), 

156 ddl.FieldSpec( 

157 name="name", 

158 dtype=sqlalchemy.String, 

159 length=DATASET_TYPE_NAME_LENGTH, 

160 nullable=False, 

161 doc="String name that uniquely identifies a dataset type.", 

162 ), 

163 ddl.FieldSpec( 

164 name="storage_class", 

165 dtype=sqlalchemy.String, 

166 length=64, 

167 nullable=False, 

168 doc=( 

169 "Name of the storage class associated with all " 

170 "datasets of this type. Storage classes are " 

171 "generally associated with a Python class, and are " 

172 "enumerated in butler configuration." 

173 ) 

174 ), 

175 ddl.FieldSpec( 

176 name="dimensions_key", 

177 dtype=sqlalchemy.BigInteger, 

178 nullable=False, 

179 doc=( 

180 "Unique key for the set of dimensions that identifies " 

181 "datasets of this type." 

182 ), 

183 ), 

184 ddl.FieldSpec( 

185 name="tag_association_table", 

186 dtype=sqlalchemy.String, 

187 length=128, 

188 nullable=False, 

189 doc=( 

190 "Name of the table that holds associations between " 

191 "datasets of this type and most types of collections." 

192 ), 

193 ), 

194 ddl.FieldSpec( 

195 name="calibration_association_table", 

196 dtype=sqlalchemy.String, 

197 length=128, 

198 nullable=True, 

199 doc=( 

200 "Name of the table that holds associations between " 

201 "datasets of this type and CALIBRATION collections. " 

202 "NULL values indicate dataset types with " 

203 "isCalibration=False." 

204 ), 

205 ), 

206 ], 

207 unique=[("name",)], 

208 ), 

209 dataset=ddl.TableSpec( 

210 fields=[ 

211 ddl.FieldSpec( 

212 name="id", 

213 dtype=sqlalchemy.BigInteger, 

214 autoincrement=True, 

215 primaryKey=True, 

216 doc="A unique autoincrement field used as the primary key for dataset.", 

217 ), 

218 ddl.FieldSpec( 

219 name="dataset_type_id", 

220 dtype=sqlalchemy.BigInteger, 

221 nullable=False, 

222 doc=( 

223 "Reference to the associated entry in the dataset_type " 

224 "table." 

225 ), 

226 ), 

227 ddl.FieldSpec( 

228 name="ingest_date", 

229 dtype=sqlalchemy.TIMESTAMP, 

230 default=sqlalchemy.sql.func.now(), 

231 nullable=False, 

232 doc="Time of dataset ingestion.", 

233 ), 

234 # Foreign key field/constraint to run added below. 

235 ], 

236 foreignKeys=[ 

237 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",)), 

238 ] 

239 ), 

240 ) 

241 # Add foreign key fields programmatically. 

242 collections.addRunForeignKey(specs.dataset, onDelete="CASCADE", nullable=False) 

243 return specs 

244 

245 

246_T = TypeVar("_T") 

247 

248 

249class CollectionSummaryTables(Generic[_T]): 

250 """Structure that holds the table or table specification objects that 

251 summarize the contents of collections. 

252 

253 Parameters 

254 ---------- 

255 datasetType 

256 Table [specification] that summarizes which dataset types are in each 

257 collection. 

258 dimensions 

259 Mapping of table [specifications] that summarize which governor 

260 dimension values are present in the data IDs of each collection. 

261 """ 

262 def __init__( 

263 self, 

264 datasetType: _T, 

265 dimensions: NamedKeyMapping[GovernorDimension, _T], 

266 ): 

267 self.datasetType = datasetType 

268 self.dimensions = dimensions 

269 

270 @classmethod 

271 def initialize( 

272 cls, 

273 db: Database, 

274 context: StaticTablesContext, *, 

275 collections: CollectionManager, 

276 dimensions: DimensionRecordStorageManager, 

277 ) -> CollectionSummaryTables[sqlalchemy.schema.Table]: 

278 """Create all summary tables (or check that they have been created). 

279 

280 Parameters 

281 ---------- 

282 db : `Database` 

283 Interface to the underlying database engine and namespace. 

284 context : `StaticTablesContext` 

285 Context object obtained from `Database.declareStaticTables`; used 

286 to declare any tables that should always be present. 

287 collections: `CollectionManager` 

288 Manager object for the collections in this `Registry`. 

289 dimensions : `DimensionRecordStorageManager` 

290 Manager object for the dimensions in this `Registry`. 

291 

292 Returns 

293 ------- 

294 tables : `CollectionSummaryTables` [ `sqlalchemy.schema.Table` ] 

295 Structure containing table objects. 

296 """ 

297 specs = cls.makeTableSpecs(collections, dimensions) 

298 return CollectionSummaryTables( 

299 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

300 dimensions=NamedKeyDict({ 

301 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

302 for dimension, spec in specs.dimensions.items() 

303 }).freeze(), 

304 ) 

305 

306 @classmethod 

307 def makeTableSpecs( 

308 cls, 

309 collections: CollectionManager, 

310 dimensions: DimensionRecordStorageManager, 

311 ) -> CollectionSummaryTables[ddl.TableSpec]: 

312 """Create specifications for all summary tables. 

313 

314 Parameters 

315 ---------- 

316 collections: `CollectionManager` 

317 Manager object for the collections in this `Registry`. 

318 dimensions : `DimensionRecordStorageManager` 

319 Manager object for the dimensions in this `Registry`. 

320 

321 Returns 

322 ------- 

323 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

324 Structure containing table specifications. 

325 """ 

326 # Spec for collection_summary_dataset_type. 

327 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

328 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

329 datasetTypeTableSpec.fields.add( 

330 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

331 ) 

332 datasetTypeTableSpec.foreignKeys.append( 

333 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",), 

334 onDelete="CASCADE") 

335 ) 

336 # Specs for collection_summary_<dimension>. 

337 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

338 for dimension in dimensions.universe.getGovernorDimensions(): 

339 tableSpec = ddl.TableSpec(fields=[]) 

340 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

341 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

342 dimensionTableSpecs[dimension] = tableSpec 

343 return CollectionSummaryTables( 

344 datasetType=datasetTypeTableSpec, 

345 dimensions=dimensionTableSpecs.freeze(), 

346 ) 

347 

348 

349def makeTagTableName(datasetType: DatasetType, dimensionsKey: int) -> str: 

350 """Construct the name for a dynamic (DatasetType-dependent) tag table used 

351 by the classes in this package. 

352 

353 Parameters 

354 ---------- 

355 datasetType : `DatasetType` 

356 Dataset type to construct a name for. Multiple dataset types may 

357 share the same table. 

358 dimensionsKey : `int` 

359 Integer key used to save ``datasetType.dimensions`` to the database. 

360 

361 Returns 

362 ------- 

363 name : `str` 

364 Name for the table. 

365 """ 

366 return f"dataset_tags_{dimensionsKey:08d}" 

367 

368 

369def makeCalibTableName(datasetType: DatasetType, dimensionsKey: int) -> str: 

370 """Construct the name for a dynamic (DatasetType-dependent) tag + validity 

371 range table used by the classes in this package. 

372 

373 Parameters 

374 ---------- 

375 datasetType : `DatasetType` 

376 Dataset type to construct a name for. Multiple dataset types may 

377 share the same table. 

378 dimensionsKey : `int` 

379 Integer key used to save ``datasetType.dimensions`` to the database. 

380 

381 Returns 

382 ------- 

383 name : `str` 

384 Name for the table. 

385 """ 

386 assert datasetType.isCalibration() 

387 return f"dataset_calibs_{dimensionsKey:08d}" 

388 

389 

390def makeTagTableSpec(datasetType: DatasetType, collections: Type[CollectionManager]) -> ddl.TableSpec: 

391 """Construct the specification for a dynamic (DatasetType-dependent) tag 

392 table used by the classes in this package. 

393 

394 Parameters 

395 ---------- 

396 datasetType : `DatasetType` 

397 Dataset type to construct a spec for. Multiple dataset types may 

398 share the same table. 

399 collections : `type` [ `CollectionManager` ] 

400 `CollectionManager` subclass that can be used to construct foreign keys 

401 to the run and/or collection tables. 

402 

403 Returns 

404 ------- 

405 spec : `ddl.TableSpec` 

406 Specification for the table. 

407 """ 

408 tableSpec = ddl.TableSpec( 

409 fields=[ 

410 # Foreign key fields to dataset, collection, and usually dimension 

411 # tables added below. 

412 # The dataset_type_id field here would be redundant with the one 

413 # in the main monolithic dataset table, but we need it here for an 

414 # important unique constraint. 

415 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, nullable=False), 

416 ], 

417 foreignKeys=[ 

418 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",)), 

419 ] 

420 ) 

421 # We'll also have a unique constraint on dataset type, collection, and data 

422 # ID. We only include the required part of the data ID, as that's 

423 # sufficient and saves us from worrying about nulls in the constraint. 

424 constraint = ["dataset_type_id"] 

425 # Add foreign key fields to dataset table (part of the primary key) 

426 addDatasetForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

427 # Add foreign key fields to collection table (part of the primary key and 

428 # the data ID unique constraint). 

429 collectionFieldSpec = collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

430 constraint.append(collectionFieldSpec.name) 

431 # Add foreign key constraint to the collection_summary_dataset_type table. 

432 tableSpec.foreignKeys.append( 

433 ddl.ForeignKeySpec( 

434 "collection_summary_dataset_type", 

435 source=(collectionFieldSpec.name, "dataset_type_id"), 

436 target=(collectionFieldSpec.name, "dataset_type_id"), 

437 ) 

438 ) 

439 for dimension in datasetType.dimensions.required: 

440 fieldSpec = addDimensionForeignKey(tableSpec, dimension=dimension, nullable=False, primaryKey=False) 

441 constraint.append(fieldSpec.name) 

442 # If this is a governor dimension, add a foreign key constraint to the 

443 # collection_summary_<dimension> table. 

444 if isinstance(dimension, GovernorDimension): 

445 tableSpec.foreignKeys.append( 

446 ddl.ForeignKeySpec( 

447 f"collection_summary_{dimension.name}", 

448 source=(collectionFieldSpec.name, fieldSpec.name), 

449 target=(collectionFieldSpec.name, fieldSpec.name), 

450 ) 

451 ) 

452 # Actually add the unique constraint. 

453 tableSpec.unique.add(tuple(constraint)) 

454 return tableSpec 

455 

456 

457def makeCalibTableSpec(datasetType: DatasetType, collections: Type[CollectionManager], 

458 TimespanReprClass: Type[TimespanDatabaseRepresentation]) -> ddl.TableSpec: 

459 """Construct the specification for a dynamic (DatasetType-dependent) tag + 

460 validity range table used by the classes in this package. 

461 

462 Parameters 

463 ---------- 

464 datasetType : `DatasetType` 

465 Dataset type to construct a spec for. Multiple dataset types may 

466 share the same table. 

467 collections : `type` [ `CollectionManager` ] 

468 `CollectionManager` subclass that can be used to construct foreign keys 

469 to the run and/or collection tables. 

470 

471 Returns 

472 ------- 

473 spec : `ddl.TableSpec` 

474 Specification for the table. 

475 """ 

476 tableSpec = ddl.TableSpec( 

477 fields=[ 

478 # This table has no natural primary key, compound or otherwise, so 

479 # we add an autoincrement key. We may use this field a bit 

480 # internally, but its presence is an implementation detail and it 

481 # shouldn't appear as a foreign key in any other tables. 

482 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, autoincrement=True, primaryKey=True), 

483 # Foreign key fields to dataset, collection, and usually dimension 

484 # tables added below. The dataset_type_id field here is redundant 

485 # with the one in the main monolithic dataset table, but this bit 

486 # of denormalization lets us define what should be a much more 

487 # useful index. 

488 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, nullable=False), 

489 ], 

490 foreignKeys=[ 

491 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",)), 

492 ] 

493 ) 

494 # Record fields that should go in the temporal lookup index/constraint, 

495 # starting with the dataset type. 

496 index: List[Union[str, Type[TimespanDatabaseRepresentation]]] = ["dataset_type_id"] 

497 # Add foreign key fields to dataset table (not part of the temporal 

498 # lookup/constraint). 

499 addDatasetForeignKey(tableSpec, nullable=False, onDelete="CASCADE") 

500 # Add foreign key fields to collection table (part of the temporal lookup 

501 # index/constraint). 

502 collectionFieldSpec = collections.addCollectionForeignKey(tableSpec, nullable=False, onDelete="CASCADE") 

503 index.append(collectionFieldSpec.name) 

504 # Add foreign key constraint to the collection_summary_dataset_type table. 

505 tableSpec.foreignKeys.append( 

506 ddl.ForeignKeySpec( 

507 "collection_summary_dataset_type", 

508 source=(collectionFieldSpec.name, "dataset_type_id"), 

509 target=(collectionFieldSpec.name, "dataset_type_id"), 

510 ) 

511 ) 

512 # Add dimension fields (part of the temporal lookup index.constraint). 

513 for dimension in datasetType.dimensions.required: 

514 fieldSpec = addDimensionForeignKey(tableSpec, dimension=dimension, nullable=False, primaryKey=False) 

515 index.append(fieldSpec.name) 

516 # If this is a governor dimension, add a foreign key constraint to the 

517 # collection_summary_<dimension> table. 

518 if isinstance(dimension, GovernorDimension): 

519 tableSpec.foreignKeys.append( 

520 ddl.ForeignKeySpec( 

521 f"collection_summary_{dimension.name}", 

522 source=(collectionFieldSpec.name, fieldSpec.name), 

523 target=(collectionFieldSpec.name, fieldSpec.name), 

524 ) 

525 ) 

526 # Add validity-range field(s) (part of the temporal lookup 

527 # index/constraint). 

528 tsFieldSpecs = TimespanReprClass.makeFieldSpecs(nullable=False) 

529 for fieldSpec in tsFieldSpecs: 

530 tableSpec.fields.add(fieldSpec) 

531 if TimespanReprClass.hasExclusionConstraint(): 531 ↛ 536line 531 didn't jump to line 536, because the condition on line 531 was never true

532 # This database's timespan representation can define a database-level 

533 # constraint that prevents overlapping validity ranges for entries with 

534 # the same DatasetType, collection, and data ID. 

535 # This also creates an index. 

536 index.append(TimespanReprClass) 

537 tableSpec.exclusion.add(tuple(index)) 

538 else: 

539 # No database-level constraint possible. We'll have to simulate that 

540 # in our DatasetRecordStorage.certify() implementation, and just create 

541 # a regular index here in the hope that helps with lookups. 

542 index.extend(fieldSpec.name for fieldSpec in tsFieldSpecs) 

543 tableSpec.indexes.add(tuple(index)) # type: ignore 

544 return tableSpec