Coverage for python / lsst / daf / butler / registry / datasets / byDimensions / tables.py: 0%
118 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "StaticDatasetTableSpecTuple",
32 "StaticDatasetTablesTuple",
33 "addDatasetForeignKey",
34 "makeCalibTableName",
35 "makeCalibTableSpec",
36 "makeStaticTableSpecs",
37 "makeTagTableName",
38 "makeTagTableSpec",
39)
41from typing import Any, NamedTuple, TypeAlias
43import sqlalchemy
45from lsst.utils.classes import immutable
47from .... import ddl
48from ...._utilities.thread_safe_cache import ThreadSafeCache
49from ....dimensions import DimensionGroup, DimensionUniverse, GovernorDimension, addDimensionForeignKey
50from ....timespan_database_representation import TimespanDatabaseRepresentation
51from ...interfaces import CollectionManager, Database, VersionTuple
53DATASET_TYPE_NAME_LENGTH = 128
56class MissingDatabaseTableError(RuntimeError):
57 """Exception raised when a table is not found in a database."""
60class StaticDatasetTableSpecTuple(NamedTuple):
61 """Table specifications for static dataset tables."""
63 dataset_type: ddl.TableSpec
64 dataset: ddl.TableSpec
67class StaticDatasetTablesTuple(NamedTuple):
68 """SQLAlchemy table objects for static dataset tables."""
70 dataset_type: sqlalchemy.Table
71 dataset: sqlalchemy.Table
74def addDatasetForeignKey(
75 tableSpec: ddl.TableSpec,
76 *,
77 name: str = "dataset",
78 onDelete: str | None = None,
79 constraint: bool = True,
80 **kwargs: Any,
81) -> ddl.FieldSpec:
82 """Add a foreign key column for datasets and (optionally) a constraint to
83 a table.
85 This is an internal interface for the ``byDimensions`` package; external
86 code should use `DatasetRecordStorageManager.addDatasetForeignKey` instead.
88 Parameters
89 ----------
90 tableSpec : `ddl.TableSpec`
91 Specification for the table that should reference the dataset
92 table. Will be modified in place.
93 name : `str`, optional
94 A name to use for the prefix of the new field; the full name is
95 ``{name}_id``.
96 onDelete : `str`, optional
97 One of "CASCADE" or "SET NULL", indicating what should happen to
98 the referencing row if the collection row is deleted. `None`
99 indicates that this should be an integrity error.
100 constraint : `bool`, optional
101 If `False` (`True` is default), add a field that can be joined to
102 the dataset primary key, but do not add a foreign key constraint.
103 **kwargs
104 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
105 constructor (only the ``name`` and ``dtype`` arguments are
106 otherwise provided).
108 Returns
109 -------
110 idSpec : `ddl.FieldSpec`
111 Specification for the ID field.
112 """
113 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
114 tableSpec.fields.add(idFieldSpec)
115 if constraint:
116 tableSpec.foreignKeys.append(
117 ddl.ForeignKeySpec("dataset", source=(idFieldSpec.name,), target=("id",), onDelete=onDelete)
118 )
119 return idFieldSpec
122def makeStaticTableSpecs(
123 collections: type[CollectionManager],
124 universe: DimensionUniverse,
125 schema_version: VersionTuple,
126) -> StaticDatasetTableSpecTuple:
127 """Construct all static tables used by the classes in this package.
129 Static tables are those that are present in all Registries and do not
130 depend on what DatasetTypes have been registered.
132 Parameters
133 ----------
134 collections : `CollectionManager`
135 Manager object for the collections in this `Registry`.
136 universe : `DimensionUniverse`
137 Universe graph containing all dimensions known to this `Registry`.
138 schema_version : `VersionTuple`
139 The version of this schema.
141 Returns
142 -------
143 specs : `StaticDatasetTableSpecTuple`
144 A named tuple containing `ddl.TableSpec` instances.
145 """
146 ingest_date_type: type
147 ingest_date_default: Any = None
148 if schema_version.major > 1:
149 ingest_date_type = ddl.AstropyTimeNsecTai
150 else:
151 ingest_date_type = sqlalchemy.TIMESTAMP
152 # New code provides explicit values for ingest_data, but we keep
153 # default just to be consistent with the existing schema.
154 ingest_date_default = sqlalchemy.sql.func.now()
156 specs = StaticDatasetTableSpecTuple(
157 dataset_type=ddl.TableSpec(
158 fields=[
159 ddl.FieldSpec(
160 name="id",
161 dtype=sqlalchemy.BigInteger,
162 autoincrement=True,
163 primaryKey=True,
164 doc=(
165 "Autoincrement ID that uniquely identifies a dataset "
166 "type in other tables. Python code outside the "
167 "`Registry` class should never interact with this; "
168 "its existence is considered an implementation detail."
169 ),
170 ),
171 ddl.FieldSpec(
172 name="name",
173 dtype=sqlalchemy.String,
174 length=DATASET_TYPE_NAME_LENGTH,
175 nullable=False,
176 doc="String name that uniquely identifies a dataset type.",
177 ),
178 ddl.FieldSpec(
179 name="storage_class",
180 dtype=sqlalchemy.String,
181 length=64,
182 nullable=False,
183 doc=(
184 "Name of the storage class associated with all "
185 "datasets of this type. Storage classes are "
186 "generally associated with a Python class, and are "
187 "enumerated in butler configuration."
188 ),
189 ),
190 ddl.FieldSpec(
191 name="dimensions_key",
192 dtype=sqlalchemy.BigInteger,
193 nullable=False,
194 doc="Unique key for the set of dimensions that identifies datasets of this type.",
195 ),
196 ddl.FieldSpec(
197 name="tag_association_table",
198 dtype=sqlalchemy.String,
199 length=128,
200 nullable=False,
201 doc=(
202 "Name of the table that holds associations between "
203 "datasets of this type and most types of collections."
204 ),
205 ),
206 ddl.FieldSpec(
207 name="calibration_association_table",
208 dtype=sqlalchemy.String,
209 length=128,
210 nullable=True,
211 doc=(
212 "Name of the table that holds associations between "
213 "datasets of this type and CALIBRATION collections. "
214 "NULL values indicate dataset types with "
215 "isCalibration=False."
216 ),
217 ),
218 ],
219 unique=[("name",)],
220 ),
221 dataset=ddl.TableSpec(
222 fields=[
223 ddl.FieldSpec(
224 name="id",
225 dtype=ddl.GUID,
226 primaryKey=True,
227 doc="A unique field used as the primary key for dataset.",
228 ),
229 ddl.FieldSpec(
230 name="dataset_type_id",
231 dtype=sqlalchemy.BigInteger,
232 nullable=False,
233 doc="Reference to the associated entry in the dataset_type table.",
234 ),
235 ddl.FieldSpec(
236 name="ingest_date",
237 dtype=ingest_date_type,
238 default=ingest_date_default,
239 nullable=False,
240 doc="Time of dataset ingestion.",
241 ),
242 # Foreign key field/constraint to run added below.
243 ],
244 foreignKeys=[
245 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",)),
246 ],
247 ),
248 )
249 # Add foreign key fields programmatically.
250 collections.addRunForeignKey(specs.dataset, onDelete="CASCADE", nullable=False)
251 return specs
254def makeTagTableName(dimensionsKey: int) -> str:
255 """Construct the name for a dynamic (DatasetType-dependent) tag table used
256 by the classes in this package.
258 Parameters
259 ----------
260 dimensionsKey : `int`
261 Integer key used to save ``datasetType.dimensions`` to the database.
263 Returns
264 -------
265 name : `str`
266 Name for the table.
267 """
268 return f"dataset_tags_{dimensionsKey:08d}"
271def makeCalibTableName(dimensionsKey: int) -> str:
272 """Construct the name for a dynamic (DatasetType-dependent) tag + validity
273 range table used by the classes in this package.
275 Parameters
276 ----------
277 dimensionsKey : `int`
278 Integer key used to save ``datasetType.dimensions`` to the database.
280 Returns
281 -------
282 name : `str`
283 Name for the table.
284 """
285 return f"dataset_calibs_{dimensionsKey:08d}"
288def makeTagTableSpec(
289 dimensions: DimensionGroup, collections: type[CollectionManager], *, constraints: bool = True
290) -> ddl.TableSpec:
291 """Construct the specification for a dynamic (DatasetType-dependent) tag
292 table used by the classes in this package.
294 Parameters
295 ----------
296 dimensions : `DimensionGroup`
297 Dimensions of the dataset type.
298 collections : `type` [ `CollectionManager` ]
299 `CollectionManager` subclass that can be used to construct foreign keys
300 to the run and/or collection tables.
301 constraints : `bool`, optional
302 If `False` (`True` is default), do not define foreign key constraints.
304 Returns
305 -------
306 spec : `ddl.TableSpec`
307 Specification for the table.
308 """
309 tableSpec = ddl.TableSpec(
310 fields=[
311 # Foreign key fields to dataset, collection, and usually dimension
312 # tables added below.
313 # The dataset_type_id field here would be redundant with the one
314 # in the main monolithic dataset table, but we need it here for an
315 # important unique constraint.
316 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, nullable=False),
317 ]
318 )
319 if constraints:
320 tableSpec.foreignKeys.append(
321 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",))
322 )
323 # We'll also have a unique constraint on dataset type, collection, and data
324 # ID. We only include the required part of the data ID, as that's
325 # sufficient and saves us from worrying about nulls in the constraint.
326 constraint = ["dataset_type_id"]
327 # Add foreign key fields to dataset table (part of the primary key)
328 addDatasetForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE", constraint=constraints)
329 # Add foreign key fields to collection table (part of the primary key and
330 # the data ID unique constraint).
331 collectionFieldSpec = collections.addCollectionForeignKey(
332 tableSpec, primaryKey=True, onDelete="CASCADE", constraint=constraints
333 )
334 constraint.append(collectionFieldSpec.name)
335 # Add foreign key constraint to the collection_summary_dataset_type table.
336 if constraints:
337 tableSpec.foreignKeys.append(
338 ddl.ForeignKeySpec(
339 "collection_summary_dataset_type",
340 source=(collectionFieldSpec.name, "dataset_type_id"),
341 target=(collectionFieldSpec.name, "dataset_type_id"),
342 )
343 )
344 for dimension_name in dimensions.required:
345 dimension = dimensions.universe.dimensions[dimension_name]
346 fieldSpec = addDimensionForeignKey(
347 tableSpec, dimension=dimension, nullable=False, primaryKey=False, constraint=constraints
348 )
349 constraint.append(fieldSpec.name)
350 # If this is a governor dimension, add a foreign key constraint to the
351 # collection_summary_<dimension> table.
352 if isinstance(dimension, GovernorDimension) and constraints:
353 tableSpec.foreignKeys.append(
354 ddl.ForeignKeySpec(
355 f"collection_summary_{dimension.name}",
356 source=(collectionFieldSpec.name, fieldSpec.name),
357 target=(collectionFieldSpec.name, fieldSpec.name),
358 )
359 )
360 # Actually add the unique constraint.
361 tableSpec.unique.add(tuple(constraint))
362 return tableSpec
365def makeCalibTableSpec(
366 dimensions: DimensionGroup,
367 collections: type[CollectionManager],
368 TimespanReprClass: type[TimespanDatabaseRepresentation],
369) -> ddl.TableSpec:
370 """Construct the specification for a dynamic (DatasetType-dependent) tag +
371 validity range table used by the classes in this package.
373 Parameters
374 ----------
375 dimensions : `DimensionGroup`
376 Dimensions of the dataset type.
377 collections : `type` [ `CollectionManager` ]
378 `CollectionManager` subclass that can be used to construct foreign keys
379 to the run and/or collection tables.
380 TimespanReprClass : `type` of `TimespanDatabaseRepresentation`
381 The Python type to use to represent a timespan.
383 Returns
384 -------
385 spec : `ddl.TableSpec`
386 Specification for the table.
387 """
388 tableSpec = ddl.TableSpec(
389 fields=[
390 # This table has no natural primary key, compound or otherwise, so
391 # we add an autoincrement key. We may use this field a bit
392 # internally, but its presence is an implementation detail and it
393 # shouldn't appear as a foreign key in any other tables.
394 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, autoincrement=True, primaryKey=True),
395 # Foreign key fields to dataset, collection, and usually dimension
396 # tables added below. The dataset_type_id field here is redundant
397 # with the one in the main monolithic dataset table, but this bit
398 # of denormalization lets us define what should be a much more
399 # useful index.
400 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, nullable=False),
401 ],
402 foreignKeys=[
403 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",)),
404 ],
405 )
406 # Record fields that should go in the temporal lookup index/constraint,
407 # starting with the dataset type.
408 index: list[str | type[TimespanDatabaseRepresentation]] = ["dataset_type_id"]
409 # Add foreign key fields to dataset table (not part of the temporal
410 # lookup/constraint).
411 addDatasetForeignKey(tableSpec, nullable=False, onDelete="CASCADE")
412 # Add foreign key fields to collection table (part of the temporal lookup
413 # index/constraint).
414 collectionFieldSpec = collections.addCollectionForeignKey(tableSpec, nullable=False, onDelete="CASCADE")
415 index.append(collectionFieldSpec.name)
416 # Add foreign key constraint to the collection_summary_dataset_type table.
417 tableSpec.foreignKeys.append(
418 ddl.ForeignKeySpec(
419 "collection_summary_dataset_type",
420 source=(collectionFieldSpec.name, "dataset_type_id"),
421 target=(collectionFieldSpec.name, "dataset_type_id"),
422 )
423 )
424 # Add dimension fields (part of the temporal lookup index.constraint).
425 for dimension_name in dimensions.required:
426 dimension = dimensions.universe.dimensions[dimension_name]
427 fieldSpec = addDimensionForeignKey(tableSpec, dimension=dimension, nullable=False, primaryKey=False)
428 index.append(fieldSpec.name)
429 # If this is a governor dimension, add a foreign key constraint to the
430 # collection_summary_<dimension> table.
431 if isinstance(dimension, GovernorDimension):
432 tableSpec.foreignKeys.append(
433 ddl.ForeignKeySpec(
434 f"collection_summary_{dimension.name}",
435 source=(collectionFieldSpec.name, fieldSpec.name),
436 target=(collectionFieldSpec.name, fieldSpec.name),
437 )
438 )
439 # Add validity-range field(s) (part of the temporal lookup
440 # index/constraint).
441 tsFieldSpecs = TimespanReprClass.makeFieldSpecs(nullable=False)
442 for fieldSpec in tsFieldSpecs:
443 tableSpec.fields.add(fieldSpec)
444 if TimespanReprClass.hasExclusionConstraint():
445 # This database's timespan representation can define a database-level
446 # constraint that prevents overlapping validity ranges for entries with
447 # the same DatasetType, collection, and data ID.
448 # This also creates an index.
449 index.append(TimespanReprClass)
450 tableSpec.exclusion.add(tuple(index))
451 else:
452 # No database-level constraint possible. We'll have to simulate that
453 # in our DatasetRecordStorage.certify() implementation, and just create
454 # a regular index here in the hope that helps with lookups.
455 index.extend(fieldSpec.name for fieldSpec in tsFieldSpecs)
456 tableSpec.indexes.add(ddl.IndexSpec(*index)) # type: ignore
457 return tableSpec
460TableCache: TypeAlias = ThreadSafeCache[str, sqlalchemy.Table]
463@immutable
464class DynamicTables:
465 """A struct that holds the "dynamic" tables common to dataset types that
466 share the same dimensions.
468 Objects of this class may be shared between multiple threads, so it must be
469 immutable to prevent concurrency issues.
471 Parameters
472 ----------
473 dimensions : `DimensionGroup`
474 Dimensions of the dataset types that use these tables.
475 dimensions_key : `int`
476 Integer key used to persist this dimension group in the database and
477 name the associated tables.
478 tags_name : `str`
479 Name of the "tags" table that associates datasets with data IDs in
480 RUN and TAGGED collections.
481 calibs_name : `str` or `None`
482 Name of the "calibs" table that associates datasets with data IDs and
483 timespans in CALIBRATION collections. This is `None` if none of the
484 dataset types (or at least none of those seen by this client) are
485 calibrations.
486 """
488 def __init__(
489 self, dimensions: DimensionGroup, dimensions_key: int, tags_name: str, calibs_name: str | None
490 ):
491 self._dimensions = dimensions
492 self.dimensions_key = dimensions_key
493 self.tags_name = tags_name
494 self.calibs_name = calibs_name
496 def copy(self, calibs_name: str) -> DynamicTables:
497 return DynamicTables(self._dimensions, self.dimensions_key, self.tags_name, calibs_name)
499 @classmethod
500 def from_dimensions_key(
501 cls, dimensions: DimensionGroup, dimensions_key: int, is_calibration: bool
502 ) -> DynamicTables:
503 """Construct with table names generated from the dimension key.
505 Parameters
506 ----------
507 dimensions : `DimensionGroup`
508 Dimensions of the dataset types that use these tables.
509 dimensions_key : `int`
510 Integer key used to persist this dimension group in the database
511 and name the associated tables.
512 is_calibration : `bool`
513 Whether any of the dataset types that use these tables are
514 calibrations.
516 Returns
517 -------
518 dynamic_tables : `DynamicTables`
519 Struct that holds tables for a group of dataset types.
520 """
521 return cls(
522 dimensions,
523 dimensions_key=dimensions_key,
524 tags_name=makeTagTableName(dimensions_key),
525 calibs_name=makeCalibTableName(dimensions_key) if is_calibration else None,
526 )
528 def create(self, db: Database, collections: type[CollectionManager], cache: TableCache) -> None:
529 """Create the tables if they don't already exist.
531 Parameters
532 ----------
533 db : `Database`
534 Database interface.
535 collections : `type` [ `CollectionManager` ]
536 Manager class for collections; used to create foreign key columns
537 for collections.
538 cache : `DynamicTablesCache`
539 Cache used to store sqlalchemy Table objects.
540 """
541 if cache.get(self.tags_name) is None:
542 cache.set_or_get(
543 self.tags_name,
544 db.ensureTableExists(
545 self.tags_name,
546 makeTagTableSpec(self._dimensions, collections),
547 ),
548 )
550 if self.calibs_name is not None and cache.get(self.calibs_name) is None:
551 cache.set_or_get(
552 self.calibs_name,
553 db.ensureTableExists(
554 self.calibs_name,
555 makeCalibTableSpec(self._dimensions, collections, db.getTimespanRepresentation()),
556 ),
557 )
559 def add_calibs(
560 self, db: Database, collections: type[CollectionManager], cache: TableCache
561 ) -> DynamicTables:
562 """Create a calibs table for a dataset type whose dimensions already
563 have a tags table.
565 Parameters
566 ----------
567 db : `Database`
568 Database interface.
569 collections : `type` [ `CollectionManager` ]
570 Manager class for collections; used to create foreign key columns
571 for collections.
572 cache : `DynamicTablesCache`
573 Cache used to store sqlalchemy Table objects.
574 """
575 calibs_name = makeCalibTableName(self.dimensions_key)
576 cache.set_or_get(
577 calibs_name,
578 db.ensureTableExists(
579 calibs_name,
580 makeCalibTableSpec(self._dimensions, collections, db.getTimespanRepresentation()),
581 ),
582 )
584 return self.copy(calibs_name=calibs_name)
586 def tags(self, db: Database, collections: type[CollectionManager], cache: TableCache) -> sqlalchemy.Table:
587 """Return the "tags" table that associates datasets with data IDs in
588 TAGGED and RUN collections.
590 This method caches its result the first time it is called (and assumes
591 the arguments it is given never change).
593 Parameters
594 ----------
595 db : `Database`
596 Database interface.
597 collections : `type` [ `CollectionManager` ]
598 Manager class for collections; used to create foreign key columns
599 for collections.
600 cache : `DynamicTablesCache`
601 Cache used to store sqlalchemy Table objects.
603 Returns
604 -------
605 table : `sqlalchemy.Table`
606 SQLAlchemy table object.
607 """
608 table = cache.get(self.tags_name)
609 if table is not None:
610 return table
612 spec = makeTagTableSpec(self._dimensions, collections)
613 table = db.getExistingTable(self.tags_name, spec)
614 if table is None:
615 raise MissingDatabaseTableError(f"Table {self.tags_name!r} is missing from database schema.")
616 return cache.set_or_get(self.tags_name, table)
618 def calibs(
619 self, db: Database, collections: type[CollectionManager], cache: TableCache
620 ) -> sqlalchemy.Table:
621 """Return the "calibs" table that associates datasets with data IDs and
622 timespans in CALIBRATION collections.
624 This method caches its result the first time it is called (and assumes
625 the arguments it is given never change). It may only be called if the
626 dataset type is calibration.
628 Parameters
629 ----------
630 db : `Database`
631 Database interface.
632 collections : `type` [ `CollectionManager` ]
633 Manager class for collections; used to create foreign key columns
634 for collections.
635 cache : `DynamicTablesCache`
636 Cache used to store sqlalchemy Table objects.
638 Returns
639 -------
640 table : `sqlalchemy.Table`
641 SQLAlchemy table object.
642 """
643 assert self.calibs_name is not None, (
644 "Dataset type should be checked to be calibration by calling code."
645 )
646 table = cache.get(self.calibs_name)
647 if table is not None:
648 return table
650 spec = makeCalibTableSpec(self._dimensions, collections, db.getTimespanRepresentation())
651 table = db.getExistingTable(self.calibs_name, spec)
652 if table is None:
653 raise MissingDatabaseTableError(f"Table {self.calibs_name!r} is missing from database schema.")
654 return cache.set_or_get(self.calibs_name, table)