Coverage for python/lsst/daf/butler/dimensions/_schema.py: 25%
176 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:16 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-05-02 03:16 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("addDimensionForeignKey", "DimensionRecordSchema")
31import copy
32from collections.abc import Mapping, Set
33from typing import TYPE_CHECKING
35from lsst.utils.classes import cached_getter, immutable
37from .. import arrow_utils, ddl
38from .._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag
39from .._named import NamedValueAbstractSet, NamedValueSet
40from ..column_spec import RegionColumnSpec, TimespanColumnSpec
41from ..timespan_database_representation import TimespanDatabaseRepresentation
43if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
44 from lsst.daf.relation import ColumnTag
46 from ._elements import Dimension, DimensionElement, KeyColumnSpec, MetadataColumnSpec
47 from ._group import DimensionGroup
50@immutable
51class DimensionRecordSchema:
52 """A description of the columns in a dimension element's records.
54 Instances of this class should be obtained via `DimensionElement.schema`,
55 where they are cached on first use.
57 Parameters
58 ----------
59 element : `DimensionElement`
60 Element this object describes.
61 """
63 def __init__(self, element: DimensionElement):
64 self.element = element
65 self.required = NamedValueSet()
66 self.implied = NamedValueSet()
67 self.dimensions = NamedValueSet()
68 self.remainder = NamedValueSet()
69 self.all = NamedValueSet()
70 for dimension in element.required:
71 if dimension != element:
72 key_spec = dimension.primary_key.model_copy(update={"name": dimension.name})
73 else:
74 # A Dimension instance is in its own required dependency graph
75 # (always at the end, because of topological ordering). In
76 # this case we don't want to rename the field.
77 key_spec = element.primary_key # type: ignore
78 self.required.add(key_spec)
79 self.dimensions.add(key_spec)
80 for dimension in element.implied:
81 key_spec = dimension.primary_key.model_copy(update={"name": dimension.name})
82 self.implied.add(key_spec)
83 self.dimensions.add(key_spec)
84 self.all.update(self.dimensions)
85 # Add non-primary unique keys.
86 self.remainder.update(element.alternate_keys)
87 # Add other metadata record_fields.
88 self.remainder.update(element.metadata_columns)
89 if element.spatial:
90 self.remainder.add(RegionColumnSpec(nullable=True))
91 if element.temporal:
92 self.remainder.add(TimespanColumnSpec(nullable=True))
93 self.all.update(self.remainder)
94 self.required.freeze()
95 self.implied.freeze()
96 self.dimensions.freeze()
97 self.remainder.freeze()
98 self.all.freeze()
100 element: DimensionElement
101 """The dimension element these fields correspond to.
103 (`DimensionElement`)
104 """
106 required: NamedValueAbstractSet[KeyColumnSpec]
107 """The required dimension columns of this element's records.
109 The elements of this set correspond to `DimensionElement.required`, in the
110 same order.
111 """
113 implied: NamedValueAbstractSet[KeyColumnSpec]
114 """The implied dimension columns of this element's records.
116 The elements of this set correspond to `DimensionElement.implied`, in the
117 same order.
118 """
120 dimensions: NamedValueAbstractSet[KeyColumnSpec]
121 """The required and implied dimension columns of this element's records.
123 The elements of this set correspond to `DimensionElement.dimensions`, in
124 the same order.
125 """
127 remainder: NamedValueAbstractSet[MetadataColumnSpec | RegionColumnSpec | TimespanColumnSpec]
128 """The fields of this table that do not correspond to dimensions.
130 This includes alternate keys, metadata columns, and any region or timespan.
131 """
133 all: NamedValueAbstractSet[MetadataColumnSpec | RegionColumnSpec | TimespanColumnSpec]
134 """All columns for this dimension element's records, in order."""
136 @property
137 def names(self) -> Set[str]:
138 """The names of all columns, in order."""
139 return self.all.names
141 def __str__(self) -> str:
142 lines = [f"{self.element.name}: "]
143 for column_spec in self.all:
144 lines.extend(column_spec.display(level=1))
145 return "\n".join(lines)
147 def to_arrow(
148 self, remainder_only: bool = False, dimensions: DimensionGroup | None = None
149 ) -> list[arrow_utils.ToArrow]:
150 """Convert this schema to Arrow form.
152 Parameters
153 ----------
154 remainder_only : `bool`, optional
155 If `True`, skip the fields in `dimensions` and convert only those
156 in `remainder`.
157 dimensions : `DimensionGroup`, optional
158 Full set of dimensions over which the rows of the table are unique
159 or close to unique. This is used to determine whether to use
160 Arrow's dictionary encoding to compress duplicate values. Defaults
161 to this element's `~DimensionElement.minimal_group`, which is
162 appropriate for tables of just the records of this element.
164 Returns
165 -------
166 converters : `list` [ `arrow_utils.ToArrow` ]
167 List of objects that can convert `DimensionRecord` attribute values
168 to Arrow records, corresponding exactly to either `all` or
169 `remainder`, depending on ``remainder_only``.
170 """
171 if dimensions is None:
172 dimensions = self.element.minimal_group
173 converters: list[arrow_utils.ToArrow] = []
174 if not remainder_only:
175 for dimension, key_spec in zip(self.element.dimensions, self.dimensions):
176 converters.append(dimension.to_arrow(dimensions, key_spec))
177 for remainder_spec in self.remainder:
178 if remainder_spec.type == "string" and (
179 remainder_spec.name in self.element.metadata_columns.names
180 or dimensions != self.element.minimal_group
181 ):
182 converters.append(remainder_spec.to_arrow().dictionary_encoded())
183 else:
184 converters.append(remainder_spec.to_arrow())
185 return converters
188def _makeForeignKeySpec(dimension: Dimension) -> ddl.ForeignKeySpec:
189 """Make a `ddl.ForeignKeySpec`.
191 This will reference the table for the given `Dimension` table.
193 Most callers should use the higher-level `addDimensionForeignKey` function
194 instead.
196 Parameters
197 ----------
198 dimension : `Dimension`
199 The dimension to be referenced. Caller guarantees that it is actually
200 associated with a table.
202 Returns
203 -------
204 spec : `ddl.ForeignKeySpec`
205 A database-agnostic foreign key specification.
206 """
207 source = []
208 target = []
209 for other in dimension.required:
210 if other == dimension:
211 target.append(dimension.primaryKey.name)
212 else:
213 target.append(other.name)
214 source.append(other.name)
215 return ddl.ForeignKeySpec(table=dimension.name, source=tuple(source), target=tuple(target))
218def addDimensionForeignKey(
219 tableSpec: ddl.TableSpec,
220 dimension: Dimension,
221 *,
222 primaryKey: bool,
223 nullable: bool = False,
224 constraint: bool = True,
225) -> ddl.FieldSpec:
226 """Add a field and possibly a foreign key to a table specification.
228 The field will reference the table for the given `Dimension`.
230 Parameters
231 ----------
232 tableSpec : `ddl.TableSpec`
233 Specification the field and foreign key are to be added to.
234 dimension : `Dimension`
235 Dimension to be referenced. If this dimension has required
236 dependencies, those must have already been added to the table. A field
237 will be added that correspond to this dimension's primary key, and a
238 foreign key constraint will be added only if the dimension is
239 associated with a table of its own.
240 primaryKey : `bool`
241 If `True`, the new field will be added as part of a compound primary
242 key for the table.
243 nullable : `bool`, optional
244 If `False` (default) the new field will be added with a NOT NULL
245 constraint.
246 constraint : `bool`
247 If `False` (`True` is default), just add the field, not the foreign
248 key constraint.
250 Returns
251 -------
252 fieldSpec : `ddl.FieldSpec`
253 Specification for the field just added.
254 """
255 # Add the dependency's primary key field, but use the dimension name for
256 # the field name to make it unique and more meaningful in this table.
257 fieldSpec = copy.copy(dimension.primaryKey)
258 fieldSpec.name = dimension.name
259 fieldSpec.primaryKey = primaryKey
260 fieldSpec.nullable = nullable
261 tableSpec.fields.add(fieldSpec)
262 # Also add a foreign key constraint on the dependency table, but only if
263 # there actually is one and we weren't told not to.
264 if dimension.has_own_table and constraint:
265 tableSpec.foreignKeys.append(_makeForeignKeySpec(dimension))
266 return fieldSpec
269class DimensionElementFields:
270 """Class for constructing table schemas for `DimensionElement`.
272 This creates an object that constructs the table schema for a
273 `DimensionElement` and provides a categorized view of its fields.
275 Parameters
276 ----------
277 element : `DimensionElement`
278 Element for which to make a table specification.
280 Notes
281 -----
282 This combines the foreign key fields from dependencies, unique keys
283 for true `Dimension` instances, metadata fields, and region/timestamp
284 fields for spatial/temporal elements.
286 Callers should use `DimensionUniverse.makeSchemaSpec` if they want to
287 account for elements that have no table or reference another table; this
288 class simply creates a specification for the table an element _would_ have
289 without checking whether it does have one. That can be useful in contexts
290 (e.g. `DimensionRecord`) where we want to simulate the existence of such a
291 table.
292 """
294 def __init__(self, element: DimensionElement):
295 self.element = element
296 self._tableSpec = ddl.TableSpec(fields=())
297 # Add the primary key fields of required dimensions. These continue to
298 # be primary keys in the table for this dimension.
299 self.required = NamedValueSet()
300 self.dimensions = NamedValueSet()
301 self.facts = NamedValueSet()
302 self.standard = NamedValueSet()
303 dependencies = []
304 for dimension in element.required:
305 if dimension != element:
306 fieldSpec = addDimensionForeignKey(self._tableSpec, dimension, primaryKey=True)
307 dependencies.append(fieldSpec.name)
308 else:
309 fieldSpec = element.primaryKey # type: ignore
310 # A Dimension instance is in its own required dependency graph
311 # (always at the end, because of topological ordering). In
312 # this case we don't want to rename the field.
313 self._tableSpec.fields.add(fieldSpec)
314 self.required.add(fieldSpec)
315 self.dimensions.add(fieldSpec)
316 self.standard.add(fieldSpec)
317 # Add fields and foreign keys for implied dimensions. These are
318 # primary keys in their own table, but should not be here. As with
319 # required dependencies, we rename the fields with the dimension name.
320 # We use element.implied instead of element.graph.implied because we
321 # don't want *recursive* implied dependencies.
322 self.implied = NamedValueSet()
323 for dimension in element.implied:
324 fieldSpec = addDimensionForeignKey(self._tableSpec, dimension, primaryKey=False, nullable=False)
325 self.implied.add(fieldSpec)
326 self.dimensions.add(fieldSpec)
327 self.standard.add(fieldSpec)
328 # Add non-primary unique keys and unique constraints for them.
329 for fieldSpec in getattr(element, "alternateKeys", ()):
330 self._tableSpec.fields.add(fieldSpec)
331 self._tableSpec.unique.add(tuple(dependencies) + (fieldSpec.name,))
332 self.standard.add(fieldSpec)
333 self.facts.add(fieldSpec)
334 # Add other metadata fields.
335 for fieldSpec in element.metadata:
336 self._tableSpec.fields.add(fieldSpec)
337 self.standard.add(fieldSpec)
338 self.facts.add(fieldSpec)
339 names = list(self.standard.names)
340 # Add fields for regions and/or timespans.
341 if element.spatial is not None:
342 names.append("region")
343 if element.temporal is not None:
344 names.append(TimespanDatabaseRepresentation.NAME)
345 self.names = tuple(names)
347 def makeTableSpec(
348 self,
349 TimespanReprClass: type[TimespanDatabaseRepresentation],
350 ) -> ddl.TableSpec:
351 """Construct a complete specification for a table.
353 The table could hold the records of this element.
355 Parameters
356 ----------
357 TimespanReprClass : `type` [ `TimespanDatabaseRepresentation` ]
358 Class object that specifies how timespans are represented in the
359 database.
361 Returns
362 -------
363 spec : `ddl.TableSpec`
364 Specification for a table.
365 """
366 if self.element.temporal is not None or self.element.spatial is not None:
367 spec = ddl.TableSpec(
368 fields=NamedValueSet(self._tableSpec.fields),
369 unique=self._tableSpec.unique,
370 indexes=self._tableSpec.indexes,
371 foreignKeys=self._tableSpec.foreignKeys,
372 )
373 if self.element.spatial is not None:
374 spec.fields.add(ddl.FieldSpec.for_region())
375 if self.element.temporal is not None:
376 spec.fields.update(TimespanReprClass.makeFieldSpecs(nullable=True))
377 else:
378 spec = self._tableSpec
379 return spec
381 def __str__(self) -> str:
382 lines = [f"{self.element.name}: "]
383 lines.extend(f" {field.name}: {field.getPythonType().__name__}" for field in self.standard)
384 if self.element.spatial is not None:
385 lines.append(" region: lsst.sphgeom.Region")
386 if self.element.temporal is not None:
387 lines.append(" timespan: lsst.daf.butler.Timespan")
388 return "\n".join(lines)
390 @property
391 @cached_getter
392 def columns(self) -> Mapping[ColumnTag, str]:
393 """A mapping from `ColumnTag` to field name for all fields in this
394 element's records (`~collections.abc.Mapping`).
395 """
396 result: dict[ColumnTag, str] = {}
397 for dimension_name, field_name in zip(
398 self.element.dimensions.names, self.dimensions.names, strict=True
399 ):
400 result[DimensionKeyColumnTag(dimension_name)] = field_name
401 for field_name in self.facts.names:
402 result[DimensionRecordColumnTag(self.element.name, field_name)] = field_name
403 if self.element.spatial:
404 result[DimensionRecordColumnTag(self.element.name, "region")] = "region"
405 if self.element.temporal:
406 result[DimensionRecordColumnTag(self.element.name, "timespan")] = "timespan"
407 return result
409 element: DimensionElement
410 """The dimension element these fields correspond to.
412 (`DimensionElement`)
413 """
415 required: NamedValueSet[ddl.FieldSpec]
416 """The required dimension fields of this table.
418 They correspond to the element's required
419 dimensions, in that order, i.e. `DimensionElement.required`
420 (`NamedValueSet` [ `ddl.FieldSpec` ]).
421 """
423 implied: NamedValueSet[ddl.FieldSpec]
424 """The implied dimension fields of this table.
426 They correspond to the element's implied
427 dimensions, in that order, i.e. `DimensionElement.implied`
428 (`NamedValueSet` [ `ddl.FieldSpec` ]).
429 """
431 dimensions: NamedValueSet[ddl.FieldSpec]
432 """The direct and implied dimension fields of this table.
434 They correspond to the element's direct
435 required and implied dimensions, in that order, i.e.
436 `DimensionElement.dimensions` (`NamedValueSet` [ `ddl.FieldSpec` ]).
437 """
439 facts: NamedValueSet[ddl.FieldSpec]
440 """The standard fields of this table that do not correspond to dimensions.
442 (`NamedValueSet` [ `ddl.FieldSpec` ]).
444 This is equivalent to ``standard - dimensions`` (but possibly in a
445 different order).
446 """
448 standard: NamedValueSet[ddl.FieldSpec]
449 """All standard fields that are expected to have the same form.
451 They are expected to have the same form in all
452 databases; this is all fields other than those that represent a region
453 and/or timespan (`NamedValueSet` [ `ddl.FieldSpec` ]).
454 """
456 names: tuple[str, ...]
457 """The names of all fields in the specification (`tuple` [ `str` ]).
459 This includes "region" and/or "timespan" if `element` is spatial and/or
460 temporal (respectively). The actual database representation of these
461 quantities may involve multiple fields (or even fields only on a different
462 table), but the Python representation of those rows (i.e. `DimensionRecord`
463 instances) will always contain exactly these fields.
464 """