Coverage for python/lsst/daf/butler/dimensions/_elements.py: 72%
176 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:20 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:20 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "Dimension",
32 "DimensionCombination",
33 "DimensionElement",
34)
36from abc import abstractmethod
37from typing import TYPE_CHECKING, Annotated, Any, ClassVar, TypeAlias, Union, cast
39import pydantic
40from lsst.utils.classes import cached_getter
42from .. import arrow_utils, column_spec, ddl
43from .._named import NamedValueAbstractSet, NamedValueSet
44from .._topology import TopologicalRelationshipEndpoint
45from ..json import from_json_generic, to_json_generic
47if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
48 from ..registry import Registry
49 from ._governor import GovernorDimension
50 from ._graph import DimensionGraph
51 from ._group import DimensionGroup
52 from ._records import DimensionRecord
53 from ._schema import DimensionRecordSchema
54 from ._universe import DimensionUniverse
56KeyColumnSpec: TypeAlias = Annotated[
57 Union[
58 column_spec.IntColumnSpec,
59 column_spec.StringColumnSpec,
60 column_spec.HashColumnSpec,
61 ],
62 pydantic.Field(discriminator="type"),
63]
65MetadataColumnSpec: TypeAlias = Annotated[
66 Union[
67 column_spec.IntColumnSpec,
68 column_spec.StringColumnSpec,
69 column_spec.FloatColumnSpec,
70 column_spec.HashColumnSpec,
71 column_spec.BoolColumnSpec,
72 ],
73 pydantic.Field(discriminator="type"),
74]
77class DimensionElement(TopologicalRelationshipEndpoint):
78 """A label and/or metadata in the dimensions system.
80 A named data-organization concept that defines a label and/or metadata
81 in the dimensions system.
83 A `DimensionElement` instance typically corresponds to a _logical_ table in
84 the `Registry`: either an actual database table or a way of generating rows
85 on-the-fly that can similarly participate in queries. The rows in that
86 table are represented by instances of a `DimensionRecord` subclass. Most
87 `DimensionElement` instances are instances of its `Dimension` subclass,
88 which is used for elements that can be used as data ID keys.
90 Notes
91 -----
92 `DimensionElement` instances should always be constructed by and retrieved
93 from a `DimensionUniverse`. They are immutable after they are fully
94 constructed, and should never be copied.
96 Pickling a `DimensionElement` just records its name and universe;
97 unpickling one actually just looks up the element via the singleton
98 dictionary of all universes. This allows pickle to be used to transfer
99 elements between processes, but only when each process initializes its own
100 instance of the same `DimensionUniverse`.
101 """
103 def __str__(self) -> str:
104 return self.name
106 def __repr__(self) -> str:
107 return f"{type(self).__name__}({self.name})"
109 def __eq__(self, other: Any) -> bool:
110 try:
111 return self.name == other.name
112 except AttributeError:
113 # TODO: try removing this fallback; it's not really consistent with
114 # base class intent, and it could be confusing
115 return self.name == other
117 def __hash__(self) -> int:
118 return hash(self.name)
120 # TODO: try removing comparison operators; DimensionUniverse.sorted should
121 # be adequate.
123 def __lt__(self, other: DimensionElement) -> bool:
124 try:
125 return self.universe.getElementIndex(self.name) < self.universe.getElementIndex(other.name)
126 except KeyError:
127 return NotImplemented
129 def __le__(self, other: DimensionElement) -> bool:
130 try:
131 return self.universe.getElementIndex(self.name) <= self.universe.getElementIndex(other.name)
132 except KeyError:
133 return NotImplemented
135 def __gt__(self, other: DimensionElement) -> bool:
136 try:
137 return self.universe.getElementIndex(self.name) > self.universe.getElementIndex(other.name)
138 except KeyError:
139 return NotImplemented
141 def __ge__(self, other: DimensionElement) -> bool:
142 try:
143 return self.universe.getElementIndex(self.name) >= self.universe.getElementIndex(other.name)
144 except KeyError:
145 return NotImplemented
147 @classmethod
148 def _unpickle(cls, universe: DimensionUniverse, name: str) -> DimensionElement:
149 """Callable used for unpickling.
151 For internal use only.
152 """
153 return universe[name]
155 def __reduce__(self) -> tuple:
156 return (self._unpickle, (self.universe, self.name))
158 def __deepcopy__(self, memo: dict) -> DimensionElement:
159 # DimensionElement is recursively immutable; see note in @immutable
160 # decorator.
161 return self
163 def to_simple(self, minimal: bool = False) -> str:
164 """Convert this class to a simple python type.
166 This is suitable for serialization.
168 Parameters
169 ----------
170 minimal : `bool`, optional
171 Use minimal serialization. Has no effect on for this class.
173 Returns
174 -------
175 simple : `str`
176 The object converted to a single string.
177 """
178 return self.name
180 @classmethod
181 def from_simple(
182 cls, simple: str, universe: DimensionUniverse | None = None, registry: Registry | None = None
183 ) -> DimensionElement:
184 """Construct a new object from the simplified form.
186 Usually the data is returned from the `to_simple` method.
188 Parameters
189 ----------
190 simple : `str`
191 The value returned by `to_simple()`.
192 universe : `DimensionUniverse`
193 The special graph of all known dimensions.
194 registry : `lsst.daf.butler.Registry`, optional
195 Registry from which a universe can be extracted. Can be `None`
196 if universe is provided explicitly.
198 Returns
199 -------
200 dataId : `DimensionElement`
201 Newly-constructed object.
202 """
203 if universe is None and registry is None:
204 raise ValueError("One of universe or registry is required to convert a dict to a DataCoordinate")
205 if universe is None and registry is not None:
206 universe = registry.dimensions
207 if universe is None:
208 # this is for mypy
209 raise ValueError("Unable to determine a usable universe")
211 return universe[simple]
213 to_json = to_json_generic
214 from_json: ClassVar = classmethod(from_json_generic)
216 def hasTable(self) -> bool:
217 """Indicate if this element is associated with a table.
219 Return `True` if this element is associated with a table
220 (even if that table "belongs" to another element).
221 """
222 return self.has_own_table or self.implied_union_target is not None
224 universe: DimensionUniverse
225 """The universe of all compatible dimensions with which this element is
226 associated (`DimensionUniverse`).
227 """
229 @property
230 @cached_getter
231 def governor(self) -> GovernorDimension | None:
232 """Return the governor dimension.
234 This is the `GovernorDimension` that is a required dependency of this
235 element, or `None` if there is no such dimension (`GovernorDimension`
236 or `None`).
237 """
238 if len(self.minimal_group.governors) == 1:
239 (result,) = self.minimal_group.governors
240 return cast("GovernorDimension", self.universe[result])
241 elif len(self.minimal_group.governors) > 1:
242 raise RuntimeError(
243 f"Dimension element {self.name} has multiple governors: {self.minimal_group.governors}."
244 )
245 else:
246 return None
248 @property
249 @abstractmethod
250 def required(self) -> NamedValueAbstractSet[Dimension]:
251 """Return the required dimensions.
253 Dimensions that are necessary to uniquely identify a record of this
254 dimension element.
256 For elements with a database representation, these dimension are
257 exactly those used to form the (possibly compound) primary key, and all
258 dimensions here that are not ``self`` are also used to form foreign
259 keys.
261 For `Dimension` instances, this should be exactly the same as
262 ``graph.required``, but that may not be true for `DimensionElement`
263 instances in general. When they differ, there are multiple
264 combinations of dimensions that uniquely identify this element, but
265 this one is more direct.
266 """
267 raise NotImplementedError()
269 @property
270 @abstractmethod
271 def implied(self) -> NamedValueAbstractSet[Dimension]:
272 """Return the implied dimensions.
274 Other dimensions that are uniquely identified directly by a record
275 of this dimension element.
277 For elements with a database representation, these are exactly the
278 dimensions used to form foreign key constraints whose fields are not
279 (wholly) also part of the primary key.
281 Unlike ``self.graph.implied``, this set is not expanded recursively.
282 """
283 raise NotImplementedError()
285 @property
286 @cached_getter
287 def dimensions(self) -> NamedValueAbstractSet[Dimension]:
288 """Return all dimensions.
290 The union of `required` and `implied`, with all elements in
291 `required` before any elements in `implied`.
293 This differs from ``self.graph.dimensions`` both in order and in
294 content:
296 - as in ``self.implied``, implied dimensions are not expanded
297 recursively here;
298 - implied dimensions appear after required dimensions here, instead of
299 being topologically ordered.
301 As a result, this set is ordered consistently with
302 ``self.RecordClass.fields``.
303 """
304 return NamedValueSet(list(self.required) + list(self.implied)).freeze()
306 # Deprecated via a warning from its implementation.
307 # TODO: remove on DM-41326.
308 @property
309 def graph(self) -> DimensionGraph:
310 """Return minimal graph that includes this element (`DimensionGraph`).
312 ``self.graph.required`` includes all dimensions whose primary key
313 values are sufficient (often necessary) to uniquely identify ``self``
314 (including ``self`` if ``isinstance(self, Dimension)``.
315 ``self.graph.implied`` includes all dimensions also identified
316 (possibly recursively) by this set.
317 """
318 return self.minimal_group._as_graph()
320 @property
321 @cached_getter
322 def minimal_group(self) -> DimensionGroup:
323 """Return minimal dimension group that includes this element.
325 ``self.minimal_group.required`` includes all dimensions whose primary
326 key values are sufficient (often necessary) to uniquely identify
327 ``self`` (including ``self`` if ``isinstance(self, Dimension)``.
328 ``self.minimal_group.implied`` includes all dimensions also identified
329 (possibly recursively) by this set.
330 """
331 return self.universe.conform(self.dimensions.names)
333 @property
334 @cached_getter
335 def RecordClass(self) -> type[DimensionRecord]:
336 """Return the record subclass for this element.
338 The `DimensionRecord` subclass used to hold records for this element
339 (`type`).
341 Because `DimensionRecord` subclasses are generated dynamically, this
342 type cannot be imported directly and hence can only be obtained from
343 this attribute.
344 """
345 from ._records import _subclassDimensionRecord
347 return _subclassDimensionRecord(self)
349 @property
350 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]:
351 """Additional unique key fields for this dimension element that are not
352 the primary key (`NamedValueAbstractSet` of `KeyColumnSpec`).
354 This is always empty for elements that are not dimensions.
356 If this dimension has required dependencies, the keys of those
357 dimensions are also included in the unique constraints defined for
358 these alternate keys.
359 """
360 return NamedValueSet().freeze()
362 @property
363 @abstractmethod
364 def metadata_columns(self) -> NamedValueAbstractSet[MetadataColumnSpec]:
365 """Additional metadata fields included in this element's table.
367 (`NamedValueSet` of `MetadataColumnSpec`).
368 """
369 raise NotImplementedError()
371 @property
372 @cached_getter
373 def metadata(self) -> NamedValueAbstractSet[ddl.FieldSpec]:
374 """Additional metadata fields included in this element's table.
376 (`NamedValueSet` of `FieldSpec`).
377 """
378 return NamedValueSet([column_spec.to_sql_spec() for column_spec in self.metadata_columns]).freeze()
380 @property
381 def viewOf(self) -> str | None:
382 """Name of another table this element's records are drawn from.
384 (`str` or `None`).
385 """
386 return self.implied_union_target.name if self.implied_union_target is not None else None
388 @property
389 def alwaysJoin(self) -> bool:
390 """Indicate if the element should always be included.
392 If `True`, always include this element in any query or data ID in
393 which its ``required`` dimensions appear, because it defines a
394 relationship between those dimensions that must always be satisfied.
395 """
396 return False
398 @property
399 def has_own_table(self) -> bool:
400 """Whether this element should have its own table in the database."""
401 return self.implied_union_target is None
403 @property
404 def implied_union_target(self) -> DimensionElement | None:
405 """If not `None`, another element whose implied values for this element
406 form the set of allowable values.
408 For example, in the default dimension universe, the allowed values for
409 ``band`` is the union of all ``band`` values in the ``physical_filter``
410 table, so the `implied_union_target` for ``band`` is
411 ``physical_filter``.
412 """
413 return None
415 @property
416 def defines_relationships(self) -> bool:
417 """Whether this element's records define one or more relationships that
418 must be satisfied in rows over dimensions that include it.
419 """
420 return bool(self.implied)
422 @property
423 def is_cached(self) -> bool:
424 """Whether this element's records should be aggressively cached,
425 because they are small in number and rarely inserted.
426 """
427 return False
429 @property
430 @abstractmethod
431 def populated_by(self) -> Dimension | None:
432 """The dimension that this element's records are always inserted,
433 exported, and imported alongside.
435 Notes
436 -----
437 When this is `None` (as it will be, at least at first, for any data
438 repositories created before this attribute was added), records for
439 this element will often need to be exported manually when datasets
440 associated with some other related dimension are exported, in order for
441 the post-import data repository to function as expected.
442 """
443 raise NotImplementedError()
445 @property
446 @cached_getter
447 def schema(self) -> DimensionRecordSchema:
448 """A description of the columns in this element's records and (at least
449 conceptual) table.
450 """
451 from ._schema import DimensionRecordSchema
453 return DimensionRecordSchema(self)
455 @property
456 @abstractmethod
457 def documentation(self) -> str:
458 """Extended description of this dimension element."""
459 raise NotImplementedError()
462class Dimension(DimensionElement):
463 """A dimension.
465 A named data-organization concept that can be used as a key in a data
466 ID.
467 """
469 @property
470 @abstractmethod
471 def unique_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]:
472 """Descriptions of unique identifiers for this dimension.
474 All fields that can individually be used to identify records of this
475 element, given the primary keys of all required dependencies
476 (`NamedValueAbstractSet` of `KeyColumnSpec`).
477 """
478 raise NotImplementedError()
480 @property
481 @cached_getter
482 def primary_key(self) -> KeyColumnSpec:
483 """The primary key field for this dimension (`KeyColumnSpec`).
485 Note that the database primary keys for dimension tables are in general
486 compound; this field is the only field in the database primary key that
487 is not also a foreign key (to a required dependency dimension table).
488 """
489 primary_ey, *_ = self.unique_keys
490 return primary_ey
492 @property
493 @cached_getter
494 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]:
495 # Docstring inherited.
496 _, *alternate_keys = self.unique_keys
497 return NamedValueSet(alternate_keys).freeze()
499 @property
500 @cached_getter
501 def uniqueKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]:
502 """Return the unique fields.
504 All fields that can individually be used to identify records of this
505 element, given the primary keys of all required dependencies
506 (`NamedValueAbstractSet` of `FieldSpec`).
507 """
508 return NamedValueSet(
509 [column_spec.to_sql_spec(primaryKey=(n == 0)) for n, column_spec in enumerate(self.unique_keys)]
510 )
512 @property
513 @cached_getter
514 def primaryKey(self) -> ddl.FieldSpec:
515 """Return primary key field for this dimension (`FieldSpec`).
517 Note that the database primary keys for dimension tables are in general
518 compound; this field is the only field in the database primary key that
519 is not also a foreign key (to a required dependency dimension table).
520 """
521 primaryKey, *_ = self.uniqueKeys
522 return primaryKey
524 @property
525 @cached_getter
526 def alternateKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]:
527 """Return alternate keys.
529 Additional unique key fields for this dimension that are not the
530 primary key (`NamedValueAbstractSet` of `FieldSpec`).
532 If this dimension has required dependencies, the keys of those
533 dimensions are also included in the unique constraints defined for
534 these alternate keys.
535 """
536 _, *alternateKeys = self.uniqueKeys
537 return NamedValueSet(alternateKeys).freeze()
539 @property
540 def populated_by(self) -> Dimension:
541 # Docstring inherited.
542 return self
544 def to_arrow(self, dimensions: DimensionGroup, spec: KeyColumnSpec | None = None) -> arrow_utils.ToArrow:
545 """Return an object that converts the primary key value for this
546 dimension to column in an Arrow table.
548 Parameters
549 ----------
550 dimensions : `DimensionGroup`
551 Full set of dimensions over which the rows of the table are unique
552 or close to unique. This is used to determine whether to use
553 Arrow's dictionary encoding to compress duplicate values.
554 spec : `KeyColumnSpec`, optional
555 Column specification for this dimension. If not provided, a copy
556 of `primary_key` the the field name replaced with the dimension
557 name will be used, which is appropriate for when this dimension
558 appears in data ID or the dimension record tables of other
559 dimension elements.
561 Returns
562 -------
563 converter : `arrow_utils.ToArrow`
564 Converter for this dimension's primary key.
565 """
566 if spec is None:
567 spec = self.primary_key.model_copy(update={"name": self.name})
568 if dimensions != self.minimal_group and spec.type != "int":
569 # Values are large and will be duplicated in rows that are unique
570 # over these dimensions, so dictionary encoding may help a lot.
571 return spec.to_arrow().dictionary_encoded()
572 else:
573 return spec.to_arrow()
576class DimensionCombination(DimensionElement):
577 """Element with extra information.
579 A `DimensionElement` that provides extra metadata and/or relationship
580 endpoint information for a combination of dimensions.
581 """