Coverage for python/lsst/daf/butler/dimensions/_elements.py: 72%
188 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "Dimension",
32 "DimensionCombination",
33 "DimensionElement",
34)
36from abc import abstractmethod
37from typing import TYPE_CHECKING, Annotated, Any, ClassVar, TypeAlias, Union, cast
39import pydantic
40from lsst.utils.classes import cached_getter
41from pydantic_core import core_schema
43from .. import arrow_utils, column_spec, ddl, pydantic_utils
44from .._named import NamedValueAbstractSet, NamedValueSet
45from .._topology import TopologicalRelationshipEndpoint
46from ..json import from_json_generic, to_json_generic
48if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
49 from ..registry import Registry
50 from ._governor import GovernorDimension
51 from ._graph import DimensionGraph
52 from ._group import DimensionGroup
53 from ._records import DimensionRecord
54 from ._schema import DimensionRecordSchema
55 from ._universe import DimensionUniverse
57KeyColumnSpec: TypeAlias = Annotated[
58 Union[
59 column_spec.IntColumnSpec,
60 column_spec.StringColumnSpec,
61 column_spec.HashColumnSpec,
62 ],
63 pydantic.Field(discriminator="type"),
64]
66MetadataColumnSpec: TypeAlias = Annotated[
67 Union[
68 column_spec.IntColumnSpec,
69 column_spec.StringColumnSpec,
70 column_spec.FloatColumnSpec,
71 column_spec.HashColumnSpec,
72 column_spec.BoolColumnSpec,
73 ],
74 pydantic.Field(discriminator="type"),
75]
78class DimensionElement(TopologicalRelationshipEndpoint):
79 """A label and/or metadata in the dimensions system.
81 A named data-organization concept that defines a label and/or metadata
82 in the dimensions system.
84 A `DimensionElement` instance typically corresponds to a _logical_ table in
85 the `Registry`: either an actual database table or a way of generating rows
86 on-the-fly that can similarly participate in queries. The rows in that
87 table are represented by instances of a `DimensionRecord` subclass. Most
88 `DimensionElement` instances are instances of its `Dimension` subclass,
89 which is used for elements that can be used as data ID keys.
91 Notes
92 -----
93 `DimensionElement` instances should always be constructed by and retrieved
94 from a `DimensionUniverse`. They are immutable after they are fully
95 constructed, and should never be copied.
97 Pickling a `DimensionElement` just records its name and universe;
98 unpickling one actually just looks up the element via the singleton
99 dictionary of all universes. This allows pickle to be used to transfer
100 elements between processes, but only when each process initializes its own
101 instance of the same `DimensionUniverse`.
102 """
104 def __str__(self) -> str:
105 return self.name
107 def __repr__(self) -> str:
108 return f"{type(self).__name__}({self.name})"
110 def __eq__(self, other: Any) -> bool:
111 try:
112 return self.name == other.name
113 except AttributeError:
114 # TODO: try removing this fallback; it's not really consistent with
115 # base class intent, and it could be confusing
116 return self.name == other
118 def __hash__(self) -> int:
119 return hash(self.name)
121 # TODO: try removing comparison operators; DimensionUniverse.sorted should
122 # be adequate.
124 def __lt__(self, other: DimensionElement) -> bool:
125 try:
126 return self.universe.getElementIndex(self.name) < self.universe.getElementIndex(other.name)
127 except KeyError:
128 return NotImplemented
130 def __le__(self, other: DimensionElement) -> bool:
131 try:
132 return self.universe.getElementIndex(self.name) <= self.universe.getElementIndex(other.name)
133 except KeyError:
134 return NotImplemented
136 def __gt__(self, other: DimensionElement) -> bool:
137 try:
138 return self.universe.getElementIndex(self.name) > self.universe.getElementIndex(other.name)
139 except KeyError:
140 return NotImplemented
142 def __ge__(self, other: DimensionElement) -> bool:
143 try:
144 return self.universe.getElementIndex(self.name) >= self.universe.getElementIndex(other.name)
145 except KeyError:
146 return NotImplemented
148 @classmethod
149 def _unpickle(cls, universe: DimensionUniverse, name: str) -> DimensionElement:
150 """Callable used for unpickling.
152 For internal use only.
153 """
154 return universe[name]
156 def __reduce__(self) -> tuple:
157 return (self._unpickle, (self.universe, self.name))
159 def __deepcopy__(self, memo: dict) -> DimensionElement:
160 # DimensionElement is recursively immutable; see note in @immutable
161 # decorator.
162 return self
164 def to_simple(self, minimal: bool = False) -> str:
165 """Convert this class to a simple python type.
167 This is suitable for serialization.
169 Parameters
170 ----------
171 minimal : `bool`, optional
172 Use minimal serialization. Has no effect on for this class.
174 Returns
175 -------
176 simple : `str`
177 The object converted to a single string.
178 """
179 return self.name
181 @classmethod
182 def from_simple(
183 cls, simple: str, universe: DimensionUniverse | None = None, registry: Registry | None = None
184 ) -> DimensionElement:
185 """Construct a new object from the simplified form.
187 Usually the data is returned from the `to_simple` method.
189 Parameters
190 ----------
191 simple : `str`
192 The value returned by `to_simple()`.
193 universe : `DimensionUniverse`
194 The special graph of all known dimensions.
195 registry : `lsst.daf.butler.Registry`, optional
196 Registry from which a universe can be extracted. Can be `None`
197 if universe is provided explicitly.
199 Returns
200 -------
201 dataId : `DimensionElement`
202 Newly-constructed object.
203 """
204 if universe is None and registry is None:
205 raise ValueError("One of universe or registry is required to convert a dict to a DataCoordinate")
206 if universe is None and registry is not None:
207 universe = registry.dimensions
208 if universe is None:
209 # this is for mypy
210 raise ValueError("Unable to determine a usable universe")
212 return universe[simple]
214 to_json = to_json_generic
215 from_json: ClassVar = classmethod(from_json_generic)
217 def hasTable(self) -> bool:
218 """Indicate if this element is associated with a table.
220 Return `True` if this element is associated with a table
221 (even if that table "belongs" to another element).
222 """
223 return self.has_own_table or self.implied_union_target is not None
225 universe: DimensionUniverse
226 """The universe of all compatible dimensions with which this element is
227 associated (`DimensionUniverse`).
228 """
230 @property
231 @cached_getter
232 def governor(self) -> GovernorDimension | None:
233 """Return the governor dimension.
235 This is the `GovernorDimension` that is a required dependency of this
236 element, or `None` if there is no such dimension (`GovernorDimension`
237 or `None`).
238 """
239 if len(self.minimal_group.governors) == 1:
240 (result,) = self.minimal_group.governors
241 return cast("GovernorDimension", self.universe[result])
242 elif len(self.minimal_group.governors) > 1:
243 raise RuntimeError(
244 f"Dimension element {self.name} has multiple governors: {self.minimal_group.governors}."
245 )
246 else:
247 return None
249 @property
250 @abstractmethod
251 def required(self) -> NamedValueAbstractSet[Dimension]:
252 """Return the required dimensions.
254 Dimensions that are necessary to uniquely identify a record of this
255 dimension element.
257 For elements with a database representation, these dimension are
258 exactly those used to form the (possibly compound) primary key, and all
259 dimensions here that are not ``self`` are also used to form foreign
260 keys.
262 For `Dimension` instances, this should be exactly the same as
263 ``graph.required``, but that may not be true for `DimensionElement`
264 instances in general. When they differ, there are multiple
265 combinations of dimensions that uniquely identify this element, but
266 this one is more direct.
267 """
268 raise NotImplementedError()
270 @property
271 @abstractmethod
272 def implied(self) -> NamedValueAbstractSet[Dimension]:
273 """Return the implied dimensions.
275 Other dimensions that are uniquely identified directly by a record
276 of this dimension element.
278 For elements with a database representation, these are exactly the
279 dimensions used to form foreign key constraints whose fields are not
280 (wholly) also part of the primary key.
282 Unlike ``self.graph.implied``, this set is not expanded recursively.
283 """
284 raise NotImplementedError()
286 @property
287 @cached_getter
288 def dimensions(self) -> NamedValueAbstractSet[Dimension]:
289 """Return all dimensions.
291 The union of `required` and `implied`, with all elements in
292 `required` before any elements in `implied`.
294 This differs from ``self.graph.dimensions`` both in order and in
295 content:
297 - as in ``self.implied``, implied dimensions are not expanded
298 recursively here;
299 - implied dimensions appear after required dimensions here, instead of
300 being topologically ordered.
302 As a result, this set is ordered consistently with
303 ``self.RecordClass.fields``.
304 """
305 return NamedValueSet(list(self.required) + list(self.implied)).freeze()
307 # Deprecated via a warning from its implementation.
308 # TODO: remove on DM-41326.
309 @property
310 def graph(self) -> DimensionGraph:
311 """Return minimal graph that includes this element (`DimensionGraph`).
313 ``self.graph.required`` includes all dimensions whose primary key
314 values are sufficient (often necessary) to uniquely identify ``self``
315 (including ``self`` if ``isinstance(self, Dimension)``.
316 ``self.graph.implied`` includes all dimensions also identified
317 (possibly recursively) by this set.
318 """
319 return self.minimal_group._as_graph()
321 @property
322 @cached_getter
323 def minimal_group(self) -> DimensionGroup:
324 """Return minimal dimension group that includes this element.
326 ``self.minimal_group.required`` includes all dimensions whose primary
327 key values are sufficient (often necessary) to uniquely identify
328 ``self`` (including ``self`` if ``isinstance(self, Dimension)``.
329 ``self.minimal_group.implied`` includes all dimensions also identified
330 (possibly recursively) by this set.
331 """
332 return self.universe.conform(self.dimensions.names)
334 @property
335 @cached_getter
336 def RecordClass(self) -> type[DimensionRecord]:
337 """Return the record subclass for this element.
339 The `DimensionRecord` subclass used to hold records for this element
340 (`type`).
342 Because `DimensionRecord` subclasses are generated dynamically, this
343 type cannot be imported directly and hence can only be obtained from
344 this attribute.
345 """
346 from ._records import _subclassDimensionRecord
348 return _subclassDimensionRecord(self)
350 @property
351 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]:
352 """Additional unique key fields for this dimension element that are not
353 the primary key (`NamedValueAbstractSet` of `KeyColumnSpec`).
355 This is always empty for elements that are not dimensions.
357 If this dimension has required dependencies, the keys of those
358 dimensions are also included in the unique constraints defined for
359 these alternate keys.
360 """
361 return NamedValueSet().freeze()
363 @property
364 @abstractmethod
365 def metadata_columns(self) -> NamedValueAbstractSet[MetadataColumnSpec]:
366 """Additional metadata fields included in this element's table.
368 (`NamedValueSet` of `MetadataColumnSpec`).
369 """
370 raise NotImplementedError()
372 @property
373 @cached_getter
374 def metadata(self) -> NamedValueAbstractSet[ddl.FieldSpec]:
375 """Additional metadata fields included in this element's table.
377 (`NamedValueSet` of `FieldSpec`).
378 """
379 return NamedValueSet([column_spec.to_sql_spec() for column_spec in self.metadata_columns]).freeze()
381 @property
382 def viewOf(self) -> str | None:
383 """Name of another table this element's records are drawn from.
385 (`str` or `None`).
386 """
387 return self.implied_union_target.name if self.implied_union_target is not None else None
389 @property
390 def alwaysJoin(self) -> bool:
391 """Indicate if the element should always be included.
393 If `True`, always include this element in any query or data ID in
394 which its ``required`` dimensions appear, because it defines a
395 relationship between those dimensions that must always be satisfied.
396 """
397 return False
399 @property
400 def has_own_table(self) -> bool:
401 """Whether this element should have its own table in the database."""
402 return self.implied_union_target is None
404 @property
405 def implied_union_target(self) -> DimensionElement | None:
406 """If not `None`, another element whose implied values for this element
407 form the set of allowable values.
409 For example, in the default dimension universe, the allowed values for
410 ``band`` is the union of all ``band`` values in the ``physical_filter``
411 table, so the `implied_union_target` for ``band`` is
412 ``physical_filter``.
413 """
414 return None
416 @property
417 def defines_relationships(self) -> bool:
418 """Whether this element's records define one or more relationships that
419 must be satisfied in rows over dimensions that include it.
420 """
421 return bool(self.implied)
423 @property
424 def is_cached(self) -> bool:
425 """Whether this element's records should be aggressively cached,
426 because they are small in number and rarely inserted.
427 """
428 return False
430 @property
431 @abstractmethod
432 def populated_by(self) -> Dimension | None:
433 """The dimension that this element's records are always inserted,
434 exported, and imported alongside.
436 Notes
437 -----
438 When this is `None` (as it will be, at least at first, for any data
439 repositories created before this attribute was added), records for
440 this element will often need to be exported manually when datasets
441 associated with some other related dimension are exported, in order for
442 the post-import data repository to function as expected.
443 """
444 raise NotImplementedError()
446 @property
447 @cached_getter
448 def schema(self) -> DimensionRecordSchema:
449 """A description of the columns in this element's records and (at least
450 conceptual) table.
451 """
452 from ._schema import DimensionRecordSchema
454 return DimensionRecordSchema(self)
456 @property
457 @abstractmethod
458 def documentation(self) -> str:
459 """Extended description of this dimension element."""
460 raise NotImplementedError()
462 @classmethod
463 def _validate(cls, data: Any, info: pydantic.ValidationInfo) -> DimensionElement:
464 """Pydantic validator (deserializer) for `DimensionElement`.
466 This satisfies the `pydantic.WithInfoPlainValidatorFunction` signature.
467 """
468 universe = pydantic_utils.get_universe_from_context(info.context)
469 return universe[data]
471 def _serialize(self) -> str:
472 """Pydantic serializer for `DimensionElement`.
474 This satisfies the `pydantic.PlainSerializerFunction` signature.
475 """
476 return self.name
478 @classmethod
479 def __get_pydantic_core_schema__(
480 cls, source_type: Any, handler: pydantic.GetCoreSchemaHandler
481 ) -> core_schema.CoreSchema:
482 # This is the Pydantic hook for overriding serialization, validation,
483 # and JSON schema generation.
484 str_schema = core_schema.str_schema()
485 from_str_schema = core_schema.chain_schema(
486 [str_schema, core_schema.with_info_plain_validator_function(cls._validate)]
487 )
488 return core_schema.json_or_python_schema(
489 # When deserializing from JSON, expect it to be a `str`
490 json_schema=from_str_schema,
491 # When deserializing from Python, first see if it's already a
492 # DimensionElement and then try conversion from `str`.
493 python_schema=core_schema.union_schema(
494 [core_schema.is_instance_schema(DimensionElement), from_str_schema]
495 ),
496 # When serializing convert it to a `str`.
497 serialization=core_schema.plain_serializer_function_ser_schema(
498 cls._serialize, return_schema=str_schema
499 ),
500 )
503class Dimension(DimensionElement):
504 """A dimension.
506 A named data-organization concept that can be used as a key in a data
507 ID.
508 """
510 @property
511 @abstractmethod
512 def unique_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]:
513 """Descriptions of unique identifiers for this dimension.
515 All fields that can individually be used to identify records of this
516 element, given the primary keys of all required dependencies
517 (`NamedValueAbstractSet` of `KeyColumnSpec`).
518 """
519 raise NotImplementedError()
521 @property
522 @cached_getter
523 def primary_key(self) -> KeyColumnSpec:
524 """The primary key field for this dimension (`KeyColumnSpec`).
526 Note that the database primary keys for dimension tables are in general
527 compound; this field is the only field in the database primary key that
528 is not also a foreign key (to a required dependency dimension table).
529 """
530 primary_ey, *_ = self.unique_keys
531 return primary_ey
533 @property
534 @cached_getter
535 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]:
536 # Docstring inherited.
537 _, *alternate_keys = self.unique_keys
538 return NamedValueSet(alternate_keys).freeze()
540 @property
541 @cached_getter
542 def uniqueKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]:
543 """Return the unique fields.
545 All fields that can individually be used to identify records of this
546 element, given the primary keys of all required dependencies
547 (`NamedValueAbstractSet` of `FieldSpec`).
548 """
549 return NamedValueSet(
550 [column_spec.to_sql_spec(primaryKey=(n == 0)) for n, column_spec in enumerate(self.unique_keys)]
551 )
553 @property
554 @cached_getter
555 def primaryKey(self) -> ddl.FieldSpec:
556 """Return primary key field for this dimension (`FieldSpec`).
558 Note that the database primary keys for dimension tables are in general
559 compound; this field is the only field in the database primary key that
560 is not also a foreign key (to a required dependency dimension table).
561 """
562 primaryKey, *_ = self.uniqueKeys
563 return primaryKey
565 @property
566 @cached_getter
567 def alternateKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]:
568 """Return alternate keys.
570 Additional unique key fields for this dimension that are not the
571 primary key (`NamedValueAbstractSet` of `FieldSpec`).
573 If this dimension has required dependencies, the keys of those
574 dimensions are also included in the unique constraints defined for
575 these alternate keys.
576 """
577 _, *alternateKeys = self.uniqueKeys
578 return NamedValueSet(alternateKeys).freeze()
580 @property
581 def populated_by(self) -> Dimension:
582 # Docstring inherited.
583 return self
585 def to_arrow(self, dimensions: DimensionGroup, spec: KeyColumnSpec | None = None) -> arrow_utils.ToArrow:
586 """Return an object that converts the primary key value for this
587 dimension to column in an Arrow table.
589 Parameters
590 ----------
591 dimensions : `DimensionGroup`
592 Full set of dimensions over which the rows of the table are unique
593 or close to unique. This is used to determine whether to use
594 Arrow's dictionary encoding to compress duplicate values.
595 spec : `KeyColumnSpec`, optional
596 Column specification for this dimension. If not provided, a copy
597 of `primary_key` the the field name replaced with the dimension
598 name will be used, which is appropriate for when this dimension
599 appears in data ID or the dimension record tables of other
600 dimension elements.
602 Returns
603 -------
604 converter : `arrow_utils.ToArrow`
605 Converter for this dimension's primary key.
606 """
607 if spec is None:
608 spec = self.primary_key.model_copy(update={"name": self.name})
609 if dimensions != self.minimal_group and spec.type != "int":
610 # Values are large and will be duplicated in rows that are unique
611 # over these dimensions, so dictionary encoding may help a lot.
612 return spec.to_arrow().dictionary_encoded()
613 else:
614 return spec.to_arrow()
617class DimensionCombination(DimensionElement):
618 """Element with extra information.
620 A `DimensionElement` that provides extra metadata and/or relationship
621 endpoint information for a combination of dimensions.
622 """