Coverage for python/lsst/daf/butler/column_spec.py: 78%
106 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 02:58 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-16 02:58 -0700
1# This file is part of butler4.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "ColumnSpec",
32 "IntColumnSpec",
33 "StringColumnSpec",
34 "HashColumnSpec",
35 "FloatColumnSpec",
36 "BoolColumnSpec",
37 "UUIDColumnSpec",
38 "RegionColumnSpec",
39 "TimespanColumnSpec",
40 "ColumnType",
41 "COLLECTION_NAME_MAX_LENGTH",
42)
44import textwrap
45import uuid
46from abc import ABC, abstractmethod
47from typing import Annotated, Any, ClassVar, Literal, TypeAlias, Union, final
49import astropy.time
50import pyarrow as pa
51import pydantic
52from lsst.sphgeom import Region
54from . import arrow_utils, ddl
55from ._timespan import Timespan
56from .name_shrinker import NameShrinker
58ColumnType: TypeAlias = Literal[
59 "int", "string", "hash", "float", "datetime", "bool", "uuid", "timespan", "region"
60]
63COLLECTION_NAME_MAX_LENGTH = 64
64# TODO: DM-42541 would bee a good opportunity to move this constant to a
65# better home; this file is the least-bad home I can think of for now. Note
66# that actually changing the value is a (minor) schema change.
69class _BaseColumnSpec(pydantic.BaseModel, ABC):
70 """Base class for descriptions of table columns."""
72 name: str = pydantic.Field(description="""Name of the column.""")
74 doc: str = pydantic.Field(default="", description="Documentation for the column.")
76 type: ColumnType
78 nullable: bool = pydantic.Field(
79 default=True,
80 description="Whether the column may be ``NULL``.",
81 )
83 def to_sql_spec(self, name_shrinker: NameShrinker | None = None, **kwargs: Any) -> ddl.FieldSpec:
84 """Convert this specification to a SQL-specific one.
86 Parameters
87 ----------
88 name_shrinker : `NameShrinker`, optional
89 Object that should be used to shrink the field name to ensure it
90 fits within database-specific limits.
91 **kwargs
92 Forwarded to `ddl.FieldSpec`.
94 Returns
95 -------
96 sql_spec : `ddl.FieldSpec`
97 A SQL-specific version of this specification.
98 """
99 name = self.name
100 if name_shrinker is not None:
101 name = name_shrinker.shrink(name)
102 return ddl.FieldSpec(name=name, dtype=ddl.VALID_CONFIG_COLUMN_TYPES[self.type], **kwargs)
104 @abstractmethod
105 def to_arrow(self) -> arrow_utils.ToArrow:
106 """Return an object that converts values of this column to a column in
107 an Arrow table.
109 Returns
110 -------
111 converter : `arrow_utils.ToArrow`
112 A converter object with schema information in Arrow form.
113 """
114 raise NotImplementedError()
116 def display(self, level: int = 0, tab: str = " ") -> list[str]:
117 """Return a human-reader-focused string description of this column as
118 a list of lines.
120 Parameters
121 ----------
122 level : `int`
123 Number of indentation tabs for the first line.
124 tab : `str`
125 Characters to duplicate ``level`` times to form the actual indent.
127 Returns
128 -------
129 lines : `list` [ `str` ]
130 Display lines.
131 """
132 lines = [f"{tab * level}{self.name}: {self.type}"]
133 if self.doc:
134 indent = tab * (level + 1)
135 lines.extend(
136 textwrap.wrap(
137 self.doc,
138 initial_indent=indent,
139 subsequent_indent=indent,
140 )
141 )
142 return lines
144 def __str__(self) -> str:
145 return "\n".join(self.display())
148@final
149class IntColumnSpec(_BaseColumnSpec):
150 """Description of an integer column."""
152 pytype: ClassVar[type] = int
154 type: Literal["int"] = "int"
156 def to_arrow(self) -> arrow_utils.ToArrow:
157 # Docstring inherited.
158 return arrow_utils.ToArrow.for_primitive(self.name, pa.uint64(), nullable=self.nullable)
161@final
162class StringColumnSpec(_BaseColumnSpec):
163 """Description of a string column."""
165 pytype: ClassVar[type] = str
167 type: Literal["string"] = "string"
169 length: int
170 """Maximum length of strings."""
172 def to_sql_spec(self, name_shrinker: NameShrinker | None = None, **kwargs: Any) -> ddl.FieldSpec:
173 # Docstring inherited.
174 return super().to_sql_spec(length=self.length, name_shrinker=name_shrinker, **kwargs)
176 def to_arrow(self) -> arrow_utils.ToArrow:
177 # Docstring inherited.
178 return arrow_utils.ToArrow.for_primitive(self.name, pa.string(), nullable=self.nullable)
181@final
182class HashColumnSpec(_BaseColumnSpec):
183 """Description of a hash digest."""
185 pytype: ClassVar[type] = bytes
187 type: Literal["hash"] = "hash"
189 nbytes: int
190 """Number of bytes for the hash."""
192 def to_sql_spec(self, name_shrinker: NameShrinker | None = None, **kwargs: Any) -> ddl.FieldSpec:
193 # Docstring inherited.
194 return super().to_sql_spec(nbytes=self.nbytes, name_shrinker=name_shrinker, **kwargs)
196 def to_arrow(self) -> arrow_utils.ToArrow:
197 # Docstring inherited.
198 return arrow_utils.ToArrow.for_primitive(
199 self.name,
200 # The size for Arrow binary columns is a fixed size, not a maximum
201 # as in SQL, so we use a variable-size column.
202 pa.binary(),
203 nullable=self.nullable,
204 )
207@final
208class FloatColumnSpec(_BaseColumnSpec):
209 """Description of a float column."""
211 pytype: ClassVar[type] = float
213 type: Literal["float"] = "float"
215 def to_arrow(self) -> arrow_utils.ToArrow:
216 # Docstring inherited.
217 assert self.nullable is not None, "nullable=None should be resolved by validators"
218 return arrow_utils.ToArrow.for_primitive(self.name, pa.float64(), nullable=self.nullable)
221@final
222class BoolColumnSpec(_BaseColumnSpec):
223 """Description of a bool column."""
225 pytype: ClassVar[type] = bool
227 type: Literal["bool"] = "bool"
229 def to_arrow(self) -> arrow_utils.ToArrow:
230 # Docstring inherited.
231 return arrow_utils.ToArrow.for_primitive(self.name, pa.bool_(), nullable=self.nullable)
234@final
235class UUIDColumnSpec(_BaseColumnSpec):
236 """Description of a UUID column."""
238 pytype: ClassVar[type] = uuid.UUID
240 type: Literal["uuid"] = "uuid"
242 def to_arrow(self) -> arrow_utils.ToArrow:
243 # Docstring inherited.
244 assert self.nullable is not None, "nullable=None should be resolved by validators"
245 return arrow_utils.ToArrow.for_uuid(self.name, nullable=self.nullable)
248@final
249class RegionColumnSpec(_BaseColumnSpec):
250 """Description of a region column."""
252 name: str = "region"
254 pytype: ClassVar[type] = Region
256 type: Literal["region"] = "region"
258 nbytes: int = 2048
259 """Number of bytes for the encoded region."""
261 def to_arrow(self) -> arrow_utils.ToArrow:
262 # Docstring inherited.
263 assert self.nullable is not None, "nullable=None should be resolved by validators"
264 return arrow_utils.ToArrow.for_region(self.name, nullable=self.nullable)
267@final
268class TimespanColumnSpec(_BaseColumnSpec):
269 """Description of a timespan column."""
271 name: str = "timespan"
273 pytype: ClassVar[type] = Timespan
275 type: Literal["timespan"] = "timespan"
277 def to_arrow(self) -> arrow_utils.ToArrow:
278 # Docstring inherited.
279 return arrow_utils.ToArrow.for_timespan(self.name, nullable=self.nullable)
282@final
283class DateTimeColumnSpec(_BaseColumnSpec):
284 """Description of a time column, stored as integer TAI nanoseconds since
285 1970-01-01 and represented in Python via `astropy.time.Time`.
286 """
288 pytype: ClassVar[type] = astropy.time.Time
290 type: Literal["datetime"] = "datetime"
292 def to_arrow(self) -> arrow_utils.ToArrow:
293 # Docstring inherited.
294 assert self.nullable is not None, "nullable=None should be resolved by validators"
295 return arrow_utils.ToArrow.for_datetime(self.name, nullable=self.nullable)
298ColumnSpec = Annotated[
299 Union[
300 IntColumnSpec,
301 StringColumnSpec,
302 HashColumnSpec,
303 FloatColumnSpec,
304 BoolColumnSpec,
305 UUIDColumnSpec,
306 RegionColumnSpec,
307 TimespanColumnSpec,
308 DateTimeColumnSpec,
309 ],
310 pydantic.Field(discriminator="type"),
311]