Coverage for python / lsst / daf / butler / arrow_utils.py: 68%
219 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:37 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:37 +0000
1# This file is part of butler4.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = (
31 "DateTimeArrowScalar",
32 "DateTimeArrowType",
33 "RegionArrowScalar",
34 "RegionArrowType",
35 "TimespanArrowScalar",
36 "TimespanArrowType",
37 "ToArrow",
38 "UUIDArrowScalar",
39 "UUIDArrowType",
40)
42import uuid
43from abc import ABC, abstractmethod
44from typing import Any, ClassVar, final
46import astropy.time
47import pyarrow as pa
49from lsst.sphgeom import Region
51from ._timespan import Timespan
52from .time_utils import TimeConverter
55class ToArrow(ABC):
56 """An abstract interface for converting objects to an Arrow field of the
57 appropriate type.
58 """
60 @staticmethod
61 def for_primitive(name: str, data_type: pa.DataType, nullable: bool) -> ToArrow:
62 """Return a converter for a primitive type already supported by Arrow.
64 Parameters
65 ----------
66 name : `str`
67 Name of the schema field.
68 data_type : `pyarrow.DataType`
69 Arrow data type object.
70 nullable : `bool`
71 Whether the field should permit null or `None` values.
73 Returns
74 -------
75 to_arrow : `ToArrow`
76 Converter instance.
77 """
78 return _ToArrowPrimitive(name, data_type, nullable)
80 @staticmethod
81 def for_uuid(name: str, nullable: bool = True) -> ToArrow:
82 """Return a converter for `uuid.UUID`.
84 Parameters
85 ----------
86 name : `str`
87 Name of the schema field.
88 nullable : `bool`
89 Whether the field should permit null or `None` values.
91 Returns
92 -------
93 to_arrow : `ToArrow`
94 Converter instance.
95 """
96 return _ToArrowUUID(name, nullable)
98 @staticmethod
99 def for_region(name: str, nullable: bool = True) -> ToArrow:
100 """Return a converter for `lsst.sphgeom.Region`.
102 Parameters
103 ----------
104 name : `str`
105 Name of the schema field.
106 nullable : `bool`
107 Whether the field should permit null or `None` values.
109 Returns
110 -------
111 to_arrow : `ToArrow`
112 Converter instance.
113 """
114 return _ToArrowRegion(name, nullable)
116 @staticmethod
117 def for_timespan(name: str, nullable: bool = True) -> ToArrow:
118 """Return a converter for `lsst.daf.butler.Timespan`.
120 Parameters
121 ----------
122 name : `str`
123 Name of the schema field.
124 nullable : `bool`
125 Whether the field should permit null or `None` values.
127 Returns
128 -------
129 to_arrow : `ToArrow`
130 Converter instance.
131 """
132 return _ToArrowTimespan(name, nullable)
134 @staticmethod
135 def for_datetime(name: str, nullable: bool = True) -> ToArrow:
136 """Return a converter for `astropy.time.Time`, stored as TAI
137 nanoseconds since 1970-01-01.
139 Parameters
140 ----------
141 name : `str`
142 Name of the schema field.
143 nullable : `bool`
144 Whether the field should permit null or `None` values.
146 Returns
147 -------
148 to_arrow : `ToArrow`
149 Converter instance.
150 """
151 return _ToArrowDateTime(name, nullable)
153 @property
154 @abstractmethod
155 def name(self) -> str:
156 """Name of the field."""
157 raise NotImplementedError()
159 @property
160 @abstractmethod
161 def nullable(self) -> bool:
162 """Whether the field permits null or `None` values."""
163 raise NotImplementedError()
165 @property
166 @abstractmethod
167 def data_type(self) -> pa.DataType:
168 """Arrow data type for the field this converter produces."""
169 raise NotImplementedError()
171 @property
172 def field(self) -> pa.Field:
173 """Arrow field this converter produces."""
174 return pa.field(self.name, self.data_type, self.nullable)
176 def dictionary_encoded(self) -> ToArrow:
177 """Return a new converter with the same name and type, but using
178 dictionary encoding (to 32-bit integers) to compress duplicate values.
179 """
180 return _ToArrowDictionary(self)
182 @abstractmethod
183 def append(self, value: Any, column: list[Any]) -> None:
184 """Append an object's arrow representation to a `list`.
186 Parameters
187 ----------
188 value : `object`
189 Original value to be converted to a row in an Arrow column.
190 column : `list`
191 List of values to append to. The type of value to append is
192 implementation-defined; the only requirement is that `finish` be
193 able to handle this `list` later.
194 """
195 raise NotImplementedError()
197 @abstractmethod
198 def finish(self, column: list[Any]) -> pa.Array:
199 """Convert a list of values constructed via `append` into an Arrow
200 array.
202 Parameters
203 ----------
204 column : `list`
205 List of column values populated by `append`.
206 """
207 raise NotImplementedError()
210class _ToArrowPrimitive(ToArrow):
211 """`ToArrow` implementation for primitive types supported direct by Arrow.
213 Should be constructed via the `ToArrow.for_primitive` factory method.
214 """
216 def __init__(self, name: str, data_type: pa.DataType, nullable: bool):
217 self._name = name
218 self._data_type = data_type
219 self._nullable = nullable
221 @property
222 def name(self) -> str:
223 # Docstring inherited.
224 return self._name
226 @property
227 def nullable(self) -> bool:
228 # Docstring inherited.
229 return self._nullable
231 @property
232 def data_type(self) -> pa.DataType:
233 # Docstring inherited.
234 return self._data_type
236 def append(self, value: Any, column: list[Any]) -> None:
237 # Docstring inherited.
238 column.append(value)
240 def finish(self, column: list[Any]) -> pa.Array:
241 # Docstring inherited.
242 return pa.array(column, self._data_type)
245class _ToArrowDictionary(ToArrow):
246 """`ToArrow` implementation for Arrow dictionary fields.
248 Should be constructed via the `ToArrow.dictionary_encoded` factory method.
249 """
251 def __init__(self, to_arrow_value: ToArrow):
252 self._to_arrow_value = to_arrow_value
254 @property
255 def name(self) -> str:
256 # Docstring inherited.
257 return self._to_arrow_value.name
259 @property
260 def nullable(self) -> bool:
261 # Docstring inherited.
262 return self._to_arrow_value.nullable
264 @property
265 def data_type(self) -> pa.DataType:
266 # Docstring inherited.
267 # We hard-code int32 as the index type here because that's what
268 # the pa.Arrow.dictionary_encode() method does.
269 return pa.dictionary(pa.int32(), self._to_arrow_value.data_type)
271 def append(self, value: Any, column: list[Any]) -> None:
272 # Docstring inherited.
273 self._to_arrow_value.append(value, column)
275 def finish(self, column: list[Any]) -> pa.Array:
276 # Docstring inherited.
277 return self._to_arrow_value.finish(column).dictionary_encode()
280class _ToArrowUUID(ToArrow):
281 """`ToArrow` implementation for `uuid.UUID` fields.
283 Should be constructed via the `ToArrow.for_uuid` factory method.
284 """
286 def __init__(self, name: str, nullable: bool):
287 self._name = name
288 self._nullable = nullable
290 storage_type: ClassVar[pa.DataType] = pa.binary(16)
292 @property
293 def name(self) -> str:
294 # Docstring inherited.
295 return self._name
297 @property
298 def nullable(self) -> bool:
299 # Docstring inherited.
300 return self._nullable
302 @property
303 def data_type(self) -> pa.DataType:
304 # Docstring inherited.
305 return UUIDArrowType()
307 def append(self, value: uuid.UUID | None, column: list[bytes | None]) -> None:
308 # Docstring inherited.
309 column.append(value.bytes if value is not None else None)
311 def finish(self, column: list[Any]) -> pa.Array:
312 # Docstring inherited.
313 storage_array = pa.array(column, self.storage_type)
314 return pa.ExtensionArray.from_storage(UUIDArrowType(), storage_array)
317class _ToArrowRegion(ToArrow):
318 """`ToArrow` implementation for `lsst.sphgeom.Region` fields.
320 Should be constructed via the `ToArrow.for_region` factory method.
321 """
323 def __init__(self, name: str, nullable: bool):
324 self._name = name
325 self._nullable = nullable
327 storage_type: ClassVar[pa.DataType] = pa.binary()
329 @property
330 def name(self) -> str:
331 # Docstring inherited.
332 return self._name
334 @property
335 def nullable(self) -> bool:
336 # Docstring inherited.
337 return self._nullable
339 @property
340 def data_type(self) -> pa.DataType:
341 # Docstring inherited.
342 return RegionArrowType()
344 def append(self, value: Region | None, column: list[bytes | None]) -> None:
345 # Docstring inherited.
346 column.append(value.encode() if value is not None else None)
348 def finish(self, column: list[Any]) -> pa.Array:
349 # Docstring inherited.
350 storage_array = pa.array(column, self.storage_type)
351 return pa.ExtensionArray.from_storage(RegionArrowType(), storage_array)
354class _ToArrowTimespan(ToArrow):
355 """`ToArrow` implementation for `lsst.daf.butler.timespan` fields.
357 Should be constructed via the `ToArrow.for_timespan` factory method.
358 """
360 def __init__(self, name: str, nullable: bool):
361 self._name = name
362 self._nullable = nullable
364 storage_type: ClassVar[pa.DataType] = pa.struct(
365 [
366 pa.field("begin_nsec", pa.int64(), nullable=False),
367 pa.field("end_nsec", pa.int64(), nullable=False),
368 ]
369 )
371 @property
372 def name(self) -> str:
373 # Docstring inherited.
374 return self._name
376 @property
377 def nullable(self) -> bool:
378 # Docstring inherited.
379 return self._nullable
381 @property
382 def data_type(self) -> pa.DataType:
383 # Docstring inherited.
384 return TimespanArrowType()
386 def append(self, value: Timespan | None, column: list[dict[str, int] | None]) -> None:
387 # Docstring inherited.
388 column.append({"begin_nsec": value.nsec[0], "end_nsec": value.nsec[1]} if value is not None else None)
390 def finish(self, column: list[Any]) -> pa.Array:
391 # Docstring inherited.
392 storage_array = pa.array(column, self.storage_type)
393 return pa.ExtensionArray.from_storage(TimespanArrowType(), storage_array)
396class _ToArrowDateTime(ToArrow):
397 """`ToArrow` implementation for `astropy.time.Time` fields.
399 Should be constructed via the `ToArrow.for_datetime` factory method.
400 """
402 def __init__(self, name: str, nullable: bool):
403 self._name = name
404 self._nullable = nullable
406 storage_type: ClassVar[pa.DataType] = pa.int64()
408 @property
409 def name(self) -> str:
410 # Docstring inherited.
411 return self._name
413 @property
414 def nullable(self) -> bool:
415 # Docstring inherited.
416 return self._nullable
418 @property
419 def data_type(self) -> pa.DataType:
420 # Docstring inherited.
421 return DateTimeArrowType()
423 def append(self, value: astropy.time.Time | None, column: list[int | None]) -> None:
424 # Docstring inherited.
425 column.append(TimeConverter().astropy_to_nsec(value) if value is not None else None)
427 def finish(self, column: list[Any]) -> pa.Array:
428 # Docstring inherited.
429 storage_array = pa.array(column, self.storage_type)
430 return pa.ExtensionArray.from_storage(DateTimeArrowType(), storage_array)
433@final
434class UUIDArrowType(pa.ExtensionType):
435 """An Arrow extension type for `uuid.UUID`, stored as 16 bytes."""
437 def __init__(self) -> None:
438 super().__init__(_ToArrowUUID.storage_type, "uuid.UUID")
440 def __arrow_ext_serialize__(self) -> bytes:
441 return b""
443 @classmethod
444 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> UUIDArrowType:
445 return cls()
447 def __arrow_ext_scalar_class__(self) -> type[UUIDArrowScalar]:
448 return UUIDArrowScalar
451@final
452class UUIDArrowScalar(pa.ExtensionScalar):
453 """An Arrow scalar type for `uuid.UUID`.
455 Use the standard `as_py` method to convert to an actual `uuid.UUID`
456 instance.
457 """
459 def as_py(self, **_unused: Any) -> uuid.UUID:
460 return uuid.UUID(bytes=self.value.as_py())
463@final
464class RegionArrowType(pa.ExtensionType):
465 """An Arrow extension type for lsst.sphgeom.Region."""
467 def __init__(self) -> None:
468 super().__init__(_ToArrowRegion.storage_type, "lsst.sphgeom.Region")
470 def __arrow_ext_serialize__(self) -> bytes:
471 return b""
473 @classmethod
474 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> RegionArrowType:
475 return cls()
477 def __arrow_ext_scalar_class__(self) -> type[RegionArrowScalar]:
478 return RegionArrowScalar
481@final
482class RegionArrowScalar(pa.ExtensionScalar):
483 """An Arrow scalar type for `lsst.sphgeom.Region`.
485 Use the standard `as_py` method to convert to an actual region.
486 """
488 def as_py(self, **_unused: Any) -> Region:
489 return Region.decode(self.value.as_py())
492@final
493class TimespanArrowType(pa.ExtensionType):
494 """An Arrow extension type for lsst.daf.butler.Timespan."""
496 def __init__(self) -> None:
497 super().__init__(_ToArrowTimespan.storage_type, "lsst.daf.butler.Timespan")
499 def __arrow_ext_serialize__(self) -> bytes:
500 return b""
502 @classmethod
503 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> TimespanArrowType:
504 return cls()
506 def __arrow_ext_scalar_class__(self) -> type[TimespanArrowScalar]:
507 return TimespanArrowScalar
510@final
511class TimespanArrowScalar(pa.ExtensionScalar):
512 """An Arrow scalar type for `lsst.daf.butler.Timespan`.
514 Use the standard `as_py` method to convert to an actual timespan.
515 """
517 def as_py(self, **_unused: Any) -> Timespan | None:
518 if self.value is None:
519 return None
520 else:
521 return Timespan(
522 None, None, _nsec=(self.value["begin_nsec"].as_py(), self.value["end_nsec"].as_py())
523 )
526@final
527class DateTimeArrowType(pa.ExtensionType):
528 """An Arrow extension type for `astropy.time.Time`, stored as TAI
529 nanoseconds since 1970-01-01.
530 """
532 def __init__(self) -> None:
533 super().__init__(_ToArrowTimespan.storage_type, "astropy.time.Time")
535 def __arrow_ext_serialize__(self) -> bytes:
536 return b""
538 @classmethod
539 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> DateTimeArrowType:
540 return cls()
542 def __arrow_ext_scalar_class__(self) -> type[DateTimeArrowScalar]:
543 return DateTimeArrowScalar
546@final
547class DateTimeArrowScalar(pa.ExtensionScalar):
548 """An Arrow scalar type for `astropy.time.Time`, stored as TAI
549 nanoseconds since 1970-01-01.
551 Use the standard `as_py` method to convert to an actual `astropy.time.Time`
552 instance.
553 """
555 def as_py(self, **_unused: Any) -> astropy.time.Time:
556 return TimeConverter().nsec_to_astropy(self.value.as_py())
559pa.register_extension_type(RegionArrowType())
560pa.register_extension_type(TimespanArrowType())
561pa.register_extension_type(DateTimeArrowType())