Coverage for python / lsst / daf / butler / arrow_utils.py: 68%

219 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-24 08:17 +0000

1# This file is part of butler4. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "DateTimeArrowScalar", 

32 "DateTimeArrowType", 

33 "RegionArrowScalar", 

34 "RegionArrowType", 

35 "TimespanArrowScalar", 

36 "TimespanArrowType", 

37 "ToArrow", 

38 "UUIDArrowScalar", 

39 "UUIDArrowType", 

40) 

41 

42import uuid 

43from abc import ABC, abstractmethod 

44from typing import Any, ClassVar, final 

45 

46import astropy.time 

47import pyarrow as pa 

48 

49from lsst.sphgeom import Region 

50 

51from ._timespan import Timespan 

52from .time_utils import TimeConverter 

53 

54 

55class ToArrow(ABC): 

56 """An abstract interface for converting objects to an Arrow field of the 

57 appropriate type. 

58 """ 

59 

60 @staticmethod 

61 def for_primitive(name: str, data_type: pa.DataType, nullable: bool) -> ToArrow: 

62 """Return a converter for a primitive type already supported by Arrow. 

63 

64 Parameters 

65 ---------- 

66 name : `str` 

67 Name of the schema field. 

68 data_type : `pyarrow.DataType` 

69 Arrow data type object. 

70 nullable : `bool` 

71 Whether the field should permit null or `None` values. 

72 

73 Returns 

74 ------- 

75 to_arrow : `ToArrow` 

76 Converter instance. 

77 """ 

78 return _ToArrowPrimitive(name, data_type, nullable) 

79 

80 @staticmethod 

81 def for_uuid(name: str, nullable: bool = True) -> ToArrow: 

82 """Return a converter for `uuid.UUID`. 

83 

84 Parameters 

85 ---------- 

86 name : `str` 

87 Name of the schema field. 

88 nullable : `bool` 

89 Whether the field should permit null or `None` values. 

90 

91 Returns 

92 ------- 

93 to_arrow : `ToArrow` 

94 Converter instance. 

95 """ 

96 return _ToArrowUUID(name, nullable) 

97 

98 @staticmethod 

99 def for_region(name: str, nullable: bool = True) -> ToArrow: 

100 """Return a converter for `lsst.sphgeom.Region`. 

101 

102 Parameters 

103 ---------- 

104 name : `str` 

105 Name of the schema field. 

106 nullable : `bool` 

107 Whether the field should permit null or `None` values. 

108 

109 Returns 

110 ------- 

111 to_arrow : `ToArrow` 

112 Converter instance. 

113 """ 

114 return _ToArrowRegion(name, nullable) 

115 

116 @staticmethod 

117 def for_timespan(name: str, nullable: bool = True) -> ToArrow: 

118 """Return a converter for `lsst.daf.butler.Timespan`. 

119 

120 Parameters 

121 ---------- 

122 name : `str` 

123 Name of the schema field. 

124 nullable : `bool` 

125 Whether the field should permit null or `None` values. 

126 

127 Returns 

128 ------- 

129 to_arrow : `ToArrow` 

130 Converter instance. 

131 """ 

132 return _ToArrowTimespan(name, nullable) 

133 

134 @staticmethod 

135 def for_datetime(name: str, nullable: bool = True) -> ToArrow: 

136 """Return a converter for `astropy.time.Time`, stored as TAI 

137 nanoseconds since 1970-01-01. 

138 

139 Parameters 

140 ---------- 

141 name : `str` 

142 Name of the schema field. 

143 nullable : `bool` 

144 Whether the field should permit null or `None` values. 

145 

146 Returns 

147 ------- 

148 to_arrow : `ToArrow` 

149 Converter instance. 

150 """ 

151 return _ToArrowDateTime(name, nullable) 

152 

153 @property 

154 @abstractmethod 

155 def name(self) -> str: 

156 """Name of the field.""" 

157 raise NotImplementedError() 

158 

159 @property 

160 @abstractmethod 

161 def nullable(self) -> bool: 

162 """Whether the field permits null or `None` values.""" 

163 raise NotImplementedError() 

164 

165 @property 

166 @abstractmethod 

167 def data_type(self) -> pa.DataType: 

168 """Arrow data type for the field this converter produces.""" 

169 raise NotImplementedError() 

170 

171 @property 

172 def field(self) -> pa.Field: 

173 """Arrow field this converter produces.""" 

174 return pa.field(self.name, self.data_type, self.nullable) 

175 

176 def dictionary_encoded(self) -> ToArrow: 

177 """Return a new converter with the same name and type, but using 

178 dictionary encoding (to 32-bit integers) to compress duplicate values. 

179 """ 

180 return _ToArrowDictionary(self) 

181 

182 @abstractmethod 

183 def append(self, value: Any, column: list[Any]) -> None: 

184 """Append an object's arrow representation to a `list`. 

185 

186 Parameters 

187 ---------- 

188 value : `object` 

189 Original value to be converted to a row in an Arrow column. 

190 column : `list` 

191 List of values to append to. The type of value to append is 

192 implementation-defined; the only requirement is that `finish` be 

193 able to handle this `list` later. 

194 """ 

195 raise NotImplementedError() 

196 

197 @abstractmethod 

198 def finish(self, column: list[Any]) -> pa.Array: 

199 """Convert a list of values constructed via `append` into an Arrow 

200 array. 

201 

202 Parameters 

203 ---------- 

204 column : `list` 

205 List of column values populated by `append`. 

206 """ 

207 raise NotImplementedError() 

208 

209 

210class _ToArrowPrimitive(ToArrow): 

211 """`ToArrow` implementation for primitive types supported direct by Arrow. 

212 

213 Should be constructed via the `ToArrow.for_primitive` factory method. 

214 """ 

215 

216 def __init__(self, name: str, data_type: pa.DataType, nullable: bool): 

217 self._name = name 

218 self._data_type = data_type 

219 self._nullable = nullable 

220 

221 @property 

222 def name(self) -> str: 

223 # Docstring inherited. 

224 return self._name 

225 

226 @property 

227 def nullable(self) -> bool: 

228 # Docstring inherited. 

229 return self._nullable 

230 

231 @property 

232 def data_type(self) -> pa.DataType: 

233 # Docstring inherited. 

234 return self._data_type 

235 

236 def append(self, value: Any, column: list[Any]) -> None: 

237 # Docstring inherited. 

238 column.append(value) 

239 

240 def finish(self, column: list[Any]) -> pa.Array: 

241 # Docstring inherited. 

242 return pa.array(column, self._data_type) 

243 

244 

245class _ToArrowDictionary(ToArrow): 

246 """`ToArrow` implementation for Arrow dictionary fields. 

247 

248 Should be constructed via the `ToArrow.dictionary_encoded` factory method. 

249 """ 

250 

251 def __init__(self, to_arrow_value: ToArrow): 

252 self._to_arrow_value = to_arrow_value 

253 

254 @property 

255 def name(self) -> str: 

256 # Docstring inherited. 

257 return self._to_arrow_value.name 

258 

259 @property 

260 def nullable(self) -> bool: 

261 # Docstring inherited. 

262 return self._to_arrow_value.nullable 

263 

264 @property 

265 def data_type(self) -> pa.DataType: 

266 # Docstring inherited. 

267 # We hard-code int32 as the index type here because that's what 

268 # the pa.Arrow.dictionary_encode() method does. 

269 return pa.dictionary(pa.int32(), self._to_arrow_value.data_type) 

270 

271 def append(self, value: Any, column: list[Any]) -> None: 

272 # Docstring inherited. 

273 self._to_arrow_value.append(value, column) 

274 

275 def finish(self, column: list[Any]) -> pa.Array: 

276 # Docstring inherited. 

277 return self._to_arrow_value.finish(column).dictionary_encode() 

278 

279 

280class _ToArrowUUID(ToArrow): 

281 """`ToArrow` implementation for `uuid.UUID` fields. 

282 

283 Should be constructed via the `ToArrow.for_uuid` factory method. 

284 """ 

285 

286 def __init__(self, name: str, nullable: bool): 

287 self._name = name 

288 self._nullable = nullable 

289 

290 storage_type: ClassVar[pa.DataType] = pa.binary(16) 

291 

292 @property 

293 def name(self) -> str: 

294 # Docstring inherited. 

295 return self._name 

296 

297 @property 

298 def nullable(self) -> bool: 

299 # Docstring inherited. 

300 return self._nullable 

301 

302 @property 

303 def data_type(self) -> pa.DataType: 

304 # Docstring inherited. 

305 return UUIDArrowType() 

306 

307 def append(self, value: uuid.UUID | None, column: list[bytes | None]) -> None: 

308 # Docstring inherited. 

309 column.append(value.bytes if value is not None else None) 

310 

311 def finish(self, column: list[Any]) -> pa.Array: 

312 # Docstring inherited. 

313 storage_array = pa.array(column, self.storage_type) 

314 return pa.ExtensionArray.from_storage(UUIDArrowType(), storage_array) 

315 

316 

317class _ToArrowRegion(ToArrow): 

318 """`ToArrow` implementation for `lsst.sphgeom.Region` fields. 

319 

320 Should be constructed via the `ToArrow.for_region` factory method. 

321 """ 

322 

323 def __init__(self, name: str, nullable: bool): 

324 self._name = name 

325 self._nullable = nullable 

326 

327 storage_type: ClassVar[pa.DataType] = pa.binary() 

328 

329 @property 

330 def name(self) -> str: 

331 # Docstring inherited. 

332 return self._name 

333 

334 @property 

335 def nullable(self) -> bool: 

336 # Docstring inherited. 

337 return self._nullable 

338 

339 @property 

340 def data_type(self) -> pa.DataType: 

341 # Docstring inherited. 

342 return RegionArrowType() 

343 

344 def append(self, value: Region | None, column: list[bytes | None]) -> None: 

345 # Docstring inherited. 

346 column.append(value.encode() if value is not None else None) 

347 

348 def finish(self, column: list[Any]) -> pa.Array: 

349 # Docstring inherited. 

350 storage_array = pa.array(column, self.storage_type) 

351 return pa.ExtensionArray.from_storage(RegionArrowType(), storage_array) 

352 

353 

354class _ToArrowTimespan(ToArrow): 

355 """`ToArrow` implementation for `lsst.daf.butler.timespan` fields. 

356 

357 Should be constructed via the `ToArrow.for_timespan` factory method. 

358 """ 

359 

360 def __init__(self, name: str, nullable: bool): 

361 self._name = name 

362 self._nullable = nullable 

363 

364 storage_type: ClassVar[pa.DataType] = pa.struct( 

365 [ 

366 pa.field("begin_nsec", pa.int64(), nullable=False), 

367 pa.field("end_nsec", pa.int64(), nullable=False), 

368 ] 

369 ) 

370 

371 @property 

372 def name(self) -> str: 

373 # Docstring inherited. 

374 return self._name 

375 

376 @property 

377 def nullable(self) -> bool: 

378 # Docstring inherited. 

379 return self._nullable 

380 

381 @property 

382 def data_type(self) -> pa.DataType: 

383 # Docstring inherited. 

384 return TimespanArrowType() 

385 

386 def append(self, value: Timespan | None, column: list[dict[str, int] | None]) -> None: 

387 # Docstring inherited. 

388 column.append({"begin_nsec": value.nsec[0], "end_nsec": value.nsec[1]} if value is not None else None) 

389 

390 def finish(self, column: list[Any]) -> pa.Array: 

391 # Docstring inherited. 

392 storage_array = pa.array(column, self.storage_type) 

393 return pa.ExtensionArray.from_storage(TimespanArrowType(), storage_array) 

394 

395 

396class _ToArrowDateTime(ToArrow): 

397 """`ToArrow` implementation for `astropy.time.Time` fields. 

398 

399 Should be constructed via the `ToArrow.for_datetime` factory method. 

400 """ 

401 

402 def __init__(self, name: str, nullable: bool): 

403 self._name = name 

404 self._nullable = nullable 

405 

406 storage_type: ClassVar[pa.DataType] = pa.int64() 

407 

408 @property 

409 def name(self) -> str: 

410 # Docstring inherited. 

411 return self._name 

412 

413 @property 

414 def nullable(self) -> bool: 

415 # Docstring inherited. 

416 return self._nullable 

417 

418 @property 

419 def data_type(self) -> pa.DataType: 

420 # Docstring inherited. 

421 return DateTimeArrowType() 

422 

423 def append(self, value: astropy.time.Time | None, column: list[int | None]) -> None: 

424 # Docstring inherited. 

425 column.append(TimeConverter().astropy_to_nsec(value) if value is not None else None) 

426 

427 def finish(self, column: list[Any]) -> pa.Array: 

428 # Docstring inherited. 

429 storage_array = pa.array(column, self.storage_type) 

430 return pa.ExtensionArray.from_storage(DateTimeArrowType(), storage_array) 

431 

432 

433@final 

434class UUIDArrowType(pa.ExtensionType): 

435 """An Arrow extension type for `uuid.UUID`, stored as 16 bytes.""" 

436 

437 def __init__(self) -> None: 

438 super().__init__(_ToArrowUUID.storage_type, "uuid.UUID") 

439 

440 def __arrow_ext_serialize__(self) -> bytes: 

441 return b"" 

442 

443 @classmethod 

444 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> UUIDArrowType: 

445 return cls() 

446 

447 def __arrow_ext_scalar_class__(self) -> type[UUIDArrowScalar]: 

448 return UUIDArrowScalar 

449 

450 

451@final 

452class UUIDArrowScalar(pa.ExtensionScalar): 

453 """An Arrow scalar type for `uuid.UUID`. 

454 

455 Use the standard `as_py` method to convert to an actual `uuid.UUID` 

456 instance. 

457 """ 

458 

459 def as_py(self, **_unused: Any) -> uuid.UUID: 

460 return uuid.UUID(bytes=self.value.as_py()) 

461 

462 

463@final 

464class RegionArrowType(pa.ExtensionType): 

465 """An Arrow extension type for lsst.sphgeom.Region.""" 

466 

467 def __init__(self) -> None: 

468 super().__init__(_ToArrowRegion.storage_type, "lsst.sphgeom.Region") 

469 

470 def __arrow_ext_serialize__(self) -> bytes: 

471 return b"" 

472 

473 @classmethod 

474 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> RegionArrowType: 

475 return cls() 

476 

477 def __arrow_ext_scalar_class__(self) -> type[RegionArrowScalar]: 

478 return RegionArrowScalar 

479 

480 

481@final 

482class RegionArrowScalar(pa.ExtensionScalar): 

483 """An Arrow scalar type for `lsst.sphgeom.Region`. 

484 

485 Use the standard `as_py` method to convert to an actual region. 

486 """ 

487 

488 def as_py(self, **_unused: Any) -> Region: 

489 return Region.decode(self.value.as_py()) 

490 

491 

492@final 

493class TimespanArrowType(pa.ExtensionType): 

494 """An Arrow extension type for lsst.daf.butler.Timespan.""" 

495 

496 def __init__(self) -> None: 

497 super().__init__(_ToArrowTimespan.storage_type, "lsst.daf.butler.Timespan") 

498 

499 def __arrow_ext_serialize__(self) -> bytes: 

500 return b"" 

501 

502 @classmethod 

503 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> TimespanArrowType: 

504 return cls() 

505 

506 def __arrow_ext_scalar_class__(self) -> type[TimespanArrowScalar]: 

507 return TimespanArrowScalar 

508 

509 

510@final 

511class TimespanArrowScalar(pa.ExtensionScalar): 

512 """An Arrow scalar type for `lsst.daf.butler.Timespan`. 

513 

514 Use the standard `as_py` method to convert to an actual timespan. 

515 """ 

516 

517 def as_py(self, **_unused: Any) -> Timespan | None: 

518 if self.value is None: 

519 return None 

520 else: 

521 return Timespan( 

522 None, None, _nsec=(self.value["begin_nsec"].as_py(), self.value["end_nsec"].as_py()) 

523 ) 

524 

525 

526@final 

527class DateTimeArrowType(pa.ExtensionType): 

528 """An Arrow extension type for `astropy.time.Time`, stored as TAI 

529 nanoseconds since 1970-01-01. 

530 """ 

531 

532 def __init__(self) -> None: 

533 super().__init__(_ToArrowTimespan.storage_type, "astropy.time.Time") 

534 

535 def __arrow_ext_serialize__(self) -> bytes: 

536 return b"" 

537 

538 @classmethod 

539 def __arrow_ext_deserialize__(cls, storage_type: pa.DataType, serialized: bytes) -> DateTimeArrowType: 

540 return cls() 

541 

542 def __arrow_ext_scalar_class__(self) -> type[DateTimeArrowScalar]: 

543 return DateTimeArrowScalar 

544 

545 

546@final 

547class DateTimeArrowScalar(pa.ExtensionScalar): 

548 """An Arrow scalar type for `astropy.time.Time`, stored as TAI 

549 nanoseconds since 1970-01-01. 

550 

551 Use the standard `as_py` method to convert to an actual `astropy.time.Time` 

552 instance. 

553 """ 

554 

555 def as_py(self, **_unused: Any) -> astropy.time.Time: 

556 return TimeConverter().nsec_to_astropy(self.value.as_py()) 

557 

558 

559pa.register_extension_type(RegionArrowType()) 

560pa.register_extension_type(TimespanArrowType()) 

561pa.register_extension_type(DateTimeArrowType())