Coverage for python/lsst/daf/butler/formatters/parquet.py: 13%

415 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-13 02:34 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ParquetFormatter", 

26 "arrow_to_pandas", 

27 "arrow_to_astropy", 

28 "arrow_to_numpy", 

29 "arrow_to_numpy_dict", 

30 "pandas_to_arrow", 

31 "pandas_to_astropy", 

32 "astropy_to_arrow", 

33 "numpy_to_arrow", 

34 "numpy_to_astropy", 

35 "numpy_dict_to_arrow", 

36 "arrow_schema_to_pandas_index", 

37 "DataFrameSchema", 

38 "ArrowAstropySchema", 

39 "ArrowNumpySchema", 

40) 

41 

42import collections.abc 

43import itertools 

44import json 

45import re 

46from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, cast 

47 

48import pyarrow as pa 

49import pyarrow.parquet as pq 

50from lsst.daf.butler import Formatter 

51from lsst.utils.introspection import get_full_type_name 

52from lsst.utils.iteration import ensure_iterable 

53 

54if TYPE_CHECKING: 

55 import astropy.table as atable 

56 import numpy as np 

57 import pandas as pd 

58 

59 

60class ParquetFormatter(Formatter): 

61 """Interface for reading and writing Arrow Table objects to and from 

62 Parquet files. 

63 """ 

64 

65 extension = ".parq" 

66 

67 def read(self, component: Optional[str] = None) -> Any: 

68 # Docstring inherited from Formatter.read. 

69 schema = pq.read_schema(self.fileDescriptor.location.path) 

70 

71 if component in ("columns", "schema"): 

72 # The schema will be translated to column format 

73 # depending on the input type. 

74 return schema 

75 elif component == "rowcount": 

76 # Get the rowcount from the metadata if possible, otherwise count. 

77 if b"lsst::arrow::rowcount" in schema.metadata: 

78 return int(schema.metadata[b"lsst::arrow::rowcount"]) 

79 

80 temp_table = pq.read_table( 

81 self.fileDescriptor.location.path, 

82 columns=[schema.names[0]], 

83 use_threads=False, 

84 use_pandas_metadata=False, 

85 ) 

86 

87 return len(temp_table[schema.names[0]]) 

88 

89 par_columns = None 

90 if self.fileDescriptor.parameters: 

91 par_columns = self.fileDescriptor.parameters.pop("columns", None) 

92 if par_columns: 

93 has_pandas_multi_index = False 

94 if b"pandas" in schema.metadata: 

95 md = json.loads(schema.metadata[b"pandas"]) 

96 if len(md["column_indexes"]) > 1: 

97 has_pandas_multi_index = True 

98 

99 if not has_pandas_multi_index: 

100 # Ensure uniqueness, keeping order. 

101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns))) 

102 file_columns = [name for name in schema.names if not name.startswith("__")] 

103 

104 for par_column in par_columns: 

105 if par_column not in file_columns: 

106 raise ValueError( 

107 f"Column {par_column} specified in parameters not available in parquet file." 

108 ) 

109 else: 

110 par_columns = _standardize_multi_index_columns(schema, par_columns) 

111 

112 if len(self.fileDescriptor.parameters): 

113 raise ValueError( 

114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read." 

115 ) 

116 

117 metadata = schema.metadata if schema.metadata is not None else {} 

118 arrow_table = pq.read_table( 

119 self.fileDescriptor.location.path, 

120 columns=par_columns, 

121 use_threads=False, 

122 use_pandas_metadata=(b"pandas" in metadata), 

123 ) 

124 

125 return arrow_table 

126 

127 def write(self, inMemoryDataset: Any) -> None: 

128 import numpy as np 

129 from astropy.table import Table as astropyTable 

130 

131 arrow_table = None 

132 if isinstance(inMemoryDataset, pa.Table): 

133 # This will be the most likely match. 

134 arrow_table = inMemoryDataset 

135 elif isinstance(inMemoryDataset, astropyTable): 

136 arrow_table = astropy_to_arrow(inMemoryDataset) 

137 elif isinstance(inMemoryDataset, np.ndarray): 

138 arrow_table = numpy_to_arrow(inMemoryDataset) 

139 elif isinstance(inMemoryDataset, dict): 

140 try: 

141 arrow_table = numpy_dict_to_arrow(inMemoryDataset) 

142 except (TypeError, AttributeError) as e: 

143 raise ValueError( 

144 "Input dict for inMemoryDataset does not appear to be a dict of numpy arrays." 

145 ) from e 

146 else: 

147 if hasattr(inMemoryDataset, "to_parquet"): 

148 # This may be a pandas DataFrame 

149 try: 

150 import pandas as pd 

151 except ImportError: 

152 pd = None 

153 

154 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame): 

155 arrow_table = pandas_to_arrow(inMemoryDataset) 

156 

157 if arrow_table is None: 

158 raise ValueError( 

159 f"Unsupported type {get_full_type_name(inMemoryDataset)} of " 

160 "inMemoryDataset for ParquetFormatter." 

161 ) 

162 

163 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

164 

165 pq.write_table(arrow_table, location.path) 

166 

167 

168def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame: 

169 """Convert a pyarrow table to a pandas DataFrame. 

170 

171 Parameters 

172 ---------- 

173 arrow_table : `pyarrow.Table` 

174 Input arrow table to convert. If the table has ``pandas`` metadata 

175 in the schema it will be used in the construction of the 

176 ``DataFrame``. 

177 

178 Returns 

179 ------- 

180 dataframe : `pandas.DataFrame` 

181 Converted pandas dataframe. 

182 """ 

183 return arrow_table.to_pandas(use_threads=False, integer_object_nulls=True) 

184 

185 

186def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table: 

187 """Convert a pyarrow table to an `astropy.Table`. 

188 

189 Parameters 

190 ---------- 

191 arrow_table : `pyarrow.Table` 

192 Input arrow table to convert. If the table has astropy unit 

193 metadata in the schema it will be used in the construction 

194 of the ``astropy.Table``. 

195 

196 Returns 

197 ------- 

198 table : `astropy.Table` 

199 Converted astropy table. 

200 """ 

201 from astropy.table import Table 

202 

203 astropy_table = Table(arrow_to_numpy_dict(arrow_table)) 

204 

205 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {} 

206 

207 _apply_astropy_metadata(astropy_table, metadata) 

208 

209 return astropy_table 

210 

211 

212def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray: 

213 """Convert a pyarrow table to a structured numpy array. 

214 

215 Parameters 

216 ---------- 

217 arrow_table : `pyarrow.Table` 

218 Input arrow table. 

219 

220 Returns 

221 ------- 

222 array : `numpy.ndarray` (N,) 

223 Numpy array table with N rows and the same column names 

224 as the input arrow table. 

225 """ 

226 import numpy as np 

227 

228 numpy_dict = arrow_to_numpy_dict(arrow_table) 

229 

230 dtype = [] 

231 for name, col in numpy_dict.items(): 

232 if len(shape := numpy_dict[name].shape) <= 1: 

233 dtype.append((name, col.dtype)) 

234 else: 

235 dtype.append((name, (col.dtype, shape[1:]))) 

236 

237 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype) 

238 

239 return array 

240 

241 

242def arrow_to_numpy_dict(arrow_table: pa.Table) -> dict[str, np.ndarray]: 

243 """Convert a pyarrow table to a dict of numpy arrays. 

244 

245 Parameters 

246 ---------- 

247 arrow_table : `pyarrow.Table` 

248 Input arrow table. 

249 

250 Returns 

251 ------- 

252 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

253 Dict with keys as the column names, values as the arrays. 

254 """ 

255 import numpy as np 

256 

257 schema = arrow_table.schema 

258 metadata = schema.metadata if schema.metadata is not None else {} 

259 

260 numpy_dict = {} 

261 

262 for name in schema.names: 

263 t = schema.field(name).type 

264 

265 if arrow_table[name].null_count == 0: 

266 # Regular non-masked column 

267 col = arrow_table[name].to_numpy() 

268 else: 

269 # For a masked column, we need to ask arrow to fill the null 

270 # values with an appropriately typed value before conversion. 

271 # Then we apply the mask to get a masked array of the correct type. 

272 

273 if t in (pa.string(), pa.binary()): 

274 dummy = "" 

275 else: 

276 dummy = t.to_pandas_dtype()(0) 

277 

278 col = np.ma.masked_array( 

279 data=arrow_table[name].fill_null(dummy).to_numpy(), 

280 mask=arrow_table[name].is_null().to_numpy(), 

281 ) 

282 

283 if t in (pa.string(), pa.binary()): 

284 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col)) 

285 elif isinstance(t, pa.FixedSizeListType): 

286 if len(col) > 0: 

287 col = np.stack(col) 

288 else: 

289 # this is an empty column, and needs to be coerced to type. 

290 col = col.astype(t.value_type.to_pandas_dtype()) 

291 

292 shape = _multidim_shape_from_metadata(metadata, t.list_size, name) 

293 col = col.reshape((len(arrow_table), *shape)) 

294 

295 numpy_dict[name] = col 

296 

297 return numpy_dict 

298 

299 

300def _numpy_dict_to_numpy(numpy_dict: dict[str, np.ndarray]) -> np.ndarray: 

301 """Convert a dict of numpy arrays to a structured numpy array. 

302 

303 Parameters 

304 ---------- 

305 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

306 Dict with keys as the column names, values as the arrays. 

307 

308 Returns 

309 ------- 

310 array : `numpy.ndarray` (N,) 

311 Numpy array table with N rows and columns names from the dict keys. 

312 """ 

313 return arrow_to_numpy(numpy_dict_to_arrow(numpy_dict)) 

314 

315 

316def _numpy_to_numpy_dict(np_array: np.ndarray) -> dict[str, np.ndarray]: 

317 """Convert a structured numpy array to a dict of numpy arrays. 

318 

319 Parameters 

320 ---------- 

321 np_array : `numpy.ndarray` 

322 Input numpy array with multiple fields. 

323 

324 Returns 

325 ------- 

326 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

327 Dict with keys as the column names, values as the arrays. 

328 """ 

329 return arrow_to_numpy_dict(numpy_to_arrow(np_array)) 

330 

331 

332def numpy_to_arrow(np_array: np.ndarray) -> pa.Table: 

333 """Convert a numpy array table to an arrow table. 

334 

335 Parameters 

336 ---------- 

337 np_array : `numpy.ndarray` 

338 Input numpy array with multiple fields. 

339 

340 Returns 

341 ------- 

342 arrow_table : `pyarrow.Table` 

343 Converted arrow table. 

344 """ 

345 type_list = _numpy_dtype_to_arrow_types(np_array.dtype) 

346 

347 md = {} 

348 md[b"lsst::arrow::rowcount"] = str(len(np_array)) 

349 

350 for name in np_array.dtype.names: 

351 _append_numpy_string_metadata(md, name, np_array.dtype[name]) 

352 _append_numpy_multidim_metadata(md, name, np_array.dtype[name]) 

353 

354 schema = pa.schema(type_list, metadata=md) 

355 

356 arrays = _numpy_style_arrays_to_arrow_arrays( 

357 np_array.dtype, 

358 len(np_array), 

359 np_array, 

360 schema, 

361 ) 

362 

363 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

364 

365 return arrow_table 

366 

367 

368def numpy_dict_to_arrow(numpy_dict: dict[str, np.ndarray]) -> pa.Table: 

369 """Convert a dict of numpy arrays to an arrow table. 

370 

371 Parameters 

372 ---------- 

373 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

374 Dict with keys as the column names, values as the arrays. 

375 

376 Returns 

377 ------- 

378 arrow_table : `pyarrow.Table` 

379 Converted arrow table. 

380 

381 Raises 

382 ------ 

383 ValueError if columns in numpy_dict have unequal numbers of rows. 

384 """ 

385 dtype, rowcount = _numpy_dict_to_dtype(numpy_dict) 

386 type_list = _numpy_dtype_to_arrow_types(dtype) 

387 

388 md = {} 

389 md[b"lsst::arrow::rowcount"] = str(rowcount) 

390 

391 if dtype.names is not None: 

392 for name in dtype.names: 

393 _append_numpy_string_metadata(md, name, dtype[name]) 

394 _append_numpy_multidim_metadata(md, name, dtype[name]) 

395 

396 schema = pa.schema(type_list, metadata=md) 

397 

398 arrays = _numpy_style_arrays_to_arrow_arrays( 

399 dtype, 

400 rowcount, 

401 numpy_dict, 

402 schema, 

403 ) 

404 

405 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

406 

407 return arrow_table 

408 

409 

410def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table: 

411 """Convert an astropy table to an arrow table. 

412 

413 Parameters 

414 ---------- 

415 astropy_table : `astropy.Table` 

416 Input astropy table. 

417 

418 Returns 

419 ------- 

420 arrow_table : `pyarrow.Table` 

421 Converted arrow table. 

422 """ 

423 from astropy.table import meta 

424 

425 type_list = _numpy_dtype_to_arrow_types(astropy_table.dtype) 

426 

427 md = {} 

428 md[b"lsst::arrow::rowcount"] = str(len(astropy_table)) 

429 

430 for name in astropy_table.dtype.names: 

431 _append_numpy_string_metadata(md, name, astropy_table.dtype[name]) 

432 _append_numpy_multidim_metadata(md, name, astropy_table.dtype[name]) 

433 

434 meta_yaml = meta.get_yaml_from_table(astropy_table) 

435 meta_yaml_str = "\n".join(meta_yaml) 

436 md[b"table_meta_yaml"] = meta_yaml_str 

437 

438 schema = pa.schema(type_list, metadata=md) 

439 

440 arrays = _numpy_style_arrays_to_arrow_arrays( 

441 astropy_table.dtype, 

442 len(astropy_table), 

443 astropy_table, 

444 schema, 

445 ) 

446 

447 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

448 

449 return arrow_table 

450 

451 

452def _astropy_to_numpy_dict(astropy_table: atable.Table) -> dict[str, np.ndarray]: 

453 """Convert an astropy table to an arrow table. 

454 

455 Parameters 

456 ---------- 

457 astropy_table : `astropy.Table` 

458 Input astropy table. 

459 

460 Returns 

461 ------- 

462 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

463 Dict with keys as the column names, values as the arrays. 

464 """ 

465 return arrow_to_numpy_dict(astropy_to_arrow(astropy_table)) 

466 

467 

468def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table: 

469 """Convert a pandas dataframe to an arrow table. 

470 

471 Parameters 

472 ---------- 

473 dataframe : `pandas.DataFrame` 

474 Input pandas dataframe. 

475 default_length : `int`, optional 

476 Default string length when not in metadata or can be inferred 

477 from column. 

478 

479 Returns 

480 ------- 

481 arrow_table : `pyarrow.Table` 

482 Converted arrow table. 

483 """ 

484 arrow_table = pa.Table.from_pandas(dataframe) 

485 

486 # Update the metadata 

487 md = arrow_table.schema.metadata 

488 

489 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows) 

490 

491 # We loop through the arrow table columns because the datatypes have 

492 # been checked and converted from pandas objects. 

493 for name in arrow_table.column_names: 

494 if not name.startswith("__"): 

495 if arrow_table[name].type == pa.string(): 

496 if len(arrow_table[name]) > 0: 

497 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid) 

498 else: 

499 strlen = default_length 

500 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen) 

501 

502 arrow_table = arrow_table.replace_schema_metadata(md) 

503 

504 return arrow_table 

505 

506 

507def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table: 

508 """Convert a pandas dataframe to an astropy table, preserving indexes. 

509 

510 Parameters 

511 ---------- 

512 dataframe : `pandas.DataFrame` 

513 Input pandas dataframe. 

514 

515 Returns 

516 ------- 

517 astropy_table : `astropy.table.Table` 

518 Converted astropy table. 

519 """ 

520 import pandas as pd 

521 from astropy.table import Table 

522 

523 if isinstance(dataframe.columns, pd.MultiIndex): 

524 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.") 

525 

526 return Table.from_pandas(dataframe, index=True) 

527 

528 

529def _pandas_to_numpy_dict(dataframe: pd.DataFrame) -> dict[str, np.ndarray]: 

530 """Convert a pandas dataframe to an dict of numpy arrays. 

531 

532 Parameters 

533 ---------- 

534 dataframe : `pandas.DataFrame` 

535 Input pandas dataframe. 

536 

537 Returns 

538 ------- 

539 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

540 Dict with keys as the column names, values as the arrays. 

541 """ 

542 return arrow_to_numpy_dict(pandas_to_arrow(dataframe)) 

543 

544 

545def numpy_to_astropy(np_array: np.ndarray) -> atable.Table: 

546 """Convert a numpy table to an astropy table. 

547 

548 Parameters 

549 ---------- 

550 np_array : `numpy.ndarray` 

551 Input numpy array with multiple fields. 

552 

553 Returns 

554 ------- 

555 astropy_table : `astropy.table.Table` 

556 Converted astropy table. 

557 """ 

558 from astropy.table import Table 

559 

560 return Table(data=np_array, copy=False) 

561 

562 

563def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex: 

564 """Convert an arrow schema to a pandas index/multiindex. 

565 

566 Parameters 

567 ---------- 

568 schema : `pyarrow.Schema` 

569 Input pyarrow schema. 

570 

571 Returns 

572 ------- 

573 index : `pandas.Index` or `pandas.MultiIndex` 

574 Converted pandas index. 

575 """ 

576 import pandas as pd 

577 

578 if b"pandas" in schema.metadata: 

579 md = json.loads(schema.metadata[b"pandas"]) 

580 indexes = md["column_indexes"] 

581 len_indexes = len(indexes) 

582 else: 

583 len_indexes = 0 

584 

585 if len_indexes <= 1: 

586 return pd.Index(name for name in schema.names if not name.startswith("__")) 

587 else: 

588 raw_columns = _split_multi_index_column_names(len(indexes), schema.names) 

589 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

590 

591 

592def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]: 

593 """Convert an arrow schema to a list of string column names. 

594 

595 Parameters 

596 ---------- 

597 schema : `pyarrow.Schema` 

598 Input pyarrow schema. 

599 

600 Returns 

601 ------- 

602 column_list : `list` [`str`] 

603 Converted list of column names. 

604 """ 

605 return [name for name in schema.names] 

606 

607 

608class DataFrameSchema: 

609 """Wrapper class for a schema for a pandas DataFrame. 

610 

611 Parameters 

612 ---------- 

613 dataframe : `pandas.DataFrame` 

614 Dataframe to turn into a schema. 

615 """ 

616 

617 def __init__(self, dataframe: pd.DataFrame) -> None: 

618 self._schema = dataframe.loc[[False] * len(dataframe)] 

619 

620 @classmethod 

621 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema: 

622 """Convert an arrow schema into a `DataFrameSchema`. 

623 

624 Parameters 

625 ---------- 

626 schema : `pyarrow.Schema` 

627 The pyarrow schema to convert. 

628 

629 Returns 

630 ------- 

631 dataframe_schema : `DataFrameSchema` 

632 Converted dataframe schema. 

633 """ 

634 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema) 

635 

636 return cls(empty_table.to_pandas()) 

637 

638 def to_arrow_schema(self) -> pa.Schema: 

639 """Convert to an arrow schema. 

640 

641 Returns 

642 ------- 

643 arrow_schema : `pyarrow.Schema` 

644 Converted pyarrow schema. 

645 """ 

646 arrow_table = pa.Table.from_pandas(self._schema) 

647 

648 return arrow_table.schema 

649 

650 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

651 """Convert to an `ArrowNumpySchema`. 

652 

653 Returns 

654 ------- 

655 arrow_numpy_schema : `ArrowNumpySchema` 

656 Converted arrow numpy schema. 

657 """ 

658 return ArrowNumpySchema.from_arrow(self.to_arrow_schema()) 

659 

660 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

661 """Convert to an ArrowAstropySchema. 

662 

663 Returns 

664 ------- 

665 arrow_astropy_schema : `ArrowAstropySchema` 

666 Converted arrow astropy schema. 

667 """ 

668 return ArrowAstropySchema.from_arrow(self.to_arrow_schema()) 

669 

670 @property 

671 def schema(self) -> np.dtype: 

672 return self._schema 

673 

674 def __repr__(self) -> str: 

675 return repr(self._schema) 

676 

677 def __eq__(self, other: object) -> bool: 

678 if not isinstance(other, DataFrameSchema): 

679 return NotImplemented 

680 

681 return self._schema.equals(other._schema) 

682 

683 

684class ArrowAstropySchema: 

685 """Wrapper class for a schema for an astropy table. 

686 

687 Parameters 

688 ---------- 

689 astropy_table : `astropy.table.Table` 

690 Input astropy table. 

691 """ 

692 

693 def __init__(self, astropy_table: atable.Table) -> None: 

694 self._schema = astropy_table[:0] 

695 

696 @classmethod 

697 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema: 

698 """Convert an arrow schema into a ArrowAstropySchema. 

699 

700 Parameters 

701 ---------- 

702 schema : `pyarrow.Schema` 

703 Input pyarrow schema. 

704 

705 Returns 

706 ------- 

707 astropy_schema : `ArrowAstropySchema` 

708 Converted arrow astropy schema. 

709 """ 

710 import numpy as np 

711 from astropy.table import Table 

712 

713 dtype = _schema_to_dtype_list(schema) 

714 

715 data = np.zeros(0, dtype=dtype) 

716 astropy_table = Table(data=data) 

717 

718 metadata = schema.metadata if schema.metadata is not None else {} 

719 

720 _apply_astropy_metadata(astropy_table, metadata) 

721 

722 return cls(astropy_table) 

723 

724 def to_arrow_schema(self) -> pa.Schema: 

725 """Convert to an arrow schema. 

726 

727 Returns 

728 ------- 

729 arrow_schema : `pyarrow.Schema` 

730 Converted pyarrow schema. 

731 """ 

732 return astropy_to_arrow(self._schema).schema 

733 

734 def to_dataframe_schema(self) -> DataFrameSchema: 

735 """Convert to a DataFrameSchema. 

736 

737 Returns 

738 ------- 

739 dataframe_schema : `DataFrameSchema` 

740 Converted dataframe schema. 

741 """ 

742 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema) 

743 

744 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

745 """Convert to an `ArrowNumpySchema`. 

746 

747 Returns 

748 ------- 

749 arrow_numpy_schema : `ArrowNumpySchema` 

750 Converted arrow numpy schema. 

751 """ 

752 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema) 

753 

754 @property 

755 def schema(self) -> atable.Table: 

756 return self._schema 

757 

758 def __repr__(self) -> str: 

759 return repr(self._schema) 

760 

761 def __eq__(self, other: object) -> bool: 

762 if not isinstance(other, ArrowAstropySchema): 

763 return NotImplemented 

764 

765 # If this comparison passes then the two tables have the 

766 # same column names. 

767 if self._schema.dtype != other._schema.dtype: 

768 return False 

769 

770 for name in self._schema.columns: 

771 if not self._schema[name].unit == other._schema[name].unit: 

772 return False 

773 if not self._schema[name].description == other._schema[name].description: 

774 return False 

775 if not self._schema[name].format == other._schema[name].format: 

776 return False 

777 

778 return True 

779 

780 

781class ArrowNumpySchema: 

782 """Wrapper class for a schema for a numpy ndarray. 

783 

784 Parameters 

785 ---------- 

786 numpy_dtype : `numpy.dtype` 

787 Numpy dtype to convert. 

788 """ 

789 

790 def __init__(self, numpy_dtype: np.dtype) -> None: 

791 self._dtype = numpy_dtype 

792 

793 @classmethod 

794 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema: 

795 """Convert an arrow schema into an `ArrowNumpySchema`. 

796 

797 Parameters 

798 ---------- 

799 schema : `pyarrow.Schema` 

800 Pyarrow schema to convert. 

801 

802 Returns 

803 ------- 

804 numpy_schema : `ArrowNumpySchema` 

805 Converted arrow numpy schema. 

806 """ 

807 import numpy as np 

808 

809 dtype = _schema_to_dtype_list(schema) 

810 

811 return cls(np.dtype(dtype)) 

812 

813 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

814 """Convert to an `ArrowAstropySchema`. 

815 

816 Returns 

817 ------- 

818 astropy_schema : `ArrowAstropySchema` 

819 Converted arrow astropy schema. 

820 """ 

821 import numpy as np 

822 

823 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

824 

825 def to_dataframe_schema(self) -> DataFrameSchema: 

826 """Convert to a `DataFrameSchema`. 

827 

828 Returns 

829 ------- 

830 dataframe_schema : `DataFrameSchema` 

831 Converted dataframe schema. 

832 """ 

833 import numpy as np 

834 

835 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

836 

837 def to_arrow_schema(self) -> pa.Schema: 

838 """Convert to a `pyarrow.Schema`. 

839 

840 Returns 

841 ------- 

842 arrow_schema : `pyarrow.Schema` 

843 Converted pyarrow schema. 

844 """ 

845 import numpy as np 

846 

847 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema 

848 

849 @property 

850 def schema(self) -> np.dtype: 

851 return self._dtype 

852 

853 def __repr__(self) -> str: 

854 return repr(self._dtype) 

855 

856 def __eq__(self, other: object) -> bool: 

857 if not isinstance(other, ArrowNumpySchema): 

858 return NotImplemented 

859 

860 if not self._dtype == other._dtype: 

861 return False 

862 

863 return True 

864 

865 

866def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]: 

867 """Split a string that represents a multi-index column. 

868 

869 PyArrow maps Pandas' multi-index column names (which are tuples in Python) 

870 to flat strings on disk. This routine exists to reconstruct the original 

871 tuple. 

872 

873 Parameters 

874 ---------- 

875 n : `int` 

876 Number of levels in the `pandas.MultiIndex` that is being 

877 reconstructed. 

878 names : `~collections.abc.Iterable` [`str`] 

879 Strings to be split. 

880 

881 Returns 

882 ------- 

883 column_names : `list` [`tuple` [`str`]] 

884 A list of multi-index column name tuples. 

885 """ 

886 column_names: List[Sequence[str]] = [] 

887 

888 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

889 for name in names: 

890 m = re.search(pattern, name) 

891 if m is not None: 

892 column_names.append(m.groups()) 

893 

894 return column_names 

895 

896 

897def _standardize_multi_index_columns( 

898 schema: pa.Schema, 

899 columns: Any, 

900 stringify: bool = True, 

901) -> list[str | Sequence[Any]]: 

902 """Transform a dictionary/iterable index from a multi-index column list 

903 into a string directly understandable by PyArrow. 

904 

905 Parameters 

906 ---------- 

907 schema : `pyarrow.Schema` 

908 Pyarrow schema. 

909 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]] 

910 Columns to standardize. 

911 stringify : `bool`, optional 

912 Should the column names be stringified? 

913 

914 Returns 

915 ------- 

916 names : `list` [`str`] 

917 Stringified representation of a multi-index column name. 

918 """ 

919 pd_index = arrow_schema_to_pandas_index(schema) 

920 index_level_names = tuple(pd_index.names) 

921 

922 names: list[str | Sequence[Any]] = [] 

923 

924 if isinstance(columns, list): 

925 for requested in columns: 

926 if not isinstance(requested, tuple): 

927 raise ValueError( 

928 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

929 f"Instead got a {get_full_type_name(requested)}." 

930 ) 

931 if stringify: 

932 names.append(str(requested)) 

933 else: 

934 names.append(requested) 

935 else: 

936 if not isinstance(columns, collections.abc.Mapping): 

937 raise ValueError( 

938 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

939 f"Instead got a {get_full_type_name(columns)}." 

940 ) 

941 if not set(index_level_names).issuperset(columns.keys()): 

942 raise ValueError( 

943 f"Cannot use dict with keys {set(columns.keys())} to select columns from {index_level_names}." 

944 ) 

945 factors = [ 

946 ensure_iterable(columns.get(level, pd_index.levels[i])) 

947 for i, level in enumerate(index_level_names) 

948 ] 

949 for requested in itertools.product(*factors): 

950 for i, value in enumerate(requested): 

951 if value not in pd_index.levels[i]: 

952 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.") 

953 if stringify: 

954 names.append(str(requested)) 

955 else: 

956 names.append(requested) 

957 

958 return names 

959 

960 

961def _apply_astropy_metadata(astropy_table: atable.Table, metadata: dict) -> None: 

962 """Apply any astropy metadata from the schema metadata. 

963 

964 Parameters 

965 ---------- 

966 astropy_table : `astropy.table.Table` 

967 Table to apply metadata. 

968 metadata : `dict` [`bytes`] 

969 Metadata dict. 

970 """ 

971 from astropy.table import meta 

972 

973 meta_yaml = metadata.get(b"table_meta_yaml", None) 

974 if meta_yaml: 

975 meta_yaml = meta_yaml.decode("UTF8").split("\n") 

976 meta_hdr = meta.get_header_from_yaml(meta_yaml) 

977 

978 # Set description, format, unit, meta from the column 

979 # metadata that was serialized with the table. 

980 header_cols = {x["name"]: x for x in meta_hdr["datatype"]} 

981 for col in astropy_table.columns.values(): 

982 for attr in ("description", "format", "unit", "meta"): 

983 if attr in header_cols[col.name]: 

984 setattr(col, attr, header_cols[col.name][attr]) 

985 

986 if "meta" in meta_hdr: 

987 astropy_table.meta.update(meta_hdr["meta"]) 

988 

989 

990def _arrow_string_to_numpy_dtype( 

991 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10 

992) -> str: 

993 """Get the numpy dtype string associated with an arrow column. 

994 

995 Parameters 

996 ---------- 

997 schema : `pyarrow.Schema` 

998 Arrow table schema. 

999 name : `str` 

1000 Column name. 

1001 numpy_column : `numpy.ndarray`, optional 

1002 Column to determine numpy string dtype. 

1003 default_length : `int`, optional 

1004 Default string length when not in metadata or can be inferred 

1005 from column. 

1006 

1007 Returns 

1008 ------- 

1009 dtype_str : `str` 

1010 Numpy dtype string. 

1011 """ 

1012 # Special-case for string and binary columns 

1013 md_name = f"lsst::arrow::len::{name}" 

1014 strlen = default_length 

1015 metadata = schema.metadata if schema.metadata is not None else {} 

1016 if (encoded := md_name.encode("UTF-8")) in metadata: 

1017 # String/bytes length from header. 

1018 strlen = int(schema.metadata[encoded]) 

1019 elif numpy_column is not None: 

1020 if len(numpy_column) > 0: 

1021 strlen = max(len(row) for row in numpy_column) 

1022 

1023 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}" 

1024 

1025 return dtype 

1026 

1027 

1028def _append_numpy_string_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None: 

1029 """Append numpy string length keys to arrow metadata. 

1030 

1031 All column types are handled, but the metadata is only modified for 

1032 string and byte columns. 

1033 

1034 Parameters 

1035 ---------- 

1036 metadata : `dict` [`bytes`, `str`] 

1037 Metadata dictionary; modified in place. 

1038 name : `str` 

1039 Column name. 

1040 dtype : `np.dtype` 

1041 Numpy dtype. 

1042 """ 

1043 import numpy as np 

1044 

1045 if dtype.type is np.str_: 

1046 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4) 

1047 elif dtype.type is np.bytes_: 

1048 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize) 

1049 

1050 

1051def _append_numpy_multidim_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None: 

1052 """Append numpy multi-dimensional shapes to arrow metadata. 

1053 

1054 All column types are handled, but the metadata is only modified for 

1055 multi-dimensional columns. 

1056 

1057 Parameters 

1058 ---------- 

1059 metadata : `dict` [`bytes`, `str`] 

1060 Metadata dictionary; modified in place. 

1061 name : `str` 

1062 Column name. 

1063 dtype : `np.dtype` 

1064 Numpy dtype. 

1065 """ 

1066 if len(dtype.shape) > 1: 

1067 metadata[f"lsst::arrow::shape::{name}".encode("UTF-8")] = str(dtype.shape) 

1068 

1069 

1070def _multidim_shape_from_metadata(metadata: dict[bytes, bytes], list_size: int, name: str) -> tuple[int, ...]: 

1071 """Retrieve the shape from the metadata, if available. 

1072 

1073 Parameters 

1074 ---------- 

1075 metadata : `dict` [`bytes`, `bytes`] 

1076 Metadata dictionary. 

1077 list_size : `int` 

1078 Size of the list datatype. 

1079 name : `str` 

1080 Column name. 

1081 

1082 Returns 

1083 ------- 

1084 shape : `tuple` [`int`] 

1085 Shape associated with the column. 

1086 

1087 Raises 

1088 ------ 

1089 RuntimeError 

1090 Raised if metadata is found but has incorrect format. 

1091 """ 

1092 md_name = f"lsst::arrow::shape::{name}" 

1093 if (encoded := md_name.encode("UTF-8")) in metadata: 

1094 groups = re.search(r"\((.*)\)", metadata[encoded].decode("UTF-8")) 

1095 if groups is None: 

1096 raise RuntimeError("Illegal value found in metadata.") 

1097 shape = tuple(int(x) for x in groups[1].split(",") if x != "") 

1098 else: 

1099 shape = (list_size,) 

1100 

1101 return shape 

1102 

1103 

1104def _schema_to_dtype_list(schema: pa.Schema) -> list[tuple[str, tuple[Any] | str]]: 

1105 """Convert a pyarrow schema to a numpy dtype. 

1106 

1107 Parameters 

1108 ---------- 

1109 schema : `pyarrow.Schema` 

1110 Input pyarrow schema. 

1111 

1112 Returns 

1113 ------- 

1114 dtype_list: `list` [`tuple`] 

1115 A list with name, type pairs. 

1116 """ 

1117 metadata = schema.metadata if schema.metadata is not None else {} 

1118 

1119 dtype: list[Any] = [] 

1120 for name in schema.names: 

1121 t = schema.field(name).type 

1122 if isinstance(t, pa.FixedSizeListType): 

1123 shape = _multidim_shape_from_metadata(metadata, t.list_size, name) 

1124 dtype.append((name, (t.value_type.to_pandas_dtype(), shape))) 

1125 elif t not in (pa.string(), pa.binary()): 

1126 dtype.append((name, t.to_pandas_dtype())) 

1127 else: 

1128 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name))) 

1129 

1130 return dtype 

1131 

1132 

1133def _numpy_dtype_to_arrow_types(dtype: np.dtype) -> list[Any]: 

1134 """Convert a numpy dtype to a list of arrow types. 

1135 

1136 Parameters 

1137 ---------- 

1138 dtype : `numpy.dtype` 

1139 Numpy dtype to convert. 

1140 

1141 Returns 

1142 ------- 

1143 type_list : `list` [`object`] 

1144 Converted list of arrow types. 

1145 """ 

1146 from math import prod 

1147 

1148 import numpy as np 

1149 

1150 type_list: list[Any] = [] 

1151 if dtype.names is None: 

1152 return type_list 

1153 

1154 for name in dtype.names: 

1155 dt = dtype[name] 

1156 arrow_type: Any 

1157 if len(dt.shape) > 0: 

1158 arrow_type = pa.list_( 

1159 pa.from_numpy_dtype(cast(tuple[np.dtype, tuple[int, ...]], dt.subdtype)[0].type), 

1160 prod(dt.shape), 

1161 ) 

1162 else: 

1163 arrow_type = pa.from_numpy_dtype(dt.type) 

1164 type_list.append((name, arrow_type)) 

1165 

1166 return type_list 

1167 

1168 

1169def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, int]: 

1170 """Extract equivalent table dtype from dict of numpy arrays. 

1171 

1172 Parameters 

1173 ---------- 

1174 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

1175 Dict with keys as the column names, values as the arrays. 

1176 

1177 Returns 

1178 ------- 

1179 dtype : `numpy.dtype` 

1180 dtype of equivalent table. 

1181 rowcount : `int` 

1182 Number of rows in the table. 

1183 

1184 Raises 

1185 ------ 

1186 ValueError if columns in numpy_dict have unequal numbers of rows. 

1187 """ 

1188 import numpy as np 

1189 

1190 dtype_list = [] 

1191 rowcount = 0 

1192 for name, col in numpy_dict.items(): 

1193 if rowcount == 0: 

1194 rowcount = len(col) 

1195 if len(col) != rowcount: 

1196 raise ValueError(f"Column {name} has a different number of rows.") 

1197 if len(col.shape) == 1: 

1198 dtype_list.append((name, col.dtype)) 

1199 else: 

1200 dtype_list.append((name, (col.dtype, col.shape[1:]))) 

1201 dtype = np.dtype(dtype_list) 

1202 

1203 return (dtype, rowcount) 

1204 

1205 

1206def _numpy_style_arrays_to_arrow_arrays( 

1207 dtype: np.dtype, 

1208 rowcount: int, 

1209 np_style_arrays: dict[str, np.ndarray] | np.ndarray | atable.Table, 

1210 schema: pa.Schema, 

1211) -> list[pa.Array]: 

1212 """Convert numpy-style arrays to arrow arrays. 

1213 

1214 Parameters 

1215 ---------- 

1216 dtype : `numpy.dtype` 

1217 Numpy dtype of input table/arrays. 

1218 rowcount : `int` 

1219 Number of rows in input table/arrays. 

1220 np_style_arrays : `dict` [`str`, `np.ndarray`] or `np.ndarray` 

1221 or `astropy.table.Table` 

1222 Arrays to convert to arrow. 

1223 schema : `pyarrow.Schema` 

1224 Schema of arrow table. 

1225 

1226 Returns 

1227 ------- 

1228 arrow_arrays : `list` [`pyarrow.Array`] 

1229 List of converted pyarrow arrays. 

1230 """ 

1231 import numpy as np 

1232 

1233 arrow_arrays: list[pa.Array] = [] 

1234 if dtype.names is None: 

1235 return arrow_arrays 

1236 

1237 for name in dtype.names: 

1238 dt = dtype[name] 

1239 val: Any 

1240 if len(dt.shape) > 0: 

1241 if rowcount > 0: 

1242 val = np.split(np_style_arrays[name].ravel(), rowcount) 

1243 else: 

1244 val = [] 

1245 else: 

1246 val = np_style_arrays[name] 

1247 

1248 try: 

1249 arrow_arrays.append(pa.array(val, type=schema.field(name).type)) 

1250 except pa.ArrowNotImplementedError as err: 

1251 # Check if val is big-endian. 

1252 if (np.little_endian and val.dtype.byteorder == ">") or ( 

1253 not np.little_endian and val.dtype.byteorder == "=" 

1254 ): 

1255 # We need to convert the array to little-endian. 

1256 val2 = val.byteswap() 

1257 val2.dtype = val2.dtype.newbyteorder("<") 

1258 arrow_arrays.append(pa.array(val2, type=schema.field(name).type)) 

1259 else: 

1260 # This failed for some other reason so raise the exception. 

1261 raise err 

1262 

1263 return arrow_arrays