Coverage for python/lsst/daf/butler/formatters/parquet.py: 13%

390 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-25 02:36 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ParquetFormatter", 

26 "arrow_to_pandas", 

27 "arrow_to_astropy", 

28 "arrow_to_numpy", 

29 "arrow_to_numpy_dict", 

30 "pandas_to_arrow", 

31 "pandas_to_astropy", 

32 "astropy_to_arrow", 

33 "numpy_to_arrow", 

34 "numpy_to_astropy", 

35 "numpy_dict_to_arrow", 

36 "arrow_schema_to_pandas_index", 

37 "DataFrameSchema", 

38 "ArrowAstropySchema", 

39 "ArrowNumpySchema", 

40) 

41 

42import collections.abc 

43import itertools 

44import json 

45import re 

46from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, Union, cast 

47 

48import pyarrow as pa 

49import pyarrow.parquet as pq 

50from lsst.daf.butler import Formatter 

51from lsst.utils.introspection import get_full_type_name 

52from lsst.utils.iteration import ensure_iterable 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 import astropy.table as atable 

56 import numpy as np 

57 import pandas as pd 

58 

59 

60class ParquetFormatter(Formatter): 

61 """Interface for reading and writing Arrow Table objects to and from 

62 Parquet files. 

63 """ 

64 

65 extension = ".parq" 

66 

67 def read(self, component: Optional[str] = None) -> Any: 

68 # Docstring inherited from Formatter.read. 

69 schema = pq.read_schema(self.fileDescriptor.location.path) 

70 

71 if component in ("columns", "schema"): 

72 # The schema will be translated to column format 

73 # depending on the input type. 

74 return schema 

75 elif component == "rowcount": 

76 # Get the rowcount from the metadata if possible, otherwise count. 

77 if b"lsst::arrow::rowcount" in schema.metadata: 

78 return int(schema.metadata[b"lsst::arrow::rowcount"]) 

79 

80 temp_table = pq.read_table( 

81 self.fileDescriptor.location.path, 

82 columns=[schema.names[0]], 

83 use_threads=False, 

84 use_pandas_metadata=False, 

85 ) 

86 

87 return len(temp_table[schema.names[0]]) 

88 

89 par_columns = None 

90 if self.fileDescriptor.parameters: 

91 par_columns = self.fileDescriptor.parameters.pop("columns", None) 

92 if par_columns: 

93 has_pandas_multi_index = False 

94 if b"pandas" in schema.metadata: 

95 md = json.loads(schema.metadata[b"pandas"]) 

96 if len(md["column_indexes"]) > 1: 

97 has_pandas_multi_index = True 

98 

99 if not has_pandas_multi_index: 

100 # Ensure uniqueness, keeping order. 

101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns))) 

102 file_columns = [name for name in schema.names if not name.startswith("__")] 

103 

104 for par_column in par_columns: 

105 if par_column not in file_columns: 

106 raise ValueError( 

107 f"Column {par_column} specified in parameters not available in parquet file." 

108 ) 

109 else: 

110 par_columns = _standardize_multi_index_columns(schema, par_columns) 

111 

112 if len(self.fileDescriptor.parameters): 

113 raise ValueError( 

114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read." 

115 ) 

116 

117 metadata = schema.metadata if schema.metadata is not None else {} 

118 arrow_table = pq.read_table( 

119 self.fileDescriptor.location.path, 

120 columns=par_columns, 

121 use_threads=False, 

122 use_pandas_metadata=(b"pandas" in metadata), 

123 ) 

124 

125 return arrow_table 

126 

127 def write(self, inMemoryDataset: Any) -> None: 

128 import numpy as np 

129 from astropy.table import Table as astropyTable 

130 

131 arrow_table = None 

132 if isinstance(inMemoryDataset, pa.Table): 

133 # This will be the most likely match. 

134 arrow_table = inMemoryDataset 

135 elif isinstance(inMemoryDataset, astropyTable): 

136 arrow_table = astropy_to_arrow(inMemoryDataset) 

137 elif isinstance(inMemoryDataset, np.ndarray): 

138 arrow_table = numpy_to_arrow(inMemoryDataset) 

139 else: 

140 if hasattr(inMemoryDataset, "to_parquet"): 

141 # This may be a pandas DataFrame 

142 try: 

143 import pandas as pd 

144 except ImportError: 

145 pd = None 

146 

147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame): 

148 arrow_table = pandas_to_arrow(inMemoryDataset) 

149 

150 if arrow_table is None: 

151 raise ValueError( 

152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of " 

153 "inMemoryDataset for ParquetFormatter." 

154 ) 

155 

156 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

157 

158 pq.write_table(arrow_table, location.path) 

159 

160 

161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame: 

162 """Convert a pyarrow table to a pandas DataFrame. 

163 

164 Parameters 

165 ---------- 

166 arrow_table : `pyarrow.Table` 

167 Input arrow table to convert. If the table has ``pandas`` metadata 

168 in the schema it will be used in the construction of the 

169 ``DataFrame``. 

170 

171 Returns 

172 ------- 

173 dataframe : `pandas.DataFrame` 

174 Converted pandas dataframe. 

175 """ 

176 return arrow_table.to_pandas(use_threads=False) 

177 

178 

179def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table: 

180 """Convert a pyarrow table to an `astropy.Table`. 

181 

182 Parameters 

183 ---------- 

184 arrow_table : `pyarrow.Table` 

185 Input arrow table to convert. If the table has astropy unit 

186 metadata in the schema it will be used in the construction 

187 of the ``astropy.Table``. 

188 

189 Returns 

190 ------- 

191 table : `astropy.Table` 

192 Converted astropy table. 

193 """ 

194 from astropy.table import Table 

195 

196 astropy_table = Table(arrow_to_numpy_dict(arrow_table)) 

197 

198 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {} 

199 

200 _apply_astropy_metadata(astropy_table, metadata) 

201 

202 return astropy_table 

203 

204 

205def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray: 

206 """Convert a pyarrow table to a structured numpy array. 

207 

208 Parameters 

209 ---------- 

210 arrow_table : `pyarrow.Table` 

211 Input arrow table. 

212 

213 Returns 

214 ------- 

215 array : `numpy.ndarray` (N,) 

216 Numpy array table with N rows and the same column names 

217 as the input arrow table. 

218 """ 

219 import numpy as np 

220 

221 numpy_dict = arrow_to_numpy_dict(arrow_table) 

222 

223 dtype = [] 

224 for name, col in numpy_dict.items(): 

225 if len(shape := numpy_dict[name].shape) <= 1: 

226 dtype.append((name, col.dtype)) 

227 else: 

228 dtype.append((name, (col.dtype, shape[1:]))) 

229 

230 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype) 

231 

232 return array 

233 

234 

235def arrow_to_numpy_dict(arrow_table: pa.Table) -> dict[str, np.ndarray]: 

236 """Convert a pyarrow table to a dict of numpy arrays. 

237 

238 Parameters 

239 ---------- 

240 arrow_table : `pyarrow.Table` 

241 Input arrow table. 

242 

243 Returns 

244 ------- 

245 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

246 Dict with keys as the column names, values as the arrays. 

247 """ 

248 import numpy as np 

249 

250 schema = arrow_table.schema 

251 metadata = schema.metadata if schema.metadata is not None else {} 

252 

253 numpy_dict = {} 

254 

255 for name in schema.names: 

256 col = arrow_table[name].to_numpy() 

257 

258 t = schema.field(name).type 

259 if t in (pa.string(), pa.binary()): 

260 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col)) 

261 elif isinstance(t, pa.FixedSizeListType): 

262 if len(col) > 0: 

263 col = np.stack(col) 

264 else: 

265 # this is an empty column, and needs to be coerced to type. 

266 col = col.astype(t.value_type.to_pandas_dtype()) 

267 

268 shape = _multidim_shape_from_metadata(metadata, t.list_size, name) 

269 col = col.reshape((len(arrow_table), *shape)) 

270 

271 numpy_dict[name] = col 

272 

273 return numpy_dict 

274 

275 

276def numpy_to_arrow(np_array: np.ndarray) -> pa.Table: 

277 """Convert a numpy array table to an arrow table. 

278 

279 Parameters 

280 ---------- 

281 np_array : `numpy.ndarray` 

282 Input numpy array with multiple fields. 

283 

284 Returns 

285 ------- 

286 arrow_table : `pyarrow.Table` 

287 Converted arrow table. 

288 """ 

289 type_list = _numpy_dtype_to_arrow_types(np_array.dtype) 

290 

291 md = {} 

292 md[b"lsst::arrow::rowcount"] = str(len(np_array)) 

293 

294 for name in np_array.dtype.names: 

295 _append_numpy_string_metadata(md, name, np_array.dtype[name]) 

296 _append_numpy_multidim_metadata(md, name, np_array.dtype[name]) 

297 

298 schema = pa.schema(type_list, metadata=md) 

299 

300 arrays = _numpy_style_arrays_to_arrow_arrays( 

301 np_array.dtype, 

302 len(np_array), 

303 np_array, 

304 schema, 

305 ) 

306 

307 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

308 

309 return arrow_table 

310 

311 

312def numpy_dict_to_arrow(numpy_dict: dict[str, np.ndarray]) -> pa.Table: 

313 """Convert a dict of numpy arrays to an arrow table. 

314 

315 Parameters 

316 ---------- 

317 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

318 Dict with keys as the column names, values as the arrays. 

319 

320 Returns 

321 ------- 

322 arrow_table : `pyarrow.Table` 

323 Converted arrow table. 

324 

325 Raises 

326 ------ 

327 ValueError if columns in numpy_dict have unequal numbers of rows. 

328 """ 

329 dtype, rowcount = _numpy_dict_to_dtype(numpy_dict) 

330 type_list = _numpy_dtype_to_arrow_types(dtype) 

331 

332 md = {} 

333 md[b"lsst::arrow::rowcount"] = str(rowcount) 

334 

335 if dtype.names is not None: 

336 for name in dtype.names: 

337 _append_numpy_string_metadata(md, name, dtype[name]) 

338 _append_numpy_multidim_metadata(md, name, dtype[name]) 

339 

340 schema = pa.schema(type_list, metadata=md) 

341 

342 arrays = _numpy_style_arrays_to_arrow_arrays( 

343 dtype, 

344 rowcount, 

345 numpy_dict, 

346 schema, 

347 ) 

348 

349 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

350 

351 return arrow_table 

352 

353 

354def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table: 

355 """Convert an astropy table to an arrow table. 

356 

357 Parameters 

358 ---------- 

359 astropy_table : `astropy.Table` 

360 Input astropy table. 

361 

362 Returns 

363 ------- 

364 arrow_table : `pyarrow.Table` 

365 Converted arrow table. 

366 """ 

367 from astropy.table import meta 

368 

369 type_list = _numpy_dtype_to_arrow_types(astropy_table.dtype) 

370 

371 md = {} 

372 md[b"lsst::arrow::rowcount"] = str(len(astropy_table)) 

373 

374 for name in astropy_table.dtype.names: 

375 _append_numpy_string_metadata(md, name, astropy_table.dtype[name]) 

376 _append_numpy_multidim_metadata(md, name, astropy_table.dtype[name]) 

377 

378 meta_yaml = meta.get_yaml_from_table(astropy_table) 

379 meta_yaml_str = "\n".join(meta_yaml) 

380 md[b"table_meta_yaml"] = meta_yaml_str 

381 

382 schema = pa.schema(type_list, metadata=md) 

383 

384 arrays = _numpy_style_arrays_to_arrow_arrays( 

385 astropy_table.dtype, 

386 len(astropy_table), 

387 astropy_table, 

388 schema, 

389 ) 

390 

391 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

392 

393 return arrow_table 

394 

395 

396def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table: 

397 """Convert a pandas dataframe to an arrow table. 

398 

399 Parameters 

400 ---------- 

401 dataframe : `pandas.DataFrame` 

402 Input pandas dataframe. 

403 default_length : `int`, optional 

404 Default string length when not in metadata or can be inferred 

405 from column. 

406 

407 Returns 

408 ------- 

409 arrow_table : `pyarrow.Table` 

410 Converted arrow table. 

411 """ 

412 arrow_table = pa.Table.from_pandas(dataframe) 

413 

414 # Update the metadata 

415 md = arrow_table.schema.metadata 

416 

417 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows) 

418 

419 # We loop through the arrow table columns because the datatypes have 

420 # been checked and converted from pandas objects. 

421 for name in arrow_table.column_names: 

422 if not name.startswith("__"): 

423 if arrow_table[name].type == pa.string(): 

424 if len(arrow_table[name]) > 0: 

425 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid) 

426 else: 

427 strlen = default_length 

428 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen) 

429 

430 arrow_table = arrow_table.replace_schema_metadata(md) 

431 

432 return arrow_table 

433 

434 

435def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table: 

436 """Convert a pandas dataframe to an astropy table, preserving indexes. 

437 

438 Parameters 

439 ---------- 

440 dataframe : `pandas.DataFrame` 

441 Input pandas dataframe. 

442 

443 Returns 

444 ------- 

445 astropy_table : `astropy.table.Table` 

446 Converted astropy table. 

447 """ 

448 import pandas as pd 

449 from astropy.table import Table 

450 

451 if isinstance(dataframe.columns, pd.MultiIndex): 

452 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.") 

453 

454 return Table.from_pandas(dataframe, index=True) 

455 

456 

457def numpy_to_astropy(np_array: np.ndarray) -> atable.Table: 

458 """Convert a numpy table to an astropy table. 

459 

460 Parameters 

461 ---------- 

462 np_array : `numpy.ndarray` 

463 Input numpy array with multiple fields. 

464 

465 Returns 

466 ------- 

467 astropy_table : `astropy.table.Table` 

468 Converted astropy table. 

469 """ 

470 from astropy.table import Table 

471 

472 return Table(data=np_array, copy=False) 

473 

474 

475def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex: 

476 """Convert an arrow schema to a pandas index/multiindex. 

477 

478 Parameters 

479 ---------- 

480 schema : `pyarrow.Schema` 

481 Input pyarrow schema. 

482 

483 Returns 

484 ------- 

485 index : `pandas.Index` or `pandas.MultiIndex` 

486 Converted pandas index. 

487 """ 

488 import pandas as pd 

489 

490 if b"pandas" in schema.metadata: 

491 md = json.loads(schema.metadata[b"pandas"]) 

492 indexes = md["column_indexes"] 

493 len_indexes = len(indexes) 

494 else: 

495 len_indexes = 0 

496 

497 if len_indexes <= 1: 

498 return pd.Index(name for name in schema.names if not name.startswith("__")) 

499 else: 

500 raw_columns = _split_multi_index_column_names(len(indexes), schema.names) 

501 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

502 

503 

504def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]: 

505 """Convert an arrow schema to a list of string column names. 

506 

507 Parameters 

508 ---------- 

509 schema : `pyarrow.Schema` 

510 Input pyarrow schema. 

511 

512 Returns 

513 ------- 

514 column_list : `list` [`str`] 

515 Converted list of column names. 

516 """ 

517 return [name for name in schema.names] 

518 

519 

520class DataFrameSchema: 

521 """Wrapper class for a schema for a pandas DataFrame. 

522 

523 Parameters 

524 ---------- 

525 dataframe : `pandas.DataFrame` 

526 Dataframe to turn into a schema. 

527 """ 

528 

529 def __init__(self, dataframe: pd.DataFrame) -> None: 

530 self._schema = dataframe.loc[[False] * len(dataframe)] 

531 

532 @classmethod 

533 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema: 

534 """Convert an arrow schema into a `DataFrameSchema`. 

535 

536 Parameters 

537 ---------- 

538 schema : `pyarrow.Schema` 

539 The pyarrow schema to convert. 

540 

541 Returns 

542 ------- 

543 dataframe_schema : `DataFrameSchema` 

544 Converted dataframe schema. 

545 """ 

546 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema) 

547 

548 return cls(empty_table.to_pandas()) 

549 

550 def to_arrow_schema(self) -> pa.Schema: 

551 """Convert to an arrow schema. 

552 

553 Returns 

554 ------- 

555 arrow_schema : `pyarrow.Schema` 

556 Converted pyarrow schema. 

557 """ 

558 arrow_table = pa.Table.from_pandas(self._schema) 

559 

560 return arrow_table.schema 

561 

562 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

563 """Convert to an `ArrowNumpySchema`. 

564 

565 Returns 

566 ------- 

567 arrow_numpy_schema : `ArrowNumpySchema` 

568 Converted arrow numpy schema. 

569 """ 

570 return ArrowNumpySchema.from_arrow(self.to_arrow_schema()) 

571 

572 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

573 """Convert to an ArrowAstropySchema. 

574 

575 Returns 

576 ------- 

577 arrow_astropy_schema : `ArrowAstropySchema` 

578 Converted arrow astropy schema. 

579 """ 

580 return ArrowAstropySchema.from_arrow(self.to_arrow_schema()) 

581 

582 @property 

583 def schema(self) -> np.dtype: 

584 return self._schema 

585 

586 def __repr__(self) -> str: 

587 return repr(self._schema) 

588 

589 def __eq__(self, other: object) -> bool: 

590 if not isinstance(other, DataFrameSchema): 

591 return NotImplemented 

592 

593 return self._schema.equals(other._schema) 

594 

595 

596class ArrowAstropySchema: 

597 """Wrapper class for a schema for an astropy table. 

598 

599 Parameters 

600 ---------- 

601 astropy_table : `astropy.table.Table` 

602 Input astropy table. 

603 """ 

604 

605 def __init__(self, astropy_table: atable.Table) -> None: 

606 self._schema = astropy_table[:0] 

607 

608 @classmethod 

609 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema: 

610 """Convert an arrow schema into a ArrowAstropySchema. 

611 

612 Parameters 

613 ---------- 

614 schema : `pyarrow.Schema` 

615 Input pyarrow schema. 

616 

617 Returns 

618 ------- 

619 astropy_schema : `ArrowAstropySchema` 

620 Converted arrow astropy schema. 

621 """ 

622 import numpy as np 

623 from astropy.table import Table 

624 

625 dtype = _schema_to_dtype_list(schema) 

626 

627 data = np.zeros(0, dtype=dtype) 

628 astropy_table = Table(data=data) 

629 

630 metadata = schema.metadata if schema.metadata is not None else {} 

631 

632 _apply_astropy_metadata(astropy_table, metadata) 

633 

634 return cls(astropy_table) 

635 

636 def to_arrow_schema(self) -> pa.Schema: 

637 """Convert to an arrow schema. 

638 

639 Returns 

640 ------- 

641 arrow_schema : `pyarrow.Schema` 

642 Converted pyarrow schema. 

643 """ 

644 return astropy_to_arrow(self._schema).schema 

645 

646 def to_dataframe_schema(self) -> DataFrameSchema: 

647 """Convert to a DataFrameSchema. 

648 

649 Returns 

650 ------- 

651 dataframe_schema : `DataFrameSchema` 

652 Converted dataframe schema. 

653 """ 

654 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema) 

655 

656 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

657 """Convert to an `ArrowNumpySchema`. 

658 

659 Returns 

660 ------- 

661 arrow_numpy_schema : `ArrowNumpySchema` 

662 Converted arrow numpy schema. 

663 """ 

664 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema) 

665 

666 @property 

667 def schema(self) -> atable.Table: 

668 return self._schema 

669 

670 def __repr__(self) -> str: 

671 return repr(self._schema) 

672 

673 def __eq__(self, other: object) -> bool: 

674 if not isinstance(other, ArrowAstropySchema): 

675 return NotImplemented 

676 

677 # If this comparison passes then the two tables have the 

678 # same column names. 

679 if self._schema.dtype != other._schema.dtype: 

680 return False 

681 

682 for name in self._schema.columns: 

683 if not self._schema[name].unit == other._schema[name].unit: 

684 return False 

685 if not self._schema[name].description == other._schema[name].description: 

686 return False 

687 if not self._schema[name].format == other._schema[name].format: 

688 return False 

689 

690 return True 

691 

692 

693class ArrowNumpySchema: 

694 """Wrapper class for a schema for a numpy ndarray. 

695 

696 Parameters 

697 ---------- 

698 numpy_dtype : `numpy.dtype` 

699 Numpy dtype to convert. 

700 """ 

701 

702 def __init__(self, numpy_dtype: np.dtype) -> None: 

703 self._dtype = numpy_dtype 

704 

705 @classmethod 

706 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema: 

707 """Convert an arrow schema into an `ArrowNumpySchema`. 

708 

709 Parameters 

710 ---------- 

711 schema : `pyarrow.Schema` 

712 Pyarrow schema to convert. 

713 

714 Returns 

715 ------- 

716 numpy_schema : `ArrowNumpySchema` 

717 Converted arrow numpy schema. 

718 """ 

719 import numpy as np 

720 

721 dtype = _schema_to_dtype_list(schema) 

722 

723 return cls(np.dtype(dtype)) 

724 

725 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

726 """Convert to an `ArrowAstropySchema`. 

727 

728 Returns 

729 ------- 

730 astropy_schema : `ArrowAstropySchema` 

731 Converted arrow astropy schema. 

732 """ 

733 import numpy as np 

734 

735 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

736 

737 def to_dataframe_schema(self) -> DataFrameSchema: 

738 """Convert to a `DataFrameSchema`. 

739 

740 Returns 

741 ------- 

742 dataframe_schema : `DataFrameSchema` 

743 Converted dataframe schema. 

744 """ 

745 import numpy as np 

746 

747 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

748 

749 def to_arrow_schema(self) -> pa.Schema: 

750 """Convert to a `pyarrow.Schema`. 

751 

752 Returns 

753 ------- 

754 arrow_schema : `pyarrow.Schema` 

755 Converted pyarrow schema. 

756 """ 

757 import numpy as np 

758 

759 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema 

760 

761 @property 

762 def schema(self) -> np.dtype: 

763 return self._dtype 

764 

765 def __repr__(self) -> str: 

766 return repr(self._dtype) 

767 

768 def __eq__(self, other: object) -> bool: 

769 if not isinstance(other, ArrowNumpySchema): 

770 return NotImplemented 

771 

772 if not self._dtype == other._dtype: 

773 return False 

774 

775 return True 

776 

777 

778def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]: 

779 """Split a string that represents a multi-index column. 

780 

781 PyArrow maps Pandas' multi-index column names (which are tuples in Python) 

782 to flat strings on disk. This routine exists to reconstruct the original 

783 tuple. 

784 

785 Parameters 

786 ---------- 

787 n : `int` 

788 Number of levels in the `pandas.MultiIndex` that is being 

789 reconstructed. 

790 names : `~collections.abc.Iterable` [`str`] 

791 Strings to be split. 

792 

793 Returns 

794 ------- 

795 column_names : `list` [`tuple` [`str`]] 

796 A list of multi-index column name tuples. 

797 """ 

798 column_names: List[Sequence[str]] = [] 

799 

800 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

801 for name in names: 

802 m = re.search(pattern, name) 

803 if m is not None: 

804 column_names.append(m.groups()) 

805 

806 return column_names 

807 

808 

809def _standardize_multi_index_columns( 

810 schema: pa.Schema, columns: Union[List[tuple], dict[str, Union[str, List[str]]]] 

811) -> List[str]: 

812 """Transform a dictionary/iterable index from a multi-index column list 

813 into a string directly understandable by PyArrow. 

814 

815 Parameters 

816 ---------- 

817 schema : `pyarrow.Schema` 

818 Pyarrow schema. 

819 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]] 

820 Columns to standardize. 

821 

822 Returns 

823 ------- 

824 names : `list` [`str`] 

825 Stringified representation of a multi-index column name. 

826 """ 

827 pd_index = arrow_schema_to_pandas_index(schema) 

828 index_level_names = tuple(pd_index.names) 

829 

830 names = [] 

831 

832 if isinstance(columns, list): 

833 for requested in columns: 

834 if not isinstance(requested, tuple): 

835 raise ValueError( 

836 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

837 f"Instead got a {get_full_type_name(requested)}." 

838 ) 

839 names.append(str(requested)) 

840 else: 

841 if not isinstance(columns, collections.abc.Mapping): 

842 raise ValueError( 

843 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

844 f"Instead got a {get_full_type_name(columns)}." 

845 ) 

846 if not set(index_level_names).issuperset(columns.keys()): 

847 raise ValueError( 

848 f"Cannot use dict with keys {set(columns.keys())} " 

849 f"to select columns from {index_level_names}." 

850 ) 

851 factors = [ 

852 ensure_iterable(columns.get(level, pd_index.levels[i])) 

853 for i, level in enumerate(index_level_names) 

854 ] 

855 for requested in itertools.product(*factors): 

856 for i, value in enumerate(requested): 

857 if value not in pd_index.levels[i]: 

858 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.") 

859 names.append(str(requested)) 

860 

861 return names 

862 

863 

864def _apply_astropy_metadata(astropy_table: atable.Table, metadata: dict) -> None: 

865 """Apply any astropy metadata from the schema metadata. 

866 

867 Parameters 

868 ---------- 

869 astropy_table : `astropy.table.Table` 

870 Table to apply metadata. 

871 metadata : `dict` [`bytes`] 

872 Metadata dict. 

873 """ 

874 from astropy.table import meta 

875 

876 meta_yaml = metadata.get(b"table_meta_yaml", None) 

877 if meta_yaml: 

878 meta_yaml = meta_yaml.decode("UTF8").split("\n") 

879 meta_hdr = meta.get_header_from_yaml(meta_yaml) 

880 

881 # Set description, format, unit, meta from the column 

882 # metadata that was serialized with the table. 

883 header_cols = {x["name"]: x for x in meta_hdr["datatype"]} 

884 for col in astropy_table.columns.values(): 

885 for attr in ("description", "format", "unit", "meta"): 

886 if attr in header_cols[col.name]: 

887 setattr(col, attr, header_cols[col.name][attr]) 

888 

889 if "meta" in meta_hdr: 

890 astropy_table.meta.update(meta_hdr["meta"]) 

891 

892 

893def _arrow_string_to_numpy_dtype( 

894 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10 

895) -> str: 

896 """Get the numpy dtype string associated with an arrow column. 

897 

898 Parameters 

899 ---------- 

900 schema : `pyarrow.Schema` 

901 Arrow table schema. 

902 name : `str` 

903 Column name. 

904 numpy_column : `numpy.ndarray`, optional 

905 Column to determine numpy string dtype. 

906 default_length : `int`, optional 

907 Default string length when not in metadata or can be inferred 

908 from column. 

909 

910 Returns 

911 ------- 

912 dtype_str : `str` 

913 Numpy dtype string. 

914 """ 

915 # Special-case for string and binary columns 

916 md_name = f"lsst::arrow::len::{name}" 

917 strlen = default_length 

918 metadata = schema.metadata if schema.metadata is not None else {} 

919 if (encoded := md_name.encode("UTF-8")) in metadata: 

920 # String/bytes length from header. 

921 strlen = int(schema.metadata[encoded]) 

922 elif numpy_column is not None: 

923 if len(numpy_column) > 0: 

924 strlen = max(len(row) for row in numpy_column) 

925 

926 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}" 

927 

928 return dtype 

929 

930 

931def _append_numpy_string_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None: 

932 """Append numpy string length keys to arrow metadata. 

933 

934 All column types are handled, but the metadata is only modified for 

935 string and byte columns. 

936 

937 Parameters 

938 ---------- 

939 metadata : `dict` [`bytes`, `str`] 

940 Metadata dictionary; modified in place. 

941 name : `str` 

942 Column name. 

943 dtype : `np.dtype` 

944 Numpy dtype. 

945 """ 

946 import numpy as np 

947 

948 if dtype.type is np.str_: 

949 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4) 

950 elif dtype.type is np.bytes_: 

951 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize) 

952 

953 

954def _append_numpy_multidim_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None: 

955 """Append numpy multi-dimensional shapes to arrow metadata. 

956 

957 All column types are handled, but the metadata is only modified for 

958 multi-dimensional columns. 

959 

960 Parameters 

961 ---------- 

962 metadata : `dict` [`bytes`, `str`] 

963 Metadata dictionary; modified in place. 

964 name : `str` 

965 Column name. 

966 dtype : `np.dtype` 

967 Numpy dtype. 

968 """ 

969 if len(dtype.shape) > 1: 

970 metadata[f"lsst::arrow::shape::{name}".encode("UTF-8")] = str(dtype.shape) 

971 

972 

973def _multidim_shape_from_metadata(metadata: dict[bytes, bytes], list_size: int, name: str) -> tuple[int, ...]: 

974 """Retrieve the shape from the metadata, if available. 

975 

976 Parameters 

977 ---------- 

978 metadata : `dict` [`bytes`, `bytes`] 

979 Metadata dictionary. 

980 list_size : `int` 

981 Size of the list datatype. 

982 name : `str` 

983 Column name. 

984 

985 Returns 

986 ------- 

987 shape : `tuple` [`int`] 

988 Shape associated with the column. 

989 

990 Raises 

991 ------ 

992 RuntimeError 

993 Raised if metadata is found but has incorrect format. 

994 """ 

995 md_name = f"lsst::arrow::shape::{name}" 

996 if (encoded := md_name.encode("UTF-8")) in metadata: 

997 groups = re.search(r"\((.*)\)", metadata[encoded].decode("UTF-8")) 

998 if groups is None: 

999 raise RuntimeError("Illegal value found in metadata.") 

1000 shape = tuple(int(x) for x in groups[1].split(",") if x != "") 

1001 else: 

1002 shape = (list_size,) 

1003 

1004 return shape 

1005 

1006 

1007def _schema_to_dtype_list(schema: pa.Schema) -> list[tuple[str, tuple[Any] | str]]: 

1008 """Convert a pyarrow schema to a numpy dtype. 

1009 

1010 Parameters 

1011 ---------- 

1012 schema : `pyarrow.Schema` 

1013 Input pyarrow schema. 

1014 

1015 Returns 

1016 ------- 

1017 dtype_list: `list` [`tuple`] 

1018 A list with name, type pairs. 

1019 """ 

1020 metadata = schema.metadata if schema.metadata is not None else {} 

1021 

1022 dtype: list[Any] = [] 

1023 for name in schema.names: 

1024 t = schema.field(name).type 

1025 if isinstance(t, pa.FixedSizeListType): 

1026 shape = _multidim_shape_from_metadata(metadata, t.list_size, name) 

1027 dtype.append((name, (t.value_type.to_pandas_dtype(), shape))) 

1028 elif t not in (pa.string(), pa.binary()): 

1029 dtype.append((name, t.to_pandas_dtype())) 

1030 else: 

1031 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name))) 

1032 

1033 return dtype 

1034 

1035 

1036def _numpy_dtype_to_arrow_types(dtype: np.dtype) -> list[Any]: 

1037 """Convert a numpy dtype to a list of arrow types. 

1038 

1039 Parameters 

1040 ---------- 

1041 dtype : `numpy.dtype` 

1042 Numpy dtype to convert. 

1043 

1044 Returns 

1045 ------- 

1046 type_list : `list` [`object`] 

1047 Converted list of arrow types. 

1048 """ 

1049 from math import prod 

1050 

1051 import numpy as np 

1052 

1053 type_list: list[Any] = [] 

1054 if dtype.names is None: 

1055 return type_list 

1056 

1057 for name in dtype.names: 

1058 dt = dtype[name] 

1059 arrow_type: Any 

1060 if len(dt.shape) > 0: 

1061 arrow_type = pa.list_( 

1062 pa.from_numpy_dtype(cast(tuple[np.dtype, tuple[int, ...]], dt.subdtype)[0].type), 

1063 prod(dt.shape), 

1064 ) 

1065 else: 

1066 arrow_type = pa.from_numpy_dtype(dt.type) 

1067 type_list.append((name, arrow_type)) 

1068 

1069 return type_list 

1070 

1071 

1072def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, int]: 

1073 """Extract equivalent table dtype from dict of numpy arrays. 

1074 

1075 Parameters 

1076 ---------- 

1077 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

1078 Dict with keys as the column names, values as the arrays. 

1079 

1080 Returns 

1081 ------- 

1082 dtype : `numpy.dtype` 

1083 dtype of equivalent table. 

1084 rowcount : `int` 

1085 Number of rows in the table. 

1086 

1087 Raises 

1088 ------ 

1089 ValueError if columns in numpy_dict have unequal numbers of rows. 

1090 """ 

1091 import numpy as np 

1092 

1093 dtype_list = [] 

1094 rowcount = 0 

1095 for name, col in numpy_dict.items(): 

1096 if rowcount == 0: 

1097 rowcount = len(col) 

1098 if len(col) != rowcount: 

1099 raise ValueError(f"Column {name} has a different number of rows.") 

1100 if len(col.shape) == 1: 

1101 dtype_list.append((name, col.dtype)) 

1102 else: 

1103 dtype_list.append((name, (col.dtype, col.shape[1:]))) 

1104 dtype = np.dtype(dtype_list) 

1105 

1106 return (dtype, rowcount) 

1107 

1108 

1109def _numpy_style_arrays_to_arrow_arrays( 

1110 dtype: np.dtype, 

1111 rowcount: int, 

1112 np_style_arrays: dict[str, np.ndarray] | np.ndarray | atable.Table, 

1113 schema: pa.Schema, 

1114) -> list[pa.Array]: 

1115 """Convert numpy-style arrays to arrow arrays. 

1116 

1117 Parameters 

1118 ---------- 

1119 dtype : `numpy.dtype` 

1120 Numpy dtype of input table/arrays. 

1121 rowcount : `int` 

1122 Number of rows in input table/arrays. 

1123 np_style_arrays : `dict` [`str`, `np.ndarray`] or `np.ndarray` 

1124 or `astropy.table.Table` 

1125 Arrays to convert to arrow. 

1126 schema : `pyarrow.Schema` 

1127 Schema of arrow table. 

1128 

1129 Returns 

1130 ------- 

1131 arrow_arrays : `list` [`pyarrow.Array`] 

1132 List of converted pyarrow arrays. 

1133 """ 

1134 import numpy as np 

1135 

1136 arrow_arrays: list[pa.Array] = [] 

1137 if dtype.names is None: 

1138 return arrow_arrays 

1139 

1140 for name in dtype.names: 

1141 dt = dtype[name] 

1142 val: Any 

1143 if len(dt.shape) > 0: 

1144 if rowcount > 0: 

1145 val = np.split(np_style_arrays[name].ravel(), rowcount) 

1146 else: 

1147 val = [] 

1148 else: 

1149 val = np_style_arrays[name] 

1150 arrow_arrays.append(pa.array(val, type=schema.field(name).type)) 

1151 

1152 return arrow_arrays