Coverage for python/lsst/daf/butler/formatters/parquet.py: 13%

388 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-06 01:42 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ParquetFormatter", 

26 "arrow_to_pandas", 

27 "arrow_to_astropy", 

28 "arrow_to_numpy", 

29 "arrow_to_numpy_dict", 

30 "pandas_to_arrow", 

31 "pandas_to_astropy", 

32 "astropy_to_arrow", 

33 "numpy_to_arrow", 

34 "numpy_to_astropy", 

35 "numpy_dict_to_arrow", 

36 "arrow_schema_to_pandas_index", 

37 "DataFrameSchema", 

38 "ArrowAstropySchema", 

39 "ArrowNumpySchema", 

40) 

41 

42import collections.abc 

43import itertools 

44import json 

45import re 

46from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, Union, cast 

47 

48import pyarrow as pa 

49import pyarrow.parquet as pq 

50from lsst.daf.butler import Formatter 

51from lsst.utils.introspection import get_full_type_name 

52from lsst.utils.iteration import ensure_iterable 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 import astropy.table as atable 

56 import numpy as np 

57 import pandas as pd 

58 

59 

60class ParquetFormatter(Formatter): 

61 """Interface for reading and writing Arrow Table objects to and from 

62 Parquet files. 

63 """ 

64 

65 extension = ".parq" 

66 

67 def read(self, component: Optional[str] = None) -> Any: 

68 # Docstring inherited from Formatter.read. 

69 schema = pq.read_schema(self.fileDescriptor.location.path) 

70 

71 if component in ("columns", "schema"): 

72 # The schema will be translated to column format 

73 # depending on the input type. 

74 return schema 

75 elif component == "rowcount": 

76 # Get the rowcount from the metadata if possible, otherwise count. 

77 if b"lsst::arrow::rowcount" in schema.metadata: 

78 return int(schema.metadata[b"lsst::arrow::rowcount"]) 

79 

80 temp_table = pq.read_table( 

81 self.fileDescriptor.location.path, 

82 columns=[schema.names[0]], 

83 use_threads=False, 

84 use_pandas_metadata=False, 

85 ) 

86 

87 return len(temp_table[schema.names[0]]) 

88 

89 par_columns = None 

90 if self.fileDescriptor.parameters: 

91 par_columns = self.fileDescriptor.parameters.pop("columns", None) 

92 if par_columns: 

93 has_pandas_multi_index = False 

94 if b"pandas" in schema.metadata: 

95 md = json.loads(schema.metadata[b"pandas"]) 

96 if len(md["column_indexes"]) > 1: 

97 has_pandas_multi_index = True 

98 

99 if not has_pandas_multi_index: 

100 # Ensure uniqueness, keeping order. 

101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns))) 

102 file_columns = [name for name in schema.names if not name.startswith("__")] 

103 

104 for par_column in par_columns: 

105 if par_column not in file_columns: 

106 raise ValueError( 

107 f"Column {par_column} specified in parameters not available in parquet file." 

108 ) 

109 else: 

110 par_columns = _standardize_multi_index_columns(schema, par_columns) 

111 

112 if len(self.fileDescriptor.parameters): 

113 raise ValueError( 

114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read." 

115 ) 

116 

117 metadata = schema.metadata if schema.metadata is not None else {} 

118 arrow_table = pq.read_table( 

119 self.fileDescriptor.location.path, 

120 columns=par_columns, 

121 use_threads=False, 

122 use_pandas_metadata=(b"pandas" in metadata), 

123 ) 

124 

125 return arrow_table 

126 

127 def write(self, inMemoryDataset: Any) -> None: 

128 import numpy as np 

129 from astropy.table import Table as astropyTable 

130 

131 arrow_table = None 

132 if isinstance(inMemoryDataset, pa.Table): 

133 # This will be the most likely match. 

134 arrow_table = inMemoryDataset 

135 elif isinstance(inMemoryDataset, astropyTable): 

136 arrow_table = astropy_to_arrow(inMemoryDataset) 

137 elif isinstance(inMemoryDataset, np.ndarray): 

138 arrow_table = numpy_to_arrow(inMemoryDataset) 

139 else: 

140 if hasattr(inMemoryDataset, "to_parquet"): 

141 # This may be a pandas DataFrame 

142 try: 

143 import pandas as pd 

144 except ImportError: 

145 pd = None 

146 

147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame): 

148 arrow_table = pandas_to_arrow(inMemoryDataset) 

149 

150 if arrow_table is None: 

151 raise ValueError( 

152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of " 

153 "inMemoryDataset for ParquetFormatter." 

154 ) 

155 

156 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

157 

158 pq.write_table(arrow_table, location.path) 

159 

160 

161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame: 

162 """Convert a pyarrow table to a pandas DataFrame. 

163 

164 Parameters 

165 ---------- 

166 arrow_table : `pyarrow.Table` 

167 Input arrow table to convert. If the table has ``pandas`` metadata 

168 in the schema it will be used in the construction of the 

169 ``DataFrame``. 

170 

171 Returns 

172 ------- 

173 dataframe : `pandas.DataFrame` 

174 Converted pandas dataframe. 

175 """ 

176 return arrow_table.to_pandas(use_threads=False) 

177 

178 

179def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table: 

180 """Convert a pyarrow table to an `astropy.Table`. 

181 

182 Parameters 

183 ---------- 

184 arrow_table : `pyarrow.Table` 

185 Input arrow table to convert. If the table has astropy unit 

186 metadata in the schema it will be used in the construction 

187 of the ``astropy.Table``. 

188 

189 Returns 

190 ------- 

191 table : `astropy.Table` 

192 Converted astropy table. 

193 """ 

194 from astropy.table import Table 

195 

196 astropy_table = Table(arrow_to_numpy_dict(arrow_table)) 

197 

198 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {} 

199 

200 _apply_astropy_metadata(astropy_table, metadata) 

201 

202 return astropy_table 

203 

204 

205def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray: 

206 """Convert a pyarrow table to a structured numpy array. 

207 

208 Parameters 

209 ---------- 

210 arrow_table : `pyarrow.Table` 

211 Input arrow table. 

212 

213 Returns 

214 ------- 

215 array : `numpy.ndarray` (N,) 

216 Numpy array table with N rows and the same column names 

217 as the input arrow table. 

218 """ 

219 import numpy as np 

220 

221 numpy_dict = arrow_to_numpy_dict(arrow_table) 

222 

223 dtype = [] 

224 for name, col in numpy_dict.items(): 

225 if len(shape := numpy_dict[name].shape) <= 1: 

226 dtype.append((name, col.dtype)) 

227 else: 

228 dtype.append((name, (col.dtype, shape[1:]))) 

229 

230 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype) 

231 

232 return array 

233 

234 

235def arrow_to_numpy_dict(arrow_table: pa.Table) -> dict[str, np.ndarray]: 

236 """Convert a pyarrow table to a dict of numpy arrays. 

237 

238 Parameters 

239 ---------- 

240 arrow_table : `pyarrow.Table` 

241 Input arrow table. 

242 

243 Returns 

244 ------- 

245 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

246 Dict with keys as the column names, values as the arrays. 

247 """ 

248 import numpy as np 

249 

250 schema = arrow_table.schema 

251 metadata = schema.metadata if schema.metadata is not None else {} 

252 

253 numpy_dict = {} 

254 

255 for name in schema.names: 

256 col = arrow_table[name].to_numpy() 

257 

258 t = schema.field(name).type 

259 if t in (pa.string(), pa.binary()): 

260 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col)) 

261 elif isinstance(t, pa.FixedSizeListType): 

262 if len(col) > 0: 

263 col = np.stack(col) 

264 else: 

265 # this is an empty column, and needs to be coerced to type. 

266 col = col.astype(t.value_type.to_pandas_dtype()) 

267 

268 shape = _multidim_shape_from_metadata(metadata, t.list_size, name) 

269 col = col.reshape((len(arrow_table), *shape)) 

270 

271 numpy_dict[name] = col 

272 

273 return numpy_dict 

274 

275 

276def numpy_to_arrow(np_array: np.ndarray) -> pa.Table: 

277 """Convert a numpy array table to an arrow table. 

278 

279 Parameters 

280 ---------- 

281 np_array : `numpy.ndarray` 

282 Input numpy array with multiple fields. 

283 

284 Returns 

285 ------- 

286 arrow_table : `pyarrow.Table` 

287 Converted arrow table. 

288 """ 

289 type_list = _numpy_dtype_to_arrow_types(np_array.dtype) 

290 

291 md = {} 

292 md[b"lsst::arrow::rowcount"] = str(len(np_array)) 

293 

294 for name in np_array.dtype.names: 

295 _append_numpy_string_metadata(md, name, np_array.dtype[name]) 

296 _append_numpy_multidim_metadata(md, name, np_array.dtype[name]) 

297 

298 schema = pa.schema(type_list, metadata=md) 

299 

300 arrays = _numpy_style_arrays_to_arrow_arrays( 

301 np_array.dtype, 

302 len(np_array), 

303 np_array, 

304 schema, 

305 ) 

306 

307 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

308 

309 return arrow_table 

310 

311 

312def numpy_dict_to_arrow(numpy_dict: dict[str, np.ndarray]) -> pa.Table: 

313 """Convert a dict of numpy arrays to an arrow table. 

314 

315 Parameters 

316 ---------- 

317 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

318 Dict with keys as the column names, values as the arrays. 

319 

320 Returns 

321 ------- 

322 arrow_table : `pyarrow.Table` 

323 Converted arrow table. 

324 

325 Raises 

326 ------ 

327 ValueError if columns in numpy_dict have unequal numbers of rows. 

328 """ 

329 dtype, rowcount = _numpy_dict_to_dtype(numpy_dict) 

330 type_list = _numpy_dtype_to_arrow_types(dtype) 

331 

332 md = {} 

333 md[b"lsst::arrow::rowcount"] = str(rowcount) 

334 

335 if dtype.names is not None: 

336 for name in dtype.names: 

337 _append_numpy_string_metadata(md, name, dtype[name]) 

338 _append_numpy_multidim_metadata(md, name, dtype[name]) 

339 

340 schema = pa.schema(type_list, metadata=md) 

341 

342 arrays = _numpy_style_arrays_to_arrow_arrays( 

343 dtype, 

344 rowcount, 

345 numpy_dict, 

346 schema, 

347 ) 

348 

349 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

350 

351 return arrow_table 

352 

353 

354def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table: 

355 """Convert an astropy table to an arrow table. 

356 

357 Parameters 

358 ---------- 

359 astropy_table : `astropy.Table` 

360 Input astropy table. 

361 

362 Returns 

363 ------- 

364 arrow_table : `pyarrow.Table` 

365 Converted arrow table. 

366 """ 

367 from astropy.table import meta 

368 

369 type_list = _numpy_dtype_to_arrow_types(astropy_table.dtype) 

370 

371 md = {} 

372 md[b"lsst::arrow::rowcount"] = str(len(astropy_table)) 

373 

374 for name in astropy_table.dtype.names: 

375 _append_numpy_string_metadata(md, name, astropy_table.dtype[name]) 

376 _append_numpy_multidim_metadata(md, name, astropy_table.dtype[name]) 

377 

378 meta_yaml = meta.get_yaml_from_table(astropy_table) 

379 meta_yaml_str = "\n".join(meta_yaml) 

380 md[b"table_meta_yaml"] = meta_yaml_str 

381 

382 schema = pa.schema(type_list, metadata=md) 

383 

384 arrays = _numpy_style_arrays_to_arrow_arrays( 

385 astropy_table.dtype, 

386 len(astropy_table), 

387 astropy_table, 

388 schema, 

389 ) 

390 

391 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

392 

393 return arrow_table 

394 

395 

396def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table: 

397 """Convert a pandas dataframe to an arrow table. 

398 

399 Parameters 

400 ---------- 

401 dataframe : `pandas.DataFrame` 

402 Input pandas dataframe. 

403 default_length : `int`, optional 

404 Default string length when not in metadata or can be inferred 

405 from column. 

406 

407 Returns 

408 ------- 

409 arrow_table : `pyarrow.Table` 

410 Converted arrow table. 

411 """ 

412 arrow_table = pa.Table.from_pandas(dataframe) 

413 

414 # Update the metadata 

415 md = arrow_table.schema.metadata 

416 

417 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows) 

418 

419 # We loop through the arrow table columns because the datatypes have 

420 # been checked and converted from pandas objects. 

421 for name in arrow_table.column_names: 

422 if not name.startswith("__"): 

423 if arrow_table[name].type == pa.string(): 

424 if len(arrow_table[name]) > 0: 

425 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid) 

426 else: 

427 strlen = default_length 

428 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen) 

429 

430 arrow_table = arrow_table.replace_schema_metadata(md) 

431 

432 return arrow_table 

433 

434 

435def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table: 

436 """Convert a pandas dataframe to an astropy table, preserving indexes. 

437 

438 Parameters 

439 ---------- 

440 dataframe : `pandas.DataFrame` 

441 Input pandas dataframe. 

442 

443 Returns 

444 ------- 

445 astropy_table : `astropy.table.Table` 

446 Converted astropy table. 

447 """ 

448 import pandas as pd 

449 from astropy.table import Table 

450 

451 if isinstance(dataframe.columns, pd.MultiIndex): 

452 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.") 

453 

454 return Table.from_pandas(dataframe, index=True) 

455 

456 

457def numpy_to_astropy(np_array: np.ndarray) -> atable.Table: 

458 """Convert a numpy table to an astropy table. 

459 

460 Parameters 

461 ---------- 

462 np_array : `numpy.ndarray` 

463 Input numpy array with multiple fields. 

464 

465 Returns 

466 ------- 

467 astropy_table : `astropy.table.Table` 

468 Converted astropy table. 

469 """ 

470 from astropy.table import Table 

471 

472 return Table(data=np_array, copy=False) 

473 

474 

475def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex: 

476 """Convert an arrow schema to a pandas index/multiindex. 

477 

478 Parameters 

479 ---------- 

480 schema : `pyarrow.Schema` 

481 Input pyarrow schema. 

482 

483 Returns 

484 ------- 

485 index : `pandas.Index` or `pandas.MultiIndex` 

486 Converted pandas index. 

487 """ 

488 import pandas as pd 

489 

490 if b"pandas" in schema.metadata: 

491 md = json.loads(schema.metadata[b"pandas"]) 

492 indexes = md["column_indexes"] 

493 len_indexes = len(indexes) 

494 else: 

495 len_indexes = 0 

496 

497 if len_indexes <= 1: 

498 return pd.Index(name for name in schema.names if not name.startswith("__")) 

499 else: 

500 raw_columns = _split_multi_index_column_names(len(indexes), schema.names) 

501 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

502 

503 

504def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]: 

505 """Convert an arrow schema to a list of string column names. 

506 

507 Parameters 

508 ---------- 

509 schema : `pyarrow.Schema` 

510 Input pyarrow schema. 

511 

512 Returns 

513 ------- 

514 column_list : `list` [`str`] 

515 Converted list of column names. 

516 """ 

517 return [name for name in schema.names] 

518 

519 

520class DataFrameSchema: 

521 """Wrapper class for a schema for a pandas DataFrame. 

522 

523 Parameters 

524 ---------- 

525 dataframe : `pandas.DataFrame` 

526 Dataframe to turn into a schema. 

527 """ 

528 

529 def __init__(self, dataframe: pd.DataFrame) -> None: 

530 self._schema = dataframe.loc[[False] * len(dataframe)] 

531 

532 @classmethod 

533 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema: 

534 """Convert an arrow schema into a `DataFrameSchema`. 

535 

536 Parameters 

537 ---------- 

538 schema : `pyarrow.Schema` 

539 The pyarrow schema to convert. 

540 

541 Returns 

542 ------- 

543 dataframe_schema : `DataFrameSchema` 

544 Converted dataframe schema. 

545 """ 

546 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema) 

547 

548 return cls(empty_table.to_pandas()) 

549 

550 def to_arrow_schema(self) -> pa.Schema: 

551 """Convert to an arrow schema. 

552 

553 Returns 

554 ------- 

555 arrow_schema : `pyarrow.Schema` 

556 Converted pyarrow schema. 

557 """ 

558 arrow_table = pa.Table.from_pandas(self._schema) 

559 

560 return arrow_table.schema 

561 

562 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

563 """Convert to an `ArrowNumpySchema`. 

564 

565 Returns 

566 ------- 

567 arrow_numpy_schema : `ArrowNumpySchema` 

568 Converted arrow numpy schema. 

569 """ 

570 return ArrowNumpySchema.from_arrow(self.to_arrow_schema()) 

571 

572 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

573 """Convert to an ArrowAstropySchema. 

574 

575 Returns 

576 ------- 

577 arrow_astropy_schema : `ArrowAstropySchema` 

578 Converted arrow astropy schema. 

579 """ 

580 return ArrowAstropySchema.from_arrow(self.to_arrow_schema()) 

581 

582 @property 

583 def schema(self) -> np.dtype: 

584 return self._schema 

585 

586 def __repr__(self) -> str: 

587 return repr(self._schema) 

588 

589 def __eq__(self, other: object) -> bool: 

590 if not isinstance(other, DataFrameSchema): 

591 return NotImplemented 

592 

593 return self._schema.equals(other._schema) 

594 

595 

596class ArrowAstropySchema: 

597 """Wrapper class for a schema for an astropy table. 

598 

599 Parameters 

600 ---------- 

601 astropy_table : `astropy.table.Table` 

602 Input astropy table. 

603 """ 

604 

605 def __init__(self, astropy_table: atable.Table) -> None: 

606 self._schema = astropy_table[:0] 

607 

608 @classmethod 

609 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema: 

610 """Convert an arrow schema into a ArrowAstropySchema. 

611 

612 Parameters 

613 ---------- 

614 schema : `pyarrow.Schema` 

615 Input pyarrow schema. 

616 

617 Returns 

618 ------- 

619 astropy_schema : `ArrowAstropySchema` 

620 Converted arrow astropy schema. 

621 """ 

622 import numpy as np 

623 from astropy.table import Table 

624 

625 dtype = _schema_to_dtype_list(schema) 

626 

627 data = np.zeros(0, dtype=dtype) 

628 astropy_table = Table(data=data) 

629 

630 metadata = schema.metadata if schema.metadata is not None else {} 

631 

632 _apply_astropy_metadata(astropy_table, metadata) 

633 

634 return cls(astropy_table) 

635 

636 def to_arrow_schema(self) -> pa.Schema: 

637 """Convert to an arrow schema. 

638 

639 Returns 

640 ------- 

641 arrow_schema : `pyarrow.Schema` 

642 Converted pyarrow schema. 

643 """ 

644 return astropy_to_arrow(self._schema).schema 

645 

646 def to_dataframe_schema(self) -> DataFrameSchema: 

647 """Convert to a DataFrameSchema. 

648 

649 Returns 

650 ------- 

651 dataframe_schema : `DataFrameSchema` 

652 Converted dataframe schema. 

653 """ 

654 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema) 

655 

656 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

657 """Convert to an `ArrowNumpySchema`. 

658 

659 Returns 

660 ------- 

661 arrow_numpy_schema : `ArrowNumpySchema` 

662 Converted arrow numpy schema. 

663 """ 

664 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema) 

665 

666 @property 

667 def schema(self) -> atable.Table: 

668 return self._schema 

669 

670 def __repr__(self) -> str: 

671 return repr(self._schema) 

672 

673 def __eq__(self, other: object) -> bool: 

674 if not isinstance(other, ArrowAstropySchema): 

675 return NotImplemented 

676 

677 # If this comparison passes then the two tables have the 

678 # same column names. 

679 if self._schema.dtype != other._schema.dtype: 

680 return False 

681 

682 for name in self._schema.columns: 

683 if not self._schema[name].unit == other._schema[name].unit: 

684 return False 

685 if not self._schema[name].description == other._schema[name].description: 

686 return False 

687 if not self._schema[name].format == other._schema[name].format: 

688 return False 

689 

690 return True 

691 

692 

693class ArrowNumpySchema: 

694 """Wrapper class for a schema for a numpy ndarray. 

695 

696 Parameters 

697 ---------- 

698 numpy_dtype : `numpy.dtype` 

699 Numpy dtype to convert. 

700 """ 

701 

702 def __init__(self, numpy_dtype: np.dtype) -> None: 

703 self._dtype = numpy_dtype 

704 

705 @classmethod 

706 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema: 

707 """Convert an arrow schema into an `ArrowNumpySchema`. 

708 

709 Parameters 

710 ---------- 

711 schema : `pyarrow.Schema` 

712 Pyarrow schema to convert. 

713 

714 Returns 

715 ------- 

716 numpy_schema : `ArrowNumpySchema` 

717 Converted arrow numpy schema. 

718 """ 

719 import numpy as np 

720 

721 dtype = _schema_to_dtype_list(schema) 

722 

723 return cls(np.dtype(dtype)) 

724 

725 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

726 """Convert to an `ArrowAstropySchema`. 

727 

728 Returns 

729 ------- 

730 astropy_schema : `ArrowAstropySchema` 

731 Converted arrow astropy schema. 

732 """ 

733 import numpy as np 

734 

735 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

736 

737 def to_dataframe_schema(self) -> DataFrameSchema: 

738 """Convert to a `DataFrameSchema`. 

739 

740 Returns 

741 ------- 

742 dataframe_schema : `DataFrameSchema` 

743 Converted dataframe schema. 

744 """ 

745 import numpy as np 

746 

747 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

748 

749 def to_arrow_schema(self) -> pa.Schema: 

750 """Convert to a `pyarrow.Schema`. 

751 

752 Returns 

753 ------- 

754 arrow_schema : `pyarrow.Schema` 

755 Converted pyarrow schema. 

756 """ 

757 import numpy as np 

758 

759 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema 

760 

761 @property 

762 def schema(self) -> np.dtype: 

763 return self._dtype 

764 

765 def __repr__(self) -> str: 

766 return repr(self._dtype) 

767 

768 def __eq__(self, other: object) -> bool: 

769 if not isinstance(other, ArrowNumpySchema): 

770 return NotImplemented 

771 

772 if not self._dtype == other._dtype: 

773 return False 

774 

775 return True 

776 

777 

778def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]: 

779 """Split a string that represents a multi-index column. 

780 

781 PyArrow maps Pandas' multi-index column names (which are tuples in Python) 

782 to flat strings on disk. This routine exists to reconstruct the original 

783 tuple. 

784 

785 Parameters 

786 ---------- 

787 n : `int` 

788 Number of levels in the `pandas.MultiIndex` that is being 

789 reconstructed. 

790 names : `~collections.abc.Iterable` [`str`] 

791 Strings to be split. 

792 

793 Returns 

794 ------- 

795 column_names : `list` [`tuple` [`str`]] 

796 A list of multi-index column name tuples. 

797 """ 

798 column_names: List[Sequence[str]] = [] 

799 

800 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

801 for name in names: 

802 m = re.search(pattern, name) 

803 if m is not None: 

804 column_names.append(m.groups()) 

805 

806 return column_names 

807 

808 

809def _standardize_multi_index_columns( 

810 schema: pa.Schema, columns: Union[List[tuple], dict[str, Union[str, List[str]]]] 

811) -> List[str]: 

812 """Transform a dictionary/iterable index from a multi-index column list 

813 into a string directly understandable by PyArrow. 

814 

815 Parameters 

816 ---------- 

817 schema : `pyarrow.Schema` 

818 Pyarrow schema. 

819 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]] 

820 Columns to standardize. 

821 

822 Returns 

823 ------- 

824 names : `list` [`str`] 

825 Stringified representation of a multi-index column name. 

826 """ 

827 pd_index = arrow_schema_to_pandas_index(schema) 

828 index_level_names = tuple(pd_index.names) 

829 

830 names = [] 

831 

832 if isinstance(columns, list): 

833 for requested in columns: 

834 if not isinstance(requested, tuple): 

835 raise ValueError( 

836 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

837 f"Instead got a {get_full_type_name(requested)}." 

838 ) 

839 names.append(str(requested)) 

840 else: 

841 if not isinstance(columns, collections.abc.Mapping): 

842 raise ValueError( 

843 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

844 f"Instead got a {get_full_type_name(columns)}." 

845 ) 

846 if not set(index_level_names).issuperset(columns.keys()): 

847 raise ValueError( 

848 f"Cannot use dict with keys {set(columns.keys())} " 

849 f"to select columns from {index_level_names}." 

850 ) 

851 factors = [ 

852 ensure_iterable(columns.get(level, pd_index.levels[i])) 

853 for i, level in enumerate(index_level_names) 

854 ] 

855 for requested in itertools.product(*factors): 

856 for i, value in enumerate(requested): 

857 if value not in pd_index.levels[i]: 

858 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.") 

859 names.append(str(requested)) 

860 

861 return names 

862 

863 

864def _apply_astropy_metadata(astropy_table: atable.Table, metadata: dict) -> None: 

865 """Apply any astropy metadata from the schema metadata. 

866 

867 Parameters 

868 ---------- 

869 astropy_table : `astropy.table.Table` 

870 Table to apply metadata. 

871 metadata : `dict` [`bytes`] 

872 Metadata dict. 

873 """ 

874 from astropy.table import meta 

875 

876 meta_yaml = metadata.get(b"table_meta_yaml", None) 

877 if meta_yaml: 

878 meta_yaml = meta_yaml.decode("UTF8").split("\n") 

879 meta_hdr = meta.get_header_from_yaml(meta_yaml) 

880 

881 # Set description, format, unit, meta from the column 

882 # metadata that was serialized with the table. 

883 header_cols = {x["name"]: x for x in meta_hdr["datatype"]} 

884 for col in astropy_table.columns.values(): 

885 for attr in ("description", "format", "unit", "meta"): 

886 if attr in header_cols[col.name]: 

887 setattr(col, attr, header_cols[col.name][attr]) 

888 

889 

890def _arrow_string_to_numpy_dtype( 

891 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10 

892) -> str: 

893 """Get the numpy dtype string associated with an arrow column. 

894 

895 Parameters 

896 ---------- 

897 schema : `pyarrow.Schema` 

898 Arrow table schema. 

899 name : `str` 

900 Column name. 

901 numpy_column : `numpy.ndarray`, optional 

902 Column to determine numpy string dtype. 

903 default_length : `int`, optional 

904 Default string length when not in metadata or can be inferred 

905 from column. 

906 

907 Returns 

908 ------- 

909 dtype_str : `str` 

910 Numpy dtype string. 

911 """ 

912 # Special-case for string and binary columns 

913 md_name = f"lsst::arrow::len::{name}" 

914 strlen = default_length 

915 metadata = schema.metadata if schema.metadata is not None else {} 

916 if (encoded := md_name.encode("UTF-8")) in metadata: 

917 # String/bytes length from header. 

918 strlen = int(schema.metadata[encoded]) 

919 elif numpy_column is not None: 

920 if len(numpy_column) > 0: 

921 strlen = max(len(row) for row in numpy_column) 

922 

923 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}" 

924 

925 return dtype 

926 

927 

928def _append_numpy_string_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None: 

929 """Append numpy string length keys to arrow metadata. 

930 

931 All column types are handled, but the metadata is only modified for 

932 string and byte columns. 

933 

934 Parameters 

935 ---------- 

936 metadata : `dict` [`bytes`, `str`] 

937 Metadata dictionary; modified in place. 

938 name : `str` 

939 Column name. 

940 dtype : `np.dtype` 

941 Numpy dtype. 

942 """ 

943 import numpy as np 

944 

945 if dtype.type is np.str_: 

946 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4) 

947 elif dtype.type is np.bytes_: 

948 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize) 

949 

950 

951def _append_numpy_multidim_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None: 

952 """Append numpy multi-dimensional shapes to arrow metadata. 

953 

954 All column types are handled, but the metadata is only modified for 

955 multi-dimensional columns. 

956 

957 Parameters 

958 ---------- 

959 metadata : `dict` [`bytes`, `str`] 

960 Metadata dictionary; modified in place. 

961 name : `str` 

962 Column name. 

963 dtype : `np.dtype` 

964 Numpy dtype. 

965 """ 

966 if len(dtype.shape) > 1: 

967 metadata[f"lsst::arrow::shape::{name}".encode("UTF-8")] = str(dtype.shape) 

968 

969 

970def _multidim_shape_from_metadata(metadata: dict[bytes, bytes], list_size: int, name: str) -> tuple[int, ...]: 

971 """Retrieve the shape from the metadata, if available. 

972 

973 Parameters 

974 ---------- 

975 metadata : `dict` [`bytes`, `bytes`] 

976 Metadata dictionary. 

977 list_size : `int` 

978 Size of the list datatype. 

979 name : `str` 

980 Column name. 

981 

982 Returns 

983 ------- 

984 shape : `tuple` [`int`] 

985 Shape associated with the column. 

986 

987 Raises 

988 ------ 

989 RuntimeError 

990 Raised if metadata is found but has incorrect format. 

991 """ 

992 md_name = f"lsst::arrow::shape::{name}" 

993 if (encoded := md_name.encode("UTF-8")) in metadata: 

994 groups = re.search(r"\((.*)\)", metadata[encoded].decode("UTF-8")) 

995 if groups is None: 

996 raise RuntimeError("Illegal value found in metadata.") 

997 shape = tuple(int(x) for x in groups[1].split(",") if x != "") 

998 else: 

999 shape = (list_size,) 

1000 

1001 return shape 

1002 

1003 

1004def _schema_to_dtype_list(schema: pa.Schema) -> list[tuple[str, tuple[Any] | str]]: 

1005 """Convert a pyarrow schema to a numpy dtype. 

1006 

1007 Parameters 

1008 ---------- 

1009 schema : `pyarrow.Schema` 

1010 Input pyarrow schema. 

1011 

1012 Returns 

1013 ------- 

1014 dtype_list: `list` [`tuple`] 

1015 A list with name, type pairs. 

1016 """ 

1017 metadata = schema.metadata if schema.metadata is not None else {} 

1018 

1019 dtype: list[Any] = [] 

1020 for name in schema.names: 

1021 t = schema.field(name).type 

1022 if isinstance(t, pa.FixedSizeListType): 

1023 shape = _multidim_shape_from_metadata(metadata, t.list_size, name) 

1024 dtype.append((name, (t.value_type.to_pandas_dtype(), shape))) 

1025 elif t not in (pa.string(), pa.binary()): 

1026 dtype.append((name, t.to_pandas_dtype())) 

1027 else: 

1028 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name))) 

1029 

1030 return dtype 

1031 

1032 

1033def _numpy_dtype_to_arrow_types(dtype: np.dtype) -> list[Any]: 

1034 """Convert a numpy dtype to a list of arrow types. 

1035 

1036 Parameters 

1037 ---------- 

1038 dtype : `numpy.dtype` 

1039 Numpy dtype to convert. 

1040 

1041 Returns 

1042 ------- 

1043 type_list : `list` [`object`] 

1044 Converted list of arrow types. 

1045 """ 

1046 from math import prod 

1047 

1048 import numpy as np 

1049 

1050 type_list: list[Any] = [] 

1051 if dtype.names is None: 

1052 return type_list 

1053 

1054 for name in dtype.names: 

1055 dt = dtype[name] 

1056 arrow_type: Any 

1057 if len(dt.shape) > 0: 

1058 arrow_type = pa.list_( 

1059 pa.from_numpy_dtype(cast(tuple[np.dtype, tuple[int, ...]], dt.subdtype)[0].type), 

1060 prod(dt.shape), 

1061 ) 

1062 else: 

1063 arrow_type = pa.from_numpy_dtype(dt.type) 

1064 type_list.append((name, arrow_type)) 

1065 

1066 return type_list 

1067 

1068 

1069def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, int]: 

1070 """Extract equivalent table dtype from dict of numpy arrays. 

1071 

1072 Parameters 

1073 ---------- 

1074 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

1075 Dict with keys as the column names, values as the arrays. 

1076 

1077 Returns 

1078 ------- 

1079 dtype : `numpy.dtype` 

1080 dtype of equivalent table. 

1081 rowcount : `int` 

1082 Number of rows in the table. 

1083 

1084 Raises 

1085 ------ 

1086 ValueError if columns in numpy_dict have unequal numbers of rows. 

1087 """ 

1088 import numpy as np 

1089 

1090 dtype_list = [] 

1091 rowcount = 0 

1092 for name, col in numpy_dict.items(): 

1093 if rowcount == 0: 

1094 rowcount = len(col) 

1095 if len(col) != rowcount: 

1096 raise ValueError(f"Column {name} has a different number of rows.") 

1097 if len(col.shape) == 1: 

1098 dtype_list.append((name, col.dtype)) 

1099 else: 

1100 dtype_list.append((name, (col.dtype, col.shape[1:]))) 

1101 dtype = np.dtype(dtype_list) 

1102 

1103 return (dtype, rowcount) 

1104 

1105 

1106def _numpy_style_arrays_to_arrow_arrays( 

1107 dtype: np.dtype, 

1108 rowcount: int, 

1109 np_style_arrays: dict[str, np.ndarray] | np.ndarray | atable.Table, 

1110 schema: pa.Schema, 

1111) -> list[pa.Array]: 

1112 """Convert numpy-style arrays to arrow arrays. 

1113 

1114 Parameters 

1115 ---------- 

1116 dtype : `numpy.dtype` 

1117 Numpy dtype of input table/arrays. 

1118 rowcount : `int` 

1119 Number of rows in input table/arrays. 

1120 np_style_arrays : `dict` [`str`, `np.ndarray`] or `np.ndarray` 

1121 or `astropy.table.Table` 

1122 Arrays to convert to arrow. 

1123 schema : `pyarrow.Schema` 

1124 Schema of arrow table. 

1125 

1126 Returns 

1127 ------- 

1128 arrow_arrays : `list` [`pyarrow.Array`] 

1129 List of converted pyarrow arrays. 

1130 """ 

1131 import numpy as np 

1132 

1133 arrow_arrays: list[pa.Array] = [] 

1134 if dtype.names is None: 

1135 return arrow_arrays 

1136 

1137 for name in dtype.names: 

1138 dt = dtype[name] 

1139 val: Any 

1140 if len(dt.shape) > 0: 

1141 if rowcount > 0: 

1142 val = np.split(np_style_arrays[name].ravel(), rowcount) 

1143 else: 

1144 val = [] 

1145 else: 

1146 val = np_style_arrays[name] 

1147 arrow_arrays.append(pa.array(val, type=schema.field(name).type)) 

1148 

1149 return arrow_arrays