Coverage for python/lsst/daf/butler/formatters/parquet.py: 14%

317 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-09 02:51 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ParquetFormatter", 

26 "arrow_to_pandas", 

27 "arrow_to_astropy", 

28 "arrow_to_numpy", 

29 "arrow_to_numpy_dict", 

30 "pandas_to_arrow", 

31 "pandas_to_astropy", 

32 "astropy_to_arrow", 

33 "numpy_to_arrow", 

34 "numpy_to_astropy", 

35 "numpy_dict_to_arrow", 

36 "arrow_schema_to_pandas_index", 

37 "DataFrameSchema", 

38 "ArrowAstropySchema", 

39 "ArrowNumpySchema", 

40) 

41 

42import collections.abc 

43import itertools 

44import json 

45import re 

46from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union 

47 

48import pyarrow as pa 

49import pyarrow.parquet as pq 

50from lsst.daf.butler import Formatter 

51from lsst.utils.introspection import get_full_type_name 

52from lsst.utils.iteration import ensure_iterable 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 import astropy.table as atable 

56 import numpy as np 

57 import pandas as pd 

58 

59 

60class ParquetFormatter(Formatter): 

61 """Interface for reading and writing Arrow Table objects to and from 

62 Parquet files. 

63 """ 

64 

65 extension = ".parq" 

66 

67 def read(self, component: Optional[str] = None) -> Any: 

68 # Docstring inherited from Formatter.read. 

69 schema = pq.read_schema(self.fileDescriptor.location.path) 

70 

71 if component in ("columns", "schema"): 

72 # The schema will be translated to column format 

73 # depending on the input type. 

74 return schema 

75 elif component == "rowcount": 

76 # Get the rowcount from the metadata if possible, otherwise count. 

77 if b"lsst::arrow::rowcount" in schema.metadata: 

78 return int(schema.metadata[b"lsst::arrow::rowcount"]) 

79 

80 temp_table = pq.read_table( 

81 self.fileDescriptor.location.path, 

82 columns=[schema.names[0]], 

83 use_threads=False, 

84 use_pandas_metadata=False, 

85 ) 

86 

87 return len(temp_table[schema.names[0]]) 

88 

89 par_columns = None 

90 if self.fileDescriptor.parameters: 

91 par_columns = self.fileDescriptor.parameters.pop("columns", None) 

92 if par_columns: 

93 has_pandas_multi_index = False 

94 if b"pandas" in schema.metadata: 

95 md = json.loads(schema.metadata[b"pandas"]) 

96 if len(md["column_indexes"]) > 1: 

97 has_pandas_multi_index = True 

98 

99 if not has_pandas_multi_index: 

100 # Ensure uniqueness, keeping order. 

101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns))) 

102 file_columns = [name for name in schema.names if not name.startswith("__")] 

103 

104 for par_column in par_columns: 

105 if par_column not in file_columns: 

106 raise ValueError( 

107 f"Column {par_column} specified in parameters not available in parquet file." 

108 ) 

109 else: 

110 par_columns = _standardize_multi_index_columns(schema, par_columns) 

111 

112 if len(self.fileDescriptor.parameters): 

113 raise ValueError( 

114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read." 

115 ) 

116 

117 metadata = schema.metadata if schema.metadata is not None else {} 

118 arrow_table = pq.read_table( 

119 self.fileDescriptor.location.path, 

120 columns=par_columns, 

121 use_threads=False, 

122 use_pandas_metadata=(b"pandas" in metadata), 

123 ) 

124 

125 return arrow_table 

126 

127 def write(self, inMemoryDataset: Any) -> None: 

128 import numpy as np 

129 from astropy.table import Table as astropyTable 

130 

131 arrow_table = None 

132 if isinstance(inMemoryDataset, pa.Table): 

133 # This will be the most likely match. 

134 arrow_table = inMemoryDataset 

135 elif isinstance(inMemoryDataset, astropyTable): 

136 arrow_table = astropy_to_arrow(inMemoryDataset) 

137 elif isinstance(inMemoryDataset, np.ndarray): 

138 arrow_table = numpy_to_arrow(inMemoryDataset) 

139 else: 

140 if hasattr(inMemoryDataset, "to_parquet"): 

141 # This may be a pandas DataFrame 

142 try: 

143 import pandas as pd 

144 except ImportError: 

145 pd = None 

146 

147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame): 

148 arrow_table = pandas_to_arrow(inMemoryDataset) 

149 

150 if arrow_table is None: 

151 raise ValueError( 

152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of " 

153 "inMemoryDataset for ParquetFormatter." 

154 ) 

155 

156 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

157 

158 pq.write_table(arrow_table, location.path) 

159 

160 

161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame: 

162 """Convert a pyarrow table to a pandas DataFrame. 

163 

164 Parameters 

165 ---------- 

166 arrow_table : `pyarrow.Table` 

167 Input arrow table to convert. If the table has ``pandas`` metadata 

168 in the schema it will be used in the construction of the 

169 ``DataFrame``. 

170 

171 Returns 

172 ------- 

173 dataframe : `pandas.DataFrame` 

174 """ 

175 return arrow_table.to_pandas(use_threads=False) 

176 

177 

178def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table: 

179 """Convert a pyarrow table to an `astropy.Table`. 

180 

181 Parameters 

182 ---------- 

183 arrow_table : `pyarrow.Table` 

184 Input arrow table to convert. If the table has astropy unit 

185 metadata in the schema it will be used in the construction 

186 of the ``astropy.Table``. 

187 

188 Returns 

189 ------- 

190 table : `astropy.Table` 

191 """ 

192 from astropy.table import Table 

193 

194 astropy_table = Table(arrow_to_numpy_dict(arrow_table)) 

195 

196 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {} 

197 

198 _apply_astropy_metadata(astropy_table, metadata) 

199 

200 return astropy_table 

201 

202 

203def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray: 

204 """Convert a pyarrow table to a structured numpy array. 

205 

206 Parameters 

207 ---------- 

208 arrow_table : `pyarrow.Table` 

209 

210 Returns 

211 ------- 

212 array : `numpy.ndarray` (N,) 

213 Numpy array table with N rows and the same column names 

214 as the input arrow table. 

215 """ 

216 import numpy as np 

217 

218 numpy_dict = arrow_to_numpy_dict(arrow_table) 

219 

220 dtype = [] 

221 for name, col in numpy_dict.items(): 

222 dtype.append((name, col.dtype)) 

223 

224 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype) 

225 

226 return array 

227 

228 

229def arrow_to_numpy_dict(arrow_table: pa.Table) -> Dict[str, np.ndarray]: 

230 """Convert a pyarrow table to a dict of numpy arrays. 

231 

232 Parameters 

233 ---------- 

234 arrow_table : `pyarrow.Table` 

235 

236 Returns 

237 ------- 

238 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

239 Dict with keys as the column names, values as the arrays. 

240 """ 

241 schema = arrow_table.schema 

242 

243 numpy_dict = {} 

244 

245 for name in schema.names: 

246 col = arrow_table[name].to_numpy() 

247 

248 if schema.field(name).type in (pa.string(), pa.binary()): 

249 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col)) 

250 

251 numpy_dict[name] = col 

252 

253 return numpy_dict 

254 

255 

256def numpy_to_arrow(np_array: np.ndarray) -> pa.Table: 

257 """Convert a numpy array table to an arrow table. 

258 

259 Parameters 

260 ---------- 

261 np_array : `numpy.ndarray` 

262 

263 Returns 

264 ------- 

265 arrow_table : `pyarrow.Table` 

266 """ 

267 type_list = [(name, pa.from_numpy_dtype(np_array.dtype[name].type)) for name in np_array.dtype.names] 

268 

269 md = {} 

270 md[b"lsst::arrow::rowcount"] = str(len(np_array)) 

271 

272 for name in np_array.dtype.names: 

273 _append_numpy_string_metadata(md, name, np_array.dtype[name]) 

274 

275 schema = pa.schema(type_list, metadata=md) 

276 

277 arrays = [pa.array(np_array[col]) for col in np_array.dtype.names] 

278 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

279 

280 return arrow_table 

281 

282 

283def numpy_dict_to_arrow(numpy_dict: Dict[str, np.ndarray]) -> pa.Table: 

284 """Convert a dict of numpy arrays to an arrow table. 

285 

286 Parameters 

287 ---------- 

288 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

289 Dict with keys as the column names, values as the arrays. 

290 

291 Returns 

292 ------- 

293 arrow_table : `pyarrow.Table` 

294 """ 

295 type_list = [(name, pa.from_numpy_dtype(col.dtype.type)) for name, col in numpy_dict.items()] 

296 

297 md = {} 

298 md[b"lsst::arrow::rowcount"] = str(len(numpy_dict[list(numpy_dict.keys())[0]])) 

299 

300 for name, col in numpy_dict.items(): 

301 _append_numpy_string_metadata(md, name, col.dtype) 

302 

303 schema = pa.schema(type_list, metadata=md) 

304 

305 arrays = [pa.array(col) for col in numpy_dict.values()] 

306 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

307 

308 return arrow_table 

309 

310 

311def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table: 

312 """Convert an astropy table to an arrow table. 

313 

314 Parameters 

315 ---------- 

316 astropy_table : `astropy.Table` 

317 

318 Returns 

319 ------- 

320 arrow_table : `pyarrow.Table` 

321 """ 

322 from astropy.table import meta 

323 

324 type_list = [ 

325 (name, pa.from_numpy_dtype(astropy_table.dtype[name].type)) for name in astropy_table.dtype.names 

326 ] 

327 

328 md = {} 

329 md[b"lsst::arrow::rowcount"] = str(len(astropy_table)) 

330 

331 for name, col in astropy_table.columns.items(): 

332 _append_numpy_string_metadata(md, name, col.dtype) 

333 

334 meta_yaml = meta.get_yaml_from_table(astropy_table) 

335 meta_yaml_str = "\n".join(meta_yaml) 

336 md[b"table_meta_yaml"] = meta_yaml_str 

337 

338 schema = pa.schema(type_list, metadata=md) 

339 

340 arrays = [pa.array(col) for col in astropy_table.itercols()] 

341 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

342 

343 return arrow_table 

344 

345 

346def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table: 

347 """Convert a pandas dataframe to an arrow table. 

348 

349 Parameters 

350 ---------- 

351 dataframe : `pandas.DataFrame` 

352 default_length : `int`, optional 

353 Default string length when not in metadata or can be inferred 

354 from column. 

355 

356 Returns 

357 ------- 

358 arrow_table : `pyarrow.Table` 

359 """ 

360 arrow_table = pa.Table.from_pandas(dataframe) 

361 

362 # Update the metadata 

363 md = arrow_table.schema.metadata 

364 

365 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows) 

366 

367 # We loop through the arrow table columns because the datatypes have 

368 # been checked and converted from pandas objects. 

369 for name in arrow_table.column_names: 

370 if not name.startswith("__"): 

371 if arrow_table[name].type == pa.string(): 

372 if len(arrow_table[name]) > 0: 

373 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid) 

374 else: 

375 strlen = default_length 

376 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen) 

377 

378 arrow_table = arrow_table.replace_schema_metadata(md) 

379 

380 return arrow_table 

381 

382 

383def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table: 

384 """Convert a pandas dataframe to an astropy table, preserving indexes. 

385 

386 Parameters 

387 ---------- 

388 dataframe : `pandas.DataFrame` 

389 

390 Returns 

391 ------- 

392 astropy_table : `astropy.table.Table` 

393 """ 

394 import pandas as pd 

395 from astropy.table import Table 

396 

397 if isinstance(dataframe.columns, pd.MultiIndex): 

398 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.") 

399 

400 return Table.from_pandas(dataframe, index=True) 

401 

402 

403def numpy_to_astropy(np_array: np.ndarray) -> atable.Table: 

404 """Convert a numpy table to an astropy table. 

405 

406 Parameters 

407 ---------- 

408 np_array : `numpy.ndarray` 

409 

410 Returns 

411 ------- 

412 astropy_table : `astropy.table.Table` 

413 """ 

414 from astropy.table import Table 

415 

416 return Table(data=np_array, copy=False) 

417 

418 

419def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex: 

420 """Convert an arrow schema to a pandas index/multiindex. 

421 

422 Parameters 

423 ---------- 

424 schema : `pyarrow.Schema` 

425 

426 Returns 

427 ------- 

428 index : `pandas.Index` or `pandas.MultiIndex` 

429 """ 

430 import pandas as pd 

431 

432 if b"pandas" in schema.metadata: 

433 md = json.loads(schema.metadata[b"pandas"]) 

434 indexes = md["column_indexes"] 

435 len_indexes = len(indexes) 

436 else: 

437 len_indexes = 0 

438 

439 if len_indexes <= 1: 

440 return pd.Index(name for name in schema.names if not name.startswith("__")) 

441 else: 

442 raw_columns = _split_multi_index_column_names(len(indexes), schema.names) 

443 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

444 

445 

446def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]: 

447 """Convert an arrow schema to a list of string column names. 

448 

449 Parameters 

450 ---------- 

451 schema : `pyarrow.Schema` 

452 

453 Returns 

454 ------- 

455 column_list : `list` [`str`] 

456 """ 

457 return [name for name in schema.names] 

458 

459 

460class DataFrameSchema: 

461 """Wrapper class for a schema for a pandas DataFrame. 

462 

463 Parameters 

464 ---------- 

465 dataframe : `pandas.DataFrame` 

466 Dataframe to turn into a schema. 

467 """ 

468 

469 def __init__(self, dataframe: pd.DataFrame) -> None: 

470 self._schema = dataframe.loc[[False] * len(dataframe)] 

471 

472 @classmethod 

473 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema: 

474 """Convert an arrow schema into a `DataFrameSchema`. 

475 

476 Parameters 

477 ---------- 

478 schema : `pyarrow.Schema` 

479 The pyarrow schema to convert. 

480 

481 Returns 

482 ------- 

483 dataframe_schema : `DataFrameSchema` 

484 """ 

485 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema) 

486 

487 return cls(empty_table.to_pandas()) 

488 

489 def to_arrow_schema(self) -> pa.Schema: 

490 """Convert to an arrow schema. 

491 

492 Returns 

493 ------- 

494 arrow_schema : `pyarrow.Schema` 

495 """ 

496 arrow_table = pa.Table.from_pandas(self._schema) 

497 

498 return arrow_table.schema 

499 

500 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

501 """Convert to an `ArrowNumpySchema`. 

502 

503 Returns 

504 ------- 

505 arrow_numpy_schema : `ArrowNumpySchema` 

506 """ 

507 return ArrowNumpySchema.from_arrow(self.to_arrow_schema()) 

508 

509 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

510 """Convert to an ArrowAstropySchema. 

511 

512 Returns 

513 ------- 

514 arrow_astropy_schema : `ArrowAstropySchema` 

515 """ 

516 return ArrowAstropySchema.from_arrow(self.to_arrow_schema()) 

517 

518 @property 

519 def schema(self) -> np.dtype: 

520 return self._schema 

521 

522 def __repr__(self) -> str: 

523 return repr(self._schema) 

524 

525 def __eq__(self, other: object) -> bool: 

526 if not isinstance(other, DataFrameSchema): 

527 return NotImplemented 

528 

529 return self._schema.equals(other._schema) 

530 

531 

532class ArrowAstropySchema: 

533 """Wrapper class for a schema for an astropy table. 

534 

535 Parameters 

536 ---------- 

537 astropy_table : `astropy.table.Table` 

538 """ 

539 

540 def __init__(self, astropy_table: atable.Table) -> None: 

541 self._schema = astropy_table[:0] 

542 

543 @classmethod 

544 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema: 

545 """Convert an arrow schema into a ArrowAstropySchema. 

546 

547 Parameters 

548 ---------- 

549 schema : `pyarrow.Schema` 

550 

551 Returns 

552 ------- 

553 astropy_schema : `ArrowAstropySchema` 

554 """ 

555 import numpy as np 

556 from astropy.table import Table 

557 

558 dtype = [] 

559 for name in schema.names: 

560 if schema.field(name).type not in (pa.string(), pa.binary()): 

561 dtype.append(schema.field(name).type.to_pandas_dtype()) 

562 continue 

563 

564 dtype.append(_arrow_string_to_numpy_dtype(schema, name)) 

565 

566 data = np.zeros(0, dtype=list(zip(schema.names, dtype))) 

567 

568 astropy_table = Table(data=data) 

569 

570 metadata = schema.metadata if schema.metadata is not None else {} 

571 

572 _apply_astropy_metadata(astropy_table, metadata) 

573 

574 return cls(astropy_table) 

575 

576 def to_arrow_schema(self) -> pa.Schema: 

577 """Convert to an arrow schema. 

578 

579 Returns 

580 ------- 

581 arrow_schema : `pyarrow.Schema` 

582 """ 

583 return astropy_to_arrow(self._schema).schema 

584 

585 def to_dataframe_schema(self) -> DataFrameSchema: 

586 """Convert to a DataFrameSchema. 

587 

588 Returns 

589 ------- 

590 dataframe_schema : `DataFrameSchema` 

591 """ 

592 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema) 

593 

594 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

595 """Convert to an `ArrowNumpySchema`. 

596 

597 Returns 

598 ------- 

599 arrow_numpy_schema : `ArrowNumpySchema` 

600 """ 

601 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema) 

602 

603 @property 

604 def schema(self) -> atable.Table: 

605 return self._schema 

606 

607 def __repr__(self) -> str: 

608 return repr(self._schema) 

609 

610 def __eq__(self, other: object) -> bool: 

611 if not isinstance(other, ArrowAstropySchema): 

612 return NotImplemented 

613 

614 # If this comparison passes then the two tables have the 

615 # same column names. 

616 if self._schema.dtype != other._schema.dtype: 

617 return False 

618 

619 for name in self._schema.columns: 

620 if not self._schema[name].unit == other._schema[name].unit: 

621 return False 

622 if not self._schema[name].description == other._schema[name].description: 

623 return False 

624 if not self._schema[name].format == other._schema[name].format: 

625 return False 

626 

627 return True 

628 

629 

630class ArrowNumpySchema: 

631 """Wrapper class for a schema for a numpy ndarray. 

632 

633 Parameters 

634 ---------- 

635 numpy_dtype : `numpy.dtype` 

636 Numpy dtype to convert. 

637 """ 

638 

639 def __init__(self, numpy_dtype: np.dtype) -> None: 

640 self._dtype = numpy_dtype 

641 

642 @classmethod 

643 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema: 

644 """Convert an arrow schema into an `ArrowNumpySchema`. 

645 

646 Parameters 

647 ---------- 

648 schema : `pyarrow.Schema` 

649 Pyarrow schema to convert. 

650 

651 Returns 

652 ------- 

653 numpy_schema : `ArrowNumpySchema` 

654 """ 

655 import numpy as np 

656 

657 dtype = [] 

658 for name in schema.names: 

659 if schema.field(name).type not in (pa.string(), pa.binary()): 

660 dtype.append((name, schema.field(name).type.to_pandas_dtype())) 

661 continue 

662 

663 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name))) 

664 

665 return cls(np.dtype(dtype)) 

666 

667 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

668 """Convert to an `ArrowAstropySchema`. 

669 

670 Returns 

671 ------- 

672 astropy_schema : `ArrowAstropySchema` 

673 """ 

674 import numpy as np 

675 

676 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

677 

678 def to_dataframe_schema(self) -> DataFrameSchema: 

679 """Convert to a `DataFrameSchema`. 

680 

681 Returns 

682 ------- 

683 dataframe_schema : `DataFrameSchema` 

684 """ 

685 import numpy as np 

686 

687 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

688 

689 def to_arrow_schema(self) -> pa.Schema: 

690 """Convert to a `pyarrow.Schema`. 

691 

692 Returns 

693 ------- 

694 arrow_schema : `pyarrow.Schema` 

695 """ 

696 import numpy as np 

697 

698 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema 

699 

700 @property 

701 def schema(self) -> np.dtype: 

702 return self._dtype 

703 

704 def __repr__(self) -> str: 

705 return repr(self._dtype) 

706 

707 def __eq__(self, other: object) -> bool: 

708 if not isinstance(other, ArrowNumpySchema): 

709 return NotImplemented 

710 

711 if not self._dtype == other._dtype: 

712 return False 

713 

714 return True 

715 

716 

717def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]: 

718 """Split a string that represents a multi-index column. 

719 

720 PyArrow maps Pandas' multi-index column names (which are tuples in Python) 

721 to flat strings on disk. This routine exists to reconstruct the original 

722 tuple. 

723 

724 Parameters 

725 ---------- 

726 n : `int` 

727 Number of levels in the `pandas.MultiIndex` that is being 

728 reconstructed. 

729 names : `~collections.abc.Iterable` [`str`] 

730 Strings to be split. 

731 

732 Returns 

733 ------- 

734 column_names : `list` [`tuple` [`str`]] 

735 A list of multi-index column name tuples. 

736 """ 

737 column_names: List[Sequence[str]] = [] 

738 

739 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

740 for name in names: 

741 m = re.search(pattern, name) 

742 if m is not None: 

743 column_names.append(m.groups()) 

744 

745 return column_names 

746 

747 

748def _standardize_multi_index_columns( 

749 schema: pa.Schema, columns: Union[List[tuple], Dict[str, Union[str, List[str]]]] 

750) -> List[str]: 

751 """Transform a dictionary/iterable index from a multi-index column list 

752 into a string directly understandable by PyArrow. 

753 

754 Parameters 

755 ---------- 

756 schema : `pyarrow.Schema` 

757 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]] 

758 

759 Returns 

760 ------- 

761 names : `list` [`str`] 

762 Stringified representation of a multi-index column name. 

763 """ 

764 pd_index = arrow_schema_to_pandas_index(schema) 

765 index_level_names = tuple(pd_index.names) 

766 

767 names = [] 

768 

769 if isinstance(columns, list): 

770 for requested in columns: 

771 if not isinstance(requested, tuple): 

772 raise ValueError( 

773 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

774 f"Instead got a {get_full_type_name(requested)}." 

775 ) 

776 names.append(str(requested)) 

777 else: 

778 if not isinstance(columns, collections.abc.Mapping): 

779 raise ValueError( 

780 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

781 f"Instead got a {get_full_type_name(columns)}." 

782 ) 

783 if not set(index_level_names).issuperset(columns.keys()): 

784 raise ValueError( 

785 f"Cannot use dict with keys {set(columns.keys())} " 

786 f"to select columns from {index_level_names}." 

787 ) 

788 factors = [ 

789 ensure_iterable(columns.get(level, pd_index.levels[i])) 

790 for i, level in enumerate(index_level_names) 

791 ] 

792 for requested in itertools.product(*factors): 

793 for i, value in enumerate(requested): 

794 if value not in pd_index.levels[i]: 

795 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.") 

796 names.append(str(requested)) 

797 

798 return names 

799 

800 

801def _apply_astropy_metadata(astropy_table: atable.Table, metadata: Dict) -> None: 

802 """Apply any astropy metadata from the schema metadata. 

803 

804 Parameters 

805 ---------- 

806 astropy_table : `astropy.table.Table` 

807 Table to apply metadata. 

808 metadata : `dict` [`bytes`] 

809 Metadata dict. 

810 """ 

811 from astropy.table import meta 

812 

813 meta_yaml = metadata.get(b"table_meta_yaml", None) 

814 if meta_yaml: 

815 meta_yaml = meta_yaml.decode("UTF8").split("\n") 

816 meta_hdr = meta.get_header_from_yaml(meta_yaml) 

817 

818 # Set description, format, unit, meta from the column 

819 # metadata that was serialized with the table. 

820 header_cols = {x["name"]: x for x in meta_hdr["datatype"]} 

821 for col in astropy_table.columns.values(): 

822 for attr in ("description", "format", "unit", "meta"): 

823 if attr in header_cols[col.name]: 

824 setattr(col, attr, header_cols[col.name][attr]) 

825 

826 

827def _arrow_string_to_numpy_dtype( 

828 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10 

829) -> str: 

830 """Get the numpy dtype string associated with an arrow column. 

831 

832 Parameters 

833 ---------- 

834 schema : `pyarrow.Schema` 

835 Arrow table schema. 

836 name : `str` 

837 Column name. 

838 numpy_column : `numpy.ndarray`, optional 

839 Column to determine numpy string dtype. 

840 default_length : `int`, optional 

841 Default string length when not in metadata or can be inferred 

842 from column. 

843 

844 Returns 

845 ------- 

846 dtype_str : `str` 

847 Numpy dtype string. 

848 """ 

849 # Special-case for string and binary columns 

850 md_name = f"lsst::arrow::len::{name}" 

851 strlen = default_length 

852 metadata = schema.metadata if schema.metadata is not None else {} 

853 if (encoded := md_name.encode("UTF-8")) in metadata: 

854 # String/bytes length from header. 

855 strlen = int(schema.metadata[encoded]) 

856 elif numpy_column is not None: 

857 if len(numpy_column) > 0: 

858 strlen = max(len(row) for row in numpy_column) 

859 

860 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}" 

861 

862 return dtype 

863 

864 

865def _append_numpy_string_metadata(metadata: Dict[bytes, str], name: str, dtype: np.dtype) -> None: 

866 """Append numpy string length keys to arrow metadata. 

867 

868 All column types are handled, but only the metadata is only modified for 

869 string and byte columns. 

870 

871 Parameters 

872 ---------- 

873 metadata : `dict` [`bytes`, `str`] 

874 Metadata dictionary; modified in place. 

875 name : `str` 

876 Column name. 

877 dtype : `np.dtype` 

878 Numpy dtype. 

879 """ 

880 import numpy as np 

881 

882 if dtype.type is np.str_: 

883 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4) 

884 elif dtype.type is np.bytes_: 

885 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize)