Coverage for python/lsst/daf/butler/formatters/parquet.py: 14%

319 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-28 09:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ParquetFormatter", 

26 "arrow_to_pandas", 

27 "arrow_to_astropy", 

28 "arrow_to_numpy", 

29 "arrow_to_numpy_dict", 

30 "pandas_to_arrow", 

31 "pandas_to_astropy", 

32 "astropy_to_arrow", 

33 "numpy_to_arrow", 

34 "numpy_to_astropy", 

35 "numpy_dict_to_arrow", 

36 "arrow_schema_to_pandas_index", 

37 "DataFrameSchema", 

38 "ArrowAstropySchema", 

39 "ArrowNumpySchema", 

40) 

41 

42import collections.abc 

43import itertools 

44import json 

45import re 

46from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union 

47 

48import pyarrow as pa 

49import pyarrow.parquet as pq 

50from lsst.daf.butler import Formatter 

51from lsst.utils.introspection import get_full_type_name 

52from lsst.utils.iteration import ensure_iterable 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 import astropy.table as atable 

56 import numpy as np 

57 import pandas as pd 

58 

59 

60class ParquetFormatter(Formatter): 

61 """Interface for reading and writing Arrow Table objects to and from 

62 Parquet files. 

63 """ 

64 

65 extension = ".parq" 

66 

67 def read(self, component: Optional[str] = None) -> Any: 

68 # Docstring inherited from Formatter.read. 

69 schema = pq.read_schema(self.fileDescriptor.location.path) 

70 

71 if component in ("columns", "schema"): 

72 # The schema will be translated to column format 

73 # depending on the input type. 

74 return schema 

75 elif component == "rowcount": 

76 # Get the rowcount from the metadata if possible, otherwise count. 

77 if b"lsst::arrow::rowcount" in schema.metadata: 

78 return int(schema.metadata[b"lsst::arrow::rowcount"]) 

79 

80 temp_table = pq.read_table( 

81 self.fileDescriptor.location.path, 

82 columns=[schema.names[0]], 

83 use_threads=False, 

84 use_pandas_metadata=False, 

85 ) 

86 

87 return len(temp_table[schema.names[0]]) 

88 

89 par_columns = None 

90 if self.fileDescriptor.parameters: 

91 par_columns = self.fileDescriptor.parameters.pop("columns", None) 

92 if par_columns: 

93 has_pandas_multi_index = False 

94 if b"pandas" in schema.metadata: 

95 md = json.loads(schema.metadata[b"pandas"]) 

96 if len(md["column_indexes"]) > 1: 

97 has_pandas_multi_index = True 

98 

99 if not has_pandas_multi_index: 

100 # Ensure uniqueness, keeping order. 

101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns))) 

102 file_columns = [name for name in schema.names if not name.startswith("__")] 

103 

104 for par_column in par_columns: 

105 if par_column not in file_columns: 

106 raise ValueError( 

107 f"Column {par_column} specified in parameters not available in parquet file." 

108 ) 

109 else: 

110 par_columns = _standardize_multi_index_columns(schema, par_columns) 

111 

112 if len(self.fileDescriptor.parameters): 

113 raise ValueError( 

114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read." 

115 ) 

116 

117 metadata = schema.metadata if schema.metadata is not None else {} 

118 arrow_table = pq.read_table( 

119 self.fileDescriptor.location.path, 

120 columns=par_columns, 

121 use_threads=False, 

122 use_pandas_metadata=(b"pandas" in metadata), 

123 ) 

124 

125 return arrow_table 

126 

127 def write(self, inMemoryDataset: Any) -> None: 

128 import numpy as np 

129 from astropy.table import Table as astropyTable 

130 

131 arrow_table = None 

132 if isinstance(inMemoryDataset, pa.Table): 

133 # This will be the most likely match. 

134 arrow_table = inMemoryDataset 

135 elif isinstance(inMemoryDataset, astropyTable): 

136 arrow_table = astropy_to_arrow(inMemoryDataset) 

137 elif isinstance(inMemoryDataset, np.ndarray): 

138 arrow_table = numpy_to_arrow(inMemoryDataset) 

139 else: 

140 if hasattr(inMemoryDataset, "to_parquet"): 

141 # This may be a pandas DataFrame 

142 try: 

143 import pandas as pd 

144 except ImportError: 

145 pd = None 

146 

147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame): 

148 arrow_table = pandas_to_arrow(inMemoryDataset) 

149 

150 if arrow_table is None: 

151 raise ValueError( 

152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of " 

153 "inMemoryDataset for ParquetFormatter." 

154 ) 

155 

156 location = self.makeUpdatedLocation(self.fileDescriptor.location) 

157 

158 pq.write_table(arrow_table, location.path) 

159 

160 

161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame: 

162 """Convert a pyarrow table to a pandas DataFrame. 

163 

164 Parameters 

165 ---------- 

166 arrow_table : `pyarrow.Table` 

167 Input arrow table to convert. If the table has ``pandas`` metadata 

168 in the schema it will be used in the construction of the 

169 ``DataFrame``. 

170 

171 Returns 

172 ------- 

173 dataframe : `pandas.DataFrame` 

174 """ 

175 return arrow_table.to_pandas(use_threads=False) 

176 

177 

178def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table: 

179 """Convert a pyarrow table to an `astropy.Table`. 

180 

181 Parameters 

182 ---------- 

183 arrow_table : `pyarrow.Table` 

184 Input arrow table to convert. If the table has astropy unit 

185 metadata in the schema it will be used in the construction 

186 of the ``astropy.Table``. 

187 

188 Returns 

189 ------- 

190 table : `astropy.Table` 

191 """ 

192 from astropy.table import Table 

193 

194 astropy_table = Table(arrow_to_numpy_dict(arrow_table)) 

195 

196 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {} 

197 

198 _apply_astropy_metadata(astropy_table, metadata) 

199 

200 return astropy_table 

201 

202 

203def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray: 

204 """Convert a pyarrow table to a structured numpy array. 

205 

206 Parameters 

207 ---------- 

208 arrow_table : `pyarrow.Table` 

209 

210 Returns 

211 ------- 

212 array : `numpy.ndarray` (N,) 

213 Numpy array table with N rows and the same column names 

214 as the input arrow table. 

215 """ 

216 import numpy as np 

217 

218 numpy_dict = arrow_to_numpy_dict(arrow_table) 

219 

220 dtype = [] 

221 for name, col in numpy_dict.items(): 

222 dtype.append((name, col.dtype)) 

223 

224 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype) 

225 

226 return array 

227 

228 

229def arrow_to_numpy_dict(arrow_table: pa.Table) -> Dict[str, np.ndarray]: 

230 """Convert a pyarrow table to a dict of numpy arrays. 

231 

232 Parameters 

233 ---------- 

234 arrow_table : `pyarrow.Table` 

235 

236 Returns 

237 ------- 

238 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

239 Dict with keys as the column names, values as the arrays. 

240 """ 

241 schema = arrow_table.schema 

242 

243 numpy_dict = {} 

244 

245 for name in schema.names: 

246 col = arrow_table[name].to_numpy() 

247 

248 if schema.field(name).type in (pa.string(), pa.binary()): 

249 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col)) 

250 

251 numpy_dict[name] = col 

252 

253 return numpy_dict 

254 

255 

256def numpy_to_arrow(np_array: np.ndarray) -> pa.Table: 

257 """Convert a numpy array table to an arrow table. 

258 

259 Parameters 

260 ---------- 

261 np_array : `numpy.ndarray` 

262 

263 Returns 

264 ------- 

265 arrow_table : `pyarrow.Table` 

266 """ 

267 type_list = [(name, pa.from_numpy_dtype(np_array.dtype[name].type)) for name in np_array.dtype.names] 

268 

269 md = {} 

270 md[b"lsst::arrow::rowcount"] = str(len(np_array)) 

271 

272 for name in np_array.dtype.names: 

273 _append_numpy_string_metadata(md, name, np_array.dtype[name]) 

274 

275 schema = pa.schema(type_list, metadata=md) 

276 

277 arrays = [pa.array(np_array[col]) for col in np_array.dtype.names] 

278 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

279 

280 return arrow_table 

281 

282 

283def numpy_dict_to_arrow(numpy_dict: Dict[str, np.ndarray]) -> pa.Table: 

284 """Convert a dict of numpy arrays to an arrow table. 

285 

286 Parameters 

287 ---------- 

288 numpy_dict : `dict` [`str`, `numpy.ndarray`] 

289 Dict with keys as the column names, values as the arrays. 

290 

291 Returns 

292 ------- 

293 arrow_table : `pyarrow.Table` 

294 """ 

295 type_list = [(name, pa.from_numpy_dtype(col.dtype.type)) for name, col in numpy_dict.items()] 

296 

297 md = {} 

298 md[b"lsst::arrow::rowcount"] = str(len(numpy_dict[list(numpy_dict.keys())[0]])) 

299 

300 for name, col in numpy_dict.items(): 

301 _append_numpy_string_metadata(md, name, col.dtype) 

302 

303 schema = pa.schema(type_list, metadata=md) 

304 

305 arrays = [pa.array(col) for col in numpy_dict.values()] 

306 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

307 

308 return arrow_table 

309 

310 

311def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table: 

312 """Convert an astropy table to an arrow table. 

313 

314 Parameters 

315 ---------- 

316 astropy_table : `astropy.Table` 

317 

318 Returns 

319 ------- 

320 arrow_table : `pyarrow.Table` 

321 """ 

322 from astropy.table import meta 

323 

324 type_list = [ 

325 (name, pa.from_numpy_dtype(astropy_table.dtype[name].type)) for name in astropy_table.dtype.names 

326 ] 

327 

328 md = {} 

329 md[b"lsst::arrow::rowcount"] = str(len(astropy_table)) 

330 

331 for name, col in astropy_table.columns.items(): 

332 _append_numpy_string_metadata(md, name, col.dtype) 

333 

334 meta_yaml = meta.get_yaml_from_table(astropy_table) 

335 meta_yaml_str = "\n".join(meta_yaml) 

336 md[b"table_meta_yaml"] = meta_yaml_str 

337 

338 schema = pa.schema(type_list, metadata=md) 

339 

340 arrays = [pa.array(col) for col in astropy_table.itercols()] 

341 arrow_table = pa.Table.from_arrays(arrays, schema=schema) 

342 

343 return arrow_table 

344 

345 

346def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table: 

347 """Convert a pandas dataframe to an arrow table. 

348 

349 Parameters 

350 ---------- 

351 dataframe : `pandas.DataFrame` 

352 default_length : `int`, optional 

353 Default string length when not in metadata or can be inferred 

354 from column. 

355 

356 Returns 

357 ------- 

358 arrow_table : `pyarrow.Table` 

359 """ 

360 import numpy as np 

361 import pandas as pd 

362 

363 arrow_table = pa.Table.from_pandas(dataframe) 

364 

365 # Update the metadata 

366 md = arrow_table.schema.metadata 

367 

368 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows) 

369 

370 if not isinstance(dataframe.columns, pd.MultiIndex): 

371 for name in dataframe.columns: 

372 if dataframe[name].dtype.type is np.object_: 

373 if len(dataframe[name].values) > 0: 

374 strlen = max(len(row) for row in dataframe[name].values) 

375 else: 

376 strlen = default_length 

377 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen) 

378 

379 arrow_table = arrow_table.replace_schema_metadata(md) 

380 

381 return arrow_table 

382 

383 

384def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table: 

385 """Convert a pandas dataframe to an astropy table, preserving indexes. 

386 

387 Parameters 

388 ---------- 

389 dataframe : `pandas.DataFrame` 

390 

391 Returns 

392 ------- 

393 astropy_table : `astropy.table.Table` 

394 """ 

395 import pandas as pd 

396 from astropy.table import Table 

397 

398 if isinstance(dataframe.columns, pd.MultiIndex): 

399 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.") 

400 

401 return Table.from_pandas(dataframe, index=True) 

402 

403 

404def numpy_to_astropy(np_array: np.ndarray) -> atable.Table: 

405 """Convert a numpy table to an astropy table. 

406 

407 Parameters 

408 ---------- 

409 np_array : `numpy.ndarray` 

410 

411 Returns 

412 ------- 

413 astropy_table : `astropy.table.Table` 

414 """ 

415 from astropy.table import Table 

416 

417 return Table(data=np_array, copy=False) 

418 

419 

420def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex: 

421 """Convert an arrow schema to a pandas index/multiindex. 

422 

423 Parameters 

424 ---------- 

425 schema : `pyarrow.Schema` 

426 

427 Returns 

428 ------- 

429 index : `pandas.Index` or `pandas.MultiIndex` 

430 """ 

431 import pandas as pd 

432 

433 if b"pandas" in schema.metadata: 

434 md = json.loads(schema.metadata[b"pandas"]) 

435 indexes = md["column_indexes"] 

436 len_indexes = len(indexes) 

437 else: 

438 len_indexes = 0 

439 

440 if len_indexes <= 1: 

441 return pd.Index(name for name in schema.names if not name.startswith("__")) 

442 else: 

443 raw_columns = _split_multi_index_column_names(len(indexes), schema.names) 

444 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes]) 

445 

446 

447def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]: 

448 """Convert an arrow schema to a list of string column names. 

449 

450 Parameters 

451 ---------- 

452 schema : `pyarrow.Schema` 

453 

454 Returns 

455 ------- 

456 column_list : `list` [`str`] 

457 """ 

458 return [name for name in schema.names] 

459 

460 

461class DataFrameSchema: 

462 """Wrapper class for a schema for a pandas DataFrame. 

463 

464 Parameters 

465 ---------- 

466 dataframe : `pandas.DataFrame` 

467 Dataframe to turn into a schema. 

468 """ 

469 

470 def __init__(self, dataframe: pd.DataFrame) -> None: 

471 self._schema = dataframe.loc[[False] * len(dataframe)] 

472 

473 @classmethod 

474 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema: 

475 """Convert an arrow schema into a `DataFrameSchema`. 

476 

477 Parameters 

478 ---------- 

479 schema : `pyarrow.Schema` 

480 The pyarrow schema to convert. 

481 

482 Returns 

483 ------- 

484 dataframe_schema : `DataFrameSchema` 

485 """ 

486 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema) 

487 

488 return cls(empty_table.to_pandas()) 

489 

490 def to_arrow_schema(self) -> pa.Schema: 

491 """Convert to an arrow schema. 

492 

493 Returns 

494 ------- 

495 arrow_schema : `pyarrow.Schema` 

496 """ 

497 arrow_table = pa.Table.from_pandas(self._schema) 

498 

499 return arrow_table.schema 

500 

501 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

502 """Convert to an `ArrowNumpySchema`. 

503 

504 Returns 

505 ------- 

506 arrow_numpy_schema : `ArrowNumpySchema` 

507 """ 

508 return ArrowNumpySchema.from_arrow(self.to_arrow_schema()) 

509 

510 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

511 """Convert to an ArrowAstropySchema. 

512 

513 Returns 

514 ------- 

515 arrow_astropy_schema : `ArrowAstropySchema` 

516 """ 

517 return ArrowAstropySchema.from_arrow(self.to_arrow_schema()) 

518 

519 @property 

520 def schema(self) -> np.dtype: 

521 return self._schema 

522 

523 def __repr__(self) -> str: 

524 return repr(self._schema) 

525 

526 def __eq__(self, other: object) -> bool: 

527 if not isinstance(other, DataFrameSchema): 

528 return NotImplemented 

529 

530 return self._schema.equals(other._schema) 

531 

532 

533class ArrowAstropySchema: 

534 """Wrapper class for a schema for an astropy table. 

535 

536 Parameters 

537 ---------- 

538 astropy_table : `astropy.table.Table` 

539 """ 

540 

541 def __init__(self, astropy_table: atable.Table) -> None: 

542 self._schema = astropy_table[:0] 

543 

544 @classmethod 

545 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema: 

546 """Convert an arrow schema into a ArrowAstropySchema. 

547 

548 Parameters 

549 ---------- 

550 schema : `pyarrow.Schema` 

551 

552 Returns 

553 ------- 

554 astropy_schema : `ArrowAstropySchema` 

555 """ 

556 import numpy as np 

557 from astropy.table import Table 

558 

559 dtype = [] 

560 for name in schema.names: 

561 if schema.field(name).type not in (pa.string(), pa.binary()): 

562 dtype.append(schema.field(name).type.to_pandas_dtype()) 

563 continue 

564 

565 dtype.append(_arrow_string_to_numpy_dtype(schema, name)) 

566 

567 data = np.zeros(0, dtype=list(zip(schema.names, dtype))) 

568 

569 astropy_table = Table(data=data) 

570 

571 metadata = schema.metadata if schema.metadata is not None else {} 

572 

573 _apply_astropy_metadata(astropy_table, metadata) 

574 

575 return cls(astropy_table) 

576 

577 def to_arrow_schema(self) -> pa.Schema: 

578 """Convert to an arrow schema. 

579 

580 Returns 

581 ------- 

582 arrow_schema : `pyarrow.Schema` 

583 """ 

584 return astropy_to_arrow(self._schema).schema 

585 

586 def to_dataframe_schema(self) -> DataFrameSchema: 

587 """Convert to a DataFrameSchema. 

588 

589 Returns 

590 ------- 

591 dataframe_schema : `DataFrameSchema` 

592 """ 

593 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema) 

594 

595 def to_arrow_numpy_schema(self) -> ArrowNumpySchema: 

596 """Convert to an `ArrowNumpySchema`. 

597 

598 Returns 

599 ------- 

600 arrow_numpy_schema : `ArrowNumpySchema` 

601 """ 

602 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema) 

603 

604 @property 

605 def schema(self) -> atable.Table: 

606 return self._schema 

607 

608 def __repr__(self) -> str: 

609 return repr(self._schema) 

610 

611 def __eq__(self, other: object) -> bool: 

612 if not isinstance(other, ArrowAstropySchema): 

613 return NotImplemented 

614 

615 # If this comparison passes then the two tables have the 

616 # same column names. 

617 if self._schema.dtype != other._schema.dtype: 

618 return False 

619 

620 for name in self._schema.columns: 

621 if not self._schema[name].unit == other._schema[name].unit: 

622 return False 

623 if not self._schema[name].description == other._schema[name].description: 

624 return False 

625 if not self._schema[name].format == other._schema[name].format: 

626 return False 

627 

628 return True 

629 

630 

631class ArrowNumpySchema: 

632 """Wrapper class for a schema for a numpy ndarray. 

633 

634 Parameters 

635 ---------- 

636 numpy_dtype : `numpy.dtype` 

637 Numpy dtype to convert. 

638 """ 

639 

640 def __init__(self, numpy_dtype: np.dtype) -> None: 

641 self._dtype = numpy_dtype 

642 

643 @classmethod 

644 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema: 

645 """Convert an arrow schema into an `ArrowNumpySchema`. 

646 

647 Parameters 

648 ---------- 

649 schema : `pyarrow.Schema` 

650 Pyarrow schema to convert. 

651 

652 Returns 

653 ------- 

654 numpy_schema : `ArrowNumpySchema` 

655 """ 

656 import numpy as np 

657 

658 dtype = [] 

659 for name in schema.names: 

660 if schema.field(name).type not in (pa.string(), pa.binary()): 

661 dtype.append((name, schema.field(name).type.to_pandas_dtype())) 

662 continue 

663 

664 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name))) 

665 

666 return cls(np.dtype(dtype)) 

667 

668 def to_arrow_astropy_schema(self) -> ArrowAstropySchema: 

669 """Convert to an `ArrowAstropySchema`. 

670 

671 Returns 

672 ------- 

673 astropy_schema : `ArrowAstropySchema` 

674 """ 

675 import numpy as np 

676 

677 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

678 

679 def to_dataframe_schema(self) -> DataFrameSchema: 

680 """Convert to a `DataFrameSchema`. 

681 

682 Returns 

683 ------- 

684 dataframe_schema : `DataFrameSchema` 

685 """ 

686 import numpy as np 

687 

688 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema) 

689 

690 def to_arrow_schema(self) -> pa.Schema: 

691 """Convert to a `pyarrow.Schema`. 

692 

693 Returns 

694 ------- 

695 arrow_schema : `pyarrow.Schema` 

696 """ 

697 import numpy as np 

698 

699 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema 

700 

701 @property 

702 def schema(self) -> np.dtype: 

703 return self._dtype 

704 

705 def __repr__(self) -> str: 

706 return repr(self._dtype) 

707 

708 def __eq__(self, other: object) -> bool: 

709 if not isinstance(other, ArrowNumpySchema): 

710 return NotImplemented 

711 

712 if not self._dtype == other._dtype: 

713 return False 

714 

715 return True 

716 

717 

718def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]: 

719 """Split a string that represents a multi-index column. 

720 

721 PyArrow maps Pandas' multi-index column names (which are tuples in Python) 

722 to flat strings on disk. This routine exists to reconstruct the original 

723 tuple. 

724 

725 Parameters 

726 ---------- 

727 n : `int` 

728 Number of levels in the `pandas.MultiIndex` that is being 

729 reconstructed. 

730 names : `~collections.abc.Iterable` [`str`] 

731 Strings to be split. 

732 

733 Returns 

734 ------- 

735 column_names : `list` [`tuple` [`str`]] 

736 A list of multi-index column name tuples. 

737 """ 

738 column_names: List[Sequence[str]] = [] 

739 

740 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n))) 

741 for name in names: 

742 m = re.search(pattern, name) 

743 if m is not None: 

744 column_names.append(m.groups()) 

745 

746 return column_names 

747 

748 

749def _standardize_multi_index_columns( 

750 schema: pa.Schema, columns: Union[List[tuple], Dict[str, Union[str, List[str]]]] 

751) -> List[str]: 

752 """Transform a dictionary/iterable index from a multi-index column list 

753 into a string directly understandable by PyArrow. 

754 

755 Parameters 

756 ---------- 

757 schema : `pyarrow.Schema` 

758 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]] 

759 

760 Returns 

761 ------- 

762 names : `list` [`str`] 

763 Stringified representation of a multi-index column name. 

764 """ 

765 pd_index = arrow_schema_to_pandas_index(schema) 

766 index_level_names = tuple(pd_index.names) 

767 

768 names = [] 

769 

770 if isinstance(columns, list): 

771 for requested in columns: 

772 if not isinstance(requested, tuple): 

773 raise ValueError( 

774 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

775 f"Instead got a {get_full_type_name(requested)}." 

776 ) 

777 names.append(str(requested)) 

778 else: 

779 if not isinstance(columns, collections.abc.Mapping): 

780 raise ValueError( 

781 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. " 

782 f"Instead got a {get_full_type_name(columns)}." 

783 ) 

784 if not set(index_level_names).issuperset(columns.keys()): 

785 raise ValueError( 

786 f"Cannot use dict with keys {set(columns.keys())} " 

787 f"to select columns from {index_level_names}." 

788 ) 

789 factors = [ 

790 ensure_iterable(columns.get(level, pd_index.levels[i])) 

791 for i, level in enumerate(index_level_names) 

792 ] 

793 for requested in itertools.product(*factors): 

794 for i, value in enumerate(requested): 

795 if value not in pd_index.levels[i]: 

796 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.") 

797 names.append(str(requested)) 

798 

799 return names 

800 

801 

802def _apply_astropy_metadata(astropy_table: atable.Table, metadata: Dict) -> None: 

803 """Apply any astropy metadata from the schema metadata. 

804 

805 Parameters 

806 ---------- 

807 astropy_table : `astropy.table.Table` 

808 Table to apply metadata. 

809 metadata : `dict` [`bytes`] 

810 Metadata dict. 

811 """ 

812 from astropy.table import meta 

813 

814 meta_yaml = metadata.get(b"table_meta_yaml", None) 

815 if meta_yaml: 

816 meta_yaml = meta_yaml.decode("UTF8").split("\n") 

817 meta_hdr = meta.get_header_from_yaml(meta_yaml) 

818 

819 # Set description, format, unit, meta from the column 

820 # metadata that was serialized with the table. 

821 header_cols = {x["name"]: x for x in meta_hdr["datatype"]} 

822 for col in astropy_table.columns.values(): 

823 for attr in ("description", "format", "unit", "meta"): 

824 if attr in header_cols[col.name]: 

825 setattr(col, attr, header_cols[col.name][attr]) 

826 

827 

828def _arrow_string_to_numpy_dtype( 

829 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10 

830) -> str: 

831 """Get the numpy dtype string associated with an arrow column. 

832 

833 Parameters 

834 ---------- 

835 schema : `pyarrow.Schema` 

836 Arrow table schema. 

837 name : `str` 

838 Column name. 

839 numpy_column : `numpy.ndarray`, optional 

840 Column to determine numpy string dtype. 

841 default_length : `int`, optional 

842 Default string length when not in metadata or can be inferred 

843 from column. 

844 

845 Returns 

846 ------- 

847 dtype_str : `str` 

848 Numpy dtype string. 

849 """ 

850 # Special-case for string and binary columns 

851 md_name = f"lsst::arrow::len::{name}" 

852 strlen = default_length 

853 metadata = schema.metadata if schema.metadata is not None else {} 

854 if (encoded := md_name.encode("UTF-8")) in metadata: 

855 # String/bytes length from header. 

856 strlen = int(schema.metadata[encoded]) 

857 elif numpy_column is not None: 

858 if len(numpy_column) > 0: 

859 strlen = max(len(row) for row in numpy_column) 

860 

861 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}" 

862 

863 return dtype 

864 

865 

866def _append_numpy_string_metadata(metadata: Dict[bytes, str], name: str, dtype: np.dtype) -> None: 

867 """Append numpy string length keys to arrow metadata. 

868 

869 All column types are handled, but only the metadata is only modified for 

870 string and byte columns. 

871 

872 Parameters 

873 ---------- 

874 metadata : `dict` [`bytes`, `str`] 

875 Metadata dictionary; modified in place. 

876 name : `str` 

877 Column name. 

878 dtype : `np.dtype` 

879 Numpy dtype. 

880 """ 

881 import numpy as np 

882 

883 if dtype.type is np.str_: 

884 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4) 

885 elif dtype.type is np.bytes_: 

886 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize)