Coverage for python/lsst/daf/butler/formatters/parquet.py: 13%
390 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-12 02:05 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-12 02:05 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ParquetFormatter",
26 "arrow_to_pandas",
27 "arrow_to_astropy",
28 "arrow_to_numpy",
29 "arrow_to_numpy_dict",
30 "pandas_to_arrow",
31 "pandas_to_astropy",
32 "astropy_to_arrow",
33 "numpy_to_arrow",
34 "numpy_to_astropy",
35 "numpy_dict_to_arrow",
36 "arrow_schema_to_pandas_index",
37 "DataFrameSchema",
38 "ArrowAstropySchema",
39 "ArrowNumpySchema",
40)
42import collections.abc
43import itertools
44import json
45import re
46from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, Union, cast
48import pyarrow as pa
49import pyarrow.parquet as pq
50from lsst.daf.butler import Formatter
51from lsst.utils.introspection import get_full_type_name
52from lsst.utils.iteration import ensure_iterable
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 import astropy.table as atable
56 import numpy as np
57 import pandas as pd
60class ParquetFormatter(Formatter):
61 """Interface for reading and writing Arrow Table objects to and from
62 Parquet files.
63 """
65 extension = ".parq"
67 def read(self, component: Optional[str] = None) -> Any:
68 # Docstring inherited from Formatter.read.
69 schema = pq.read_schema(self.fileDescriptor.location.path)
71 if component in ("columns", "schema"):
72 # The schema will be translated to column format
73 # depending on the input type.
74 return schema
75 elif component == "rowcount":
76 # Get the rowcount from the metadata if possible, otherwise count.
77 if b"lsst::arrow::rowcount" in schema.metadata:
78 return int(schema.metadata[b"lsst::arrow::rowcount"])
80 temp_table = pq.read_table(
81 self.fileDescriptor.location.path,
82 columns=[schema.names[0]],
83 use_threads=False,
84 use_pandas_metadata=False,
85 )
87 return len(temp_table[schema.names[0]])
89 par_columns = None
90 if self.fileDescriptor.parameters:
91 par_columns = self.fileDescriptor.parameters.pop("columns", None)
92 if par_columns:
93 has_pandas_multi_index = False
94 if b"pandas" in schema.metadata:
95 md = json.loads(schema.metadata[b"pandas"])
96 if len(md["column_indexes"]) > 1:
97 has_pandas_multi_index = True
99 if not has_pandas_multi_index:
100 # Ensure uniqueness, keeping order.
101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns)))
102 file_columns = [name for name in schema.names if not name.startswith("__")]
104 for par_column in par_columns:
105 if par_column not in file_columns:
106 raise ValueError(
107 f"Column {par_column} specified in parameters not available in parquet file."
108 )
109 else:
110 par_columns = _standardize_multi_index_columns(schema, par_columns)
112 if len(self.fileDescriptor.parameters):
113 raise ValueError(
114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read."
115 )
117 metadata = schema.metadata if schema.metadata is not None else {}
118 arrow_table = pq.read_table(
119 self.fileDescriptor.location.path,
120 columns=par_columns,
121 use_threads=False,
122 use_pandas_metadata=(b"pandas" in metadata),
123 )
125 return arrow_table
127 def write(self, inMemoryDataset: Any) -> None:
128 import numpy as np
129 from astropy.table import Table as astropyTable
131 arrow_table = None
132 if isinstance(inMemoryDataset, pa.Table):
133 # This will be the most likely match.
134 arrow_table = inMemoryDataset
135 elif isinstance(inMemoryDataset, astropyTable):
136 arrow_table = astropy_to_arrow(inMemoryDataset)
137 elif isinstance(inMemoryDataset, np.ndarray):
138 arrow_table = numpy_to_arrow(inMemoryDataset)
139 else:
140 if hasattr(inMemoryDataset, "to_parquet"):
141 # This may be a pandas DataFrame
142 try:
143 import pandas as pd
144 except ImportError:
145 pd = None
147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame):
148 arrow_table = pandas_to_arrow(inMemoryDataset)
150 if arrow_table is None:
151 raise ValueError(
152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of "
153 "inMemoryDataset for ParquetFormatter."
154 )
156 location = self.makeUpdatedLocation(self.fileDescriptor.location)
158 pq.write_table(arrow_table, location.path)
161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame:
162 """Convert a pyarrow table to a pandas DataFrame.
164 Parameters
165 ----------
166 arrow_table : `pyarrow.Table`
167 Input arrow table to convert. If the table has ``pandas`` metadata
168 in the schema it will be used in the construction of the
169 ``DataFrame``.
171 Returns
172 -------
173 dataframe : `pandas.DataFrame`
174 Converted pandas dataframe.
175 """
176 return arrow_table.to_pandas(use_threads=False)
179def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table:
180 """Convert a pyarrow table to an `astropy.Table`.
182 Parameters
183 ----------
184 arrow_table : `pyarrow.Table`
185 Input arrow table to convert. If the table has astropy unit
186 metadata in the schema it will be used in the construction
187 of the ``astropy.Table``.
189 Returns
190 -------
191 table : `astropy.Table`
192 Converted astropy table.
193 """
194 from astropy.table import Table
196 astropy_table = Table(arrow_to_numpy_dict(arrow_table))
198 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {}
200 _apply_astropy_metadata(astropy_table, metadata)
202 return astropy_table
205def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray:
206 """Convert a pyarrow table to a structured numpy array.
208 Parameters
209 ----------
210 arrow_table : `pyarrow.Table`
211 Input arrow table.
213 Returns
214 -------
215 array : `numpy.ndarray` (N,)
216 Numpy array table with N rows and the same column names
217 as the input arrow table.
218 """
219 import numpy as np
221 numpy_dict = arrow_to_numpy_dict(arrow_table)
223 dtype = []
224 for name, col in numpy_dict.items():
225 if len(shape := numpy_dict[name].shape) <= 1:
226 dtype.append((name, col.dtype))
227 else:
228 dtype.append((name, (col.dtype, shape[1:])))
230 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype)
232 return array
235def arrow_to_numpy_dict(arrow_table: pa.Table) -> dict[str, np.ndarray]:
236 """Convert a pyarrow table to a dict of numpy arrays.
238 Parameters
239 ----------
240 arrow_table : `pyarrow.Table`
241 Input arrow table.
243 Returns
244 -------
245 numpy_dict : `dict` [`str`, `numpy.ndarray`]
246 Dict with keys as the column names, values as the arrays.
247 """
248 import numpy as np
250 schema = arrow_table.schema
251 metadata = schema.metadata if schema.metadata is not None else {}
253 numpy_dict = {}
255 for name in schema.names:
256 col = arrow_table[name].to_numpy()
258 t = schema.field(name).type
259 if t in (pa.string(), pa.binary()):
260 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col))
261 elif isinstance(t, pa.FixedSizeListType):
262 if len(col) > 0:
263 col = np.stack(col)
264 else:
265 # this is an empty column, and needs to be coerced to type.
266 col = col.astype(t.value_type.to_pandas_dtype())
268 shape = _multidim_shape_from_metadata(metadata, t.list_size, name)
269 col = col.reshape((len(arrow_table), *shape))
271 numpy_dict[name] = col
273 return numpy_dict
276def numpy_to_arrow(np_array: np.ndarray) -> pa.Table:
277 """Convert a numpy array table to an arrow table.
279 Parameters
280 ----------
281 np_array : `numpy.ndarray`
282 Input numpy array with multiple fields.
284 Returns
285 -------
286 arrow_table : `pyarrow.Table`
287 Converted arrow table.
288 """
289 type_list = _numpy_dtype_to_arrow_types(np_array.dtype)
291 md = {}
292 md[b"lsst::arrow::rowcount"] = str(len(np_array))
294 for name in np_array.dtype.names:
295 _append_numpy_string_metadata(md, name, np_array.dtype[name])
296 _append_numpy_multidim_metadata(md, name, np_array.dtype[name])
298 schema = pa.schema(type_list, metadata=md)
300 arrays = _numpy_style_arrays_to_arrow_arrays(
301 np_array.dtype,
302 len(np_array),
303 np_array,
304 schema,
305 )
307 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
309 return arrow_table
312def numpy_dict_to_arrow(numpy_dict: dict[str, np.ndarray]) -> pa.Table:
313 """Convert a dict of numpy arrays to an arrow table.
315 Parameters
316 ----------
317 numpy_dict : `dict` [`str`, `numpy.ndarray`]
318 Dict with keys as the column names, values as the arrays.
320 Returns
321 -------
322 arrow_table : `pyarrow.Table`
323 Converted arrow table.
325 Raises
326 ------
327 ValueError if columns in numpy_dict have unequal numbers of rows.
328 """
329 dtype, rowcount = _numpy_dict_to_dtype(numpy_dict)
330 type_list = _numpy_dtype_to_arrow_types(dtype)
332 md = {}
333 md[b"lsst::arrow::rowcount"] = str(rowcount)
335 if dtype.names is not None:
336 for name in dtype.names:
337 _append_numpy_string_metadata(md, name, dtype[name])
338 _append_numpy_multidim_metadata(md, name, dtype[name])
340 schema = pa.schema(type_list, metadata=md)
342 arrays = _numpy_style_arrays_to_arrow_arrays(
343 dtype,
344 rowcount,
345 numpy_dict,
346 schema,
347 )
349 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
351 return arrow_table
354def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
355 """Convert an astropy table to an arrow table.
357 Parameters
358 ----------
359 astropy_table : `astropy.Table`
360 Input astropy table.
362 Returns
363 -------
364 arrow_table : `pyarrow.Table`
365 Converted arrow table.
366 """
367 from astropy.table import meta
369 type_list = _numpy_dtype_to_arrow_types(astropy_table.dtype)
371 md = {}
372 md[b"lsst::arrow::rowcount"] = str(len(astropy_table))
374 for name in astropy_table.dtype.names:
375 _append_numpy_string_metadata(md, name, astropy_table.dtype[name])
376 _append_numpy_multidim_metadata(md, name, astropy_table.dtype[name])
378 meta_yaml = meta.get_yaml_from_table(astropy_table)
379 meta_yaml_str = "\n".join(meta_yaml)
380 md[b"table_meta_yaml"] = meta_yaml_str
382 schema = pa.schema(type_list, metadata=md)
384 arrays = _numpy_style_arrays_to_arrow_arrays(
385 astropy_table.dtype,
386 len(astropy_table),
387 astropy_table,
388 schema,
389 )
391 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
393 return arrow_table
396def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table:
397 """Convert a pandas dataframe to an arrow table.
399 Parameters
400 ----------
401 dataframe : `pandas.DataFrame`
402 Input pandas dataframe.
403 default_length : `int`, optional
404 Default string length when not in metadata or can be inferred
405 from column.
407 Returns
408 -------
409 arrow_table : `pyarrow.Table`
410 Converted arrow table.
411 """
412 arrow_table = pa.Table.from_pandas(dataframe)
414 # Update the metadata
415 md = arrow_table.schema.metadata
417 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows)
419 # We loop through the arrow table columns because the datatypes have
420 # been checked and converted from pandas objects.
421 for name in arrow_table.column_names:
422 if not name.startswith("__"):
423 if arrow_table[name].type == pa.string():
424 if len(arrow_table[name]) > 0:
425 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid)
426 else:
427 strlen = default_length
428 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)
430 arrow_table = arrow_table.replace_schema_metadata(md)
432 return arrow_table
435def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table:
436 """Convert a pandas dataframe to an astropy table, preserving indexes.
438 Parameters
439 ----------
440 dataframe : `pandas.DataFrame`
441 Input pandas dataframe.
443 Returns
444 -------
445 astropy_table : `astropy.table.Table`
446 Converted astropy table.
447 """
448 import pandas as pd
449 from astropy.table import Table
451 if isinstance(dataframe.columns, pd.MultiIndex):
452 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")
454 return Table.from_pandas(dataframe, index=True)
457def numpy_to_astropy(np_array: np.ndarray) -> atable.Table:
458 """Convert a numpy table to an astropy table.
460 Parameters
461 ----------
462 np_array : `numpy.ndarray`
463 Input numpy array with multiple fields.
465 Returns
466 -------
467 astropy_table : `astropy.table.Table`
468 Converted astropy table.
469 """
470 from astropy.table import Table
472 return Table(data=np_array, copy=False)
475def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex:
476 """Convert an arrow schema to a pandas index/multiindex.
478 Parameters
479 ----------
480 schema : `pyarrow.Schema`
481 Input pyarrow schema.
483 Returns
484 -------
485 index : `pandas.Index` or `pandas.MultiIndex`
486 Converted pandas index.
487 """
488 import pandas as pd
490 if b"pandas" in schema.metadata:
491 md = json.loads(schema.metadata[b"pandas"])
492 indexes = md["column_indexes"]
493 len_indexes = len(indexes)
494 else:
495 len_indexes = 0
497 if len_indexes <= 1:
498 return pd.Index(name for name in schema.names if not name.startswith("__"))
499 else:
500 raw_columns = _split_multi_index_column_names(len(indexes), schema.names)
501 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
504def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]:
505 """Convert an arrow schema to a list of string column names.
507 Parameters
508 ----------
509 schema : `pyarrow.Schema`
510 Input pyarrow schema.
512 Returns
513 -------
514 column_list : `list` [`str`]
515 Converted list of column names.
516 """
517 return [name for name in schema.names]
520class DataFrameSchema:
521 """Wrapper class for a schema for a pandas DataFrame.
523 Parameters
524 ----------
525 dataframe : `pandas.DataFrame`
526 Dataframe to turn into a schema.
527 """
529 def __init__(self, dataframe: pd.DataFrame) -> None:
530 self._schema = dataframe.loc[[False] * len(dataframe)]
532 @classmethod
533 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema:
534 """Convert an arrow schema into a `DataFrameSchema`.
536 Parameters
537 ----------
538 schema : `pyarrow.Schema`
539 The pyarrow schema to convert.
541 Returns
542 -------
543 dataframe_schema : `DataFrameSchema`
544 Converted dataframe schema.
545 """
546 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema)
548 return cls(empty_table.to_pandas())
550 def to_arrow_schema(self) -> pa.Schema:
551 """Convert to an arrow schema.
553 Returns
554 -------
555 arrow_schema : `pyarrow.Schema`
556 Converted pyarrow schema.
557 """
558 arrow_table = pa.Table.from_pandas(self._schema)
560 return arrow_table.schema
562 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
563 """Convert to an `ArrowNumpySchema`.
565 Returns
566 -------
567 arrow_numpy_schema : `ArrowNumpySchema`
568 Converted arrow numpy schema.
569 """
570 return ArrowNumpySchema.from_arrow(self.to_arrow_schema())
572 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
573 """Convert to an ArrowAstropySchema.
575 Returns
576 -------
577 arrow_astropy_schema : `ArrowAstropySchema`
578 Converted arrow astropy schema.
579 """
580 return ArrowAstropySchema.from_arrow(self.to_arrow_schema())
582 @property
583 def schema(self) -> np.dtype:
584 return self._schema
586 def __repr__(self) -> str:
587 return repr(self._schema)
589 def __eq__(self, other: object) -> bool:
590 if not isinstance(other, DataFrameSchema):
591 return NotImplemented
593 return self._schema.equals(other._schema)
596class ArrowAstropySchema:
597 """Wrapper class for a schema for an astropy table.
599 Parameters
600 ----------
601 astropy_table : `astropy.table.Table`
602 Input astropy table.
603 """
605 def __init__(self, astropy_table: atable.Table) -> None:
606 self._schema = astropy_table[:0]
608 @classmethod
609 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema:
610 """Convert an arrow schema into a ArrowAstropySchema.
612 Parameters
613 ----------
614 schema : `pyarrow.Schema`
615 Input pyarrow schema.
617 Returns
618 -------
619 astropy_schema : `ArrowAstropySchema`
620 Converted arrow astropy schema.
621 """
622 import numpy as np
623 from astropy.table import Table
625 dtype = _schema_to_dtype_list(schema)
627 data = np.zeros(0, dtype=dtype)
628 astropy_table = Table(data=data)
630 metadata = schema.metadata if schema.metadata is not None else {}
632 _apply_astropy_metadata(astropy_table, metadata)
634 return cls(astropy_table)
636 def to_arrow_schema(self) -> pa.Schema:
637 """Convert to an arrow schema.
639 Returns
640 -------
641 arrow_schema : `pyarrow.Schema`
642 Converted pyarrow schema.
643 """
644 return astropy_to_arrow(self._schema).schema
646 def to_dataframe_schema(self) -> DataFrameSchema:
647 """Convert to a DataFrameSchema.
649 Returns
650 -------
651 dataframe_schema : `DataFrameSchema`
652 Converted dataframe schema.
653 """
654 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema)
656 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
657 """Convert to an `ArrowNumpySchema`.
659 Returns
660 -------
661 arrow_numpy_schema : `ArrowNumpySchema`
662 Converted arrow numpy schema.
663 """
664 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema)
666 @property
667 def schema(self) -> atable.Table:
668 return self._schema
670 def __repr__(self) -> str:
671 return repr(self._schema)
673 def __eq__(self, other: object) -> bool:
674 if not isinstance(other, ArrowAstropySchema):
675 return NotImplemented
677 # If this comparison passes then the two tables have the
678 # same column names.
679 if self._schema.dtype != other._schema.dtype:
680 return False
682 for name in self._schema.columns:
683 if not self._schema[name].unit == other._schema[name].unit:
684 return False
685 if not self._schema[name].description == other._schema[name].description:
686 return False
687 if not self._schema[name].format == other._schema[name].format:
688 return False
690 return True
693class ArrowNumpySchema:
694 """Wrapper class for a schema for a numpy ndarray.
696 Parameters
697 ----------
698 numpy_dtype : `numpy.dtype`
699 Numpy dtype to convert.
700 """
702 def __init__(self, numpy_dtype: np.dtype) -> None:
703 self._dtype = numpy_dtype
705 @classmethod
706 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema:
707 """Convert an arrow schema into an `ArrowNumpySchema`.
709 Parameters
710 ----------
711 schema : `pyarrow.Schema`
712 Pyarrow schema to convert.
714 Returns
715 -------
716 numpy_schema : `ArrowNumpySchema`
717 Converted arrow numpy schema.
718 """
719 import numpy as np
721 dtype = _schema_to_dtype_list(schema)
723 return cls(np.dtype(dtype))
725 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
726 """Convert to an `ArrowAstropySchema`.
728 Returns
729 -------
730 astropy_schema : `ArrowAstropySchema`
731 Converted arrow astropy schema.
732 """
733 import numpy as np
735 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
737 def to_dataframe_schema(self) -> DataFrameSchema:
738 """Convert to a `DataFrameSchema`.
740 Returns
741 -------
742 dataframe_schema : `DataFrameSchema`
743 Converted dataframe schema.
744 """
745 import numpy as np
747 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
749 def to_arrow_schema(self) -> pa.Schema:
750 """Convert to a `pyarrow.Schema`.
752 Returns
753 -------
754 arrow_schema : `pyarrow.Schema`
755 Converted pyarrow schema.
756 """
757 import numpy as np
759 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema
761 @property
762 def schema(self) -> np.dtype:
763 return self._dtype
765 def __repr__(self) -> str:
766 return repr(self._dtype)
768 def __eq__(self, other: object) -> bool:
769 if not isinstance(other, ArrowNumpySchema):
770 return NotImplemented
772 if not self._dtype == other._dtype:
773 return False
775 return True
778def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]:
779 """Split a string that represents a multi-index column.
781 PyArrow maps Pandas' multi-index column names (which are tuples in Python)
782 to flat strings on disk. This routine exists to reconstruct the original
783 tuple.
785 Parameters
786 ----------
787 n : `int`
788 Number of levels in the `pandas.MultiIndex` that is being
789 reconstructed.
790 names : `~collections.abc.Iterable` [`str`]
791 Strings to be split.
793 Returns
794 -------
795 column_names : `list` [`tuple` [`str`]]
796 A list of multi-index column name tuples.
797 """
798 column_names: List[Sequence[str]] = []
800 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n)))
801 for name in names:
802 m = re.search(pattern, name)
803 if m is not None:
804 column_names.append(m.groups())
806 return column_names
809def _standardize_multi_index_columns(
810 schema: pa.Schema, columns: Union[List[tuple], dict[str, Union[str, List[str]]]]
811) -> List[str]:
812 """Transform a dictionary/iterable index from a multi-index column list
813 into a string directly understandable by PyArrow.
815 Parameters
816 ----------
817 schema : `pyarrow.Schema`
818 Pyarrow schema.
819 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]]
820 Columns to standardize.
822 Returns
823 -------
824 names : `list` [`str`]
825 Stringified representation of a multi-index column name.
826 """
827 pd_index = arrow_schema_to_pandas_index(schema)
828 index_level_names = tuple(pd_index.names)
830 names = []
832 if isinstance(columns, list):
833 for requested in columns:
834 if not isinstance(requested, tuple):
835 raise ValueError(
836 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
837 f"Instead got a {get_full_type_name(requested)}."
838 )
839 names.append(str(requested))
840 else:
841 if not isinstance(columns, collections.abc.Mapping):
842 raise ValueError(
843 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
844 f"Instead got a {get_full_type_name(columns)}."
845 )
846 if not set(index_level_names).issuperset(columns.keys()):
847 raise ValueError(
848 f"Cannot use dict with keys {set(columns.keys())} "
849 f"to select columns from {index_level_names}."
850 )
851 factors = [
852 ensure_iterable(columns.get(level, pd_index.levels[i]))
853 for i, level in enumerate(index_level_names)
854 ]
855 for requested in itertools.product(*factors):
856 for i, value in enumerate(requested):
857 if value not in pd_index.levels[i]:
858 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.")
859 names.append(str(requested))
861 return names
864def _apply_astropy_metadata(astropy_table: atable.Table, metadata: dict) -> None:
865 """Apply any astropy metadata from the schema metadata.
867 Parameters
868 ----------
869 astropy_table : `astropy.table.Table`
870 Table to apply metadata.
871 metadata : `dict` [`bytes`]
872 Metadata dict.
873 """
874 from astropy.table import meta
876 meta_yaml = metadata.get(b"table_meta_yaml", None)
877 if meta_yaml:
878 meta_yaml = meta_yaml.decode("UTF8").split("\n")
879 meta_hdr = meta.get_header_from_yaml(meta_yaml)
881 # Set description, format, unit, meta from the column
882 # metadata that was serialized with the table.
883 header_cols = {x["name"]: x for x in meta_hdr["datatype"]}
884 for col in astropy_table.columns.values():
885 for attr in ("description", "format", "unit", "meta"):
886 if attr in header_cols[col.name]:
887 setattr(col, attr, header_cols[col.name][attr])
889 if "meta" in meta_hdr:
890 astropy_table.meta.update(meta_hdr["meta"])
893def _arrow_string_to_numpy_dtype(
894 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10
895) -> str:
896 """Get the numpy dtype string associated with an arrow column.
898 Parameters
899 ----------
900 schema : `pyarrow.Schema`
901 Arrow table schema.
902 name : `str`
903 Column name.
904 numpy_column : `numpy.ndarray`, optional
905 Column to determine numpy string dtype.
906 default_length : `int`, optional
907 Default string length when not in metadata or can be inferred
908 from column.
910 Returns
911 -------
912 dtype_str : `str`
913 Numpy dtype string.
914 """
915 # Special-case for string and binary columns
916 md_name = f"lsst::arrow::len::{name}"
917 strlen = default_length
918 metadata = schema.metadata if schema.metadata is not None else {}
919 if (encoded := md_name.encode("UTF-8")) in metadata:
920 # String/bytes length from header.
921 strlen = int(schema.metadata[encoded])
922 elif numpy_column is not None:
923 if len(numpy_column) > 0:
924 strlen = max(len(row) for row in numpy_column)
926 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"
928 return dtype
931def _append_numpy_string_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None:
932 """Append numpy string length keys to arrow metadata.
934 All column types are handled, but the metadata is only modified for
935 string and byte columns.
937 Parameters
938 ----------
939 metadata : `dict` [`bytes`, `str`]
940 Metadata dictionary; modified in place.
941 name : `str`
942 Column name.
943 dtype : `np.dtype`
944 Numpy dtype.
945 """
946 import numpy as np
948 if dtype.type is np.str_:
949 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4)
950 elif dtype.type is np.bytes_:
951 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize)
954def _append_numpy_multidim_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None:
955 """Append numpy multi-dimensional shapes to arrow metadata.
957 All column types are handled, but the metadata is only modified for
958 multi-dimensional columns.
960 Parameters
961 ----------
962 metadata : `dict` [`bytes`, `str`]
963 Metadata dictionary; modified in place.
964 name : `str`
965 Column name.
966 dtype : `np.dtype`
967 Numpy dtype.
968 """
969 if len(dtype.shape) > 1:
970 metadata[f"lsst::arrow::shape::{name}".encode("UTF-8")] = str(dtype.shape)
973def _multidim_shape_from_metadata(metadata: dict[bytes, bytes], list_size: int, name: str) -> tuple[int, ...]:
974 """Retrieve the shape from the metadata, if available.
976 Parameters
977 ----------
978 metadata : `dict` [`bytes`, `bytes`]
979 Metadata dictionary.
980 list_size : `int`
981 Size of the list datatype.
982 name : `str`
983 Column name.
985 Returns
986 -------
987 shape : `tuple` [`int`]
988 Shape associated with the column.
990 Raises
991 ------
992 RuntimeError
993 Raised if metadata is found but has incorrect format.
994 """
995 md_name = f"lsst::arrow::shape::{name}"
996 if (encoded := md_name.encode("UTF-8")) in metadata:
997 groups = re.search(r"\((.*)\)", metadata[encoded].decode("UTF-8"))
998 if groups is None:
999 raise RuntimeError("Illegal value found in metadata.")
1000 shape = tuple(int(x) for x in groups[1].split(",") if x != "")
1001 else:
1002 shape = (list_size,)
1004 return shape
1007def _schema_to_dtype_list(schema: pa.Schema) -> list[tuple[str, tuple[Any] | str]]:
1008 """Convert a pyarrow schema to a numpy dtype.
1010 Parameters
1011 ----------
1012 schema : `pyarrow.Schema`
1013 Input pyarrow schema.
1015 Returns
1016 -------
1017 dtype_list: `list` [`tuple`]
1018 A list with name, type pairs.
1019 """
1020 metadata = schema.metadata if schema.metadata is not None else {}
1022 dtype: list[Any] = []
1023 for name in schema.names:
1024 t = schema.field(name).type
1025 if isinstance(t, pa.FixedSizeListType):
1026 shape = _multidim_shape_from_metadata(metadata, t.list_size, name)
1027 dtype.append((name, (t.value_type.to_pandas_dtype(), shape)))
1028 elif t not in (pa.string(), pa.binary()):
1029 dtype.append((name, t.to_pandas_dtype()))
1030 else:
1031 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name)))
1033 return dtype
1036def _numpy_dtype_to_arrow_types(dtype: np.dtype) -> list[Any]:
1037 """Convert a numpy dtype to a list of arrow types.
1039 Parameters
1040 ----------
1041 dtype : `numpy.dtype`
1042 Numpy dtype to convert.
1044 Returns
1045 -------
1046 type_list : `list` [`object`]
1047 Converted list of arrow types.
1048 """
1049 from math import prod
1051 import numpy as np
1053 type_list: list[Any] = []
1054 if dtype.names is None:
1055 return type_list
1057 for name in dtype.names:
1058 dt = dtype[name]
1059 arrow_type: Any
1060 if len(dt.shape) > 0:
1061 arrow_type = pa.list_(
1062 pa.from_numpy_dtype(cast(tuple[np.dtype, tuple[int, ...]], dt.subdtype)[0].type),
1063 prod(dt.shape),
1064 )
1065 else:
1066 arrow_type = pa.from_numpy_dtype(dt.type)
1067 type_list.append((name, arrow_type))
1069 return type_list
1072def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, int]:
1073 """Extract equivalent table dtype from dict of numpy arrays.
1075 Parameters
1076 ----------
1077 numpy_dict : `dict` [`str`, `numpy.ndarray`]
1078 Dict with keys as the column names, values as the arrays.
1080 Returns
1081 -------
1082 dtype : `numpy.dtype`
1083 dtype of equivalent table.
1084 rowcount : `int`
1085 Number of rows in the table.
1087 Raises
1088 ------
1089 ValueError if columns in numpy_dict have unequal numbers of rows.
1090 """
1091 import numpy as np
1093 dtype_list = []
1094 rowcount = 0
1095 for name, col in numpy_dict.items():
1096 if rowcount == 0:
1097 rowcount = len(col)
1098 if len(col) != rowcount:
1099 raise ValueError(f"Column {name} has a different number of rows.")
1100 if len(col.shape) == 1:
1101 dtype_list.append((name, col.dtype))
1102 else:
1103 dtype_list.append((name, (col.dtype, col.shape[1:])))
1104 dtype = np.dtype(dtype_list)
1106 return (dtype, rowcount)
1109def _numpy_style_arrays_to_arrow_arrays(
1110 dtype: np.dtype,
1111 rowcount: int,
1112 np_style_arrays: dict[str, np.ndarray] | np.ndarray | atable.Table,
1113 schema: pa.Schema,
1114) -> list[pa.Array]:
1115 """Convert numpy-style arrays to arrow arrays.
1117 Parameters
1118 ----------
1119 dtype : `numpy.dtype`
1120 Numpy dtype of input table/arrays.
1121 rowcount : `int`
1122 Number of rows in input table/arrays.
1123 np_style_arrays : `dict` [`str`, `np.ndarray`] or `np.ndarray`
1124 or `astropy.table.Table`
1125 Arrays to convert to arrow.
1126 schema : `pyarrow.Schema`
1127 Schema of arrow table.
1129 Returns
1130 -------
1131 arrow_arrays : `list` [`pyarrow.Array`]
1132 List of converted pyarrow arrays.
1133 """
1134 import numpy as np
1136 arrow_arrays: list[pa.Array] = []
1137 if dtype.names is None:
1138 return arrow_arrays
1140 for name in dtype.names:
1141 dt = dtype[name]
1142 val: Any
1143 if len(dt.shape) > 0:
1144 if rowcount > 0:
1145 val = np.split(np_style_arrays[name].ravel(), rowcount)
1146 else:
1147 val = []
1148 else:
1149 val = np_style_arrays[name]
1150 arrow_arrays.append(pa.array(val, type=schema.field(name).type))
1152 return arrow_arrays