Coverage for python/lsst/daf/butler/formatters/parquet.py: 13%
415 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:22 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:22 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ParquetFormatter",
26 "arrow_to_pandas",
27 "arrow_to_astropy",
28 "arrow_to_numpy",
29 "arrow_to_numpy_dict",
30 "pandas_to_arrow",
31 "pandas_to_astropy",
32 "astropy_to_arrow",
33 "numpy_to_arrow",
34 "numpy_to_astropy",
35 "numpy_dict_to_arrow",
36 "arrow_schema_to_pandas_index",
37 "DataFrameSchema",
38 "ArrowAstropySchema",
39 "ArrowNumpySchema",
40)
42import collections.abc
43import itertools
44import json
45import re
46from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Sequence, cast
48import pyarrow as pa
49import pyarrow.parquet as pq
50from lsst.daf.butler import Formatter
51from lsst.utils.introspection import get_full_type_name
52from lsst.utils.iteration import ensure_iterable
54if TYPE_CHECKING:
55 import astropy.table as atable
56 import numpy as np
57 import pandas as pd
60class ParquetFormatter(Formatter):
61 """Interface for reading and writing Arrow Table objects to and from
62 Parquet files.
63 """
65 extension = ".parq"
67 def read(self, component: Optional[str] = None) -> Any:
68 # Docstring inherited from Formatter.read.
69 schema = pq.read_schema(self.fileDescriptor.location.path)
71 if component in ("columns", "schema"):
72 # The schema will be translated to column format
73 # depending on the input type.
74 return schema
75 elif component == "rowcount":
76 # Get the rowcount from the metadata if possible, otherwise count.
77 if b"lsst::arrow::rowcount" in schema.metadata:
78 return int(schema.metadata[b"lsst::arrow::rowcount"])
80 temp_table = pq.read_table(
81 self.fileDescriptor.location.path,
82 columns=[schema.names[0]],
83 use_threads=False,
84 use_pandas_metadata=False,
85 )
87 return len(temp_table[schema.names[0]])
89 par_columns = None
90 if self.fileDescriptor.parameters:
91 par_columns = self.fileDescriptor.parameters.pop("columns", None)
92 if par_columns:
93 has_pandas_multi_index = False
94 if b"pandas" in schema.metadata:
95 md = json.loads(schema.metadata[b"pandas"])
96 if len(md["column_indexes"]) > 1:
97 has_pandas_multi_index = True
99 if not has_pandas_multi_index:
100 # Ensure uniqueness, keeping order.
101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns)))
102 file_columns = [name for name in schema.names if not name.startswith("__")]
104 for par_column in par_columns:
105 if par_column not in file_columns:
106 raise ValueError(
107 f"Column {par_column} specified in parameters not available in parquet file."
108 )
109 else:
110 par_columns = _standardize_multi_index_columns(schema, par_columns)
112 if len(self.fileDescriptor.parameters):
113 raise ValueError(
114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read."
115 )
117 metadata = schema.metadata if schema.metadata is not None else {}
118 arrow_table = pq.read_table(
119 self.fileDescriptor.location.path,
120 columns=par_columns,
121 use_threads=False,
122 use_pandas_metadata=(b"pandas" in metadata),
123 )
125 return arrow_table
127 def write(self, inMemoryDataset: Any) -> None:
128 import numpy as np
129 from astropy.table import Table as astropyTable
131 arrow_table = None
132 if isinstance(inMemoryDataset, pa.Table):
133 # This will be the most likely match.
134 arrow_table = inMemoryDataset
135 elif isinstance(inMemoryDataset, astropyTable):
136 arrow_table = astropy_to_arrow(inMemoryDataset)
137 elif isinstance(inMemoryDataset, np.ndarray):
138 arrow_table = numpy_to_arrow(inMemoryDataset)
139 elif isinstance(inMemoryDataset, dict):
140 try:
141 arrow_table = numpy_dict_to_arrow(inMemoryDataset)
142 except (TypeError, AttributeError) as e:
143 raise ValueError(
144 "Input dict for inMemoryDataset does not appear to be a dict of numpy arrays."
145 ) from e
146 else:
147 if hasattr(inMemoryDataset, "to_parquet"):
148 # This may be a pandas DataFrame
149 try:
150 import pandas as pd
151 except ImportError:
152 pd = None
154 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame):
155 arrow_table = pandas_to_arrow(inMemoryDataset)
157 if arrow_table is None:
158 raise ValueError(
159 f"Unsupported type {get_full_type_name(inMemoryDataset)} of "
160 "inMemoryDataset for ParquetFormatter."
161 )
163 location = self.makeUpdatedLocation(self.fileDescriptor.location)
165 pq.write_table(arrow_table, location.path)
168def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame:
169 """Convert a pyarrow table to a pandas DataFrame.
171 Parameters
172 ----------
173 arrow_table : `pyarrow.Table`
174 Input arrow table to convert. If the table has ``pandas`` metadata
175 in the schema it will be used in the construction of the
176 ``DataFrame``.
178 Returns
179 -------
180 dataframe : `pandas.DataFrame`
181 Converted pandas dataframe.
182 """
183 return arrow_table.to_pandas(use_threads=False, integer_object_nulls=True)
186def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table:
187 """Convert a pyarrow table to an `astropy.Table`.
189 Parameters
190 ----------
191 arrow_table : `pyarrow.Table`
192 Input arrow table to convert. If the table has astropy unit
193 metadata in the schema it will be used in the construction
194 of the ``astropy.Table``.
196 Returns
197 -------
198 table : `astropy.Table`
199 Converted astropy table.
200 """
201 from astropy.table import Table
203 astropy_table = Table(arrow_to_numpy_dict(arrow_table))
205 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {}
207 _apply_astropy_metadata(astropy_table, metadata)
209 return astropy_table
212def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray:
213 """Convert a pyarrow table to a structured numpy array.
215 Parameters
216 ----------
217 arrow_table : `pyarrow.Table`
218 Input arrow table.
220 Returns
221 -------
222 array : `numpy.ndarray` (N,)
223 Numpy array table with N rows and the same column names
224 as the input arrow table.
225 """
226 import numpy as np
228 numpy_dict = arrow_to_numpy_dict(arrow_table)
230 dtype = []
231 for name, col in numpy_dict.items():
232 if len(shape := numpy_dict[name].shape) <= 1:
233 dtype.append((name, col.dtype))
234 else:
235 dtype.append((name, (col.dtype, shape[1:])))
237 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype)
239 return array
242def arrow_to_numpy_dict(arrow_table: pa.Table) -> dict[str, np.ndarray]:
243 """Convert a pyarrow table to a dict of numpy arrays.
245 Parameters
246 ----------
247 arrow_table : `pyarrow.Table`
248 Input arrow table.
250 Returns
251 -------
252 numpy_dict : `dict` [`str`, `numpy.ndarray`]
253 Dict with keys as the column names, values as the arrays.
254 """
255 import numpy as np
257 schema = arrow_table.schema
258 metadata = schema.metadata if schema.metadata is not None else {}
260 numpy_dict = {}
262 for name in schema.names:
263 t = schema.field(name).type
265 if arrow_table[name].null_count == 0:
266 # Regular non-masked column
267 col = arrow_table[name].to_numpy()
268 else:
269 # For a masked column, we need to ask arrow to fill the null
270 # values with an appropriately typed value before conversion.
271 # Then we apply the mask to get a masked array of the correct type.
273 if t in (pa.string(), pa.binary()):
274 dummy = ""
275 else:
276 dummy = t.to_pandas_dtype()(0)
278 col = np.ma.masked_array(
279 data=arrow_table[name].fill_null(dummy).to_numpy(),
280 mask=arrow_table[name].is_null().to_numpy(),
281 )
283 if t in (pa.string(), pa.binary()):
284 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col))
285 elif isinstance(t, pa.FixedSizeListType):
286 if len(col) > 0:
287 col = np.stack(col)
288 else:
289 # this is an empty column, and needs to be coerced to type.
290 col = col.astype(t.value_type.to_pandas_dtype())
292 shape = _multidim_shape_from_metadata(metadata, t.list_size, name)
293 col = col.reshape((len(arrow_table), *shape))
295 numpy_dict[name] = col
297 return numpy_dict
300def _numpy_dict_to_numpy(numpy_dict: dict[str, np.ndarray]) -> np.ndarray:
301 """Convert a dict of numpy arrays to a structured numpy array.
303 Parameters
304 ----------
305 numpy_dict : `dict` [`str`, `numpy.ndarray`]
306 Dict with keys as the column names, values as the arrays.
308 Returns
309 -------
310 array : `numpy.ndarray` (N,)
311 Numpy array table with N rows and columns names from the dict keys.
312 """
313 return arrow_to_numpy(numpy_dict_to_arrow(numpy_dict))
316def _numpy_to_numpy_dict(np_array: np.ndarray) -> dict[str, np.ndarray]:
317 """Convert a structured numpy array to a dict of numpy arrays.
319 Parameters
320 ----------
321 np_array : `numpy.ndarray`
322 Input numpy array with multiple fields.
324 Returns
325 -------
326 numpy_dict : `dict` [`str`, `numpy.ndarray`]
327 Dict with keys as the column names, values as the arrays.
328 """
329 return arrow_to_numpy_dict(numpy_to_arrow(np_array))
332def numpy_to_arrow(np_array: np.ndarray) -> pa.Table:
333 """Convert a numpy array table to an arrow table.
335 Parameters
336 ----------
337 np_array : `numpy.ndarray`
338 Input numpy array with multiple fields.
340 Returns
341 -------
342 arrow_table : `pyarrow.Table`
343 Converted arrow table.
344 """
345 type_list = _numpy_dtype_to_arrow_types(np_array.dtype)
347 md = {}
348 md[b"lsst::arrow::rowcount"] = str(len(np_array))
350 for name in np_array.dtype.names:
351 _append_numpy_string_metadata(md, name, np_array.dtype[name])
352 _append_numpy_multidim_metadata(md, name, np_array.dtype[name])
354 schema = pa.schema(type_list, metadata=md)
356 arrays = _numpy_style_arrays_to_arrow_arrays(
357 np_array.dtype,
358 len(np_array),
359 np_array,
360 schema,
361 )
363 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
365 return arrow_table
368def numpy_dict_to_arrow(numpy_dict: dict[str, np.ndarray]) -> pa.Table:
369 """Convert a dict of numpy arrays to an arrow table.
371 Parameters
372 ----------
373 numpy_dict : `dict` [`str`, `numpy.ndarray`]
374 Dict with keys as the column names, values as the arrays.
376 Returns
377 -------
378 arrow_table : `pyarrow.Table`
379 Converted arrow table.
381 Raises
382 ------
383 ValueError if columns in numpy_dict have unequal numbers of rows.
384 """
385 dtype, rowcount = _numpy_dict_to_dtype(numpy_dict)
386 type_list = _numpy_dtype_to_arrow_types(dtype)
388 md = {}
389 md[b"lsst::arrow::rowcount"] = str(rowcount)
391 if dtype.names is not None:
392 for name in dtype.names:
393 _append_numpy_string_metadata(md, name, dtype[name])
394 _append_numpy_multidim_metadata(md, name, dtype[name])
396 schema = pa.schema(type_list, metadata=md)
398 arrays = _numpy_style_arrays_to_arrow_arrays(
399 dtype,
400 rowcount,
401 numpy_dict,
402 schema,
403 )
405 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
407 return arrow_table
410def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
411 """Convert an astropy table to an arrow table.
413 Parameters
414 ----------
415 astropy_table : `astropy.Table`
416 Input astropy table.
418 Returns
419 -------
420 arrow_table : `pyarrow.Table`
421 Converted arrow table.
422 """
423 from astropy.table import meta
425 type_list = _numpy_dtype_to_arrow_types(astropy_table.dtype)
427 md = {}
428 md[b"lsst::arrow::rowcount"] = str(len(astropy_table))
430 for name in astropy_table.dtype.names:
431 _append_numpy_string_metadata(md, name, astropy_table.dtype[name])
432 _append_numpy_multidim_metadata(md, name, astropy_table.dtype[name])
434 meta_yaml = meta.get_yaml_from_table(astropy_table)
435 meta_yaml_str = "\n".join(meta_yaml)
436 md[b"table_meta_yaml"] = meta_yaml_str
438 schema = pa.schema(type_list, metadata=md)
440 arrays = _numpy_style_arrays_to_arrow_arrays(
441 astropy_table.dtype,
442 len(astropy_table),
443 astropy_table,
444 schema,
445 )
447 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
449 return arrow_table
452def _astropy_to_numpy_dict(astropy_table: atable.Table) -> dict[str, np.ndarray]:
453 """Convert an astropy table to an arrow table.
455 Parameters
456 ----------
457 astropy_table : `astropy.Table`
458 Input astropy table.
460 Returns
461 -------
462 numpy_dict : `dict` [`str`, `numpy.ndarray`]
463 Dict with keys as the column names, values as the arrays.
464 """
465 return arrow_to_numpy_dict(astropy_to_arrow(astropy_table))
468def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table:
469 """Convert a pandas dataframe to an arrow table.
471 Parameters
472 ----------
473 dataframe : `pandas.DataFrame`
474 Input pandas dataframe.
475 default_length : `int`, optional
476 Default string length when not in metadata or can be inferred
477 from column.
479 Returns
480 -------
481 arrow_table : `pyarrow.Table`
482 Converted arrow table.
483 """
484 arrow_table = pa.Table.from_pandas(dataframe)
486 # Update the metadata
487 md = arrow_table.schema.metadata
489 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows)
491 # We loop through the arrow table columns because the datatypes have
492 # been checked and converted from pandas objects.
493 for name in arrow_table.column_names:
494 if not name.startswith("__"):
495 if arrow_table[name].type == pa.string():
496 if len(arrow_table[name]) > 0:
497 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid)
498 else:
499 strlen = default_length
500 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)
502 arrow_table = arrow_table.replace_schema_metadata(md)
504 return arrow_table
507def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table:
508 """Convert a pandas dataframe to an astropy table, preserving indexes.
510 Parameters
511 ----------
512 dataframe : `pandas.DataFrame`
513 Input pandas dataframe.
515 Returns
516 -------
517 astropy_table : `astropy.table.Table`
518 Converted astropy table.
519 """
520 import pandas as pd
521 from astropy.table import Table
523 if isinstance(dataframe.columns, pd.MultiIndex):
524 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")
526 return Table.from_pandas(dataframe, index=True)
529def _pandas_to_numpy_dict(dataframe: pd.DataFrame) -> dict[str, np.ndarray]:
530 """Convert a pandas dataframe to an dict of numpy arrays.
532 Parameters
533 ----------
534 dataframe : `pandas.DataFrame`
535 Input pandas dataframe.
537 Returns
538 -------
539 numpy_dict : `dict` [`str`, `numpy.ndarray`]
540 Dict with keys as the column names, values as the arrays.
541 """
542 return arrow_to_numpy_dict(pandas_to_arrow(dataframe))
545def numpy_to_astropy(np_array: np.ndarray) -> atable.Table:
546 """Convert a numpy table to an astropy table.
548 Parameters
549 ----------
550 np_array : `numpy.ndarray`
551 Input numpy array with multiple fields.
553 Returns
554 -------
555 astropy_table : `astropy.table.Table`
556 Converted astropy table.
557 """
558 from astropy.table import Table
560 return Table(data=np_array, copy=False)
563def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex:
564 """Convert an arrow schema to a pandas index/multiindex.
566 Parameters
567 ----------
568 schema : `pyarrow.Schema`
569 Input pyarrow schema.
571 Returns
572 -------
573 index : `pandas.Index` or `pandas.MultiIndex`
574 Converted pandas index.
575 """
576 import pandas as pd
578 if b"pandas" in schema.metadata:
579 md = json.loads(schema.metadata[b"pandas"])
580 indexes = md["column_indexes"]
581 len_indexes = len(indexes)
582 else:
583 len_indexes = 0
585 if len_indexes <= 1:
586 return pd.Index(name for name in schema.names if not name.startswith("__"))
587 else:
588 raw_columns = _split_multi_index_column_names(len(indexes), schema.names)
589 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
592def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]:
593 """Convert an arrow schema to a list of string column names.
595 Parameters
596 ----------
597 schema : `pyarrow.Schema`
598 Input pyarrow schema.
600 Returns
601 -------
602 column_list : `list` [`str`]
603 Converted list of column names.
604 """
605 return [name for name in schema.names]
608class DataFrameSchema:
609 """Wrapper class for a schema for a pandas DataFrame.
611 Parameters
612 ----------
613 dataframe : `pandas.DataFrame`
614 Dataframe to turn into a schema.
615 """
617 def __init__(self, dataframe: pd.DataFrame) -> None:
618 self._schema = dataframe.loc[[False] * len(dataframe)]
620 @classmethod
621 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema:
622 """Convert an arrow schema into a `DataFrameSchema`.
624 Parameters
625 ----------
626 schema : `pyarrow.Schema`
627 The pyarrow schema to convert.
629 Returns
630 -------
631 dataframe_schema : `DataFrameSchema`
632 Converted dataframe schema.
633 """
634 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema)
636 return cls(empty_table.to_pandas())
638 def to_arrow_schema(self) -> pa.Schema:
639 """Convert to an arrow schema.
641 Returns
642 -------
643 arrow_schema : `pyarrow.Schema`
644 Converted pyarrow schema.
645 """
646 arrow_table = pa.Table.from_pandas(self._schema)
648 return arrow_table.schema
650 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
651 """Convert to an `ArrowNumpySchema`.
653 Returns
654 -------
655 arrow_numpy_schema : `ArrowNumpySchema`
656 Converted arrow numpy schema.
657 """
658 return ArrowNumpySchema.from_arrow(self.to_arrow_schema())
660 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
661 """Convert to an ArrowAstropySchema.
663 Returns
664 -------
665 arrow_astropy_schema : `ArrowAstropySchema`
666 Converted arrow astropy schema.
667 """
668 return ArrowAstropySchema.from_arrow(self.to_arrow_schema())
670 @property
671 def schema(self) -> np.dtype:
672 return self._schema
674 def __repr__(self) -> str:
675 return repr(self._schema)
677 def __eq__(self, other: object) -> bool:
678 if not isinstance(other, DataFrameSchema):
679 return NotImplemented
681 return self._schema.equals(other._schema)
684class ArrowAstropySchema:
685 """Wrapper class for a schema for an astropy table.
687 Parameters
688 ----------
689 astropy_table : `astropy.table.Table`
690 Input astropy table.
691 """
693 def __init__(self, astropy_table: atable.Table) -> None:
694 self._schema = astropy_table[:0]
696 @classmethod
697 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema:
698 """Convert an arrow schema into a ArrowAstropySchema.
700 Parameters
701 ----------
702 schema : `pyarrow.Schema`
703 Input pyarrow schema.
705 Returns
706 -------
707 astropy_schema : `ArrowAstropySchema`
708 Converted arrow astropy schema.
709 """
710 import numpy as np
711 from astropy.table import Table
713 dtype = _schema_to_dtype_list(schema)
715 data = np.zeros(0, dtype=dtype)
716 astropy_table = Table(data=data)
718 metadata = schema.metadata if schema.metadata is not None else {}
720 _apply_astropy_metadata(astropy_table, metadata)
722 return cls(astropy_table)
724 def to_arrow_schema(self) -> pa.Schema:
725 """Convert to an arrow schema.
727 Returns
728 -------
729 arrow_schema : `pyarrow.Schema`
730 Converted pyarrow schema.
731 """
732 return astropy_to_arrow(self._schema).schema
734 def to_dataframe_schema(self) -> DataFrameSchema:
735 """Convert to a DataFrameSchema.
737 Returns
738 -------
739 dataframe_schema : `DataFrameSchema`
740 Converted dataframe schema.
741 """
742 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema)
744 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
745 """Convert to an `ArrowNumpySchema`.
747 Returns
748 -------
749 arrow_numpy_schema : `ArrowNumpySchema`
750 Converted arrow numpy schema.
751 """
752 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema)
754 @property
755 def schema(self) -> atable.Table:
756 return self._schema
758 def __repr__(self) -> str:
759 return repr(self._schema)
761 def __eq__(self, other: object) -> bool:
762 if not isinstance(other, ArrowAstropySchema):
763 return NotImplemented
765 # If this comparison passes then the two tables have the
766 # same column names.
767 if self._schema.dtype != other._schema.dtype:
768 return False
770 for name in self._schema.columns:
771 if not self._schema[name].unit == other._schema[name].unit:
772 return False
773 if not self._schema[name].description == other._schema[name].description:
774 return False
775 if not self._schema[name].format == other._schema[name].format:
776 return False
778 return True
781class ArrowNumpySchema:
782 """Wrapper class for a schema for a numpy ndarray.
784 Parameters
785 ----------
786 numpy_dtype : `numpy.dtype`
787 Numpy dtype to convert.
788 """
790 def __init__(self, numpy_dtype: np.dtype) -> None:
791 self._dtype = numpy_dtype
793 @classmethod
794 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema:
795 """Convert an arrow schema into an `ArrowNumpySchema`.
797 Parameters
798 ----------
799 schema : `pyarrow.Schema`
800 Pyarrow schema to convert.
802 Returns
803 -------
804 numpy_schema : `ArrowNumpySchema`
805 Converted arrow numpy schema.
806 """
807 import numpy as np
809 dtype = _schema_to_dtype_list(schema)
811 return cls(np.dtype(dtype))
813 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
814 """Convert to an `ArrowAstropySchema`.
816 Returns
817 -------
818 astropy_schema : `ArrowAstropySchema`
819 Converted arrow astropy schema.
820 """
821 import numpy as np
823 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
825 def to_dataframe_schema(self) -> DataFrameSchema:
826 """Convert to a `DataFrameSchema`.
828 Returns
829 -------
830 dataframe_schema : `DataFrameSchema`
831 Converted dataframe schema.
832 """
833 import numpy as np
835 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
837 def to_arrow_schema(self) -> pa.Schema:
838 """Convert to a `pyarrow.Schema`.
840 Returns
841 -------
842 arrow_schema : `pyarrow.Schema`
843 Converted pyarrow schema.
844 """
845 import numpy as np
847 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema
849 @property
850 def schema(self) -> np.dtype:
851 return self._dtype
853 def __repr__(self) -> str:
854 return repr(self._dtype)
856 def __eq__(self, other: object) -> bool:
857 if not isinstance(other, ArrowNumpySchema):
858 return NotImplemented
860 if not self._dtype == other._dtype:
861 return False
863 return True
866def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]:
867 """Split a string that represents a multi-index column.
869 PyArrow maps Pandas' multi-index column names (which are tuples in Python)
870 to flat strings on disk. This routine exists to reconstruct the original
871 tuple.
873 Parameters
874 ----------
875 n : `int`
876 Number of levels in the `pandas.MultiIndex` that is being
877 reconstructed.
878 names : `~collections.abc.Iterable` [`str`]
879 Strings to be split.
881 Returns
882 -------
883 column_names : `list` [`tuple` [`str`]]
884 A list of multi-index column name tuples.
885 """
886 column_names: List[Sequence[str]] = []
888 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n)))
889 for name in names:
890 m = re.search(pattern, name)
891 if m is not None:
892 column_names.append(m.groups())
894 return column_names
897def _standardize_multi_index_columns(
898 schema: pa.Schema,
899 columns: Any,
900 stringify: bool = True,
901) -> list[str | Sequence[Any]]:
902 """Transform a dictionary/iterable index from a multi-index column list
903 into a string directly understandable by PyArrow.
905 Parameters
906 ----------
907 schema : `pyarrow.Schema`
908 Pyarrow schema.
909 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]]
910 Columns to standardize.
911 stringify : `bool`, optional
912 Should the column names be stringified?
914 Returns
915 -------
916 names : `list` [`str`]
917 Stringified representation of a multi-index column name.
918 """
919 pd_index = arrow_schema_to_pandas_index(schema)
920 index_level_names = tuple(pd_index.names)
922 names: list[str | Sequence[Any]] = []
924 if isinstance(columns, list):
925 for requested in columns:
926 if not isinstance(requested, tuple):
927 raise ValueError(
928 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
929 f"Instead got a {get_full_type_name(requested)}."
930 )
931 if stringify:
932 names.append(str(requested))
933 else:
934 names.append(requested)
935 else:
936 if not isinstance(columns, collections.abc.Mapping):
937 raise ValueError(
938 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
939 f"Instead got a {get_full_type_name(columns)}."
940 )
941 if not set(index_level_names).issuperset(columns.keys()):
942 raise ValueError(
943 f"Cannot use dict with keys {set(columns.keys())} to select columns from {index_level_names}."
944 )
945 factors = [
946 ensure_iterable(columns.get(level, pd_index.levels[i]))
947 for i, level in enumerate(index_level_names)
948 ]
949 for requested in itertools.product(*factors):
950 for i, value in enumerate(requested):
951 if value not in pd_index.levels[i]:
952 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.")
953 if stringify:
954 names.append(str(requested))
955 else:
956 names.append(requested)
958 return names
961def _apply_astropy_metadata(astropy_table: atable.Table, metadata: dict) -> None:
962 """Apply any astropy metadata from the schema metadata.
964 Parameters
965 ----------
966 astropy_table : `astropy.table.Table`
967 Table to apply metadata.
968 metadata : `dict` [`bytes`]
969 Metadata dict.
970 """
971 from astropy.table import meta
973 meta_yaml = metadata.get(b"table_meta_yaml", None)
974 if meta_yaml:
975 meta_yaml = meta_yaml.decode("UTF8").split("\n")
976 meta_hdr = meta.get_header_from_yaml(meta_yaml)
978 # Set description, format, unit, meta from the column
979 # metadata that was serialized with the table.
980 header_cols = {x["name"]: x for x in meta_hdr["datatype"]}
981 for col in astropy_table.columns.values():
982 for attr in ("description", "format", "unit", "meta"):
983 if attr in header_cols[col.name]:
984 setattr(col, attr, header_cols[col.name][attr])
986 if "meta" in meta_hdr:
987 astropy_table.meta.update(meta_hdr["meta"])
990def _arrow_string_to_numpy_dtype(
991 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10
992) -> str:
993 """Get the numpy dtype string associated with an arrow column.
995 Parameters
996 ----------
997 schema : `pyarrow.Schema`
998 Arrow table schema.
999 name : `str`
1000 Column name.
1001 numpy_column : `numpy.ndarray`, optional
1002 Column to determine numpy string dtype.
1003 default_length : `int`, optional
1004 Default string length when not in metadata or can be inferred
1005 from column.
1007 Returns
1008 -------
1009 dtype_str : `str`
1010 Numpy dtype string.
1011 """
1012 # Special-case for string and binary columns
1013 md_name = f"lsst::arrow::len::{name}"
1014 strlen = default_length
1015 metadata = schema.metadata if schema.metadata is not None else {}
1016 if (encoded := md_name.encode("UTF-8")) in metadata:
1017 # String/bytes length from header.
1018 strlen = int(schema.metadata[encoded])
1019 elif numpy_column is not None:
1020 if len(numpy_column) > 0:
1021 strlen = max(len(row) for row in numpy_column)
1023 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"
1025 return dtype
1028def _append_numpy_string_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None:
1029 """Append numpy string length keys to arrow metadata.
1031 All column types are handled, but the metadata is only modified for
1032 string and byte columns.
1034 Parameters
1035 ----------
1036 metadata : `dict` [`bytes`, `str`]
1037 Metadata dictionary; modified in place.
1038 name : `str`
1039 Column name.
1040 dtype : `np.dtype`
1041 Numpy dtype.
1042 """
1043 import numpy as np
1045 if dtype.type is np.str_:
1046 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4)
1047 elif dtype.type is np.bytes_:
1048 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize)
1051def _append_numpy_multidim_metadata(metadata: dict[bytes, str], name: str, dtype: np.dtype) -> None:
1052 """Append numpy multi-dimensional shapes to arrow metadata.
1054 All column types are handled, but the metadata is only modified for
1055 multi-dimensional columns.
1057 Parameters
1058 ----------
1059 metadata : `dict` [`bytes`, `str`]
1060 Metadata dictionary; modified in place.
1061 name : `str`
1062 Column name.
1063 dtype : `np.dtype`
1064 Numpy dtype.
1065 """
1066 if len(dtype.shape) > 1:
1067 metadata[f"lsst::arrow::shape::{name}".encode("UTF-8")] = str(dtype.shape)
1070def _multidim_shape_from_metadata(metadata: dict[bytes, bytes], list_size: int, name: str) -> tuple[int, ...]:
1071 """Retrieve the shape from the metadata, if available.
1073 Parameters
1074 ----------
1075 metadata : `dict` [`bytes`, `bytes`]
1076 Metadata dictionary.
1077 list_size : `int`
1078 Size of the list datatype.
1079 name : `str`
1080 Column name.
1082 Returns
1083 -------
1084 shape : `tuple` [`int`]
1085 Shape associated with the column.
1087 Raises
1088 ------
1089 RuntimeError
1090 Raised if metadata is found but has incorrect format.
1091 """
1092 md_name = f"lsst::arrow::shape::{name}"
1093 if (encoded := md_name.encode("UTF-8")) in metadata:
1094 groups = re.search(r"\((.*)\)", metadata[encoded].decode("UTF-8"))
1095 if groups is None:
1096 raise RuntimeError("Illegal value found in metadata.")
1097 shape = tuple(int(x) for x in groups[1].split(",") if x != "")
1098 else:
1099 shape = (list_size,)
1101 return shape
1104def _schema_to_dtype_list(schema: pa.Schema) -> list[tuple[str, tuple[Any] | str]]:
1105 """Convert a pyarrow schema to a numpy dtype.
1107 Parameters
1108 ----------
1109 schema : `pyarrow.Schema`
1110 Input pyarrow schema.
1112 Returns
1113 -------
1114 dtype_list: `list` [`tuple`]
1115 A list with name, type pairs.
1116 """
1117 metadata = schema.metadata if schema.metadata is not None else {}
1119 dtype: list[Any] = []
1120 for name in schema.names:
1121 t = schema.field(name).type
1122 if isinstance(t, pa.FixedSizeListType):
1123 shape = _multidim_shape_from_metadata(metadata, t.list_size, name)
1124 dtype.append((name, (t.value_type.to_pandas_dtype(), shape)))
1125 elif t not in (pa.string(), pa.binary()):
1126 dtype.append((name, t.to_pandas_dtype()))
1127 else:
1128 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name)))
1130 return dtype
1133def _numpy_dtype_to_arrow_types(dtype: np.dtype) -> list[Any]:
1134 """Convert a numpy dtype to a list of arrow types.
1136 Parameters
1137 ----------
1138 dtype : `numpy.dtype`
1139 Numpy dtype to convert.
1141 Returns
1142 -------
1143 type_list : `list` [`object`]
1144 Converted list of arrow types.
1145 """
1146 from math import prod
1148 import numpy as np
1150 type_list: list[Any] = []
1151 if dtype.names is None:
1152 return type_list
1154 for name in dtype.names:
1155 dt = dtype[name]
1156 arrow_type: Any
1157 if len(dt.shape) > 0:
1158 arrow_type = pa.list_(
1159 pa.from_numpy_dtype(cast(tuple[np.dtype, tuple[int, ...]], dt.subdtype)[0].type),
1160 prod(dt.shape),
1161 )
1162 else:
1163 arrow_type = pa.from_numpy_dtype(dt.type)
1164 type_list.append((name, arrow_type))
1166 return type_list
1169def _numpy_dict_to_dtype(numpy_dict: dict[str, np.ndarray]) -> tuple[np.dtype, int]:
1170 """Extract equivalent table dtype from dict of numpy arrays.
1172 Parameters
1173 ----------
1174 numpy_dict : `dict` [`str`, `numpy.ndarray`]
1175 Dict with keys as the column names, values as the arrays.
1177 Returns
1178 -------
1179 dtype : `numpy.dtype`
1180 dtype of equivalent table.
1181 rowcount : `int`
1182 Number of rows in the table.
1184 Raises
1185 ------
1186 ValueError if columns in numpy_dict have unequal numbers of rows.
1187 """
1188 import numpy as np
1190 dtype_list = []
1191 rowcount = 0
1192 for name, col in numpy_dict.items():
1193 if rowcount == 0:
1194 rowcount = len(col)
1195 if len(col) != rowcount:
1196 raise ValueError(f"Column {name} has a different number of rows.")
1197 if len(col.shape) == 1:
1198 dtype_list.append((name, col.dtype))
1199 else:
1200 dtype_list.append((name, (col.dtype, col.shape[1:])))
1201 dtype = np.dtype(dtype_list)
1203 return (dtype, rowcount)
1206def _numpy_style_arrays_to_arrow_arrays(
1207 dtype: np.dtype,
1208 rowcount: int,
1209 np_style_arrays: dict[str, np.ndarray] | np.ndarray | atable.Table,
1210 schema: pa.Schema,
1211) -> list[pa.Array]:
1212 """Convert numpy-style arrays to arrow arrays.
1214 Parameters
1215 ----------
1216 dtype : `numpy.dtype`
1217 Numpy dtype of input table/arrays.
1218 rowcount : `int`
1219 Number of rows in input table/arrays.
1220 np_style_arrays : `dict` [`str`, `np.ndarray`] or `np.ndarray`
1221 or `astropy.table.Table`
1222 Arrays to convert to arrow.
1223 schema : `pyarrow.Schema`
1224 Schema of arrow table.
1226 Returns
1227 -------
1228 arrow_arrays : `list` [`pyarrow.Array`]
1229 List of converted pyarrow arrays.
1230 """
1231 import numpy as np
1233 arrow_arrays: list[pa.Array] = []
1234 if dtype.names is None:
1235 return arrow_arrays
1237 for name in dtype.names:
1238 dt = dtype[name]
1239 val: Any
1240 if len(dt.shape) > 0:
1241 if rowcount > 0:
1242 val = np.split(np_style_arrays[name].ravel(), rowcount)
1243 else:
1244 val = []
1245 else:
1246 val = np_style_arrays[name]
1248 try:
1249 arrow_arrays.append(pa.array(val, type=schema.field(name).type))
1250 except pa.ArrowNotImplementedError as err:
1251 # Check if val is big-endian.
1252 if (np.little_endian and val.dtype.byteorder == ">") or (
1253 not np.little_endian and val.dtype.byteorder == "="
1254 ):
1255 # We need to convert the array to little-endian.
1256 val2 = val.byteswap()
1257 val2.dtype = val2.dtype.newbyteorder("<")
1258 arrow_arrays.append(pa.array(val2, type=schema.field(name).type))
1259 else:
1260 # This failed for some other reason so raise the exception.
1261 raise err
1263 return arrow_arrays