Coverage for python/lsst/daf/butler/formatters/parquet.py: 14%
317 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-15 02:03 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-15 02:03 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ParquetFormatter",
26 "arrow_to_pandas",
27 "arrow_to_astropy",
28 "arrow_to_numpy",
29 "arrow_to_numpy_dict",
30 "pandas_to_arrow",
31 "pandas_to_astropy",
32 "astropy_to_arrow",
33 "numpy_to_arrow",
34 "numpy_to_astropy",
35 "numpy_dict_to_arrow",
36 "arrow_schema_to_pandas_index",
37 "DataFrameSchema",
38 "ArrowAstropySchema",
39 "ArrowNumpySchema",
40)
42import collections.abc
43import itertools
44import json
45import re
46from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
48import pyarrow as pa
49import pyarrow.parquet as pq
50from lsst.daf.butler import Formatter
51from lsst.utils.introspection import get_full_type_name
52from lsst.utils.iteration import ensure_iterable
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 import astropy.table as atable
56 import numpy as np
57 import pandas as pd
60class ParquetFormatter(Formatter):
61 """Interface for reading and writing Arrow Table objects to and from
62 Parquet files.
63 """
65 extension = ".parq"
67 def read(self, component: Optional[str] = None) -> Any:
68 # Docstring inherited from Formatter.read.
69 schema = pq.read_schema(self.fileDescriptor.location.path)
71 if component in ("columns", "schema"):
72 # The schema will be translated to column format
73 # depending on the input type.
74 return schema
75 elif component == "rowcount":
76 # Get the rowcount from the metadata if possible, otherwise count.
77 if b"lsst::arrow::rowcount" in schema.metadata:
78 return int(schema.metadata[b"lsst::arrow::rowcount"])
80 temp_table = pq.read_table(
81 self.fileDescriptor.location.path,
82 columns=[schema.names[0]],
83 use_threads=False,
84 use_pandas_metadata=False,
85 )
87 return len(temp_table[schema.names[0]])
89 par_columns = None
90 if self.fileDescriptor.parameters:
91 par_columns = self.fileDescriptor.parameters.pop("columns", None)
92 if par_columns:
93 has_pandas_multi_index = False
94 if b"pandas" in schema.metadata:
95 md = json.loads(schema.metadata[b"pandas"])
96 if len(md["column_indexes"]) > 1:
97 has_pandas_multi_index = True
99 if not has_pandas_multi_index:
100 # Ensure uniqueness, keeping order.
101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns)))
102 file_columns = [name for name in schema.names if not name.startswith("__")]
104 for par_column in par_columns:
105 if par_column not in file_columns:
106 raise ValueError(
107 f"Column {par_column} specified in parameters not available in parquet file."
108 )
109 else:
110 par_columns = _standardize_multi_index_columns(schema, par_columns)
112 if len(self.fileDescriptor.parameters):
113 raise ValueError(
114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read."
115 )
117 metadata = schema.metadata if schema.metadata is not None else {}
118 arrow_table = pq.read_table(
119 self.fileDescriptor.location.path,
120 columns=par_columns,
121 use_threads=False,
122 use_pandas_metadata=(b"pandas" in metadata),
123 )
125 return arrow_table
127 def write(self, inMemoryDataset: Any) -> None:
128 import numpy as np
129 from astropy.table import Table as astropyTable
131 arrow_table = None
132 if isinstance(inMemoryDataset, pa.Table):
133 # This will be the most likely match.
134 arrow_table = inMemoryDataset
135 elif isinstance(inMemoryDataset, astropyTable):
136 arrow_table = astropy_to_arrow(inMemoryDataset)
137 elif isinstance(inMemoryDataset, np.ndarray):
138 arrow_table = numpy_to_arrow(inMemoryDataset)
139 else:
140 if hasattr(inMemoryDataset, "to_parquet"):
141 # This may be a pandas DataFrame
142 try:
143 import pandas as pd
144 except ImportError:
145 pd = None
147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame):
148 arrow_table = pandas_to_arrow(inMemoryDataset)
150 if arrow_table is None:
151 raise ValueError(
152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of "
153 "inMemoryDataset for ParquetFormatter."
154 )
156 location = self.makeUpdatedLocation(self.fileDescriptor.location)
158 pq.write_table(arrow_table, location.path)
161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame:
162 """Convert a pyarrow table to a pandas DataFrame.
164 Parameters
165 ----------
166 arrow_table : `pyarrow.Table`
167 Input arrow table to convert. If the table has ``pandas`` metadata
168 in the schema it will be used in the construction of the
169 ``DataFrame``.
171 Returns
172 -------
173 dataframe : `pandas.DataFrame`
174 """
175 return arrow_table.to_pandas(use_threads=False)
178def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table:
179 """Convert a pyarrow table to an `astropy.Table`.
181 Parameters
182 ----------
183 arrow_table : `pyarrow.Table`
184 Input arrow table to convert. If the table has astropy unit
185 metadata in the schema it will be used in the construction
186 of the ``astropy.Table``.
188 Returns
189 -------
190 table : `astropy.Table`
191 """
192 from astropy.table import Table
194 astropy_table = Table(arrow_to_numpy_dict(arrow_table))
196 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {}
198 _apply_astropy_metadata(astropy_table, metadata)
200 return astropy_table
203def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray:
204 """Convert a pyarrow table to a structured numpy array.
206 Parameters
207 ----------
208 arrow_table : `pyarrow.Table`
210 Returns
211 -------
212 array : `numpy.ndarray` (N,)
213 Numpy array table with N rows and the same column names
214 as the input arrow table.
215 """
216 import numpy as np
218 numpy_dict = arrow_to_numpy_dict(arrow_table)
220 dtype = []
221 for name, col in numpy_dict.items():
222 dtype.append((name, col.dtype))
224 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype)
226 return array
229def arrow_to_numpy_dict(arrow_table: pa.Table) -> Dict[str, np.ndarray]:
230 """Convert a pyarrow table to a dict of numpy arrays.
232 Parameters
233 ----------
234 arrow_table : `pyarrow.Table`
236 Returns
237 -------
238 numpy_dict : `dict` [`str`, `numpy.ndarray`]
239 Dict with keys as the column names, values as the arrays.
240 """
241 schema = arrow_table.schema
243 numpy_dict = {}
245 for name in schema.names:
246 col = arrow_table[name].to_numpy()
248 if schema.field(name).type in (pa.string(), pa.binary()):
249 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col))
251 numpy_dict[name] = col
253 return numpy_dict
256def numpy_to_arrow(np_array: np.ndarray) -> pa.Table:
257 """Convert a numpy array table to an arrow table.
259 Parameters
260 ----------
261 np_array : `numpy.ndarray`
263 Returns
264 -------
265 arrow_table : `pyarrow.Table`
266 """
267 type_list = [(name, pa.from_numpy_dtype(np_array.dtype[name].type)) for name in np_array.dtype.names]
269 md = {}
270 md[b"lsst::arrow::rowcount"] = str(len(np_array))
272 for name in np_array.dtype.names:
273 _append_numpy_string_metadata(md, name, np_array.dtype[name])
275 schema = pa.schema(type_list, metadata=md)
277 arrays = [pa.array(np_array[col]) for col in np_array.dtype.names]
278 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
280 return arrow_table
283def numpy_dict_to_arrow(numpy_dict: Dict[str, np.ndarray]) -> pa.Table:
284 """Convert a dict of numpy arrays to an arrow table.
286 Parameters
287 ----------
288 numpy_dict : `dict` [`str`, `numpy.ndarray`]
289 Dict with keys as the column names, values as the arrays.
291 Returns
292 -------
293 arrow_table : `pyarrow.Table`
294 """
295 type_list = [(name, pa.from_numpy_dtype(col.dtype.type)) for name, col in numpy_dict.items()]
297 md = {}
298 md[b"lsst::arrow::rowcount"] = str(len(numpy_dict[list(numpy_dict.keys())[0]]))
300 for name, col in numpy_dict.items():
301 _append_numpy_string_metadata(md, name, col.dtype)
303 schema = pa.schema(type_list, metadata=md)
305 arrays = [pa.array(col) for col in numpy_dict.values()]
306 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
308 return arrow_table
311def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
312 """Convert an astropy table to an arrow table.
314 Parameters
315 ----------
316 astropy_table : `astropy.Table`
318 Returns
319 -------
320 arrow_table : `pyarrow.Table`
321 """
322 from astropy.table import meta
324 type_list = [
325 (name, pa.from_numpy_dtype(astropy_table.dtype[name].type)) for name in astropy_table.dtype.names
326 ]
328 md = {}
329 md[b"lsst::arrow::rowcount"] = str(len(astropy_table))
331 for name, col in astropy_table.columns.items():
332 _append_numpy_string_metadata(md, name, col.dtype)
334 meta_yaml = meta.get_yaml_from_table(astropy_table)
335 meta_yaml_str = "\n".join(meta_yaml)
336 md[b"table_meta_yaml"] = meta_yaml_str
338 schema = pa.schema(type_list, metadata=md)
340 arrays = [pa.array(col) for col in astropy_table.itercols()]
341 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
343 return arrow_table
346def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table:
347 """Convert a pandas dataframe to an arrow table.
349 Parameters
350 ----------
351 dataframe : `pandas.DataFrame`
352 default_length : `int`, optional
353 Default string length when not in metadata or can be inferred
354 from column.
356 Returns
357 -------
358 arrow_table : `pyarrow.Table`
359 """
360 arrow_table = pa.Table.from_pandas(dataframe)
362 # Update the metadata
363 md = arrow_table.schema.metadata
365 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows)
367 # We loop through the arrow table columns because the datatypes have
368 # been checked and converted from pandas objects.
369 for name in arrow_table.column_names:
370 if not name.startswith("__"):
371 if arrow_table[name].type == pa.string():
372 if len(arrow_table[name]) > 0:
373 strlen = max(len(row.as_py()) for row in arrow_table[name] if row.is_valid)
374 else:
375 strlen = default_length
376 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)
378 arrow_table = arrow_table.replace_schema_metadata(md)
380 return arrow_table
383def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table:
384 """Convert a pandas dataframe to an astropy table, preserving indexes.
386 Parameters
387 ----------
388 dataframe : `pandas.DataFrame`
390 Returns
391 -------
392 astropy_table : `astropy.table.Table`
393 """
394 import pandas as pd
395 from astropy.table import Table
397 if isinstance(dataframe.columns, pd.MultiIndex):
398 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")
400 return Table.from_pandas(dataframe, index=True)
403def numpy_to_astropy(np_array: np.ndarray) -> atable.Table:
404 """Convert a numpy table to an astropy table.
406 Parameters
407 ----------
408 np_array : `numpy.ndarray`
410 Returns
411 -------
412 astropy_table : `astropy.table.Table`
413 """
414 from astropy.table import Table
416 return Table(data=np_array, copy=False)
419def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex:
420 """Convert an arrow schema to a pandas index/multiindex.
422 Parameters
423 ----------
424 schema : `pyarrow.Schema`
426 Returns
427 -------
428 index : `pandas.Index` or `pandas.MultiIndex`
429 """
430 import pandas as pd
432 if b"pandas" in schema.metadata:
433 md = json.loads(schema.metadata[b"pandas"])
434 indexes = md["column_indexes"]
435 len_indexes = len(indexes)
436 else:
437 len_indexes = 0
439 if len_indexes <= 1:
440 return pd.Index(name for name in schema.names if not name.startswith("__"))
441 else:
442 raw_columns = _split_multi_index_column_names(len(indexes), schema.names)
443 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
446def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]:
447 """Convert an arrow schema to a list of string column names.
449 Parameters
450 ----------
451 schema : `pyarrow.Schema`
453 Returns
454 -------
455 column_list : `list` [`str`]
456 """
457 return [name for name in schema.names]
460class DataFrameSchema:
461 """Wrapper class for a schema for a pandas DataFrame.
463 Parameters
464 ----------
465 dataframe : `pandas.DataFrame`
466 Dataframe to turn into a schema.
467 """
469 def __init__(self, dataframe: pd.DataFrame) -> None:
470 self._schema = dataframe.loc[[False] * len(dataframe)]
472 @classmethod
473 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema:
474 """Convert an arrow schema into a `DataFrameSchema`.
476 Parameters
477 ----------
478 schema : `pyarrow.Schema`
479 The pyarrow schema to convert.
481 Returns
482 -------
483 dataframe_schema : `DataFrameSchema`
484 """
485 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema)
487 return cls(empty_table.to_pandas())
489 def to_arrow_schema(self) -> pa.Schema:
490 """Convert to an arrow schema.
492 Returns
493 -------
494 arrow_schema : `pyarrow.Schema`
495 """
496 arrow_table = pa.Table.from_pandas(self._schema)
498 return arrow_table.schema
500 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
501 """Convert to an `ArrowNumpySchema`.
503 Returns
504 -------
505 arrow_numpy_schema : `ArrowNumpySchema`
506 """
507 return ArrowNumpySchema.from_arrow(self.to_arrow_schema())
509 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
510 """Convert to an ArrowAstropySchema.
512 Returns
513 -------
514 arrow_astropy_schema : `ArrowAstropySchema`
515 """
516 return ArrowAstropySchema.from_arrow(self.to_arrow_schema())
518 @property
519 def schema(self) -> np.dtype:
520 return self._schema
522 def __repr__(self) -> str:
523 return repr(self._schema)
525 def __eq__(self, other: object) -> bool:
526 if not isinstance(other, DataFrameSchema):
527 return NotImplemented
529 return self._schema.equals(other._schema)
532class ArrowAstropySchema:
533 """Wrapper class for a schema for an astropy table.
535 Parameters
536 ----------
537 astropy_table : `astropy.table.Table`
538 """
540 def __init__(self, astropy_table: atable.Table) -> None:
541 self._schema = astropy_table[:0]
543 @classmethod
544 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema:
545 """Convert an arrow schema into a ArrowAstropySchema.
547 Parameters
548 ----------
549 schema : `pyarrow.Schema`
551 Returns
552 -------
553 astropy_schema : `ArrowAstropySchema`
554 """
555 import numpy as np
556 from astropy.table import Table
558 dtype = []
559 for name in schema.names:
560 if schema.field(name).type not in (pa.string(), pa.binary()):
561 dtype.append(schema.field(name).type.to_pandas_dtype())
562 continue
564 dtype.append(_arrow_string_to_numpy_dtype(schema, name))
566 data = np.zeros(0, dtype=list(zip(schema.names, dtype)))
568 astropy_table = Table(data=data)
570 metadata = schema.metadata if schema.metadata is not None else {}
572 _apply_astropy_metadata(astropy_table, metadata)
574 return cls(astropy_table)
576 def to_arrow_schema(self) -> pa.Schema:
577 """Convert to an arrow schema.
579 Returns
580 -------
581 arrow_schema : `pyarrow.Schema`
582 """
583 return astropy_to_arrow(self._schema).schema
585 def to_dataframe_schema(self) -> DataFrameSchema:
586 """Convert to a DataFrameSchema.
588 Returns
589 -------
590 dataframe_schema : `DataFrameSchema`
591 """
592 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema)
594 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
595 """Convert to an `ArrowNumpySchema`.
597 Returns
598 -------
599 arrow_numpy_schema : `ArrowNumpySchema`
600 """
601 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema)
603 @property
604 def schema(self) -> atable.Table:
605 return self._schema
607 def __repr__(self) -> str:
608 return repr(self._schema)
610 def __eq__(self, other: object) -> bool:
611 if not isinstance(other, ArrowAstropySchema):
612 return NotImplemented
614 # If this comparison passes then the two tables have the
615 # same column names.
616 if self._schema.dtype != other._schema.dtype:
617 return False
619 for name in self._schema.columns:
620 if not self._schema[name].unit == other._schema[name].unit:
621 return False
622 if not self._schema[name].description == other._schema[name].description:
623 return False
624 if not self._schema[name].format == other._schema[name].format:
625 return False
627 return True
630class ArrowNumpySchema:
631 """Wrapper class for a schema for a numpy ndarray.
633 Parameters
634 ----------
635 numpy_dtype : `numpy.dtype`
636 Numpy dtype to convert.
637 """
639 def __init__(self, numpy_dtype: np.dtype) -> None:
640 self._dtype = numpy_dtype
642 @classmethod
643 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema:
644 """Convert an arrow schema into an `ArrowNumpySchema`.
646 Parameters
647 ----------
648 schema : `pyarrow.Schema`
649 Pyarrow schema to convert.
651 Returns
652 -------
653 numpy_schema : `ArrowNumpySchema`
654 """
655 import numpy as np
657 dtype = []
658 for name in schema.names:
659 if schema.field(name).type not in (pa.string(), pa.binary()):
660 dtype.append((name, schema.field(name).type.to_pandas_dtype()))
661 continue
663 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name)))
665 return cls(np.dtype(dtype))
667 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
668 """Convert to an `ArrowAstropySchema`.
670 Returns
671 -------
672 astropy_schema : `ArrowAstropySchema`
673 """
674 import numpy as np
676 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
678 def to_dataframe_schema(self) -> DataFrameSchema:
679 """Convert to a `DataFrameSchema`.
681 Returns
682 -------
683 dataframe_schema : `DataFrameSchema`
684 """
685 import numpy as np
687 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
689 def to_arrow_schema(self) -> pa.Schema:
690 """Convert to a `pyarrow.Schema`.
692 Returns
693 -------
694 arrow_schema : `pyarrow.Schema`
695 """
696 import numpy as np
698 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema
700 @property
701 def schema(self) -> np.dtype:
702 return self._dtype
704 def __repr__(self) -> str:
705 return repr(self._dtype)
707 def __eq__(self, other: object) -> bool:
708 if not isinstance(other, ArrowNumpySchema):
709 return NotImplemented
711 if not self._dtype == other._dtype:
712 return False
714 return True
717def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]:
718 """Split a string that represents a multi-index column.
720 PyArrow maps Pandas' multi-index column names (which are tuples in Python)
721 to flat strings on disk. This routine exists to reconstruct the original
722 tuple.
724 Parameters
725 ----------
726 n : `int`
727 Number of levels in the `pandas.MultiIndex` that is being
728 reconstructed.
729 names : `~collections.abc.Iterable` [`str`]
730 Strings to be split.
732 Returns
733 -------
734 column_names : `list` [`tuple` [`str`]]
735 A list of multi-index column name tuples.
736 """
737 column_names: List[Sequence[str]] = []
739 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n)))
740 for name in names:
741 m = re.search(pattern, name)
742 if m is not None:
743 column_names.append(m.groups())
745 return column_names
748def _standardize_multi_index_columns(
749 schema: pa.Schema, columns: Union[List[tuple], Dict[str, Union[str, List[str]]]]
750) -> List[str]:
751 """Transform a dictionary/iterable index from a multi-index column list
752 into a string directly understandable by PyArrow.
754 Parameters
755 ----------
756 schema : `pyarrow.Schema`
757 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]]
759 Returns
760 -------
761 names : `list` [`str`]
762 Stringified representation of a multi-index column name.
763 """
764 pd_index = arrow_schema_to_pandas_index(schema)
765 index_level_names = tuple(pd_index.names)
767 names = []
769 if isinstance(columns, list):
770 for requested in columns:
771 if not isinstance(requested, tuple):
772 raise ValueError(
773 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
774 f"Instead got a {get_full_type_name(requested)}."
775 )
776 names.append(str(requested))
777 else:
778 if not isinstance(columns, collections.abc.Mapping):
779 raise ValueError(
780 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
781 f"Instead got a {get_full_type_name(columns)}."
782 )
783 if not set(index_level_names).issuperset(columns.keys()):
784 raise ValueError(
785 f"Cannot use dict with keys {set(columns.keys())} "
786 f"to select columns from {index_level_names}."
787 )
788 factors = [
789 ensure_iterable(columns.get(level, pd_index.levels[i]))
790 for i, level in enumerate(index_level_names)
791 ]
792 for requested in itertools.product(*factors):
793 for i, value in enumerate(requested):
794 if value not in pd_index.levels[i]:
795 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.")
796 names.append(str(requested))
798 return names
801def _apply_astropy_metadata(astropy_table: atable.Table, metadata: Dict) -> None:
802 """Apply any astropy metadata from the schema metadata.
804 Parameters
805 ----------
806 astropy_table : `astropy.table.Table`
807 Table to apply metadata.
808 metadata : `dict` [`bytes`]
809 Metadata dict.
810 """
811 from astropy.table import meta
813 meta_yaml = metadata.get(b"table_meta_yaml", None)
814 if meta_yaml:
815 meta_yaml = meta_yaml.decode("UTF8").split("\n")
816 meta_hdr = meta.get_header_from_yaml(meta_yaml)
818 # Set description, format, unit, meta from the column
819 # metadata that was serialized with the table.
820 header_cols = {x["name"]: x for x in meta_hdr["datatype"]}
821 for col in astropy_table.columns.values():
822 for attr in ("description", "format", "unit", "meta"):
823 if attr in header_cols[col.name]:
824 setattr(col, attr, header_cols[col.name][attr])
827def _arrow_string_to_numpy_dtype(
828 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10
829) -> str:
830 """Get the numpy dtype string associated with an arrow column.
832 Parameters
833 ----------
834 schema : `pyarrow.Schema`
835 Arrow table schema.
836 name : `str`
837 Column name.
838 numpy_column : `numpy.ndarray`, optional
839 Column to determine numpy string dtype.
840 default_length : `int`, optional
841 Default string length when not in metadata or can be inferred
842 from column.
844 Returns
845 -------
846 dtype_str : `str`
847 Numpy dtype string.
848 """
849 # Special-case for string and binary columns
850 md_name = f"lsst::arrow::len::{name}"
851 strlen = default_length
852 metadata = schema.metadata if schema.metadata is not None else {}
853 if (encoded := md_name.encode("UTF-8")) in metadata:
854 # String/bytes length from header.
855 strlen = int(schema.metadata[encoded])
856 elif numpy_column is not None:
857 if len(numpy_column) > 0:
858 strlen = max(len(row) for row in numpy_column)
860 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"
862 return dtype
865def _append_numpy_string_metadata(metadata: Dict[bytes, str], name: str, dtype: np.dtype) -> None:
866 """Append numpy string length keys to arrow metadata.
868 All column types are handled, but only the metadata is only modified for
869 string and byte columns.
871 Parameters
872 ----------
873 metadata : `dict` [`bytes`, `str`]
874 Metadata dictionary; modified in place.
875 name : `str`
876 Column name.
877 dtype : `np.dtype`
878 Numpy dtype.
879 """
880 import numpy as np
882 if dtype.type is np.str_:
883 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4)
884 elif dtype.type is np.bytes_:
885 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize)