Coverage for python/lsst/daf/butler/formatters/parquet.py: 14%
319 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-28 09:59 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-28 09:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "ParquetFormatter",
26 "arrow_to_pandas",
27 "arrow_to_astropy",
28 "arrow_to_numpy",
29 "arrow_to_numpy_dict",
30 "pandas_to_arrow",
31 "pandas_to_astropy",
32 "astropy_to_arrow",
33 "numpy_to_arrow",
34 "numpy_to_astropy",
35 "numpy_dict_to_arrow",
36 "arrow_schema_to_pandas_index",
37 "DataFrameSchema",
38 "ArrowAstropySchema",
39 "ArrowNumpySchema",
40)
42import collections.abc
43import itertools
44import json
45import re
46from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
48import pyarrow as pa
49import pyarrow.parquet as pq
50from lsst.daf.butler import Formatter
51from lsst.utils.introspection import get_full_type_name
52from lsst.utils.iteration import ensure_iterable
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 import astropy.table as atable
56 import numpy as np
57 import pandas as pd
60class ParquetFormatter(Formatter):
61 """Interface for reading and writing Arrow Table objects to and from
62 Parquet files.
63 """
65 extension = ".parq"
67 def read(self, component: Optional[str] = None) -> Any:
68 # Docstring inherited from Formatter.read.
69 schema = pq.read_schema(self.fileDescriptor.location.path)
71 if component in ("columns", "schema"):
72 # The schema will be translated to column format
73 # depending on the input type.
74 return schema
75 elif component == "rowcount":
76 # Get the rowcount from the metadata if possible, otherwise count.
77 if b"lsst::arrow::rowcount" in schema.metadata:
78 return int(schema.metadata[b"lsst::arrow::rowcount"])
80 temp_table = pq.read_table(
81 self.fileDescriptor.location.path,
82 columns=[schema.names[0]],
83 use_threads=False,
84 use_pandas_metadata=False,
85 )
87 return len(temp_table[schema.names[0]])
89 par_columns = None
90 if self.fileDescriptor.parameters:
91 par_columns = self.fileDescriptor.parameters.pop("columns", None)
92 if par_columns:
93 has_pandas_multi_index = False
94 if b"pandas" in schema.metadata:
95 md = json.loads(schema.metadata[b"pandas"])
96 if len(md["column_indexes"]) > 1:
97 has_pandas_multi_index = True
99 if not has_pandas_multi_index:
100 # Ensure uniqueness, keeping order.
101 par_columns = list(dict.fromkeys(ensure_iterable(par_columns)))
102 file_columns = [name for name in schema.names if not name.startswith("__")]
104 for par_column in par_columns:
105 if par_column not in file_columns:
106 raise ValueError(
107 f"Column {par_column} specified in parameters not available in parquet file."
108 )
109 else:
110 par_columns = _standardize_multi_index_columns(schema, par_columns)
112 if len(self.fileDescriptor.parameters):
113 raise ValueError(
114 f"Unsupported parameters {self.fileDescriptor.parameters} in ArrowTable read."
115 )
117 metadata = schema.metadata if schema.metadata is not None else {}
118 arrow_table = pq.read_table(
119 self.fileDescriptor.location.path,
120 columns=par_columns,
121 use_threads=False,
122 use_pandas_metadata=(b"pandas" in metadata),
123 )
125 return arrow_table
127 def write(self, inMemoryDataset: Any) -> None:
128 import numpy as np
129 from astropy.table import Table as astropyTable
131 arrow_table = None
132 if isinstance(inMemoryDataset, pa.Table):
133 # This will be the most likely match.
134 arrow_table = inMemoryDataset
135 elif isinstance(inMemoryDataset, astropyTable):
136 arrow_table = astropy_to_arrow(inMemoryDataset)
137 elif isinstance(inMemoryDataset, np.ndarray):
138 arrow_table = numpy_to_arrow(inMemoryDataset)
139 else:
140 if hasattr(inMemoryDataset, "to_parquet"):
141 # This may be a pandas DataFrame
142 try:
143 import pandas as pd
144 except ImportError:
145 pd = None
147 if pd is not None and isinstance(inMemoryDataset, pd.DataFrame):
148 arrow_table = pandas_to_arrow(inMemoryDataset)
150 if arrow_table is None:
151 raise ValueError(
152 f"Unsupported type {get_full_type_name(inMemoryDataset)} of "
153 "inMemoryDataset for ParquetFormatter."
154 )
156 location = self.makeUpdatedLocation(self.fileDescriptor.location)
158 pq.write_table(arrow_table, location.path)
161def arrow_to_pandas(arrow_table: pa.Table) -> pd.DataFrame:
162 """Convert a pyarrow table to a pandas DataFrame.
164 Parameters
165 ----------
166 arrow_table : `pyarrow.Table`
167 Input arrow table to convert. If the table has ``pandas`` metadata
168 in the schema it will be used in the construction of the
169 ``DataFrame``.
171 Returns
172 -------
173 dataframe : `pandas.DataFrame`
174 """
175 return arrow_table.to_pandas(use_threads=False)
178def arrow_to_astropy(arrow_table: pa.Table) -> atable.Table:
179 """Convert a pyarrow table to an `astropy.Table`.
181 Parameters
182 ----------
183 arrow_table : `pyarrow.Table`
184 Input arrow table to convert. If the table has astropy unit
185 metadata in the schema it will be used in the construction
186 of the ``astropy.Table``.
188 Returns
189 -------
190 table : `astropy.Table`
191 """
192 from astropy.table import Table
194 astropy_table = Table(arrow_to_numpy_dict(arrow_table))
196 metadata = arrow_table.schema.metadata if arrow_table.schema.metadata is not None else {}
198 _apply_astropy_metadata(astropy_table, metadata)
200 return astropy_table
203def arrow_to_numpy(arrow_table: pa.Table) -> np.ndarray:
204 """Convert a pyarrow table to a structured numpy array.
206 Parameters
207 ----------
208 arrow_table : `pyarrow.Table`
210 Returns
211 -------
212 array : `numpy.ndarray` (N,)
213 Numpy array table with N rows and the same column names
214 as the input arrow table.
215 """
216 import numpy as np
218 numpy_dict = arrow_to_numpy_dict(arrow_table)
220 dtype = []
221 for name, col in numpy_dict.items():
222 dtype.append((name, col.dtype))
224 array = np.rec.fromarrays(numpy_dict.values(), dtype=dtype)
226 return array
229def arrow_to_numpy_dict(arrow_table: pa.Table) -> Dict[str, np.ndarray]:
230 """Convert a pyarrow table to a dict of numpy arrays.
232 Parameters
233 ----------
234 arrow_table : `pyarrow.Table`
236 Returns
237 -------
238 numpy_dict : `dict` [`str`, `numpy.ndarray`]
239 Dict with keys as the column names, values as the arrays.
240 """
241 schema = arrow_table.schema
243 numpy_dict = {}
245 for name in schema.names:
246 col = arrow_table[name].to_numpy()
248 if schema.field(name).type in (pa.string(), pa.binary()):
249 col = col.astype(_arrow_string_to_numpy_dtype(schema, name, col))
251 numpy_dict[name] = col
253 return numpy_dict
256def numpy_to_arrow(np_array: np.ndarray) -> pa.Table:
257 """Convert a numpy array table to an arrow table.
259 Parameters
260 ----------
261 np_array : `numpy.ndarray`
263 Returns
264 -------
265 arrow_table : `pyarrow.Table`
266 """
267 type_list = [(name, pa.from_numpy_dtype(np_array.dtype[name].type)) for name in np_array.dtype.names]
269 md = {}
270 md[b"lsst::arrow::rowcount"] = str(len(np_array))
272 for name in np_array.dtype.names:
273 _append_numpy_string_metadata(md, name, np_array.dtype[name])
275 schema = pa.schema(type_list, metadata=md)
277 arrays = [pa.array(np_array[col]) for col in np_array.dtype.names]
278 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
280 return arrow_table
283def numpy_dict_to_arrow(numpy_dict: Dict[str, np.ndarray]) -> pa.Table:
284 """Convert a dict of numpy arrays to an arrow table.
286 Parameters
287 ----------
288 numpy_dict : `dict` [`str`, `numpy.ndarray`]
289 Dict with keys as the column names, values as the arrays.
291 Returns
292 -------
293 arrow_table : `pyarrow.Table`
294 """
295 type_list = [(name, pa.from_numpy_dtype(col.dtype.type)) for name, col in numpy_dict.items()]
297 md = {}
298 md[b"lsst::arrow::rowcount"] = str(len(numpy_dict[list(numpy_dict.keys())[0]]))
300 for name, col in numpy_dict.items():
301 _append_numpy_string_metadata(md, name, col.dtype)
303 schema = pa.schema(type_list, metadata=md)
305 arrays = [pa.array(col) for col in numpy_dict.values()]
306 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
308 return arrow_table
311def astropy_to_arrow(astropy_table: atable.Table) -> pa.Table:
312 """Convert an astropy table to an arrow table.
314 Parameters
315 ----------
316 astropy_table : `astropy.Table`
318 Returns
319 -------
320 arrow_table : `pyarrow.Table`
321 """
322 from astropy.table import meta
324 type_list = [
325 (name, pa.from_numpy_dtype(astropy_table.dtype[name].type)) for name in astropy_table.dtype.names
326 ]
328 md = {}
329 md[b"lsst::arrow::rowcount"] = str(len(astropy_table))
331 for name, col in astropy_table.columns.items():
332 _append_numpy_string_metadata(md, name, col.dtype)
334 meta_yaml = meta.get_yaml_from_table(astropy_table)
335 meta_yaml_str = "\n".join(meta_yaml)
336 md[b"table_meta_yaml"] = meta_yaml_str
338 schema = pa.schema(type_list, metadata=md)
340 arrays = [pa.array(col) for col in astropy_table.itercols()]
341 arrow_table = pa.Table.from_arrays(arrays, schema=schema)
343 return arrow_table
346def pandas_to_arrow(dataframe: pd.DataFrame, default_length: int = 10) -> pa.Table:
347 """Convert a pandas dataframe to an arrow table.
349 Parameters
350 ----------
351 dataframe : `pandas.DataFrame`
352 default_length : `int`, optional
353 Default string length when not in metadata or can be inferred
354 from column.
356 Returns
357 -------
358 arrow_table : `pyarrow.Table`
359 """
360 import numpy as np
361 import pandas as pd
363 arrow_table = pa.Table.from_pandas(dataframe)
365 # Update the metadata
366 md = arrow_table.schema.metadata
368 md[b"lsst::arrow::rowcount"] = str(arrow_table.num_rows)
370 if not isinstance(dataframe.columns, pd.MultiIndex):
371 for name in dataframe.columns:
372 if dataframe[name].dtype.type is np.object_:
373 if len(dataframe[name].values) > 0:
374 strlen = max(len(row) for row in dataframe[name].values)
375 else:
376 strlen = default_length
377 md[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(strlen)
379 arrow_table = arrow_table.replace_schema_metadata(md)
381 return arrow_table
384def pandas_to_astropy(dataframe: pd.DataFrame) -> atable.Table:
385 """Convert a pandas dataframe to an astropy table, preserving indexes.
387 Parameters
388 ----------
389 dataframe : `pandas.DataFrame`
391 Returns
392 -------
393 astropy_table : `astropy.table.Table`
394 """
395 import pandas as pd
396 from astropy.table import Table
398 if isinstance(dataframe.columns, pd.MultiIndex):
399 raise ValueError("Cannot convert a multi-index dataframe to an astropy table.")
401 return Table.from_pandas(dataframe, index=True)
404def numpy_to_astropy(np_array: np.ndarray) -> atable.Table:
405 """Convert a numpy table to an astropy table.
407 Parameters
408 ----------
409 np_array : `numpy.ndarray`
411 Returns
412 -------
413 astropy_table : `astropy.table.Table`
414 """
415 from astropy.table import Table
417 return Table(data=np_array, copy=False)
420def arrow_schema_to_pandas_index(schema: pa.Schema) -> pd.Index | pd.MultiIndex:
421 """Convert an arrow schema to a pandas index/multiindex.
423 Parameters
424 ----------
425 schema : `pyarrow.Schema`
427 Returns
428 -------
429 index : `pandas.Index` or `pandas.MultiIndex`
430 """
431 import pandas as pd
433 if b"pandas" in schema.metadata:
434 md = json.loads(schema.metadata[b"pandas"])
435 indexes = md["column_indexes"]
436 len_indexes = len(indexes)
437 else:
438 len_indexes = 0
440 if len_indexes <= 1:
441 return pd.Index(name for name in schema.names if not name.startswith("__"))
442 else:
443 raw_columns = _split_multi_index_column_names(len(indexes), schema.names)
444 return pd.MultiIndex.from_tuples(raw_columns, names=[f["name"] for f in indexes])
447def arrow_schema_to_column_list(schema: pa.Schema) -> list[str]:
448 """Convert an arrow schema to a list of string column names.
450 Parameters
451 ----------
452 schema : `pyarrow.Schema`
454 Returns
455 -------
456 column_list : `list` [`str`]
457 """
458 return [name for name in schema.names]
461class DataFrameSchema:
462 """Wrapper class for a schema for a pandas DataFrame.
464 Parameters
465 ----------
466 dataframe : `pandas.DataFrame`
467 Dataframe to turn into a schema.
468 """
470 def __init__(self, dataframe: pd.DataFrame) -> None:
471 self._schema = dataframe.loc[[False] * len(dataframe)]
473 @classmethod
474 def from_arrow(cls, schema: pa.Schema) -> DataFrameSchema:
475 """Convert an arrow schema into a `DataFrameSchema`.
477 Parameters
478 ----------
479 schema : `pyarrow.Schema`
480 The pyarrow schema to convert.
482 Returns
483 -------
484 dataframe_schema : `DataFrameSchema`
485 """
486 empty_table = pa.Table.from_pylist([] * len(schema.names), schema=schema)
488 return cls(empty_table.to_pandas())
490 def to_arrow_schema(self) -> pa.Schema:
491 """Convert to an arrow schema.
493 Returns
494 -------
495 arrow_schema : `pyarrow.Schema`
496 """
497 arrow_table = pa.Table.from_pandas(self._schema)
499 return arrow_table.schema
501 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
502 """Convert to an `ArrowNumpySchema`.
504 Returns
505 -------
506 arrow_numpy_schema : `ArrowNumpySchema`
507 """
508 return ArrowNumpySchema.from_arrow(self.to_arrow_schema())
510 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
511 """Convert to an ArrowAstropySchema.
513 Returns
514 -------
515 arrow_astropy_schema : `ArrowAstropySchema`
516 """
517 return ArrowAstropySchema.from_arrow(self.to_arrow_schema())
519 @property
520 def schema(self) -> np.dtype:
521 return self._schema
523 def __repr__(self) -> str:
524 return repr(self._schema)
526 def __eq__(self, other: object) -> bool:
527 if not isinstance(other, DataFrameSchema):
528 return NotImplemented
530 return self._schema.equals(other._schema)
533class ArrowAstropySchema:
534 """Wrapper class for a schema for an astropy table.
536 Parameters
537 ----------
538 astropy_table : `astropy.table.Table`
539 """
541 def __init__(self, astropy_table: atable.Table) -> None:
542 self._schema = astropy_table[:0]
544 @classmethod
545 def from_arrow(cls, schema: pa.Schema) -> ArrowAstropySchema:
546 """Convert an arrow schema into a ArrowAstropySchema.
548 Parameters
549 ----------
550 schema : `pyarrow.Schema`
552 Returns
553 -------
554 astropy_schema : `ArrowAstropySchema`
555 """
556 import numpy as np
557 from astropy.table import Table
559 dtype = []
560 for name in schema.names:
561 if schema.field(name).type not in (pa.string(), pa.binary()):
562 dtype.append(schema.field(name).type.to_pandas_dtype())
563 continue
565 dtype.append(_arrow_string_to_numpy_dtype(schema, name))
567 data = np.zeros(0, dtype=list(zip(schema.names, dtype)))
569 astropy_table = Table(data=data)
571 metadata = schema.metadata if schema.metadata is not None else {}
573 _apply_astropy_metadata(astropy_table, metadata)
575 return cls(astropy_table)
577 def to_arrow_schema(self) -> pa.Schema:
578 """Convert to an arrow schema.
580 Returns
581 -------
582 arrow_schema : `pyarrow.Schema`
583 """
584 return astropy_to_arrow(self._schema).schema
586 def to_dataframe_schema(self) -> DataFrameSchema:
587 """Convert to a DataFrameSchema.
589 Returns
590 -------
591 dataframe_schema : `DataFrameSchema`
592 """
593 return DataFrameSchema.from_arrow(astropy_to_arrow(self._schema).schema)
595 def to_arrow_numpy_schema(self) -> ArrowNumpySchema:
596 """Convert to an `ArrowNumpySchema`.
598 Returns
599 -------
600 arrow_numpy_schema : `ArrowNumpySchema`
601 """
602 return ArrowNumpySchema.from_arrow(astropy_to_arrow(self._schema).schema)
604 @property
605 def schema(self) -> atable.Table:
606 return self._schema
608 def __repr__(self) -> str:
609 return repr(self._schema)
611 def __eq__(self, other: object) -> bool:
612 if not isinstance(other, ArrowAstropySchema):
613 return NotImplemented
615 # If this comparison passes then the two tables have the
616 # same column names.
617 if self._schema.dtype != other._schema.dtype:
618 return False
620 for name in self._schema.columns:
621 if not self._schema[name].unit == other._schema[name].unit:
622 return False
623 if not self._schema[name].description == other._schema[name].description:
624 return False
625 if not self._schema[name].format == other._schema[name].format:
626 return False
628 return True
631class ArrowNumpySchema:
632 """Wrapper class for a schema for a numpy ndarray.
634 Parameters
635 ----------
636 numpy_dtype : `numpy.dtype`
637 Numpy dtype to convert.
638 """
640 def __init__(self, numpy_dtype: np.dtype) -> None:
641 self._dtype = numpy_dtype
643 @classmethod
644 def from_arrow(cls, schema: pa.Schema) -> ArrowNumpySchema:
645 """Convert an arrow schema into an `ArrowNumpySchema`.
647 Parameters
648 ----------
649 schema : `pyarrow.Schema`
650 Pyarrow schema to convert.
652 Returns
653 -------
654 numpy_schema : `ArrowNumpySchema`
655 """
656 import numpy as np
658 dtype = []
659 for name in schema.names:
660 if schema.field(name).type not in (pa.string(), pa.binary()):
661 dtype.append((name, schema.field(name).type.to_pandas_dtype()))
662 continue
664 dtype.append((name, _arrow_string_to_numpy_dtype(schema, name)))
666 return cls(np.dtype(dtype))
668 def to_arrow_astropy_schema(self) -> ArrowAstropySchema:
669 """Convert to an `ArrowAstropySchema`.
671 Returns
672 -------
673 astropy_schema : `ArrowAstropySchema`
674 """
675 import numpy as np
677 return ArrowAstropySchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
679 def to_dataframe_schema(self) -> DataFrameSchema:
680 """Convert to a `DataFrameSchema`.
682 Returns
683 -------
684 dataframe_schema : `DataFrameSchema`
685 """
686 import numpy as np
688 return DataFrameSchema.from_arrow(numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema)
690 def to_arrow_schema(self) -> pa.Schema:
691 """Convert to a `pyarrow.Schema`.
693 Returns
694 -------
695 arrow_schema : `pyarrow.Schema`
696 """
697 import numpy as np
699 return numpy_to_arrow(np.zeros(0, dtype=self._dtype)).schema
701 @property
702 def schema(self) -> np.dtype:
703 return self._dtype
705 def __repr__(self) -> str:
706 return repr(self._dtype)
708 def __eq__(self, other: object) -> bool:
709 if not isinstance(other, ArrowNumpySchema):
710 return NotImplemented
712 if not self._dtype == other._dtype:
713 return False
715 return True
718def _split_multi_index_column_names(n: int, names: Iterable[str]) -> List[Sequence[str]]:
719 """Split a string that represents a multi-index column.
721 PyArrow maps Pandas' multi-index column names (which are tuples in Python)
722 to flat strings on disk. This routine exists to reconstruct the original
723 tuple.
725 Parameters
726 ----------
727 n : `int`
728 Number of levels in the `pandas.MultiIndex` that is being
729 reconstructed.
730 names : `~collections.abc.Iterable` [`str`]
731 Strings to be split.
733 Returns
734 -------
735 column_names : `list` [`tuple` [`str`]]
736 A list of multi-index column name tuples.
737 """
738 column_names: List[Sequence[str]] = []
740 pattern = re.compile(r"\({}\)".format(", ".join(["'(.*)'"] * n)))
741 for name in names:
742 m = re.search(pattern, name)
743 if m is not None:
744 column_names.append(m.groups())
746 return column_names
749def _standardize_multi_index_columns(
750 schema: pa.Schema, columns: Union[List[tuple], Dict[str, Union[str, List[str]]]]
751) -> List[str]:
752 """Transform a dictionary/iterable index from a multi-index column list
753 into a string directly understandable by PyArrow.
755 Parameters
756 ----------
757 schema : `pyarrow.Schema`
758 columns : `list` [`tuple`] or `dict` [`str`, `str` or `list` [`str`]]
760 Returns
761 -------
762 names : `list` [`str`]
763 Stringified representation of a multi-index column name.
764 """
765 pd_index = arrow_schema_to_pandas_index(schema)
766 index_level_names = tuple(pd_index.names)
768 names = []
770 if isinstance(columns, list):
771 for requested in columns:
772 if not isinstance(requested, tuple):
773 raise ValueError(
774 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
775 f"Instead got a {get_full_type_name(requested)}."
776 )
777 names.append(str(requested))
778 else:
779 if not isinstance(columns, collections.abc.Mapping):
780 raise ValueError(
781 "Columns parameter for multi-index data frame must be a dictionary or list of tuples. "
782 f"Instead got a {get_full_type_name(columns)}."
783 )
784 if not set(index_level_names).issuperset(columns.keys()):
785 raise ValueError(
786 f"Cannot use dict with keys {set(columns.keys())} "
787 f"to select columns from {index_level_names}."
788 )
789 factors = [
790 ensure_iterable(columns.get(level, pd_index.levels[i]))
791 for i, level in enumerate(index_level_names)
792 ]
793 for requested in itertools.product(*factors):
794 for i, value in enumerate(requested):
795 if value not in pd_index.levels[i]:
796 raise ValueError(f"Unrecognized value {value!r} for index {index_level_names[i]!r}.")
797 names.append(str(requested))
799 return names
802def _apply_astropy_metadata(astropy_table: atable.Table, metadata: Dict) -> None:
803 """Apply any astropy metadata from the schema metadata.
805 Parameters
806 ----------
807 astropy_table : `astropy.table.Table`
808 Table to apply metadata.
809 metadata : `dict` [`bytes`]
810 Metadata dict.
811 """
812 from astropy.table import meta
814 meta_yaml = metadata.get(b"table_meta_yaml", None)
815 if meta_yaml:
816 meta_yaml = meta_yaml.decode("UTF8").split("\n")
817 meta_hdr = meta.get_header_from_yaml(meta_yaml)
819 # Set description, format, unit, meta from the column
820 # metadata that was serialized with the table.
821 header_cols = {x["name"]: x for x in meta_hdr["datatype"]}
822 for col in astropy_table.columns.values():
823 for attr in ("description", "format", "unit", "meta"):
824 if attr in header_cols[col.name]:
825 setattr(col, attr, header_cols[col.name][attr])
828def _arrow_string_to_numpy_dtype(
829 schema: pa.Schema, name: str, numpy_column: np.ndarray | None = None, default_length: int = 10
830) -> str:
831 """Get the numpy dtype string associated with an arrow column.
833 Parameters
834 ----------
835 schema : `pyarrow.Schema`
836 Arrow table schema.
837 name : `str`
838 Column name.
839 numpy_column : `numpy.ndarray`, optional
840 Column to determine numpy string dtype.
841 default_length : `int`, optional
842 Default string length when not in metadata or can be inferred
843 from column.
845 Returns
846 -------
847 dtype_str : `str`
848 Numpy dtype string.
849 """
850 # Special-case for string and binary columns
851 md_name = f"lsst::arrow::len::{name}"
852 strlen = default_length
853 metadata = schema.metadata if schema.metadata is not None else {}
854 if (encoded := md_name.encode("UTF-8")) in metadata:
855 # String/bytes length from header.
856 strlen = int(schema.metadata[encoded])
857 elif numpy_column is not None:
858 if len(numpy_column) > 0:
859 strlen = max(len(row) for row in numpy_column)
861 dtype = f"U{strlen}" if schema.field(name).type == pa.string() else f"|S{strlen}"
863 return dtype
866def _append_numpy_string_metadata(metadata: Dict[bytes, str], name: str, dtype: np.dtype) -> None:
867 """Append numpy string length keys to arrow metadata.
869 All column types are handled, but only the metadata is only modified for
870 string and byte columns.
872 Parameters
873 ----------
874 metadata : `dict` [`bytes`, `str`]
875 Metadata dictionary; modified in place.
876 name : `str`
877 Column name.
878 dtype : `np.dtype`
879 Numpy dtype.
880 """
881 import numpy as np
883 if dtype.type is np.str_:
884 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize // 4)
885 elif dtype.type is np.bytes_:
886 metadata[f"lsst::arrow::len::{name}".encode("UTF-8")] = str(dtype.itemsize)