Coverage for python/lsst/daf/butler/delegates/arrowtable.py: 31%

35 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading Arrow tables.""" 

23from __future__ import annotations 

24 

25from collections.abc import Mapping 

26from typing import Any 

27 

28import pyarrow as pa 

29from lsst.daf.butler import StorageClassDelegate 

30from lsst.utils.introspection import get_full_type_name 

31from lsst.utils.iteration import ensure_iterable 

32 

33__all__ = ["ArrowTableDelegate"] 

34 

35 

36class ArrowTableDelegate(StorageClassDelegate): 

37 """Delegate that understands the ``ArrowTable`` storage class.""" 

38 

39 _datasetType = pa.Table 

40 

41 def getComponent(self, composite: pa.Table, componentName: str) -> Any: 

42 """Get a component from an Arrow table. 

43 

44 Parameters 

45 ---------- 

46 composite : `~pyarrow.Table` 

47 Arrow table to access component. 

48 componentName : `str` 

49 Name of component to retrieve. 

50 

51 Returns 

52 ------- 

53 component : `object` 

54 The component. 

55 

56 Raises 

57 ------ 

58 AttributeError 

59 The component can not be found. 

60 """ 

61 if componentName in ("columns", "schema"): 

62 # The schema will be translated to column format 

63 # depending on the input type. 

64 return composite.schema 

65 elif componentName == "rowcount": 

66 return len(composite[composite.schema.names[0]]) 

67 

68 raise AttributeError( 

69 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

70 ) 

71 

72 def handleParameters(self, inMemoryDataset: Any, parameters: Mapping[str, Any] | None = None) -> Any: 

73 if not isinstance(inMemoryDataset, self._datasetType): 

74 raise ValueError( 

75 f"inMemoryDataset must be a {get_full_type_name(self._datasetType)} and " 

76 f"not {get_full_type_name(inMemoryDataset)}." 

77 ) 

78 

79 if parameters is None: 

80 return inMemoryDataset 

81 

82 if "columns" in parameters: 

83 read_columns = list(ensure_iterable(parameters["columns"])) 

84 for column in read_columns: 

85 if not isinstance(column, str): 

86 raise NotImplementedError( 

87 "InMemoryDataset of an Arrow Table only supports string column names." 

88 ) 

89 if column not in self._getColumns(inMemoryDataset): 

90 raise ValueError(f"Unrecognized column name {column!r}.") 

91 

92 # Ensure uniqueness, keeping order. 

93 read_columns = list(dict.fromkeys(read_columns)) 

94 

95 return self._selectColumns(inMemoryDataset, read_columns) 

96 else: 

97 return inMemoryDataset 

98 

99 def _getColumns(self, inMemoryDataset: pa.Table) -> list[str]: 

100 """Get the column names from the inMemoryDataset. 

101 

102 Parameters 

103 ---------- 

104 inMemoryDataset : `object` 

105 Dataset to extract columns. 

106 

107 Returns 

108 ------- 

109 columns : `list` [`str`] 

110 List of columns. 

111 """ 

112 return inMemoryDataset.schema.names 

113 

114 def _selectColumns(self, inMemoryDataset: pa.Table, columns: list[str]) -> pa.Table: 

115 """Select a subset of columns from the inMemoryDataset. 

116 

117 Parameters 

118 ---------- 

119 inMemoryDataset : `object` 

120 Dataset to extract columns. 

121 columns : `list` [`str`] 

122 List of columns to extract. 

123 

124 Returns 

125 ------- 

126 subDataset : `object` 

127 Subselection of inMemoryDataset. 

128 """ 

129 return inMemoryDataset.select(columns)