Coverage for python/lsst/daf/butler/delegates/arrowtable.py: 29%

34 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 09:33 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading Arrow tables.""" 

23from __future__ import annotations 

24 

25from typing import Any, Mapping, Optional 

26 

27import pyarrow as pa 

28from lsst.daf.butler import StorageClassDelegate 

29from lsst.utils.introspection import get_full_type_name 

30from lsst.utils.iteration import ensure_iterable 

31 

32__all__ = ["ArrowTableDelegate"] 

33 

34 

35class ArrowTableDelegate(StorageClassDelegate): 

36 _datasetType = pa.Table 

37 

38 def getComponent(self, composite: pa.Table, componentName: str) -> Any: 

39 """Get a component from an Arrow table. 

40 

41 Parameters 

42 ---------- 

43 composite : `~pyarrow.Table` 

44 Arrow table to access component. 

45 componentName : `str` 

46 Name of component to retrieve. 

47 

48 Returns 

49 ------- 

50 component : `object` 

51 The component. 

52 

53 Raises 

54 ------ 

55 AttributeError 

56 The component can not be found. 

57 """ 

58 if componentName in ("columns", "schema"): 

59 # The schema will be translated to column format 

60 # depending on the input type. 

61 return composite.schema 

62 elif componentName == "rowcount": 

63 return len(composite[composite.schema.names[0]]) 

64 

65 raise AttributeError( 

66 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

67 ) 

68 

69 def handleParameters(self, inMemoryDataset: Any, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

70 if not isinstance(inMemoryDataset, self._datasetType): 

71 raise ValueError( 

72 f"inMemoryDataset must be a {get_full_type_name(self._datasetType)} and " 

73 f"not {get_full_type_name(inMemoryDataset)}." 

74 ) 

75 

76 if parameters is None: 

77 return inMemoryDataset 

78 

79 if "columns" in parameters: 

80 read_columns = list(ensure_iterable(parameters["columns"])) 

81 for column in read_columns: 

82 if not isinstance(column, str): 

83 raise NotImplementedError( 

84 "InMemoryDataset of an Arrow Table only supports string column names." 

85 ) 

86 if column not in self._getColumns(inMemoryDataset): 

87 raise ValueError(f"Unrecognized column name {column!r}.") 

88 

89 # Ensure uniqueness, keeping order. 

90 read_columns = list(dict.fromkeys(read_columns)) 

91 

92 return self._selectColumns(inMemoryDataset, read_columns) 

93 else: 

94 return inMemoryDataset 

95 

96 def _getColumns(self, inMemoryDataset: pa.Table) -> list[str]: 

97 """Get the column names from the inMemoryDataset. 

98 

99 Parameters 

100 ---------- 

101 inMemoryDataset : `object` 

102 Dataset to extract columns. 

103 

104 Returns 

105 ------- 

106 columns : `list` [`str`] 

107 List of columns. 

108 """ 

109 return inMemoryDataset.schema.names 

110 

111 def _selectColumns(self, inMemoryDataset: pa.Table, columns: list[str]) -> pa.Table: 

112 """Select a subset of columns from the inMemoryDataset. 

113 

114 Parameters 

115 ---------- 

116 inMemoryDataset : `object` 

117 Dataset to extract columns. 

118 columns : `list` [`str`] 

119 List of columns to extract. 

120 

121 Returns 

122 ------- 

123 subDataset : `object` 

124 Subselection of inMemoryDataset. 

125 """ 

126 return inMemoryDataset.select(columns)