Coverage for python/lsst/daf/butler/delegates/arrowtable.py: 31%

35 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-15 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading Arrow tables.""" 

23from __future__ import annotations 

24 

25from collections.abc import Mapping 

26from typing import Any 

27 

28import pyarrow as pa 

29from lsst.daf.butler import StorageClassDelegate 

30from lsst.utils.introspection import get_full_type_name 

31from lsst.utils.iteration import ensure_iterable 

32 

33__all__ = ["ArrowTableDelegate"] 

34 

35 

36class ArrowTableDelegate(StorageClassDelegate): 

37 _datasetType = pa.Table 

38 

39 def getComponent(self, composite: pa.Table, componentName: str) -> Any: 

40 """Get a component from an Arrow table. 

41 

42 Parameters 

43 ---------- 

44 composite : `~pyarrow.Table` 

45 Arrow table to access component. 

46 componentName : `str` 

47 Name of component to retrieve. 

48 

49 Returns 

50 ------- 

51 component : `object` 

52 The component. 

53 

54 Raises 

55 ------ 

56 AttributeError 

57 The component can not be found. 

58 """ 

59 if componentName in ("columns", "schema"): 

60 # The schema will be translated to column format 

61 # depending on the input type. 

62 return composite.schema 

63 elif componentName == "rowcount": 

64 return len(composite[composite.schema.names[0]]) 

65 

66 raise AttributeError( 

67 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

68 ) 

69 

70 def handleParameters(self, inMemoryDataset: Any, parameters: Mapping[str, Any] | None = None) -> Any: 

71 if not isinstance(inMemoryDataset, self._datasetType): 

72 raise ValueError( 

73 f"inMemoryDataset must be a {get_full_type_name(self._datasetType)} and " 

74 f"not {get_full_type_name(inMemoryDataset)}." 

75 ) 

76 

77 if parameters is None: 

78 return inMemoryDataset 

79 

80 if "columns" in parameters: 

81 read_columns = list(ensure_iterable(parameters["columns"])) 

82 for column in read_columns: 

83 if not isinstance(column, str): 

84 raise NotImplementedError( 

85 "InMemoryDataset of an Arrow Table only supports string column names." 

86 ) 

87 if column not in self._getColumns(inMemoryDataset): 

88 raise ValueError(f"Unrecognized column name {column!r}.") 

89 

90 # Ensure uniqueness, keeping order. 

91 read_columns = list(dict.fromkeys(read_columns)) 

92 

93 return self._selectColumns(inMemoryDataset, read_columns) 

94 else: 

95 return inMemoryDataset 

96 

97 def _getColumns(self, inMemoryDataset: pa.Table) -> list[str]: 

98 """Get the column names from the inMemoryDataset. 

99 

100 Parameters 

101 ---------- 

102 inMemoryDataset : `object` 

103 Dataset to extract columns. 

104 

105 Returns 

106 ------- 

107 columns : `list` [`str`] 

108 List of columns. 

109 """ 

110 return inMemoryDataset.schema.names 

111 

112 def _selectColumns(self, inMemoryDataset: pa.Table, columns: list[str]) -> pa.Table: 

113 """Select a subset of columns from the inMemoryDataset. 

114 

115 Parameters 

116 ---------- 

117 inMemoryDataset : `object` 

118 Dataset to extract columns. 

119 columns : `list` [`str`] 

120 List of columns to extract. 

121 

122 Returns 

123 ------- 

124 subDataset : `object` 

125 Subselection of inMemoryDataset. 

126 """ 

127 return inMemoryDataset.select(columns)