Coverage for python/lsst/daf/butler/delegates/arrowtable.py: 31%

35 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-25 10:50 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Support for reading Arrow tables.""" 

29from __future__ import annotations 

30 

31from collections.abc import Mapping 

32from typing import Any 

33 

34import pyarrow as pa 

35from lsst.daf.butler import StorageClassDelegate 

36from lsst.utils.introspection import get_full_type_name 

37from lsst.utils.iteration import ensure_iterable 

38 

39__all__ = ["ArrowTableDelegate"] 

40 

41 

42class ArrowTableDelegate(StorageClassDelegate): 

43 """Delegate that understands the ``ArrowTable`` storage class.""" 

44 

45 _datasetType = pa.Table 

46 

47 def getComponent(self, composite: pa.Table, componentName: str) -> Any: 

48 """Get a component from an Arrow table. 

49 

50 Parameters 

51 ---------- 

52 composite : `~pyarrow.Table` 

53 Arrow table to access component. 

54 componentName : `str` 

55 Name of component to retrieve. 

56 

57 Returns 

58 ------- 

59 component : `object` 

60 The component. 

61 

62 Raises 

63 ------ 

64 AttributeError 

65 The component can not be found. 

66 """ 

67 if componentName in ("columns", "schema"): 

68 # The schema will be translated to column format 

69 # depending on the input type. 

70 return composite.schema 

71 elif componentName == "rowcount": 

72 return len(composite[composite.schema.names[0]]) 

73 

74 raise AttributeError( 

75 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

76 ) 

77 

78 def handleParameters(self, inMemoryDataset: Any, parameters: Mapping[str, Any] | None = None) -> Any: 

79 if not isinstance(inMemoryDataset, self._datasetType): 

80 raise ValueError( 

81 f"inMemoryDataset must be a {get_full_type_name(self._datasetType)} and " 

82 f"not {get_full_type_name(inMemoryDataset)}." 

83 ) 

84 

85 if parameters is None: 

86 return inMemoryDataset 

87 

88 if "columns" in parameters: 

89 read_columns = list(ensure_iterable(parameters["columns"])) 

90 for column in read_columns: 

91 if not isinstance(column, str): 

92 raise NotImplementedError( 

93 "InMemoryDataset of an Arrow Table only supports string column names." 

94 ) 

95 if column not in self._getColumns(inMemoryDataset): 

96 raise ValueError(f"Unrecognized column name {column!r}.") 

97 

98 # Ensure uniqueness, keeping order. 

99 read_columns = list(dict.fromkeys(read_columns)) 

100 

101 return self._selectColumns(inMemoryDataset, read_columns) 

102 else: 

103 return inMemoryDataset 

104 

105 def _getColumns(self, inMemoryDataset: pa.Table) -> list[str]: 

106 """Get the column names from the inMemoryDataset. 

107 

108 Parameters 

109 ---------- 

110 inMemoryDataset : `object` 

111 Dataset to extract columns. 

112 

113 Returns 

114 ------- 

115 columns : `list` [`str`] 

116 List of columns. 

117 """ 

118 return inMemoryDataset.schema.names 

119 

120 def _selectColumns(self, inMemoryDataset: pa.Table, columns: list[str]) -> pa.Table: 

121 """Select a subset of columns from the inMemoryDataset. 

122 

123 Parameters 

124 ---------- 

125 inMemoryDataset : `object` 

126 Dataset to extract columns. 

127 columns : `list` [`str`] 

128 List of columns to extract. 

129 

130 Returns 

131 ------- 

132 subDataset : `object` 

133 Subselection of inMemoryDataset. 

134 """ 

135 return inMemoryDataset.select(columns)