Coverage for python/lsst/daf/butler/delegates/dataframe.py: 23%

47 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-14 09:22 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading DataFrames.""" 

23from __future__ import annotations 

24 

25import collections.abc 

26from typing import Any, Mapping, Optional 

27 

28import pandas 

29import pyarrow as pa 

30from lsst.daf.butler import StorageClassDelegate 

31from lsst.daf.butler.formatters.parquet import DataFrameSchema 

32from lsst.utils.introspection import get_full_type_name 

33from lsst.utils.iteration import ensure_iterable 

34 

35from ..formatters.parquet import _standardize_multi_index_columns 

36 

37__all__ = ["DataFrameDelegate"] 

38 

39 

40class DataFrameDelegate(StorageClassDelegate): 

41 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any: 

42 """Get a component from a DataFrame. 

43 

44 Parameters 

45 ---------- 

46 composite : `~pandas.DataFrame` 

47 ``DataFrame`` to access component. 

48 componentName : `str` 

49 Name of component to retrieve. 

50 

51 Returns 

52 ------- 

53 component : `object` 

54 The component. 

55 

56 Raises 

57 ------ 

58 AttributeError 

59 The component can not be found. 

60 """ 

61 if componentName == "columns": 

62 if isinstance(composite.columns, pandas.MultiIndex): 

63 return composite.columns 

64 else: 

65 return pandas.Index(self._getAllColumns(composite)) 

66 elif componentName == "rowcount": 

67 return len(composite) 

68 elif componentName == "schema": 

69 return DataFrameSchema(composite.iloc[:0]) 

70 else: 

71 raise AttributeError( 

72 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

73 ) 

74 

75 def handleParameters( 

76 self, inMemoryDataset: pandas.DataFrame, parameters: Optional[Mapping[str, Any]] = None 

77 ) -> Any: 

78 """Return possibly new in-memory dataset using the supplied parameters. 

79 

80 Parameters 

81 ---------- 

82 inMemoryDataset : `object` 

83 Object to modify based on the parameters. 

84 parameters : `dict`, optional 

85 Parameters to apply. Values are specific to the parameter. 

86 Supported parameters are defined in the associated 

87 `StorageClass`. If no relevant parameters are specified the 

88 ``inMemoryDataset`` will be return unchanged. 

89 

90 Returns 

91 ------- 

92 inMemoryDataset : `object` 

93 Original in-memory dataset, or updated form after parameters 

94 have been used. 

95 """ 

96 if not isinstance(inMemoryDataset, pandas.DataFrame): 

97 raise ValueError( 

98 "handleParameters for a DataFrame must get a DataFrame, " 

99 f"not {get_full_type_name(inMemoryDataset)}." 

100 ) 

101 

102 if parameters is None: 

103 return inMemoryDataset 

104 

105 if "columns" in parameters: 

106 allColumns = self._getAllColumns(inMemoryDataset) 

107 

108 if not isinstance(parameters["columns"], collections.abc.Iterable): 

109 raise NotImplementedError( 

110 "InMemoryDataset of a DataFrame only supports list/tuple of string column names" 

111 ) 

112 

113 if isinstance(inMemoryDataset.columns, pandas.MultiIndex): 

114 # We have a multi-index dataframe which needs special handling. 

115 arrow_table = pa.Table.from_pandas(inMemoryDataset) 

116 readColumns = _standardize_multi_index_columns( 

117 arrow_table.schema, 

118 parameters["columns"], 

119 stringify=False, 

120 ) 

121 else: 

122 for column in ensure_iterable(parameters["columns"]): 

123 if not isinstance(column, str): 

124 raise NotImplementedError( 

125 "InMemoryDataset of a DataFrame only supports string column names." 

126 ) 

127 if column not in allColumns: 

128 raise ValueError(f"Unrecognized column name {column!r}.") 

129 

130 # Exclude index columns from the subset. 

131 readColumns = [ 

132 name 

133 for name in ensure_iterable(parameters["columns"]) 

134 if name not in inMemoryDataset.index.names 

135 ] 

136 

137 # Ensure uniqueness, keeping order. 

138 readColumns = list(dict.fromkeys(readColumns)) 

139 

140 return inMemoryDataset[readColumns] 

141 else: 

142 return inMemoryDataset 

143 

144 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]: 

145 """Get all columns, including index columns. 

146 

147 Returns 

148 ------- 

149 columns : `list` [`str`] 

150 List of all columns. 

151 """ 

152 allColumns = list(inMemoryDataset.columns) 

153 if inMemoryDataset.index.names[0] is not None: 

154 allColumns.extend(inMemoryDataset.index.names) 

155 

156 return allColumns