Coverage for python/lsst/daf/butler/delegates/dataframe.py: 22%

45 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 09:33 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading DataFrames.""" 

23from __future__ import annotations 

24 

25import collections.abc 

26from typing import Any, Mapping, Optional 

27 

28import pandas 

29from lsst.daf.butler import StorageClassDelegate 

30from lsst.daf.butler.formatters.parquet import DataFrameSchema 

31from lsst.utils.introspection import get_full_type_name 

32from lsst.utils.iteration import ensure_iterable 

33 

34from ..formatters.parquet import _standardize_multi_index_columns 

35 

36__all__ = ["DataFrameDelegate"] 

37 

38 

39class DataFrameDelegate(StorageClassDelegate): 

40 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any: 

41 """Get a component from a DataFrame. 

42 

43 Parameters 

44 ---------- 

45 composite : `~pandas.DataFrame` 

46 ``DataFrame`` to access component. 

47 componentName : `str` 

48 Name of component to retrieve. 

49 

50 Returns 

51 ------- 

52 component : `object` 

53 The component. 

54 

55 Raises 

56 ------ 

57 AttributeError 

58 The component can not be found. 

59 """ 

60 if componentName == "columns": 

61 if isinstance(composite.columns, pandas.MultiIndex): 

62 return composite.columns 

63 else: 

64 return pandas.Index(self._getAllColumns(composite)) 

65 elif componentName == "rowcount": 

66 return len(composite) 

67 elif componentName == "schema": 

68 return DataFrameSchema(composite.iloc[:0]) 

69 else: 

70 raise AttributeError( 

71 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

72 ) 

73 

74 def handleParameters( 

75 self, inMemoryDataset: pandas.DataFrame, parameters: Optional[Mapping[str, Any]] = None 

76 ) -> Any: 

77 """Return possibly new in-memory dataset using the supplied parameters. 

78 

79 Parameters 

80 ---------- 

81 inMemoryDataset : `object` 

82 Object to modify based on the parameters. 

83 parameters : `dict`, optional 

84 Parameters to apply. Values are specific to the parameter. 

85 Supported parameters are defined in the associated 

86 `StorageClass`. If no relevant parameters are specified the 

87 ``inMemoryDataset`` will be return unchanged. 

88 

89 Returns 

90 ------- 

91 inMemoryDataset : `object` 

92 Original in-memory dataset, or updated form after parameters 

93 have been used. 

94 """ 

95 if not isinstance(inMemoryDataset, pandas.DataFrame): 

96 raise ValueError( 

97 "handleParameters for a DataFrame must get a DataFrame, " 

98 f"not {get_full_type_name(inMemoryDataset)}." 

99 ) 

100 

101 if parameters is None: 

102 return inMemoryDataset 

103 

104 if "columns" in parameters: 

105 allColumns = self._getAllColumns(inMemoryDataset) 

106 

107 if not isinstance(parameters["columns"], collections.abc.Iterable): 

108 raise NotImplementedError( 

109 "InMemoryDataset of a DataFrame only supports list/tuple of string column names" 

110 ) 

111 

112 if isinstance(inMemoryDataset.columns, pandas.MultiIndex): 

113 # We have a multi-index dataframe which needs special handling. 

114 readColumns = _standardize_multi_index_columns( 

115 inMemoryDataset.columns, 

116 parameters["columns"], 

117 stringify=False, 

118 ) 

119 else: 

120 for column in ensure_iterable(parameters["columns"]): 

121 if not isinstance(column, str): 

122 raise NotImplementedError( 

123 "InMemoryDataset of a DataFrame only supports string column names." 

124 ) 

125 if column not in allColumns: 

126 raise ValueError(f"Unrecognized column name {column!r}.") 

127 

128 # Exclude index columns from the subset. 

129 readColumns = [ 

130 name 

131 for name in ensure_iterable(parameters["columns"]) 

132 if name not in inMemoryDataset.index.names 

133 ] 

134 

135 # Ensure uniqueness, keeping order. 

136 readColumns = list(dict.fromkeys(readColumns)) 

137 

138 return inMemoryDataset[readColumns] 

139 else: 

140 return inMemoryDataset 

141 

142 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]: 

143 """Get all columns, including index columns. 

144 

145 Returns 

146 ------- 

147 columns : `list` [`str`] 

148 List of all columns. 

149 """ 

150 allColumns = list(inMemoryDataset.columns) 

151 if inMemoryDataset.index.names[0] is not None: 

152 allColumns.extend(inMemoryDataset.index.names) 

153 

154 return allColumns