Coverage for python/lsst/daf/butler/delegates/dataframe.py: 23%

46 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-05 01:26 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading DataFrames.""" 

23from __future__ import annotations 

24 

25import collections.abc 

26from collections.abc import Mapping 

27from typing import Any 

28 

29import pandas 

30from lsst.daf.butler import StorageClassDelegate 

31from lsst.daf.butler.formatters.parquet import DataFrameSchema 

32from lsst.utils.introspection import get_full_type_name 

33from lsst.utils.iteration import ensure_iterable 

34 

35from ..formatters.parquet import _standardize_multi_index_columns 

36 

37__all__ = ["DataFrameDelegate"] 

38 

39 

40class DataFrameDelegate(StorageClassDelegate): 

41 """Delegate that understands the ``DataFrame`` storage class.""" 

42 

43 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any: 

44 """Get a component from a DataFrame. 

45 

46 Parameters 

47 ---------- 

48 composite : `~pandas.DataFrame` 

49 ``DataFrame`` to access component. 

50 componentName : `str` 

51 Name of component to retrieve. 

52 

53 Returns 

54 ------- 

55 component : `object` 

56 The component. 

57 

58 Raises 

59 ------ 

60 AttributeError 

61 The component can not be found. 

62 """ 

63 if componentName == "columns": 

64 if isinstance(composite.columns, pandas.MultiIndex): 

65 return composite.columns 

66 else: 

67 return pandas.Index(self._getAllColumns(composite)) 

68 elif componentName == "rowcount": 

69 return len(composite) 

70 elif componentName == "schema": 

71 return DataFrameSchema(composite.iloc[:0]) 

72 else: 

73 raise AttributeError( 

74 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

75 ) 

76 

77 def handleParameters( 

78 self, inMemoryDataset: pandas.DataFrame, parameters: Mapping[str, Any] | None = None 

79 ) -> Any: 

80 """Return possibly new in-memory dataset using the supplied parameters. 

81 

82 Parameters 

83 ---------- 

84 inMemoryDataset : `object` 

85 Object to modify based on the parameters. 

86 parameters : `dict`, optional 

87 Parameters to apply. Values are specific to the parameter. 

88 Supported parameters are defined in the associated 

89 `StorageClass`. If no relevant parameters are specified the 

90 ``inMemoryDataset`` will be return unchanged. 

91 

92 Returns 

93 ------- 

94 inMemoryDataset : `object` 

95 Original in-memory dataset, or updated form after parameters 

96 have been used. 

97 """ 

98 if not isinstance(inMemoryDataset, pandas.DataFrame): 

99 raise ValueError( 

100 "handleParameters for a DataFrame must get a DataFrame, " 

101 f"not {get_full_type_name(inMemoryDataset)}." 

102 ) 

103 

104 if parameters is None: 

105 return inMemoryDataset 

106 

107 if "columns" in parameters: 

108 allColumns = self._getAllColumns(inMemoryDataset) 

109 

110 if not isinstance(parameters["columns"], collections.abc.Iterable): 

111 raise NotImplementedError( 

112 "InMemoryDataset of a DataFrame only supports list/tuple of string column names" 

113 ) 

114 

115 if isinstance(inMemoryDataset.columns, pandas.MultiIndex): 

116 # We have a multi-index dataframe which needs special handling. 

117 readColumns = _standardize_multi_index_columns( 

118 inMemoryDataset.columns, 

119 parameters["columns"], 

120 stringify=False, 

121 ) 

122 else: 

123 for column in ensure_iterable(parameters["columns"]): 

124 if not isinstance(column, str): 

125 raise NotImplementedError( 

126 "InMemoryDataset of a DataFrame only supports string column names." 

127 ) 

128 if column not in allColumns: 

129 raise ValueError(f"Unrecognized column name {column!r}.") 

130 

131 # Exclude index columns from the subset. 

132 readColumns = [ 

133 name 

134 for name in ensure_iterable(parameters["columns"]) 

135 if name not in inMemoryDataset.index.names 

136 ] 

137 

138 # Ensure uniqueness, keeping order. 

139 readColumns = list(dict.fromkeys(readColumns)) 

140 

141 return inMemoryDataset[readColumns] 

142 else: 

143 return inMemoryDataset 

144 

145 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]: 

146 """Get all columns, including index columns. 

147 

148 Returns 

149 ------- 

150 columns : `list` [`str`] 

151 List of all columns. 

152 """ 

153 allColumns = list(inMemoryDataset.columns) 

154 if inMemoryDataset.index.names[0] is not None: 

155 allColumns.extend(inMemoryDataset.index.names) 

156 

157 return allColumns