Coverage for python/lsst/daf/butler/delegates/dataframe.py: 23%

46 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-15 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading DataFrames.""" 

23from __future__ import annotations 

24 

25import collections.abc 

26from collections.abc import Mapping 

27from typing import Any 

28 

29import pandas 

30from lsst.daf.butler import StorageClassDelegate 

31from lsst.daf.butler.formatters.parquet import DataFrameSchema 

32from lsst.utils.introspection import get_full_type_name 

33from lsst.utils.iteration import ensure_iterable 

34 

35from ..formatters.parquet import _standardize_multi_index_columns 

36 

37__all__ = ["DataFrameDelegate"] 

38 

39 

40class DataFrameDelegate(StorageClassDelegate): 

41 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any: 

42 """Get a component from a DataFrame. 

43 

44 Parameters 

45 ---------- 

46 composite : `~pandas.DataFrame` 

47 ``DataFrame`` to access component. 

48 componentName : `str` 

49 Name of component to retrieve. 

50 

51 Returns 

52 ------- 

53 component : `object` 

54 The component. 

55 

56 Raises 

57 ------ 

58 AttributeError 

59 The component can not be found. 

60 """ 

61 if componentName == "columns": 

62 if isinstance(composite.columns, pandas.MultiIndex): 

63 return composite.columns 

64 else: 

65 return pandas.Index(self._getAllColumns(composite)) 

66 elif componentName == "rowcount": 

67 return len(composite) 

68 elif componentName == "schema": 

69 return DataFrameSchema(composite.iloc[:0]) 

70 else: 

71 raise AttributeError( 

72 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

73 ) 

74 

75 def handleParameters( 

76 self, inMemoryDataset: pandas.DataFrame, parameters: Mapping[str, Any] | None = None 

77 ) -> Any: 

78 """Return possibly new in-memory dataset using the supplied parameters. 

79 

80 Parameters 

81 ---------- 

82 inMemoryDataset : `object` 

83 Object to modify based on the parameters. 

84 parameters : `dict`, optional 

85 Parameters to apply. Values are specific to the parameter. 

86 Supported parameters are defined in the associated 

87 `StorageClass`. If no relevant parameters are specified the 

88 ``inMemoryDataset`` will be return unchanged. 

89 

90 Returns 

91 ------- 

92 inMemoryDataset : `object` 

93 Original in-memory dataset, or updated form after parameters 

94 have been used. 

95 """ 

96 if not isinstance(inMemoryDataset, pandas.DataFrame): 

97 raise ValueError( 

98 "handleParameters for a DataFrame must get a DataFrame, " 

99 f"not {get_full_type_name(inMemoryDataset)}." 

100 ) 

101 

102 if parameters is None: 

103 return inMemoryDataset 

104 

105 if "columns" in parameters: 

106 allColumns = self._getAllColumns(inMemoryDataset) 

107 

108 if not isinstance(parameters["columns"], collections.abc.Iterable): 

109 raise NotImplementedError( 

110 "InMemoryDataset of a DataFrame only supports list/tuple of string column names" 

111 ) 

112 

113 if isinstance(inMemoryDataset.columns, pandas.MultiIndex): 

114 # We have a multi-index dataframe which needs special handling. 

115 readColumns = _standardize_multi_index_columns( 

116 inMemoryDataset.columns, 

117 parameters["columns"], 

118 stringify=False, 

119 ) 

120 else: 

121 for column in ensure_iterable(parameters["columns"]): 

122 if not isinstance(column, str): 

123 raise NotImplementedError( 

124 "InMemoryDataset of a DataFrame only supports string column names." 

125 ) 

126 if column not in allColumns: 

127 raise ValueError(f"Unrecognized column name {column!r}.") 

128 

129 # Exclude index columns from the subset. 

130 readColumns = [ 

131 name 

132 for name in ensure_iterable(parameters["columns"]) 

133 if name not in inMemoryDataset.index.names 

134 ] 

135 

136 # Ensure uniqueness, keeping order. 

137 readColumns = list(dict.fromkeys(readColumns)) 

138 

139 return inMemoryDataset[readColumns] 

140 else: 

141 return inMemoryDataset 

142 

143 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]: 

144 """Get all columns, including index columns. 

145 

146 Returns 

147 ------- 

148 columns : `list` [`str`] 

149 List of all columns. 

150 """ 

151 allColumns = list(inMemoryDataset.columns) 

152 if inMemoryDataset.index.names[0] is not None: 

153 allColumns.extend(inMemoryDataset.index.names) 

154 

155 return allColumns