Coverage for python/lsst/daf/butler/delegates/dataframe.py: 23%

46 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Support for reading DataFrames.""" 

29from __future__ import annotations 

30 

31import collections.abc 

32from collections.abc import Mapping 

33from typing import Any 

34 

35import pandas 

36from lsst.daf.butler import StorageClassDelegate 

37from lsst.daf.butler.formatters.parquet import DataFrameSchema 

38from lsst.utils.introspection import get_full_type_name 

39from lsst.utils.iteration import ensure_iterable 

40 

41from ..formatters.parquet import _standardize_multi_index_columns 

42 

43__all__ = ["DataFrameDelegate"] 

44 

45 

46class DataFrameDelegate(StorageClassDelegate): 

47 """Delegate that understands the ``DataFrame`` storage class.""" 

48 

49 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any: 

50 """Get a component from a DataFrame. 

51 

52 Parameters 

53 ---------- 

54 composite : `~pandas.DataFrame` 

55 ``DataFrame`` to access component. 

56 componentName : `str` 

57 Name of component to retrieve. 

58 

59 Returns 

60 ------- 

61 component : `object` 

62 The component. 

63 

64 Raises 

65 ------ 

66 AttributeError 

67 The component can not be found. 

68 """ 

69 if componentName == "columns": 

70 if isinstance(composite.columns, pandas.MultiIndex): 

71 return composite.columns 

72 else: 

73 return pandas.Index(self._getAllColumns(composite)) 

74 elif componentName == "rowcount": 

75 return len(composite) 

76 elif componentName == "schema": 

77 return DataFrameSchema(composite.iloc[:0]) 

78 else: 

79 raise AttributeError( 

80 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

81 ) 

82 

83 def handleParameters( 

84 self, inMemoryDataset: pandas.DataFrame, parameters: Mapping[str, Any] | None = None 

85 ) -> Any: 

86 """Return possibly new in-memory dataset using the supplied parameters. 

87 

88 Parameters 

89 ---------- 

90 inMemoryDataset : `object` 

91 Object to modify based on the parameters. 

92 parameters : `dict`, optional 

93 Parameters to apply. Values are specific to the parameter. 

94 Supported parameters are defined in the associated 

95 `StorageClass`. If no relevant parameters are specified the 

96 ``inMemoryDataset`` will be return unchanged. 

97 

98 Returns 

99 ------- 

100 inMemoryDataset : `object` 

101 Original in-memory dataset, or updated form after parameters 

102 have been used. 

103 """ 

104 if not isinstance(inMemoryDataset, pandas.DataFrame): 

105 raise ValueError( 

106 "handleParameters for a DataFrame must get a DataFrame, " 

107 f"not {get_full_type_name(inMemoryDataset)}." 

108 ) 

109 

110 if parameters is None: 

111 return inMemoryDataset 

112 

113 if "columns" in parameters: 

114 allColumns = self._getAllColumns(inMemoryDataset) 

115 

116 if not isinstance(parameters["columns"], collections.abc.Iterable): 

117 raise NotImplementedError( 

118 "InMemoryDataset of a DataFrame only supports list/tuple of string column names" 

119 ) 

120 

121 if isinstance(inMemoryDataset.columns, pandas.MultiIndex): 

122 # We have a multi-index dataframe which needs special handling. 

123 readColumns = _standardize_multi_index_columns( 

124 inMemoryDataset.columns, 

125 parameters["columns"], 

126 stringify=False, 

127 ) 

128 else: 

129 for column in ensure_iterable(parameters["columns"]): 

130 if not isinstance(column, str): 

131 raise NotImplementedError( 

132 "InMemoryDataset of a DataFrame only supports string column names." 

133 ) 

134 if column not in allColumns: 

135 raise ValueError(f"Unrecognized column name {column!r}.") 

136 

137 # Exclude index columns from the subset. 

138 readColumns = [ 

139 name 

140 for name in ensure_iterable(parameters["columns"]) 

141 if name not in inMemoryDataset.index.names 

142 ] 

143 

144 # Ensure uniqueness, keeping order. 

145 readColumns = list(dict.fromkeys(readColumns)) 

146 

147 return inMemoryDataset[readColumns] 

148 else: 

149 return inMemoryDataset 

150 

151 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]: 

152 """Get all columns, including index columns. 

153 

154 Returns 

155 ------- 

156 columns : `list` [`str`] 

157 List of all columns. 

158 """ 

159 allColumns = list(inMemoryDataset.columns) 

160 if inMemoryDataset.index.names[0] is not None: 

161 allColumns.extend(inMemoryDataset.index.names) 

162 

163 return allColumns