Coverage for python/lsst/daf/butler/delegates/dataframe.py: 21%

42 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-07 10:26 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading DataFrames.""" 

23from __future__ import annotations 

24 

25import collections.abc 

26from typing import Any, Mapping, Optional 

27 

28import pandas 

29from lsst.daf.butler import StorageClassDelegate 

30from lsst.daf.butler.formatters.parquet import DataFrameSchema 

31from lsst.utils.introspection import get_full_type_name 

32from lsst.utils.iteration import ensure_iterable 

33 

34__all__ = ["DataFrameDelegate"] 

35 

36 

37class DataFrameDelegate(StorageClassDelegate): 

38 def getComponent(self, composite: pandas.DataFrame, componentName: str) -> Any: 

39 """Get a component from a DataFrame. 

40 

41 Parameters 

42 ---------- 

43 composite : `~pandas.DataFrame` 

44 ``DataFrame`` to access component. 

45 componentName : `str` 

46 Name of component to retrieve. 

47 

48 Returns 

49 ------- 

50 component : `object` 

51 The component. 

52 

53 Raises 

54 ------ 

55 AttributeError 

56 The component can not be found. 

57 """ 

58 if componentName == "columns": 

59 return pandas.Index(self._getAllColumns(composite)) 

60 elif componentName == "rowcount": 

61 return len(composite) 

62 elif componentName == "schema": 

63 return DataFrameSchema(composite.iloc[:0]) 

64 else: 

65 raise AttributeError( 

66 f"Do not know how to retrieve component {componentName} from {get_full_type_name(composite)}" 

67 ) 

68 

69 def handleParameters( 

70 self, inMemoryDataset: pandas.DataFrame, parameters: Optional[Mapping[str, Any]] = None 

71 ) -> Any: 

72 """Return possibly new in-memory dataset using the supplied parameters. 

73 

74 Parameters 

75 ---------- 

76 inMemoryDataset : `object` 

77 Object to modify based on the parameters. 

78 parameters : `dict`, optional 

79 Parameters to apply. Values are specific to the parameter. 

80 Supported parameters are defined in the associated 

81 `StorageClass`. If no relevant parameters are specified the 

82 ``inMemoryDataset`` will be return unchanged. 

83 

84 Returns 

85 ------- 

86 inMemoryDataset : `object` 

87 Original in-memory dataset, or updated form after parameters 

88 have been used. 

89 """ 

90 if not isinstance(inMemoryDataset, pandas.DataFrame): 

91 raise ValueError( 

92 "handleParameters for a DataFrame must get a DataFrame, " 

93 f"not {get_full_type_name(inMemoryDataset)}." 

94 ) 

95 

96 if parameters is None: 

97 return inMemoryDataset 

98 

99 if "columns" in parameters: 

100 allColumns = self._getAllColumns(inMemoryDataset) 

101 

102 if not isinstance(parameters["columns"], collections.abc.Iterable): 

103 raise NotImplementedError( 

104 "InMemoryDataset of a DataFrame only supports list/tuple of string column names" 

105 ) 

106 

107 for column in ensure_iterable(parameters["columns"]): 

108 if not isinstance(column, str): 

109 raise NotImplementedError( 

110 "InMemoryDataset of a DataFrame only supports string column names." 

111 ) 

112 if column not in allColumns: 

113 raise ValueError(f"Unrecognized column name {column!r}.") 

114 

115 # Exclude index columns from the subset. 

116 readColumns = [ 

117 name 

118 for name in ensure_iterable(parameters["columns"]) 

119 if name not in inMemoryDataset.index.names 

120 ] 

121 # Ensure uniqueness, keeping order. 

122 readColumns = list(dict.fromkeys(readColumns)) 

123 

124 return inMemoryDataset[readColumns] 

125 else: 

126 return inMemoryDataset 

127 

128 def _getAllColumns(self, inMemoryDataset: pandas.DataFrame) -> list[str]: 

129 """Get all columns, including index columns. 

130 

131 Returns 

132 ------- 

133 columns : `list` [`str`] 

134 List of all columns. 

135 """ 

136 allColumns = list(inMemoryDataset.columns) 

137 if inMemoryDataset.index.names[0] is not None: 

138 allColumns.extend(inMemoryDataset.index.names) 

139 

140 return allColumns