Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2from .baseMetric import BaseMetric 

3from collections import Counter 

4 

5__all__ = ['StringCountMetric'] 

6 

7 

8class keylookerupper(object): 

9 """Helper object to unpack dictionary values as reduceFunction results. 

10 """ 

11 def __init__(self, key='blank', name=None): 

12 self.key = key 

13 self.__name__ = name 

14 

15 def __call__(self, indict): 

16 return np.max(indict[self.key]) 

17 

18 

19class StringCountMetric(BaseMetric): 

20 """Count up the number of times each string appears in a column. 

21 

22 Dynamically builds reduce functions for each unique string value, so summary sats can be 

23 named the same as strings in the simData array without knowing the values of those trings ahead of time. 

24 """ 

25 

26 def __init__(self, metricName='stringCountMetric', 

27 col='filter', percent=False, **kwargs): 

28 """ 

29 Parameters 

30 ---------- 

31 

32 col: str ('filter') 

33 Column name that has strings to look at 

34 percent : bool (False) 

35 Normalize and return results as percents ranther than raw count 

36 """ 

37 if percent: 

38 units = 'percent' 

39 else: 

40 units = 'count' 

41 self.percent = percent 

42 cols = [col] 

43 super(StringCountMetric, self).__init__(cols, metricName, units=units, 

44 metricDtype=object, **kwargs) 

45 self.col = col 

46 

47 def run(self, dataslice, slicePoint=None): 

48 counter = Counter(dataslice[self.col]) 

49 # convert to a numpy array 

50 lables = list(counter.keys()) 

51 # Numpy can't handle empty string as a dtype 

52 lables = [x if x != '' else 'blank' for x in lables] 

53 metricValue = np.zeros(1, dtype=list(zip(lables, [float]*len(counter.keys())))) 

54 for key in counter: 

55 if key == '': 

56 metricValue['blank'] = counter[key] 

57 else: 

58 metricValue[key] = counter[key] 

59 if self.percent: 

60 norm = sum(metricValue[0])/100. 

61 # Not sure I really like having to loop here, but the dtype is inflexible 

62 for key in metricValue.dtype.names: 

63 metricValue[key] = metricValue[key]/norm 

64 

65 # Now to dynamically set up the reduce functions 

66 for i, key in enumerate(metricValue.dtype.names): 

67 name = key 

68 self.reduceFuncs[name] = keylookerupper(key=key, name=name) 

69 self.reduceOrder[name] = i 

70 

71 return metricValue