22 """Tools to help you iterate over a set of repositories. 24 Helpful while creating them or harvesting data from them. 26 from __future__
import absolute_import, division, print_function
27 from builtins
import zip
28 from builtins
import object
37 def _getDTypeList(keyTuple, valTuple):
38 """Construct a numpy dtype for a data ID or repository ID 40 @param[in] keyTuple: ID key names, in order 41 @param[in] valTuple: a value tuple 42 @return numpy dtype as a list 44 @warning: this guesses at string length (STR_PADDING + length of string in valTuple); 45 longer strings will be truncated when inserted into numpy structured arrays 48 for name, val
in zip(keyTuple, valTuple):
49 if isinstance(val, str):
50 predLen = len(val) + STR_PADDING
51 typeList.append((name, str, predLen))
53 typeList.append((name, numpy.array([val]).dtype))
58 """Accumulate a set of measurements from a set of source tables 61 - specify the desired source measurements when constructing this object 62 - call addSourceMetrics for each repository you harvest data from 63 - call finalize to produce the final data 65 Data available after calling finalize: 66 - self.sourceArr: a numpy structured array of shape (num repositories, num sources) 67 containing named columns for: 70 - each item of data extracted from the source table 71 - self.sourceIdDict: a dict of (source ID: index of axis 1 of self.sourceArr) 72 - self.repoArr: a numpy structured array of shape (num repositories,) 73 containing a named column for each repository key (see RepositoryIterator) 75 @note: sources that had non-finite data (e.g. NaN) for every value extracted are silently omitted 78 def __init__(self, datasetType, sourceKeyTuple):
80 @param[in] datasetType: dataset type for source 81 @param[in] sourceKeyTuple: list of keys of data items to extract from the source tables 83 @raise RuntimeError if sourceKeyTuple is empty 85 if len(sourceKeyTuple) < 1:
86 raise RuntimeError(
"Must specify at least one key in sourceKeyTuple")
105 def _getSourceMetrics(self, idKeyTuple, idValList, sourceTableList):
106 """Obtain the desired source measurements from a list of source tables 108 Extracts a set of source measurements (specified by sourceKeyTuple) from a list of source tables 109 (one per data ID) and saves them as a dict of source ID: list of data 111 @param[in] idKeyTuple: a tuple of data ID keys; must be the same for each call 112 @param[in] idValList: a list of data ID value tuples; 113 each tuple contains values in the order in idKeyTuple 114 @param[in] sourceTableList: a list of source tables, one per entry in idValList 116 @return a dict of source id: data id tuple + source data tuple 117 where source data tuple order matches sourceKeyTuple 118 and data id tuple matches self._idKeyTuple (which is set from the first idKeyTuple) 120 @raise RuntimeError if idKeyTuple is different than it was for the first call. 122 GetRepositoryDataTask.run returns idKeyTuple and idValList; you can easily make 123 a subclass of GetRepositoryDataTask that also returns sourceTableList. 125 Updates instance variables: 126 - self._idKeyTuple if not already set. 131 valTuple=idValList[0])
134 raise RuntimeError(
"idKeyTuple = %s != %s = first idKeyTuple; must be the same each time" %
138 for idTuple, sourceTable
in zip(idValList, sourceTableList):
139 if len(sourceTable) == 0:
142 idList = sourceTable.get(
"id")
149 transposedDataList = list(zip(*dataList))
152 dataDict.update((srcId, idTuple + tuple(data))
153 for srcId, data
in zip(idList, transposedDataList))
157 """Accumulate source measurements from a list of source tables. 159 Once you have accumulated all source measurements, call finalize to process the data. 161 @param[in] repoInfo: a RepositoryInfo instance 162 @param[in] idKeyTuple: a tuple of data ID keys; must be the same for each call 163 @param[in] idValList: a list of data ID value tuples; 164 each tuple contains values in the order in idKeyTuple 165 @param[in] sourceTableList: a list of source tables, one per entry in idValList 167 @raise RuntimeError if idKeyTuple is different than it was for the first call. 169 Accumulates the data in temporary cache self._tempDataList. 171 @return number of sources 184 """Process the accumulated source measurements to create the final data products. 186 Only call this after you have added all source metrics using addSourceMetrics. 188 Reads temporary cache self._tempDataList and then deletes it. 191 raise RuntimeError(
"No data found")
195 fullSrcIdSet.update(iter(dataIdDict.keys()))
202 sourceData = [[(srcId,) + srcDataDict.get(srcId, nullSourceTuple)
for srcId
in fullSrcIdSet]
205 self.
sourceArr = numpy.array(sourceData, dtype=sourceArrDType)
208 self.
sourceIdDict = dict((srcId, i)
for i, srcId
in enumerate(fullSrcIdSet))
211 repoData = [repoInfo.valTuple
for repoInfo
in self.
repoInfoList]
218 """Information about one data repository 220 Constructed by RepositoryIterator and used by SourceData. 223 def __init__(self, keyTuple, valTuple, dtype, name):
224 if len(keyTuple) != len(valTuple):
225 raise RuntimeError(
"lengths of keyTuple=%s and valTuple=%s do not match" % (keyTuple, valTuple))
233 """Iterate over a set of data repositories that use a naming convention based on parameter values 237 """Construct a repository iterator from a dict of name: valueList 239 @param[in] formatStr: format string using dictionary notation, e.g.: "%(foo)s_%(bar)d" 240 @param[in] **dataDict: name=valueList pairs 243 self.
_keyTuple = tuple(sorted(dataDict.keys()))
249 """Retrieve next RepositoryInfo object 252 valDict = dict(zip(self.
_keyTuple, valTuple))
253 name = self.
format(valDict)
257 """Return the number of items in the iterator""" 264 """Return formatted string for a specified value dictionary 266 @param[in] valDict: a dict of key: value pairs that identify a repository 271 """Return the a tuple of keys in the same order as items in value tuples 275 def _getDTypeList(self):
276 """Get a dtype for a structured array of repository keys
def _getSourceMetrics(self, idKeyTuple, idValList, sourceTableList)
def __init__(self, keyTuple, valTuple, dtype, name)
def addSourceMetrics(self, repoInfo, idKeyTuple, idValList, sourceTableList)
def __init__(self, datasetType, sourceKeyTuple)
def __init__(self, formatStr, dataDict)
def format(self, valDict)