Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

from __future__ import with_statement 

from __future__ import print_function 

from future import standard_library 

standard_library.install_aliases() 

from builtins import str 

from builtins import range 

import numpy as np 

from io import BytesIO 

from sqlalchemy import (types as satypes, Column, Table, Index, 

create_engine, MetaData) 

import string 

import random 

 

 

def np_to_sql_type(input_type): 

""" 

Returns the SQL data type (as encoded by sqlalchemy) 

corresponding to a numpy dtype 

 

input_type is an element of a numpy.dtype array 

""" 

name = input_type.name 

size = input_type.itemsize 

if name.startswith('float'): 

return satypes.Float(precision=16) 

if name == 'int64': 

return satypes.BIGINT() 

28 ↛ 29line 28 didn't jump to line 29, because the condition on line 28 was never true if name == 'int32': 

return satypes.Integer() 

30 ↛ 33line 30 didn't jump to line 33, because the condition on line 30 was never false if name.startswith('str') or str(input_type).startswith('S') or str(input_type).startswith('|S'): 

return satypes.String(length=size) 

 

raise RuntimeError("Do not know how to map %s to SQL" % str(input_type)) 

 

 

# from http://stackoverflow.com/questions/2257441/python-random-string-generation-with-upper-case-letters-and-digits 

def id_generator(size=8, chars=string.ascii_lowercase): 

return ''.join(random.choice(chars) for x in range(size)) 

 

 

def make_engine(dbAddress): 

"""create and connect to a database engine""" 

engine = create_engine(dbAddress, echo=False) 

metadata = MetaData(bind=engine) 

return engine, metadata 

 

 

def guessDtype(dataPath, numGuess, delimiter, **kwargs): 

cnt = 0 

teststr = '' 

with open(dataPath) as fh: 

while cnt < numGuess: 

teststr += fh.readline() 

cnt += 1 

dataArr = np.genfromtxt(BytesIO(teststr.encode()), dtype=None, names=True, delimiter=delimiter, **kwargs) 

return dataArr.dtype 

 

 

def createSQLTable(dtype, tableid, idCol, metadata): 

""" 

create a sqlalchemy Table object. 

 

Parameters 

---------- 

dtype is a numpy dtype describing the columns in the table 

tableid is the name of the table to be created 

idCol is the column on which to construct the Table's primary key 

metadata is the sqlalchemy MetaData object associated with the database connection 

 

Returns 

------- 

A sqlalchemy Table object with the columns specified by dtype 

""" 

sqlColumns = [] 

for itype in range(len(dtype)): 

sqlType = np_to_sql_type(dtype[itype]) 

name = dtype.names[itype] 

sqlColumns.append(Column(name, sqlType, primary_key = (idCol == name))) 

 

if tableid is None: 

tableid = id_generator() 

datatable = Table(tableid, metadata, *sqlColumns) 

metadata.create_all() 

return datatable 

 

 

def loadTable(datapath, datatable, delimiter, dtype, engine, 

indexCols=[], skipLines=1, chunkSize=100000, **kwargs): 

cnt = 0 

with open(datapath) as fh: 

while cnt < skipLines: 

fh.readline() 

cnt += 1 

cnt = 0 

tmpstr = '' 

for l in fh: 

tmpstr += l 

cnt += 1 

99 ↛ 100line 99 didn't jump to line 100, because the condition on line 99 was never true if cnt%chunkSize == 0: 

print("Loading chunk #%i"%(int(cnt/chunkSize))) 

dataArr = np.genfromtxt(BytesIO(tmpstr.encode()), dtype=dtype, delimiter=delimiter, **kwargs) 

engine.execute(datatable.insert(), 

[dict((name, np.asscalar(l[name])) for name in l.dtype.names) 

for l in dataArr]) 

tmpstr = '' 

# Clean up the last chunk 

107 ↛ 118line 107 didn't jump to line 118, because the condition on line 107 was never false if len(tmpstr) > 0: 

dataArr = np.genfromtxt(BytesIO(tmpstr.encode()), dtype=dtype, delimiter=delimiter, **kwargs) 

try: 

engine.execute(datatable.insert(), 

[dict((name, np.asscalar(l[name])) for name in l.dtype.names) 

for l in dataArr]) 

# If the file only has one line, the result of genfromtxt is a 0-d array, so cannot be iterated 

except TypeError: 

engine.execute(datatable.insert(), 

[dict((name, np.asscalar(dataArr[name])) for name in dataArr.dtype.names), ]) 

 

118 ↛ 119line 118 didn't jump to line 119, because the loop on line 118 never started for col in indexCols: 

if hasattr(col, "__iter__"): 

print("Creating index on %s"%(",".join(col))) 

colArr = (datatable.c[c] for c in col) 

i = Index('%sidx'%''.join(col), *colArr) 

else: 

print("Creating index on %s"%(col)) 

i = Index('%sidx'%col, datatable.c[col]) 

 

i.create(engine) 

 

 

def loadData(dataPath, dtype, delimiter, tableId, idCol, engine, metaData, numGuess, append=False, **kwargs): 

if dtype is None: 

dtype = guessDtype(dataPath, numGuess, delimiter) 

 

tableExists = False 

 

if tableId is not None: 

tableExists = engine.dialect.has_table(engine.connect(), tableId) 

138 ↛ 139line 138 didn't jump to line 139, because the condition on line 138 was never true if append and tableId is None: 

raise ValueError("Cannot append if the table name is missing") 

140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true elif tableExists and not append: 

raise ValueError("Append is False but table exists") 

142 ↛ 145line 142 didn't jump to line 145, because the condition on line 142 was never false elif not tableExists: 

dataTable = createSQLTable(dtype, tableId, idCol, metaData) 

else: 

dataTable = Table(tableId, metaData, autoload=True) 

loadTable(dataPath, dataTable, delimiter, dtype, engine, **kwargs) 

return dataTable.name