Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import with_statement 

2from __future__ import print_function 

3from future import standard_library 

4standard_library.install_aliases() 

5from builtins import str 

6from builtins import range 

7import numpy as np 

8from io import BytesIO 

9from sqlalchemy import (types as satypes, Column, Table, Index, 

10 create_engine, MetaData) 

11import string 

12import random 

13 

14 

15def np_to_sql_type(input_type): 

16 """ 

17 Returns the SQL data type (as encoded by sqlalchemy) 

18 corresponding to a numpy dtype 

19 

20 input_type is an element of a numpy.dtype array 

21 """ 

22 name = input_type.name 

23 size = input_type.itemsize 

24 if name.startswith('float'): 

25 return satypes.Float(precision=16) 

26 if name == 'int64': 

27 return satypes.BIGINT() 

28 if name == 'int32': 

29 return satypes.Integer() 

30 if name.startswith('str') or str(input_type).startswith('S') or str(input_type).startswith('|S'): 

31 return satypes.String(length=size) 

32 

33 raise RuntimeError("Do not know how to map %s to SQL" % str(input_type)) 

34 

35 

36# from http://stackoverflow.com/questions/2257441/python-random-string-generation-with-upper-case-letters-and-digits 

37def id_generator(size=8, chars=string.ascii_lowercase): 

38 return ''.join(random.choice(chars) for x in range(size)) 

39 

40 

41def make_engine(dbAddress): 

42 """create and connect to a database engine""" 

43 engine = create_engine(dbAddress, echo=False) 

44 metadata = MetaData(bind=engine) 

45 return engine, metadata 

46 

47 

48def guessDtype(dataPath, numGuess, delimiter, **kwargs): 

49 cnt = 0 

50 teststr = '' 

51 with open(dataPath) as fh: 

52 while cnt < numGuess: 

53 teststr += fh.readline() 

54 cnt += 1 

55 dataArr = np.genfromtxt(BytesIO(teststr.encode()), dtype=None, names=True, delimiter=delimiter, **kwargs) 

56 return dataArr.dtype 

57 

58 

59def createSQLTable(dtype, tableid, idCol, metadata): 

60 """ 

61 create a sqlalchemy Table object. 

62 

63 Parameters 

64 ---------- 

65 dtype is a numpy dtype describing the columns in the table 

66 tableid is the name of the table to be created 

67 idCol is the column on which to construct the Table's primary key 

68 metadata is the sqlalchemy MetaData object associated with the database connection 

69 

70 Returns 

71 ------- 

72 A sqlalchemy Table object with the columns specified by dtype 

73 """ 

74 sqlColumns = [] 

75 for itype in range(len(dtype)): 

76 sqlType = np_to_sql_type(dtype[itype]) 

77 name = dtype.names[itype] 

78 sqlColumns.append(Column(name, sqlType, primary_key = (idCol == name))) 

79 

80 if tableid is None: 

81 tableid = id_generator() 

82 datatable = Table(tableid, metadata, *sqlColumns) 

83 metadata.create_all() 

84 return datatable 

85 

86 

87def loadTable(datapath, datatable, delimiter, dtype, engine, 

88 indexCols=[], skipLines=1, chunkSize=100000, **kwargs): 

89 cnt = 0 

90 with open(datapath) as fh: 

91 while cnt < skipLines: 

92 fh.readline() 

93 cnt += 1 

94 cnt = 0 

95 tmpstr = '' 

96 for l in fh: 

97 tmpstr += l 

98 cnt += 1 

99 if cnt%chunkSize == 0: 

100 print("Loading chunk #%i"%(int(cnt/chunkSize))) 

101 dataArr = np.genfromtxt(BytesIO(tmpstr.encode()), dtype=dtype, delimiter=delimiter, **kwargs) 

102 engine.execute(datatable.insert(), 

103 [dict((name, np.asscalar(l[name])) for name in l.dtype.names) 

104 for l in dataArr]) 

105 tmpstr = '' 

106 # Clean up the last chunk 

107 if len(tmpstr) > 0: 

108 dataArr = np.genfromtxt(BytesIO(tmpstr.encode()), dtype=dtype, delimiter=delimiter, **kwargs) 

109 try: 

110 engine.execute(datatable.insert(), 

111 [dict((name, np.asscalar(l[name])) for name in l.dtype.names) 

112 for l in dataArr]) 

113 # If the file only has one line, the result of genfromtxt is a 0-d array, so cannot be iterated 

114 except TypeError: 

115 engine.execute(datatable.insert(), 

116 [dict((name, np.asscalar(dataArr[name])) for name in dataArr.dtype.names), ]) 

117 

118 for col in indexCols: 

119 if hasattr(col, "__iter__"): 

120 print("Creating index on %s"%(",".join(col))) 

121 colArr = (datatable.c[c] for c in col) 

122 i = Index('%sidx'%''.join(col), *colArr) 

123 else: 

124 print("Creating index on %s"%(col)) 

125 i = Index('%sidx'%col, datatable.c[col]) 

126 

127 i.create(engine) 

128 

129 

130def loadData(dataPath, dtype, delimiter, tableId, idCol, engine, metaData, numGuess, append=False, **kwargs): 

131 if dtype is None: 

132 dtype = guessDtype(dataPath, numGuess, delimiter) 

133 

134 tableExists = False 

135 

136 if tableId is not None: 

137 tableExists = engine.dialect.has_table(engine.connect(), tableId) 

138 if append and tableId is None: 

139 raise ValueError("Cannot append if the table name is missing") 

140 elif tableExists and not append: 

141 raise ValueError("Append is False but table exists") 

142 elif not tableExists: 

143 dataTable = createSQLTable(dtype, tableId, idCol, metaData) 

144 else: 

145 dataTable = Table(tableId, metaData, autoload=True) 

146 loadTable(dataPath, dataTable, delimiter, dtype, engine, **kwargs) 

147 return dataTable.name