forked from sjev/trading-with-python
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcsvDatabase.py
More file actions
187 lines (128 loc) · 5.19 KB
/
csvDatabase.py
File metadata and controls
187 lines (128 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# -*- coding: utf-8 -*-
"""
intraday data handlers in csv format.
@author: jev
"""
import pandas as pd
import datetime as dt
import os
import numpy as np
from .extra import ProgressBar
dateFormat = "%Y%m%d" # date format for converting filenames to dates
dateTimeFormat = "%Y%m%d%H%M%S"
def fileName2date(fName):
'''convert filename to date'''
name = os.path.splitext(fName)[0]
try:
return dt.datetime.strptime(name.split('_')[1],dateTimeFormat)
except ValueError:
return dt.datetime.strptime(name.split('_')[1],dateFormat)
def parseDateTime(dateTimeStr):
return dt.datetime.strptime(dateTimeStr,dateTimeFormat)
def loadCsv(fName):
''' load DataFrame from csv file '''
return pd.DataFrame.from_csv(fName)
class HistDataCsv(object):
'''class for working with historic database in .csv format'''
def __init__(self,symbol,dbDir,autoCreateDir=False):
self.symbol = symbol
self.dbDir = os.path.normpath(os.path.join(dbDir,symbol))
if not os.path.exists(self.dbDir) and autoCreateDir:
print('Creating data directory ', self.dbDir)
os.mkdir(self.dbDir)
self.dates = []
@property
def files(self):
""" a list of csv files present """
files = os.listdir(self.dbDir)
files.sort()
return files
def loadAll(self):
""" load all files from the database and return as DataFrame """
files = self.files
data = [self._loadCsv(f) for f in files]
data = pd.concat(data)
data = data.groupby(data.index).first() # remove duplicate rows
return data
def to_hdf(self,fName):
"""
convert data to hdf5 file. If no fName is provided, the file is created in
the database root directory """
df = self.loadAll()
df.to_hdf(fName,self.symbol)
@property
def dateRange(self):
""" get min and max values of the timestamps in database """
files = self.files
if len(files) == 0:
return (None, None)
ts = [fileName2date(fName) for fName in files]
# earliest
t0 = self._loadCsv(files[np.argmin(ts)]).index[0]
t1 = self._loadCsv(files[np.argmax(ts)]).index[-1]
return (t0,t1)
def _loadCsv(self,fName):
""" convenience function, prepending right path """
return pd.DataFrame.from_csv(os.path.join(self.dbDir,fName))
def saveData(self, df,lowerCaseColumns=True):
''' add data to database'''
if lowerCaseColumns: # this should provide consistency to column names. All lowercase
df.columns = [ c.lower() for c in df.columns]
s = self.symbol+'_'+df.index[-1].strftime(dateTimeFormat)+'.csv' # file name
dest = os.path.join(self.dbDir,s) # full path destination
print('Saving data to: ', dest)
df.to_csv(dest)
def __repr__(self):
rng = self.dateRange
return '%s dataset %i files\nrange: %s ... %s' % (self.symbol, len(self.files), rng[0], rng[1] )
class HistDatabase(object):
''' class working with multiple symbols at once '''
def __init__(self, dataDir):
# get symbols from directory names
symbols = []
for l in os.listdir(dataDir):
if os.path.isdir(os.path.join(dataDir,l)):
symbols.append(l)
#build dataset
self.csv = {} # dict of HistDataCsv halndlers
for symbol in symbols:
self.csv[symbol] = HistDataCsv(symbol,dataDir)
def loadDates(self,dates=None):
'''
get data for all symbols as wide panel
provide a dates list. If no dates list is provided, common dates are used.
'''
if dates is None: dates=self.commonDates
tmp = {}
for k,v in self.csv.items():
tmp[k] = v.loadDates(dates)
return pd.WidePanel(tmp)
def toHDF(self,dataFile,dates=None):
''' write wide panel data to a hdfstore file '''
if dates is None: dates=self.commonDates
store = pd.HDFStore(dataFile)
wp = self.loadDates(dates)
store['data'] = wp
store.close()
@property
def commonDates(self):
''' return dates common for all symbols '''
t = [v.dates for v in self.csv.values()] # get all dates in a list
d = list(set(t[0]).intersection(*t[1:]))
return sorted(d)
def __repr__(self):
s = '-----Hist CSV Database-----\n'
for k,v in self.csv.items():
s+= (str(v)+'\n')
return s
#--------------------
if __name__=='__main__':
dbDir =os.path.normpath('D:/data/30sec')
vxx = HistDataCsv('VXX',dbDir)
spy = HistDataCsv('SPY',dbDir)
#
date = dt.date(2012,8,31)
print(date)
#
pair = pd.DataFrame({'SPY':spy.loadDate(date)['close'],'VXX':vxx.loadDate(date)['close']})
print(pair.tail())