import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import cross_validation
data=pd.read_csv('000777.csv',encoding='gbk',parse_dates=[0],index_col=0)
data.sort_index(0,ascending=True,inplace=True)
//data.shape[0]-dayfeature意思是因为我们要用150天数据做训练,对于条目为200条的数据,只有50条数 据是有前150天的数据来训练的,所以训练集的大小就是200-150, 对于每一条数据,他的特征是前150 天的所有特征数据,即150*5,+1是将当天的开盘价引入作为一条特征数据
dayfeature=150
featurenum=5*dayfeature
x=np.zeros((data.shape[0]-dayfeature,featurenum+1))
y=np.zeros((data.shape[0]-dayfeature))
for i in range(0,data.shape[0]-dayfeature):
x[i,0:featurenum]=np.array(data[i:i+dayfeature] \
[[u'收盘价',u'最高价',u'最低价',u'开盘价',u'成交量']]).reshape((1,featurenum))
//最后一列记录当日的开盘价
x[i,featurenum]=data.ix[i+dayfeature][u'开盘价']
for i in range(0,data.shape[0]-dayfeature):
if data.ix[i+dayfeature][u'收盘价'] >= data.ix[i+dayfeature][u'开盘价']:
//如果当天收盘价高于开盘价,y[i]=1代表涨,0代表跌
y[i]=1else:
y[i]=0
clf=svm.SVC(kernel='sigmoid')
result = []
for i in range(5):
x_train, x_test, y_train, y_test = \
cross_validation.train_test_split(x, y, test_size = 0.2)
clf.fit(x_train, y_train)
result.append(np.mean(y_test == clf.predict(x_test)))
print("svm classifier accuacy:")
print(result)