news_stock练习:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from datetime import date
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
'''
1、读入数据
label 1:涨
label 0:跌
合并每一行数据中的'Top.*'项
分割测试/训练集,并进行预处理
'''
data =pd.read_csv('G:/KNNtest/NLP/Combined_News_DJIA.csv')
#合并各数据中的Top.*项
data["combined_news"] = data.filter(regex = ('Top.*')).apply(lambda x:''.join(str(x.values)),axis=1)
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
#1、预处理————将数据中的大写转换为小写,并去掉其中的'“'、”‘“符号
X_train = train["combined_news"].str.lower().str.replace('"','').str.replace("'",'').str.split()
X_test = test["combined_news"].str.lower().str.replace('"','').str.replace("'",'').str.split()
#2、删减停止词
stop = stopwords.words('english')
#3、删除数字
def hasNumbers(inputString):
return bool(re.search(r'\d',inputString))
#4、lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#把所有元素合成一个func
def check(word):
'''
如果需要该词,则true,否则false
'''
if word in stop:
return False #去除停止词
elif hasNumbers(word):
return False #删除数字
else:
return True
#如下处理后X_train为列表
X_train = X_train.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])
X_test = X_test.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])
#外部库如sklearn只支持string输入,所以需要把调整后的list再变回string
X_train = X_train.apply(lambda x: ''.join(x))
X_test = X_test.apply(lambda x: ''.join(x))
'''
2、特征提取
'''
feature_extraction = TfidfVectorizer()
#对测试集做fit 和 transform
X_train = feature_extraction.fit_transform(train["combined_news"].values)
#直接给测试集做transform
X_test = feature_extraction.transform(test["combined_news"].values)
y_train = train["Label"].values
y_test = test["Label"].values
'''
3、训练模型
'''
clf = SVC(probability=True,kernel='rbf')
clf.fit(X_train,y_train)
'''
4、预测
'''
predictions = clf.predict_proba(X_test)
'''
5、验证准确度
'''
print('ROR-AUC yields' + str(roc_auc_score(y_test,predictions[:,1])))
news_stock_advanced
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from datetime import date
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models.word2vec import Word2Vec
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
'''
本篇将会使用word2vec
'''
'''
1、读取数据,Label为1代表平盘或涨,0则代表跌
'''
data = pd.read_csv('G:/KNNtest/NLP/Combined_News_DJIA.csv')
#1、!数据分割为train和test
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
#2、!将每条新闻数据做成一个单独的句子集合在一起
X_train = train[train.columns[2:]] #train.columns[2:]为train数据从第三列开始的特征名
#3、!corpus是全部我们『可见』的文本资料。我们假设每条新闻就是一句话,把他们全部flatten()了,我们就会得到list of sentences。
#同时我们的X_train和X_test可不能随便flatten,他们需要与y_train和y_test对应。
corpus = X_train.values.flatten().astype(str) #flatten()函数将一个嵌套多层的数组array转化成只有一层的数组
X_train = X_train.values.astype(str)
X_train = np.array([''.join(x) for x in X_train])
X_test = test[test.columns[2:]]
X_test = X_test.values.astype(str)
X_test = np.array([''.join(x) for x in X_test])
y_train = train['Label'].values
y_test = test['Label'].values
'''
数据预处理:!!!把每个单词分割开,删除停止词、数字、特殊符号、存储str时遗留的符号等。并将大写该小写并进行词形归一
'''
corpus = [word_tokenize(x) for x in corpus]
X_train = [word_tokenize(x) for x in X_train]
X_test = [word_tokenize(x) for x in X_test]
stop = stopwords.words('english') #停止词
def hasNumbers(inputString):
return bool(re.search(r'\d',inputString)) #出现数字时返回true
def isSymbol(inputString):
return bool(re.match(r'[^\w]',inputString))#正则表达式,\w表示单词字符[A-Za-z0-9_],[^\w]表示取反,全是特殊符号时返回true
wordnet_lemmatizer = WordNetLemmatizer()
def check(word):
word = word.lower() #变为小写
if word in stop: #删除停止词
return False
elif hasNumbers(word) or isSymbol(word): #去掉数字和特殊字符
return False
else:
return True
def preprocessing(sen):
res = []
for word in sen:
if check(word):
#去除python里面byte存str时候留下的标识
word = word.lower().replace("b'",'').replace('b"','').replace('"','').replace("'",'')
res.append(wordnet_lemmatizer.lemmatize(word)) #词形归一化:went归一为go
return res
corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]
'''
训练NLP模型
1、word2Vec
'''
model = Word2Vec(corpus,size=128,window=5,min_count=5,workers=4)
# print(model['ok'])
#创建vocab,因为文本量很小,因此可以把所有单词的vector一起求平均值
vocab = model.wv.vocab
def get_vector(word_list):
res = np.zeros([128]) #建立一个全是0的array
count = 0
for word in word_list:
if word in vocab:
res += model[word]
count += 1
return res/count
# print(get_vector(['hello','from','the']))
#用NLP模型表达处理后的数据
wordlist_train = X_train
wordlist_test =X_test
print(np.shape(wordlist_train[1]))
X_train = [get_vector(x) for x in X_train]
print(np.shape(X_train[1]))
X_test = [get_vector(x) for x in X_test]
'''
建立ML模型,因为128维每个值都是连续的,选择连续函数比较合适:SVM
'''
params = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]
test_scores = []
for param in params:
clf = SVR(gamma=param)
test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
test_scores.append(np.mean(test_score))
plt.plot(params,test_scores)
plt.title("Param vs CV AUC Score") #AUC越大越好
plt.show()