Stratified k-fold 按照百分比划分数据。
import pandas as pd
import numpy as np
import jieba
from util import *
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import f1_score, accuracy_score, classification_report
import sklearn.metrics as metrics
from sklearn import svm
from collections import Counter
from sklearn.linear_model import LogisticRegression
X, test, y, test_id, y1 = pre_process() # 训练数据,测试数据,情感,test_id,主题
N = 10 # 分层采样
kf = StratifiedKFold(n_splits=N, random_state=2018).split(X, y)
#clf = LogisticRegression(C=0.5)
clf = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6)
y_train_oofp = np.zeros_like(y, dtype='float64') # 返回相同shape的0数组
y_train_oofp1 = np.zeros_like(y, dtype='float64')
y_test_oofp = np.zeros((test.shape[0], N))
y_test_oofp_1 = np.zeros((test.shape[0], N))
acc = 0
vcc = 0
for i, (train_fold, test_fold) in enumerate(kf):
X_train, X_validate, label_train, label_validate, \
label_1_train, label_1_validate, = X[train_fold, :], X[test_fold, :], y[train_fold], \
y[test_fold], y1[train_fold], y1[test_fold]
clf.fit(X_train, label_train)
# print("test_fold", test_fold)
val_ = clf.predict(X_validate) # 预测情感词,并将test结果写入数组
y_train_oofp[test_fold] = val_
print('sentiment_value_f1:%f' % micro_avg_f1(label_validate, val_))
acc += micro_avg_f1(label_validate, val_)
result = clf.predict(test)
y_test_oofp[:, i] = result
clf.fit(X_train, label_1_train) # 预测主题
val_1 = clf.predict(X_validate)
y_train_oofp1[test_fold] = val_
vcc += micro_avg_f1(label_1_validate, val_1)
result = clf.predict(test)
y_test_oofp_1[:, i] = result
print(acc / N)
print(vcc / N)
lbl = pk.load(open('label_encoder.sav', 'rb'))
res_2 = []
for i in range(y_test_oofp_1.shape[0]):
tmp = []
for j in range(N):
tmp.append(int(y_test_oofp_1[i][j]))
word_counts = Counter(tmp)
yes = word_counts.most_common(1)
temp_trans = lbl.inverse_transform([yes[0][0]])
temp_trans = np.array(temp_trans)
temp_trans = temp_trans.reshape(1)
res_2.append(temp_trans)
res = []
for i in range(y_test_oofp.shape[0]):
tmp = []
for j in range(N):
tmp.append(y_test_oofp[i][j])
res.append(max(set(tmp), key=tmp.count))
print(len(res))
result = pd.DataFrame()
result['content_id'] = list(test_id)
print(res_2)
result['subject'] = list(res_2)
result['subject'] = result['subject']
result['sentiment_value'] = list(res)
result['sentiment_value'] = result['sentiment_value'].astype(int)
result['sentiment_word'] = ''
result.to_csv('submit.csv', index=False)
KFlod包 划分k折交叉验证的时候,是以TEST集的顺序为主的,举例来说,如果划分5折交叉验证,那么TEST选取的顺序为[0].[1],[2],[3],[4]。
import numpy as np
from sklearn.model_selection import KFold
#Sample=np.random.rand(50,15) # 建立一个50行12列的随机数组
Sam=np.array(np.random.randn(1000)) #1000个随机数
New_sam=KFold(n_splits=5)
for train_index,test_index in New_sam.split(Sam): # 对Sam数据建立5折交叉验证的划分
#for test_index,train_index in New_sam.split(Sam): # 默认第一个参数是训练集,第二个参数是测试集
#print(train_index,test_index)
Sam_train,Sam_test=Sam[train_index],Sam[test_index]
print('训练集数量:',Sam_train.shape,'测试集数量:',Sam_test.shape) # 结果表明每次划分的数量