K折交叉验证代码示例

 

Stratified k-fold 按照百分比划分数据。

import pandas as pd
import numpy as np
import jieba
from util import *
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import f1_score, accuracy_score, classification_report
import sklearn.metrics as metrics
from sklearn import svm
from collections import Counter
from sklearn.linear_model import LogisticRegression


X, test, y, test_id, y1 = pre_process()  # 训练数据,测试数据,情感,test_id,主题

N = 10  # 分层采样
kf = StratifiedKFold(n_splits=N, random_state=2018).split(X, y)


#clf = LogisticRegression(C=0.5)
clf = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6)

y_train_oofp = np.zeros_like(y, dtype='float64')  # 返回相同shape的0数组
y_train_oofp1 = np.zeros_like(y, dtype='float64')

y_test_oofp = np.zeros((test.shape[0], N))
y_test_oofp_1 = np.zeros((test.shape[0], N))


acc = 0
vcc = 0

for i, (train_fold, test_fold) in enumerate(kf):
    X_train, X_validate, label_train, label_validate, \
    label_1_train, label_1_validate, = X[train_fold, :], X[test_fold, :], y[train_fold], \
                                       y[test_fold], y1[train_fold], y1[test_fold]
    clf.fit(X_train, label_train)
    # print("test_fold", test_fold)

    val_ = clf.predict(X_validate)  # 预测情感词,并将test结果写入数组
    y_train_oofp[test_fold] = val_
    print('sentiment_value_f1:%f' % micro_avg_f1(label_validate, val_))
    acc += micro_avg_f1(label_validate, val_)
    result = clf.predict(test)
    y_test_oofp[:, i] = result

    clf.fit(X_train, label_1_train)  # 预测主题
    val_1 = clf.predict(X_validate)
    y_train_oofp1[test_fold] = val_

    vcc += micro_avg_f1(label_1_validate, val_1)
    result = clf.predict(test)
    y_test_oofp_1[:, i] = result
print(acc / N)
print(vcc / N)

lbl = pk.load(open('label_encoder.sav', 'rb'))
res_2 = []
for i in range(y_test_oofp_1.shape[0]):
    tmp = []
    for j in range(N):
        tmp.append(int(y_test_oofp_1[i][j]))
    word_counts = Counter(tmp)
    yes = word_counts.most_common(1)
    temp_trans = lbl.inverse_transform([yes[0][0]])
    temp_trans = np.array(temp_trans)
    temp_trans = temp_trans.reshape(1)
    res_2.append(temp_trans)


res = []
for i in range(y_test_oofp.shape[0]):
    tmp = []
    for j in range(N):
        tmp.append(y_test_oofp[i][j])
    res.append(max(set(tmp), key=tmp.count))

print(len(res))
result = pd.DataFrame()
result['content_id'] = list(test_id)

print(res_2)
result['subject'] = list(res_2)
result['subject'] = result['subject']

result['sentiment_value'] = list(res)
result['sentiment_value'] = result['sentiment_value'].astype(int)


result['sentiment_word'] = ''
result.to_csv('submit.csv', index=False)

KFlod包 划分k折交叉验证的时候,是以TEST集的顺序为主的,举例来说,如果划分5折交叉验证,那么TEST选取的顺序为[0].[1],[2],[3],[4]。

import numpy as np
from sklearn.model_selection import KFold
#Sample=np.random.rand(50,15)  # 建立一个50行12列的随机数组
Sam=np.array(np.random.randn(1000)) #1000个随机数
New_sam=KFold(n_splits=5)
for train_index,test_index in New_sam.split(Sam):  # 对Sam数据建立5折交叉验证的划分
#for test_index,train_index in New_sam.split(Sam):  # 默认第一个参数是训练集,第二个参数是测试集
    #print(train_index,test_index)
    Sam_train,Sam_test=Sam[train_index],Sam[test_index]
    print('训练集数量:',Sam_train.shape,'测试集数量:',Sam_test.shape)  # 结果表明每次划分的数量

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值