官网数据集介绍:
-
数据集地址
-
link
在邀请某人约会或跳伞之前,了解你成功的可能性很重要。向潜在客户提供住房保险价格也是如此。Homesite是一家领先的房主保险提供商,目前没有一个动态转换率模型,可以让他们相信报价会导致购买
Homesite使用一个匿名的客户和销售活动信息数据库,包括财产和保险信息,让您预测哪些客户将购买给定的报价。准确预测转换将有助于Homesite更好地理解拟议定价变化的影响,并保持理想的客户细分组合。
文章使用模型
因为下述模型时间快并且正确率较高所以选择如下模型
model | accuary | time |
---|---|---|
KNN | 0.7969637200406431 | 9.586912631988525 |
DTCclassifier | 0.8685075608152532 | 15.190108060836792 |
RandomForest | 0.8986910525372064 | 74.82114577293396 |
在正常串行编程中使用时间约为150s
串行代码
##导入数据
x_train = pd.read_table(r'./data/train_x.txt')
y_train = pd.read_table(r'./data/train_y.txt')
x_test = pd.read_table(r'./data/test_x.txt')
y_test = pd.read_table(r'./data/test_y.txt')
KNN
model_start = time.time()
knn_start = time.time()
KNNClassifier = KNeighborsClassifier(n_neighbors=6)
KNNClassifier.fit(x_train, np.ravel(y_train))
y_pred_KNN = KNNClassifier.predict(x_test)
KNNAcc = accuracy_score(y_pred_KNN, y_test)
knn_end = time.time()
print('KNN准确率为:{}'.format(KNNAcc))
print('KNN共耗时{}秒'.format(knn_end - knn_start))
决策树
DTCclassifier = DecisionTreeClassifier()
DTCclassifier.fit(x_train, np.ravel(y_train))
y_pred_DTC = DTCclassifier.predict(x_test)
DTCAcc = accuracy_score(y_pred_DTC, y_test)
print('决策树准确率为:{}'.format(DTCAcc))
print('决策树共耗时{}秒'.format(DTC_end - DTC_start))
随机森林
DTC_start = time.time()
DTCclassifier = DecisionTreeClassifier()
DTCclassifier.fit(x_train, np.ravel(y_train))
y_pred_DTC = DTCclassifier.predict(x_test)
DTCAcc = accuracy_score(y_pred_DTC, y_test)
DTC_end = time.time()
print('决策树准确率为:{}'.format(DTCAcc))
print('决策树共耗时{}秒'.format(DTC_end - DTC_start))
投票器
total = list(zip(y_pred_KNN, y_pred_DTC, y_pred_RF))
pre = [Counter(list(total[i])).most_common(1)[0][0] for i in range(len(total))]
vote = accuracy_score(pre, np.array(y_test).flatten())
print('投票:{}'.format(vote_acc))
模型并行
模型并行流程图
代码如下:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import time
from collections import Counter
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
# x_train = np.loadtxt('./data/train_x.txt', dtype=np.float32, delimiter='\t', skiprows=1)
# y_train = np.loadtxt('./data/train_y.txt', dtype=np.float32, delimiter='\t', skiprows=1)
# x_test = np.loadtxt('./data/test_x.txt', dtype=np.float32, delimiter='\t', skiprows=1)
# y_test = np.loadtxt('./data/test_y.txt', dtype=np.float32, delimiter='\t', skiprows=1)
x_train = pd.read_table('./data/train_x.txt')
y_train = pd.read_table('./data/train_y.txt')
x_test = pd.read_table('./data/test_x.txt')
y_test = pd.read_table('./data/test_y.txt')
def KNN_Model(x_train, y_train, x_test):
KNNClassifier = KNeighborsClassifier(n_neighbors=5)
KNNClassifier.fit(x_train, np.ravel(y_train))
KNN = KNNClassifier.predict(x_test)
KNNAcc = accuracy_score(KNN, y_test)
data = open('KNN.txt', mode='w')
for i in KNN.tolist():
data.write(str(i) + '\n')
print('最近邻准确率为:{}'.format(KNNAcc))
return KNN
def DecisionTree_Model(x_train, y_train, x_test):
DTCclassifier = DecisionTreeClassifier()
DTCclassifier.fit(x_train, np.ravel(y_train))
DTC = DTCclassifier.predict(x_test)
DTCAcc = accuracy_score(DTC, y_test)
data = open('DecisionTree.txt', mode='w')
for i in DTC.tolist():
data.write(str(i) + '\n')
print('决策树准确率为:{}'.format(DTCAcc))
return DTC
def RandomForest_Model(x_train, y_train, x_test):
RFclassifier = RandomForestClassifier()
RFclassifier.fit(x_train, np.ravel(y_train))
randomforest = RFclassifier.predict(x_test)
RFAcc = accuracy_score(randomforest, y_test)
data = open('RandomForest.txt', mode='w')
for i in randomforest.tolist():
data.write(str(i) + '\n')
print('随机森林准确率为:{}'.format(RFAcc))
return randomforest
def load(path):
pre = []
for line in open(path, encoding='utf8'):
line1 = line.replace("\n", "")
pre.append(line1)
return pre
def model(x_train, y_train, x_test):
KNN_txt = load("./KNN.txt")
DecisionTree_txt = load("./DecisionTree.txt")
RandomForest_txt = load("./RandomForest.txt")
total = list(zip(KNN_txt, DecisionTree_txt, RandomForest_txt))
pre = [Counter(list(total[i])).most_common(1)[0][0] \
for i in range(len(total))]
vote = accuracy_score(pre, np.array(y_test).flatten())
print('投票准确率为'.format(vote))
if __name__ == "__main__":
model_start = time.time()
task1 = multiprocessing.Process(target=KNN_Model, args=(x_train, y_train, x_test))
task2 = multiprocessing.Process(target=DecisionTree_Model, args=(x_train, y_train, x_test))
task3 = multiprocessing.Process(target=RandomForest_Model, args=(x_train, y_train, x_test))
task1.start()
task2.start()
task3.start()
task1.join()
task2.join()
task3.join()
model_end = time.time()
print('total_time{}'.format(model_end - model_start))
可提升方向
读写数据比较慢,模型加载比较慢这都是可以改进的地方