机器学习中导入的库比较多,需要一个一个确定是否导入成功
# 导入类库
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.externals.joblib import dump
from sklearn.externals.joblib import load
# 导入数据
filename = 'D://example//MachineLearning-master//iris.data.csv'
names = ['separ-length', 'separ-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(filename, names=names)
print('数据维度: 行 %s,列 %s' % dataset.shape)#显示数据维度
print(dataset.head(10))# 查看数据的前10行
print(dataset.describe())# 统计描述数据信息
print(dataset.groupby('class').size())# 分类分布情况
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)# 箱线图
pyplot.show()
dataset.hist()# 直方图
pyplot.show()
scatter_matrix(dataset)# 散点矩阵图
pyplot.show()
# 分离数据集
array = dataset.values
X = array[:, 0:4] #取二维数组中第1(m)维到3维(第n-1维)的所有数据,相当于取第1(m)列到第3(n-1)列的所有数据
Y = array[:, 4]#取二维数组中第3维的所有数据,相当于取第3列的所有数据
validation_size = 0.2#验证集比例
seed = 7
X_train, X_validation, Y_train, Y_validation = \
train_test_split(X, Y, test_size=validation_size, random_state=seed)
#分离出训练集和验证集的数据
# 算法审查
models = {}
models['LR'] = LogisticRegression()#线性回归(LR)
models['LDA'] = LinearDiscriminantAnalysis()#线性判别分析(LDA)
models['KNN'] = KNeighborsClassifier()#K近邻(KNN)
models['CART'] = DecisionTreeClassifier()#分类与回归树(CART)
models['NB'] = GaussianNB()#贝叶斯分类器(NB)
models['SVM'] = SVC()#支持向量机(SVM)
# 评估算法
results = []
for key in models:
kfold = KFold(n_splits=10, random_state=seed)
cv_results = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))
#使用评估数据集评估算法
svm = SVC()
svm.fit(X=X_train, y=Y_train)#X是已知训练集,Y是标准答案
predictions = svm.predict(X_validation)#用验证集去测试,看输出结果与验证集Y_validation是否相同。
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
model_file='finalized_model_joblib.sav'#保存模型
with open(model_file,'wb') as model_f:
dump(svm,model_f)
with open(model_file,'rb') as model_f:#加载模型
loaded_model=load(model_f)
result=loaded_model.score(X=X_train, y=Y_train)
print(accuracy_score(Y_validation, predictions))
下面的链接是数据
https://pan.baidu.com/s/1a7Q2QaB2pIEz6p9uGsUGcA