SVM_Multi_class_classification

最新推荐文章于 2019-11-21 19:51:09 发布

Matrix-yang

最新推荐文章于 2019-11-21 19:51:09 发布

阅读量1.8k

点赞数 1

CC 4.0 BY-SA版权

分类专栏：机器学习

本文链接：https://blog.youkuaiyun.com/qq_21768483/article/details/86504770

机器学习专栏收录该内容

42 篇文章

订阅专栏

本文通过生成并分类三组正态分布数据，演示了如何使用Python的Scikit-Learn库中的SVM分类器进行线性分类。展示了从数据生成、可视化、训练到评估的全过程。

import numpy as np 
#产生正态分布的数据100组，中心点（0，0），其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到（5,0），作为第0类
for i in range(100):
    p[i][0]+=5
    p[i][1]+=0

#产生正态分布的数据100组，中心点（0，0），其标准差σ为1，作为第1类
f=np.random.randn(100,2)
#产生正态分布的数据100组，中心点（0，0），其标准差σ为1
t=np.random.randn(100,2)
#将中心点移动到（3.5,3.5），作为第2类
for i in range(100):
    t[i][0]+=3.5
    t[i][1]+=3.5

import pandas as pd 

#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,1类标签1
df_p['z']=0

#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,0类标签0
df_f['z']=1


#将np数组转换成dataframe
df_t=pd.DataFrame(t,columns=['x','y'])
#加上标签z,2类标签2
df_t['z']=2

#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f,df_t], axis=0)
res

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	x	y	z
0	6.078669	-0.517253	0
1	5.302986	1.129988	0
2	5.009613	1.225857	0
3	3.758161	-0.315786	0
4	5.360210	-0.375185	0
5	5.710155	-0.496786	0
6	4.579099	-0.531689	0
7	2.649697	1.835499	0
8	5.276772	-1.717779	0
9	4.168785	-0.901316	0
10	3.744492	-0.281852	0
11	4.795925	-1.438646	0
12	4.395546	1.470029	0
13	6.504895	0.107717	0
14	4.669277	-1.802486	0
15	5.177591	-0.715611	0
16	6.100084	-0.803093	0
17	4.173074	0.849082	0
18	4.646888	0.050525	0
19	5.725629	-0.345751	0
20	3.898579	0.587148	0
21	6.102218	-2.250628	0
22	4.779940	1.648752	0
23	2.945206	-0.156092	0
24	3.804919	-1.226393	0
25	3.148943	-0.853685	0
26	4.480589	2.014021	0
27	5.560275	2.137762	0
28	6.887921	1.943966	0
29	6.227569	0.028383	0
...	...	...	...
70	4.143435	2.546719	2
71	2.604495	3.292901	2
72	4.121899	2.666432	2
73	4.044238	3.775474	2
74	2.603628	4.173138	2
75	3.475392	3.377459	2
76	2.986226	4.487069	2
77	3.582220	4.475310	2
78	2.436692	4.918058	2
79	4.917040	3.606541	2
80	3.148297	3.048453	2
81	4.473144	4.619293	2
82	5.154484	4.372903	2
83	3.707397	3.668351	2
84	4.442523	2.497338	2
85	2.101259	3.225132	2
86	3.787636	4.148101	2
87	3.319200	3.041185	2
88	3.416234	2.522239	2
89	3.406666	3.070693	2
90	4.142204	2.908948	2
91	4.955018	2.451665	2
92	4.249549	2.185492	2
93	5.728465	3.343337	2
94	3.241553	2.228639	2
95	2.238972	3.639555	2
96	2.075077	3.650759	2
97	3.908256	2.855201	2
98	3.304940	2.721721	2
99	3.352008	3.768798	2

300 rows × 3 columns

import matplotlib.pyplot as plt
#绘制出数据集的散点图
plt.scatter(res['x'], res['y'], c=res['z'],cmap=plt.cm.Paired)

plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()

在这里插入图片描述

#重置数据集索引，应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]

from sklearn import svm
#新建SVC分类器,核函数是线性核，C将决定间隔的大小C越大间隔越小

#训练数据
X=train[['x','y']]
#选择训练集的标签
y = train['z']
#svm分类器，线性核
clf = svm.SVC(kernel='linear', C=1)
#训练
clf.fit(X, y)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

plt.scatter(X['x'], X['y'], c=y,cmap=plt.cm.Paired)
# plot the decision function
ax = plt.gca()
#获得坐标系边界
xlim = ax.get_xlim()
ylim = ax.get_ylim()


# 0-1生成300个点
xx = np.linspace(xlim[0], xlim[1], 300)
yy = np.linspace(ylim[0], ylim[1], 300)
#生成网格坐标
YY, XX = np.meshgrid(yy, xx)
#将网格坐标组成样本
xy = np.vstack([XX.ravel(), YY.ravel()]).T
#求xy到分界线的函数距离
Z = clf.predict(xy).reshape(XX.shape)
# 绘制等高线线
ax.contour(XX, YY, Z, colors='k')
# 绘制出支持向量
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,linewidth=1, facecolors='none', edgecolors='k')

plt.show()

在这里插入图片描述

#预测点
clf.predict([[2,0],[2.5,0],[3,0]])

array([1, 0, 0], dtype=int64)

#训练集得分
clf.score(X,y)

0.9866666666666667

#测试集
clf.score(test[['x','y']],test[['z']])

0.96