sklearn官方数据集:http://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
机器学习分类器实例:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
iris=datasets.load_iris()
#花的数据
iris_X=iris.data
#花的分类
iris_y=iris.target
#将数据集分成训练集和测试集,test_size=0.3表明测试集占0.3,训练集占0.7
X_train,X_test,y_train,y_test=train_test_split(iris_X,iris_y,test_size=0.3)
#使用分类器
knn=KNeighborsClassifier()
#使用训练数据学习
knn.fit(X_train,y_train)
#用已学习好的分类器处理测试数据
print(knn.predict(X_test))
#对比正确分类数据源
print(y_test)
[2 2 2 0 2 2 1 0 0 2 0 0 2 2 2 1 2 1 0 1 0 2 1 1 0 2 2 0 1 1 0 0 2 0 0 1 2
2 1 1 1 2 0 0 1]
[2 2 2 0 2 2 1 0 0 2 0 0 2 2 2 1 2 1 0 1 0 2 1 2 0 2 2 0 1 1 0 0 2 0 0 1 2
2 1 1 1 2 0 0 1]
线性回归预测房价数据(使用boston数据集)
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
loaded_data=datasets.load_boston()
data_X=loaded_data.data
data_y=loaded_data.target
#线性回归
model=LinearRegression()
model.fit(data_X,data_y)
#对比预测数据和已知数据,影响房屋的属性为13个
print(model.predict(data_X[:4,:]))
#房价是一维数据
print(data_y[:4])
一些属性:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
loaded_data=datasets.load_boston()
data_X=loaded_data.data
data_y=loaded_data.target
#线性回归
model=LinearRegression()
model.fit(data_X,data_y)
#线性回归方程为y=kx+b
#表示k的值
print(model.coef_)
#表示b的值
print(model.intercept_)
#设置的参数
print(model.get_params())
#真实数据和预测数据的吻合度,即R^2,越接近1表明拟合优度越高
print(model.score(data_X,data_y))
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
#产生用于回归的数据,X为输入数据,y为结果,100为数据个数,n_features为输入数据属性个数,
# n_targets为结果数据的属性个数,noise为标准差,越大点越离散
X,y=datasets.make_regression(n_samples=100,n_features=1,n_targets=2,noise=1)
plt.scatter(X,y)
plt.show()
数据的正规化Normalization
正规化(标准化)数据可以减少资料的偏差与跨度,提升机器学习的成效。
from sklearn import preprocessing #标准化数据模块
import numpy as np
#建立Array
a = np.array([[10, 2.7, 3.6],
[-100, 5, -2],
[120, 20, 40]], dtype=np.float64)
#将normalized后的a打印出
print(preprocessing.scale(a))
# [[ 0. -0.85170713 -0.55138018]
# [-1.22474487 -0.55187146 -0.852133 ]
# [ 1.22474487 1.40357859 1.40351318]]
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.datasets.samples_generator import make_classification
#Support Vector Machine中的Support Vector Classifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
X,y=make_classification(n_samples=300,n_features=2,
n_redundant=0,n_informative=2,
random_state=22,n_clusters_per_class=1,
scale=100)
print(y)
'''
[0 1 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 1 0 0 1 0 1 1 1 0 1 0 0 0 1 1 1
1 1 0 0 1 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 0 1 1 0
0 0 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1
0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 1 0 1 1 1
1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1
1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0 0 1
1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1
0 1 1 0 0 0 1 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1
1 1 0 0]
'''
#进行机器学习
X_train,X_test,y_train,y_test=train_test_split(X,y)
model1=SVC()
model1.fit(X_train,y_train)
#输出预测准确率
print(model1.score(X_test,y_test))
#0.426666666667
#进行数据标准化之后的准确率
X=preprocessing.scale(X)
X_train,X_test,y_train,y_test=train_test_split(X,y)
model2=SVC()
model2.fit(X_train,y_train)
print(model2.score(X_test,y_test))
#0.946666666667
#绘制散点图表明分类关系
#c表示颜色序列,根据y的不同值产生不同的颜色
plt.scatter(X[:,0],X[:,1],c=y)
plt.show()
from sklearn import svm
from sklearn import datasets
from sklearn.externals import joblib
clf=svm.SVC()
iris=datasets.load_iris()
X=iris.data
y=iris.target
clf.fit(X,y)
#保存训练好的模型
joblib.dump(clf,'clf.pkl')
#读取训练好的模型
cf2=joblib.load('clf.pkl')