机器学习1——分类问题
为建模准备数据
在时间场景中,我们一般会另设一个Dev数据集作为开发数据集(测试集),我们在成功建模后用它对模型调优。模型是从训练集中训练出来的,然后用Dev数据来度量它的指标准确度等,基于上述过程的结果,如果需要进一步提高,模型会被更深入的调优。
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import numpy as np
def get_iris():
data=load_iris()
x=data['data']
y=data['target']
input_dataset=np.column_stack([x,y])
np.random.shuffle(input_dataset)
return input_dataset
#采用8/2分布分割数据集
data=get_iris()
train,test=train_test_split(data,train_size=0.8)
print("train size",train.shape)
print("test size ",test.shape)
# 检测训练集和测试集的类别标签是否分布合理
def get_class_distribution(y):
distribution={}
set_y=set(y)
for y_label in set_y:
no_element=len(np.where(y==y_label)[0])
distribution[y_label]=no_element
return distribution
def print_class_label_split(train,test):
y_train=train[:,-1]
train_distribution=get_class_distribution(y_train)
print("\nTrain data set class label distribution")
print("======================================\n")
for k ,v in train_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
y_test=test[:,-1]
test_distribution=get_class_distribution(y_test)
print("\nTest data set class label distribution")
print("======================================\n")
for k ,v in test_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
print_class_label_split(train,test)
train size (120, 5)
test size (30, 5)
Train data set class label distribution
======================================
class label=0, Percentage records=36.00
class label=1, Percentage records=42.00
class label=2, Percentage records=42.00
Test data set class label distribution
======================================
class label=0, Percentage records=14.00
class label=1, Percentage records=8.00
class label=2, Percentage records=8.00
如何在训练集和测试集中均匀的分割类别标签
from sklearn.cross_validation import StratifiedShuffleSplit
straified_split=StratifiedShuffleSplit(data[:,-1],test_size=0.2,n_iter=1)
for train_index,test_index in straified_split:
train=data[train_index]
test=data[test_index]
print_class_label_split(train,test)
Train data set class label distribution
======================================
class label=0, Percentage records=40.00
class label=1, Percentage records=40.00
class label=2, Percentage records=40.00
Test data set class label distribution
======================================
class label=0, Percentage records=10.00
class label=1, Percentage records=10.00
class label=2, Percentage records=10.00
查找最近邻
首先需要混淆矩阵,它是类别标签的真实值与预测值的对应排列矩阵。
预测值 | |
---|---|
真实值 | T |
F |
- FP:TRUE Positive的缩写。测试集中真实标签为T,预测值标签也为T的总数
- FN: False Negative的缩写。测试集中真实标签为T,预测值却为F的总数
- FP:False Positive的缩写。测试集中真实标签为F,预测值却为T的总数
- TN:TRUE Negative的缩写。测试集中真实标签为F,预测值也为F的总数
准确度是正确的预测总数的总数。从混淆矩阵中,我们知道TP和TN之和就是正确的预测数。
训练集的准确度总是比较乐观的,但是我们应该看看测试集的准确度指标来判断模型的真正效果
K近邻算法(KNN)把所有的训练集数据加载到内存中,当它需要对测试实例进行分类时,它衡量这个实例和所有训练实例之间的距离,基于距离,它选择训练集里的K个最近的实例。测试集的分类预测值就是基于这K个最近邻的主体分类情况
# 准备数据
from sklearn.datasets import make_classification
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
def get_data():
x,y=make_classification(n_features=4,n_samples=1000)
return x,y
def plot_data(x,y):
"""
Plot a scatter plot fo all variable combinations
"""
subplot_start = 321
col_numbers = range(0,4)
col_pairs = itertools.combinations(col_numbers,2)
plt.subplots_adjust(wspace=0.36,hspace=0.2)
plt.figure(figsize=(16,9))
for col_pair in col_pairs:
plt.subplot(subplot_start)
plt.scatter(x[:,col_pair[0]],x[:,col_pair[1]],c=y)
title_string = str(col_pair[0]) + "-" + str(col_pair[1])
plt.title(title_string)
x_label = str(col_pair[0])
y_label = str(col_pair[1])
plt.xlabel(x_label)
plt.xlabel(y_label)
subplot_start+=1
x,y=get_data()
plot_data(x,y)
<Figure size 432x288 with 0 Axes>
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
def get_train_test(x,y):
train_size=0.7
test_size=1-train_size
input_data=np.column_stack([x,y])
straified_split=StratifiedShuffleSplit(input_data[:,-1],train_size=train_size,n_iter=1)
for train_indx,test_indx in straified_split:
train_x=input_data[train_indx,:-1]
train_y=input_data[train_indx,-1]
test_x=input_data[test_indx,:-1]
test_y=input_data[test_indx,-1]
return train_x,train_y,test_x,test_y
def build_model(x,y,k=2):
knn=KNeighborsClassifier(n_neighbors=k)
knn.fit(x,y)
return knn
def test_model(x,y,knn_model):
y_predicted=knn_model.predict(x)
print(classification_report(y,y_predicted))
x,y=get_data()
plot_data(x,y)
train_x,train_y,test_x,test_y=get_train_test(x,y)
knn_model=build_model(train_x,train_y)
test_model(test_x,test_y,knn_model)
test_model(train_x,train_y,knn_model)
precision recall f1-score support
0.0 0.89 0.94 0.91 50
1.0 0.94 0.88 0.91 50
avg / total 0.91 0.91 0.91 100
precision recall f1-score support
0.0 0.90 1.00 0.95 353
1.0 1.00 0.88 0.94 347
avg / total 0.95 0.94 0.94 700
<Figure size 432x288 with 0 Axes>
用朴素贝叶斯分类文档
这种算法的驱动力来自贝叶斯规则,公式如下:
这个公式展示了我们已经知道事件Y的发生的情况时,事件X发生的概率有多大。在贝叶斯的术语里,必须先定义条件概率:给定评价条件下评价为正面的概率和给定评价条件下评价为负面的概率,写成如下等式:
对于任何一条评价,如果有了上面两个概率值,我们即可通过比较它们来将这条评价归类到正面还是负面:如果负面的条件概率大于正面的条件概率,则评价法是负面分类,反之亦然。
要比较这两个等式来决定最终结果,我们可以忽略分母,因此它只是简单的缩放因此,等式左边称为后验概率。等式右边的分子部分P(review|positive)∗P(positive)P(review|positive)∗P(positive),其中P(positive)P(positive)是正面评价的先验概率,它是我们从训练集中获取的正面分类标签的分布信仰,我们通过下面的公式把它从训练集中计算出来:
P(review|postive)P(review|postive) 是一种可能性,它回答了这个问题:给定这个类别是正面的,这个评价是正面的可能性有多大
#加载库和生成数据
from nltk.corpus import movie_reviews,stopwords
from sklearn.cross_validation import StratifiedShuffleSplit
import nltk
from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures
def get_data():
dataset=[]
ylabel=[]
for cat in movie_reviews.categories():
for field in movie_reviews.fileids(cat):
words=list(movie_reviews.words(field))
dataset.append((words,cat))
ylabel.append(cat)
return dataset,ylabel
def get_train_test(input_dataset,ylabel):
stragiht_split=StratifiedShuffleSplit(ylabel,train_size=0.8,n_iter=1)
for train_index,test_index in stragiht_split:
train=[input_dataset[i] for i in train_index]
train_y=[ylabel[i] for i in train_index]
test=[input_dataset[i] for i in test_index]
test_y=[ylabel[i] for i in test_index]
return train,train_y,test,test_y
#模型构建
def build_word_features(instance):
feature_set={}
words=instance[0]
for word in words:
feature_set[word]=1
return (feature_set,instance[1])
def build_negate_features(instance)