import numpy as np
import operator
def createDataSet():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A','A','B','B']
return group, labels
def classify0(inX, dataset, labels, k):
'''
inX:输入向量
dataset:数据集中X向量
labels:数据集的标签集合
k:k-近邻算法中的K
return:输入向量的标签
'''
size = dataset.shape[0]
'''
计算欧氏距离
'''
d_value = np.tile(inX, (size, 1)) - dataset
sqrt_value = d_value**2
sum_value = sqrt_value.sum(axis=1)
distance = sum_value**0.5
sort_dis_index = distance.argsort()
class_count = {}
for i in range(k):
label = labels[sort_dis_index[i]]
class_count[label] = class_count.get(label, 0) + 1
'''
根据class_count的第二个字段进行排序
reverse=true: 降序
reverse=false: 升序
'''
sort_label_index = sorted(class_count.items(), key = operator.itemgetter(1), reverse = True)
return sort_label_index[0][0]
numpy的tile函数
tile(x,(row,col))
x:要重复的数据
row:在行方向上重复几次
col:在列方向上重复几次
字典:{},根据键取值
数组横向相加
myarray.sum(axis=1)
数组纵向相加:
myarray.sum(axis=0)
python中的点乘:**
argsort()函数,是numpy库中的函数,返回的是数组值从小到大的索引值
for example:
One dimensionalarray:一维数组
>>> x =np.array([3, 1, 2])
>>>np.argsort(x)
array([1, 2, 0])
___________________________________________________________________________________________________
利用K近邻算法改进约会网站问题:
import numpy as np
import matplotlib.pyplot as plt
import operator
def classify0(inX, dataset, labels, k):
'''
KNN算法
inX:输入向量
dataset:数据集中X向量
labels:数据集的标签集合
k:k-近邻算法中的K
return:输入向量的标签
'''
size = dataset.shape[0]
'''
计算欧氏距离
'''
d_value = np.tile(inX, (size, 1)) - dataset
sqrt_value = d_value**2
sum_value = sqrt_value.sum(axis=1)
distance = sum_value**0.5
sort_dis_index = distance.argsort()
class_count = {}
for i in range(k):
label = labels[sort_dis_index[i]]
class_count[label] = class_count.get(label, 0) + 1
'''
根据class_count的第二个字段进行排序
reverse=true: 降序
reverse=false: 升序
'''
sort_label_index = sorted(class_count.items(), key = operator.itemgetter(1), reverse = True)
return sort_label_index[0][0]
def get_dating_data(filename):
'''
获得数据
:param filename: 要操作的文件名
:return: 返回数据和标签
'''
file = open(filename)
lines = file.readlines()
length = len(lines)
dataset = np.zeros((length, 3))
labels = []
index = 0
for line in lines:
line = line.strip()
l = line.split('\t')
dataset[int(index), :] = l[0:3]
# 注意这一步要改一下txt,将最后一列的largeDoses->3 smallDoses->2 didntLike->1
labels.append(int(l[-1]))
index += 1
return dataset, labels
def plot_data(dataset,labels):
plt.scatter(dataset[:, 0], dataset[:, 1], 15.0 * np.array(labels), 15.0 * np.array(labels))
plt.show()
def auto_norm(dataset):
min_val = dataset.min(0)
max_val = dataset.max(0)
range = max_val - min_val
m = dataset.shape[0]
diff = dataset - np.tile(min_val, (m ,1))
norm_dataset = diff/np.tile(range, (m, 1))
return norm_dataset, range, min_val
def datingClassTest():
'''
测试分类器正确率
:return: 错误率
'''
test_ratio = 0.1
filename = 'datingTestSet.txt'
dataset, labels = get_dating_data(filename)
# plot_data(dataset,labels)
normset, ranges, minval = auto_norm(dataset)
m = normset.shape[0]
test_m = int(m*test_ratio)
error = 0
for i in range(test_m):
predict_class = classify0(normset[i,:], normset[test_m:m,:], labels[test_m:m], 3)
if predict_class != labels[i]:
error += 1
print("error_rate: %f", error/test_m)
def classify_person():
result_list = ['not at all', 'in small doses', 'in large doses']
ffmiles = float(input('飞机飞行公里:'))
time_playgames = float(input('玩游戏时间:'))
ice_cream = float(input('吃冰淇淋毫升:'))
inX = np.array([ffmiles, time_playgames, ice_cream])
dataset, labels = get_dating_data('datingTestSet.txt')
normset, ranges, min_val = auto_norm(dataset)
predict_class = classify0((inX - min_val)/ranges, normset, labels, 3)
if predict_class==1:
print('not at all ')
elif predict_class==2:
print('in small doses')
else:
print('in large doses')
classify_person()
将数字图片转化为txt格式的手写数字识别,最后输出错误率大概为1.16%
# ————————————————————— 手写数字识别 ————————————————————————
def image2vector(filename):
vec = np.zeros((1, 1024))
file = open(filename)
for i in range(32):
line = file.readline()
for j in range(32):
vec[0, i*32+j] = int(line[j])
return vec
def handwriting_test():
file_list = listdir('trainingDigits')
file_num = len(file_list)
labels = []
vec = np.zeros((file_num, 1024))
for i in range(file_num):
file_name = file_list[i]
temp = file_name.split('.')[0]
file_label = int(temp.split('_')[0])
labels.append(file_label)
vec[i, :] = image2vector('trainingDigits/%s' % file_name)
test_list = listdir('testDigits')
test_num = len(test_list)
error = 0
for j in range(test_num):
test_name = test_list[j]
temp = test_name.split('.')[0]
test_label = int(temp.split('_')[0])
test_vec = image2vector('testDigits/%s' % test_name)
pre_class = classify0(test_vec, vec, labels, 3)
if pre_class != test_label:
error += 1
print('error rate: %s', error/test_num)
handwriting_test()