机器学习实战第一节,K近邻算法

本文介绍了K近邻算法的基础知识,并通过numpy库中的函数进行数据处理,如tile函数、数组的相加操作和argsort函数。还讨论了如何使用K近邻算法解决约会网站问题,将数字图片转化为文本识别,最终得到约1.16%的错误率。
import numpy as np
import operator

def createDataSet():
  group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
  labels = ['A','A','B','B']
  return group, labels

def classify0(inX, dataset, labels, k):
    '''
        inX:输入向量
        dataset:数据集中X向量
        labels:数据集的标签集合
        k:k-近邻算法中的K

        return:输入向量的标签
    '''
    size = dataset.shape[0]

    '''
        计算欧氏距离
    '''
    d_value = np.tile(inX, (size, 1)) - dataset
    sqrt_value = d_value**2
    sum_value = sqrt_value.sum(axis=1)
    distance = sum_value**0.5
    sort_dis_index = distance.argsort()
    class_count = {}
    for i in range(k):
        label = labels[sort_dis_index[i]]
        class_count[label] = class_count.get(label, 0) + 1

    '''
        根据class_count的第二个字段进行排序
        reverse=true: 降序
        reverse=false: 升序
    '''
    sort_label_index = sorted(class_count.items(), key = operator.itemgetter(1), reverse = True)

    return sort_label_index[0][0]


numpytile函数

tile(x,(row,col))

x:要重复的数据

row:在行方向上重复几次

col:在列方向上重复几次


字典:{},根据键取值

 

数组横向相加

myarray.sum(axis=1)

数组纵向相加:

myarray.sum(axis=0)

 

python中的点乘:**

 

argsort()函数,是numpy库中的函数,返回的是数组值从小到大的索引值

for example:

One dimensionalarray:一维数组

>>> x =np.array([3, 1, 2])

>>>np.argsort(x)

array([1, 2, 0])



___________________________________________________________________________________________________


利用K近邻算法改进约会网站问题:

import numpy as np
import matplotlib.pyplot as plt
import operator


def classify0(inX, dataset, labels, k):
    '''
        KNN算法
        inX:输入向量
        dataset:数据集中X向量
        labels:数据集的标签集合
        k:k-近邻算法中的K

        return:输入向量的标签
    '''
    size = dataset.shape[0]

    '''
        计算欧氏距离
    '''
    d_value = np.tile(inX, (size, 1)) - dataset
    sqrt_value = d_value**2
    sum_value = sqrt_value.sum(axis=1)
    distance = sum_value**0.5
    sort_dis_index = distance.argsort()
    class_count = {}
    for i in range(k):
        label = labels[sort_dis_index[i]]
        class_count[label] = class_count.get(label, 0) + 1

    '''
        根据class_count的第二个字段进行排序
        reverse=true: 降序
        reverse=false: 升序
    '''
    sort_label_index = sorted(class_count.items(), key = operator.itemgetter(1), reverse = True)

    return sort_label_index[0][0]


def get_dating_data(filename):
    '''
    获得数据
    :param filename: 要操作的文件名
    :return: 返回数据和标签
    '''
    file = open(filename)
    lines = file.readlines()
    length = len(lines)

    dataset = np.zeros((length, 3))
    labels = []
    index = 0

    for line in lines:
        line = line.strip()
        l = line.split('\t')
        dataset[int(index), :] = l[0:3]
        # 注意这一步要改一下txt,将最后一列的largeDoses->3 smallDoses->2 didntLike->1
        labels.append(int(l[-1]))
        index += 1

    return dataset, labels


def plot_data(dataset,labels):
    plt.scatter(dataset[:, 0], dataset[:, 1], 15.0 * np.array(labels), 15.0 * np.array(labels))
    plt.show()


def auto_norm(dataset):
    min_val = dataset.min(0)
    max_val = dataset.max(0)
    range = max_val - min_val

    m = dataset.shape[0]

    diff = dataset - np.tile(min_val, (m ,1))
    norm_dataset = diff/np.tile(range, (m, 1))

    return norm_dataset, range, min_val


def datingClassTest():
    '''
    测试分类器正确率
    :return: 错误率
    '''
    test_ratio = 0.1
    filename = 'datingTestSet.txt'
    dataset, labels = get_dating_data(filename)
    # plot_data(dataset,labels)
    normset, ranges, minval = auto_norm(dataset)
    m = normset.shape[0]
    test_m = int(m*test_ratio)

    error = 0
    for i in range(test_m):
        predict_class = classify0(normset[i,:], normset[test_m:m,:], labels[test_m:m], 3)
        if predict_class != labels[i]:
            error += 1

    print("error_rate: %f", error/test_m)


def classify_person():
    result_list = ['not at all', 'in small doses', 'in large doses']
    ffmiles = float(input('飞机飞行公里:'))
    time_playgames = float(input('玩游戏时间:'))
    ice_cream = float(input('吃冰淇淋毫升:'))

    inX = np.array([ffmiles, time_playgames, ice_cream])
    dataset, labels = get_dating_data('datingTestSet.txt')
    normset, ranges, min_val = auto_norm(dataset)

    predict_class = classify0((inX - min_val)/ranges, normset, labels, 3)
    if predict_class==1:
        print('not at all ')
    elif predict_class==2:
        print('in small doses')
    else:
        print('in large doses')

classify_person()


将数字图片转化为txt格式的手写数字识别,最后输出错误率大概为1.16%

# —————————————————————    手写数字识别     ————————————————————————

def image2vector(filename):
    vec = np.zeros((1, 1024))
    file = open(filename)
    for i in range(32):
        line = file.readline()
        for j in range(32):
            vec[0, i*32+j] = int(line[j])

    return vec


def handwriting_test():
    file_list = listdir('trainingDigits')
    file_num = len(file_list)
    labels = []
    vec = np.zeros((file_num, 1024))

    for i in range(file_num):
        file_name = file_list[i]
        temp = file_name.split('.')[0]
        file_label = int(temp.split('_')[0])

        labels.append(file_label)
        vec[i, :] = image2vector('trainingDigits/%s' % file_name)

    test_list = listdir('testDigits')
    test_num = len(test_list)
    error = 0

    for j in range(test_num):
        test_name = test_list[j]
        temp = test_name.split('.')[0]
        test_label = int(temp.split('_')[0])
        test_vec = image2vector('testDigits/%s' % test_name)

        pre_class = classify0(test_vec, vec, labels, 3)
        if pre_class != test_label:
            error  += 1

    print('error rate: %s', error/test_num)


handwriting_test()



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值