统计学习方法_kNN实现

最新推荐文章于 2019-11-24 16:41:31 发布

Eminem1147

最新推荐文章于 2019-11-24 16:41:31 发布

阅读量243

点赞数 1

CC 4.0 BY-SA版权

分类专栏：统计学习方法

本文链接：https://blog.youkuaiyun.com/qq_33765907/article/details/83054487

统计学习方法专栏收录该内容

9 篇文章

订阅专栏

数据集与上一篇文章不同，可以使用完整的MNIST数据集了，下载地址：MNIST

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import time
import cv2
from sklearn.cross_validation import train_test_split

# 提取hog特征，784 -> 324
def get_hog_features(trainset):
    features = []

    hog = cv2.HOGDescriptor('./hog.xml')  # 读取配置文件

    for image in trainset:
        image = image.reshape(28, 28)
        cv_img = image.astype(np.uint8)  # uint8范围为0-255，和像素数值范围相同

        hog_feature = hog.compute(cv_img)
        features.append(hog_feature)

    features = np.array(features)
    features = features.reshape(-1, 324)  # 第一维任意，第二维为提取到的特征18*18

    return features

def Predict(testset, trainset, train_labels):
    predict = []
    count = 0

    # test_vec shape (D,)
    for test_vec in testset:
        print(count, end=" ")  # 输出测试用例的下标
        count += 1
        if count % 100 == 0:
            print()

        # 当前k个最近邻
        knn_list = []
        # 当前k个最近邻中最远点的坐标
        max_index = -1
        # 当前k个最近邻中最远点的距离
        max_dist = 0

        # 先将前k个训练数据放入knn_list中，填充满
        for i in range(k):
            label = train_labels[i]
            train_vec = trainset[i]  # shape (D,)

            # 计算欧式距离
            dist = np.linalg.norm(train_vec - test_vec)

            knn_list.append((dist, label))

        # 处理剩下的点
        for i in range(k, len(train_labels)):
            label = train_labels[i]
            train_vec = trainset[i]

            dist = np.linalg.norm(train_vec - test_vec)

            # 寻找10个邻近点中距离最远的点
            if max_index < 0:
                for j in range(k):
                    if max_dist < knn_list[j][0]:
                        max_index = j
                        max_dist = knn_list[j][0]

            # 如果当前k个最近邻中存在距离比当前点远，则替换
            if dist < max_dist:
                knn_list[max_index] = (dist, label)
                # 别忘记重新初始化，因为替换后10个邻近点中
                max_index = -1
                max_dist = 0

        # 从10个最近邻中统计选票
        class_label = [0 for i in range(k)]  # list快速赋初值方式
        for dist, label in knn_list:
            class_label[label] += 1

        # 选出最大选票对应的选票数
        mmax = max(class_label)

        for i in range(k):
            if mmax == class_label:
                predict.append(i)
                break

    return np.array(predict)

k = 10  # 可由交叉验证获得最佳的k

if __name__ == '__main__':
    print('Start reading data:')

    time1 = time.time()

    raw_data = pd.read_csv('./data/train.csv')
    data = raw_data.values

    img = data[:, 1:]
    labels = data[:, 0]

    print(img.shape)
    print(labels.shape)

    features = get_hog_features(img)
    print(features.shape)

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.33, random_state=11111)

    time2 = time.time()
    print('read data cost %f seconds' % (time2 - time1))

    print('Starting training:')
    print('knn do not need to train!')
    time3 = time.time()
    print('training cost %f seconds' % (time3 - time2))

    print('Starting predicting:')
    test_predict = Predict(test_features, train_features, train_labels)
    time4 = time.time()
    print('predicting cost %f seconds' % (time4 - time3))

    accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
    print('The accuracy is %f!' % accuracy)

'''
Start reading data:
(42000, 784)
(42000,)
(42000, 324)
read data cost 6.009209 seconds
Starting training:
knn do not need to train!
training cost 0.000033 seconds
Starting predicting:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
to be continued
运行时间过长不算出accuracy了
'''