k近邻算法(kNN)

核心思想:前k个最相似数据中出现次数最多的类别,作为新数据的类别。
核心函数:计算距离函数,投票函数

#-*- coding : utf-8 -*-
import random
import sys
from collections import Counter
from operator import itemgetter

reload(sys)
sys.setdefaultencoding("utf-8")

def createTriangle():
    # 生成三角形三个角的度数
    angleC = 0
    while(angleC <= 0):
        angleA = random.randint(1, 179)
        angleB = random.randint(1, 179)
        angleC = 180 - angleA - angleB
    if max([angleA, angleB, angleC]) < 90:
        label = 'R'  # 锐角三角形
    elif max([angleA, angleB, angleC]) == 90:
        label = 'Z'  # 直角三角形
    else:
        label = 'D'  # 钝角
    return angleA, angleB, angleC, label

def createDataSet(n,filename):
    # 生成数据集文本文件
    f = open(filename, 'w')
    for i in range(n):
        angleA, angleB, angleC, label = createTriangle()
        row = str(angleA) + '\t' + str(angleB) + '\t' + str(angleC) + '\t' + str(label) + '\n'
        f.writelines(row)
    f.close()


def loadFile2Matrix(filename):
    # load文本文件生成矩阵
    f = open(filename, 'r')
    labelMatrix = []
    dataMatrix = []
    for line in f.readlines():
        splitLine = line.strip('\n').split('\t')
        labelMatrix.append(splitLine[-1])  # 最后一列是类别标签
        dataMatrix.append([int(splitLine[i]) for i in range(len(splitLine)-1)])  
    return labelMatrix, dataMatrix

def calcDistance(vecA, vecB):
    # 计算两个向量之间的欧式距离
    sumOfSquare = 0
    for i in range(len(vecA)):
        sumOfSquare += (vecA[i]-vecB[i]) ** 2
        distance = sumOfSquare ** (1.0/2)
    return distance

def vote(candidateList):
    count_timesDict = Counter(candidateList)
    count_timesTuple = sorted(count_timesDict.iteritems(), key=itemgetter(1), reverse = True)
    voteResult = str(count_timesTuple[0][0])
    return voteResult

def kNN(testVec, dataMatrix, labelMatrix, k):
    distanceDict = {}
    # 计算新向量到各已分类向量的距离
    for i in range(len(dataMatrix)):
        distance = calcDistance(testVec, dataMatrix[i])
        distanceDict[distance] = labelMatrix[i]
    # 按距离从小到大排序
    distanceTuple = sorted(distanceDict.iteritems(), key=itemgetter(0), reverse = False)
    candidateList = []
    # 取出距离最小的k个对应的类别标签,并进行投票判别新向量的类别
    for j in range(k):
        candidateList.append(distanceTuple[j][1])
        classifyLabel = vote(candidateList)
    return classifyLabel

def testClassifier(testDataMatrix, testLabelMatrix, dataMatrix, labelMatrix, k):
    errorCount = 0
    for i in range(len(testDataMatrix)):
        vec = testDataMatrix[i]
        classifyLabel = kNN(vec, dataMatrix, labelMatrix, k)
    # 当根据kNN算法预测的类别,与测试数据的真实类别不一致时,错分个数增加1
    if not classifyLabel == testLabelMatrix[i]:
        errorCount += 1
    # 分类准确率
    pricision = 1 - float(errorCount)/len(testDataMatrix)
    print 'The pricision of this kNN classifier is:', pricision

if __name__ == '__main__':
    createDataSet(10000, 'dataSet.txt')
    labelMatrix, dataMatrix = loadFile2Matrix('dataSet.txt')
    createDataSet(1000, 'testDataSet.txt')
    testLabelMatrix, testDataMatrix = loadFile2Matrix('testDataSet.txt')
    testClassifier(testDataMatrix, testLabelMatrix, dataMatrix, labelMatrix, 5)




评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值