开始学习机器学习实战这本书,打算看完了再回头看 周志华的 机器学习。机器学习实战的代码都是用numpy写的,有些麻烦,所以考虑用pandas来实现代码,也能回顾之前学的 用python进行数据分析。感觉目前章节的测试方法太渣,留着以后学了更多再回头写。
# coding: gbk
import pandas as pd
import numpy as np
def getdata(path):
data = pd.read_csv(path, header=None, sep='\t')
character = data.iloc[:, :-1]
label = data.iloc[:, -1]
chara_max = character.max()
chara_min = character.min()
chara_range = chara_max - chara_min
normal_chara = (character - chara_min) / chara_range
return normal_chara, label # 获得归一化特征值和标记
def knn(inX, normal_chara, label, k):
data_sub = normal_chara - inX
data_square = data_sub.applymap(np.square)
data_sum = data_square.sum(axis=1)
data_sqrt = data_sum.map(np.sqrt)
dis_sort = data_sqrt.argsort()
k_label = label[dis_sort[:k]]
label_sort = k_label.value_counts()
res_label = label_sort.index[0]
return res_label # knn算法分类