原理参考:
K近邻算法(Python实现)
K-邻近算法(KNN)详解+Python实现
K近邻(KNN)算法、KD树及其python实现
代码传送门:https://github.com/taifyang/machine-learning
python实现(暴力穷举):
import numpy as np
import operator
class KNN():
def __init__(self, x, y, k, p):
self.k = k
self.p = p
self.x = x
self.y = y
def predict(self, x):
diff = np.tile(x, (self.x.shape[0], 1)) - self.x #计算预测数据和训练数据的差值
dist=np.linalg.norm(diff, ord=self.p, axis=1, keepdims=False) #计算范数
dist_sorted=dist.argsort() #返回从小到大排序的索引
#分类投票
count = {}
for i in range(self.k):
vote = self.y[dist_sorted[i]]
count[vote] = count.get(vote, 0) + 1
#对分类投票数从低到高进行排序
count_sorted = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
return count_sorted[0][0]
if __name__=='__main__':
x=np.array([[0, 10], [1, 8], [10, 1], [7, 4]])
y=np.array([0, 0, 1, 1])
knn=KNN(x, y, 3, 2)
print("预测值为:",knn.predict(np.array([[6, 2]])))
python实现(kdtree):
import numpy as np
import random
from copy import deepcopy
from numpy.linalg import norm
from collections import Counter
def partition_sort(arr, k, key=lambda x: x):
"""
以枢纽(位置k)为中心将数组划分为两部分, 枢纽左侧的元素不大于枢纽右侧的元素
:param arr: 待划分数组
:param k: 枢纽前部元素个数
:param key: 比较方式
:return: None
"""
start, end = 0, len(arr) - 1
assert 0 <= k <= end
while True:
i, j, pivot = start, end, deepcopy(arr[start])
while i < j:
# 从右向左查找较小元素
while i < j and key(pivot) <= key(arr[j]):
j -= 1
if i == j: break
arr[i] = arr[j]
i += 1
# 从左向右查找较大元素
while i < j and key(arr[i]) <= key(pivot):
i += 1
if i == j: break
arr[j] = arr[i]
j -= 1
arr[i] = pivot
if i == k:
return
elif i < k:
start = i + 1
else:
end = i - 1
def max_heapreplace(heap, new_node, key=lambda x: x[1]):
"""
大根堆替换堆顶元素
:param heap: 大根堆/列表
:param new_node: 新节点
:return: None
"""
heap[0] = new_node
root, child = 0, 1
end = len(heap) - 1
while child <= end:
if child < end and key(heap[child]) < key(heap[child + 1]):
child += 1
if key(heap[child]) <= key(new_node):
break
heap[root] = heap[child]
root, child = child, 2 * child + 1
heap[root] = new_node
def max_heappush(heap, new_node, key=lambda x: x[1]):
"""
大根堆插入元素
:param heap: 大根堆/列表
:param new_node: 新节点
:return: None
"""
heap.append(new_node)
pos = len(heap) - 1
while 0 < pos:
parent_pos = pos - 1 >> 1
if key(new_node) <= key(heap[parent_pos]):
break
heap[pos] = heap[parent_pos]
pos = parent_pos
heap[pos] = new_node
class KDNode(object):
"""kd树节点"""
def __init__(self, data=None, label=None, left=None, right=None, axis=None, parent=None):
"""
构造函数
:param data: 数据
:param label: 数据标签
:param left: 左孩子节点
:param right: 右孩子节点
:param axis: 分割轴
:param parent: 父节点
"""
self.data = data
self.label = label
self.left = left
self.right = right
self.axis = axis
self.parent = parent
class KDTree(object):
"""kd树"""
def __init__(self, X, y=None):
"""
构造函数
:param X: 输入特征集, n_samples*n_features
:param y: 输入标签集, 1*n_samples
"""
self.root = None
self.y_valid = False if y is None else True
self.create(X, y)
def create(self, X, y=None):
"""
构建kd树
:param X: 输入特征集, n_samples*n_features
:param y: 输入标签集, 1*n_samples
:return: KDNode
"""
def create_(X, axis, parent=None):
"""
递归生成kd树
:param X: 合并标签后输入集
:param axis: 切分轴
:param parent: 父节点
:return: KDNode
"""
n_samples = np.shape(X)[0]
if n_samples == 0:
return None
mid = n_samples >> 1
partition_sort(X, mid, key=lambda x: x[axis])
if self.y_valid:
kd_node = KDNode(X[mid][:-1], X[mid][-1], axis=axis, parent=parent)
else:
kd_node = KDNode(X[mid], axis=axis, parent=parent)
#print('data',kd_node.data)
next_axis = (axis + 1) % k_dimensions
kd_node.left = create_(X[:mid], next_axis, kd_node)
kd_node.right = create_(X[mid + 1:], next_axis, kd_node)
return kd_node
print('building kd-tree...')
k_dimensions = np.shape(X)[1]
if y is not None:
X = np.hstack((np.array(X), np.array([y]).T)).tolist()
self.root = create_(X, 0)
def search_knn(self, point, k, p=2):
"""
kd树中搜索k个最近邻样本
:param point: 样本点
:param k: 近邻数
:param dist: 度量方式
:return:
"""
def search_knn_(kd_node):
"""
搜索k近邻节点
:param kd_node: KDNode
:return: None
"""
if kd_node is None:
return
data = kd_node.data
distance = np.linalg.norm(np.array(data) - np.array(point), ord=p, axis=None, keepdims=False) #计算范数
if len(heap) < k:
# 向大根堆中插入新元素
max_heappush(heap, (kd_node, distance))
elif distance < heap[0][1]:
# 替换大根堆堆顶元素
max_heapreplace(heap, (kd_node, distance))
axis = kd_node.axis
if abs(point[axis] - data[axis]) < heap[0][1] or len(heap) < k:
# 当前最小超球体与分割超平面相交或堆中元素少于k个
search_knn_(kd_node.left)
search_knn_(kd_node.right)
elif point[axis] < data[axis]:
search_knn_(kd_node.left)
else:
search_knn_(kd_node.right)
if self.root is None:
raise Exception('kd-tree must be not null.')
if k < 1:
raise ValueError("k must be greater than 0.")
heap = []
search_knn_(self.root)
return sorted(heap, key=lambda x: x[1])
class KNeighborsClassifier(object):
"""K近邻分类器"""
def __init__(self, k, p=2):
"""构造函数"""
self.k = k
self.dist = p
self.kd_tree = None
def fit(self, X, y):
"""建立kd树"""
self.kd_tree = KDTree(X, y)
def predict(self, X):
"""预测类别"""
if self.kd_tree is None:
raise TypeError('Classifier must be fitted before predict!')
search_knn = lambda x: self.kd_tree.search_knn(point=x, k=self.k, p=self.dist)
y_pre = []
for x in X:
y = Counter(r[0].label for r in search_knn(x)).most_common(1)[0][0]
y_pre.append(y)
return y_pre
if __name__ == '__main__':
x=np.array([[0, 10], [1, 8], [10, 1], [7, 4]])
y=np.array([0, 0, 1, 1])
# kdtree = KDTree(x)
# print(kdtree.search_knn([6,2],3))
knn = KNeighborsClassifier(3)
knn.fit(x, y)
print("预测值为:",knn.predict(np.array([[6, 2]])))
python调包:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
if __name__=='__main__':
x=np.array([[0, 10], [1, 8], [10, 1], [7, 4]])
y=np.array([0, 0, 1, 1])
knn=KNeighborsClassifier(n_neighbors=3, p=2)
knn.fit(x, y)
print("预测值为:", knn.predict(np.array([[6, 2]])))
C++实现(暴力穷举):
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
//k邻近模型
class KNN
{
public:
KNN(std::vector<std::vector<float>> x, std::vector<float> y, int k, float p) : m_x(x), m_y(y), m_k(k), m_p(p) {};
int predict(std::vector<std::vector<float>> x)
{
x.resize(m_x.size());
for (size_t i = 0; i < x.size(); i++)
{
x[i] = x[0];
}
//计算预测数据和训练数据的差值
std::vector<std::vector<float>> diff = x;
for (size_t i = 0; i < diff.size(); i++)
{
for (size_t j = 0; j < diff[0].size(); j++)
{
diff[i][j] -= m_x[i][j];
}
}
//计算范数
std::vector<float> dist(diff.size(), 0);
for (size_t i = 0; i < diff.size(); i++)
{
for (size_t j = 0; j < diff[0].size(); j++)
{
dist[i] += pow(diff[i][j], m_p);
}
dist[i] = pow(dist[i], 1.0 / m_p);
}
//返回从小到大排序的索引
std::vector<int> dist_sorted(dist.size());
for (size_t i = 0; i != dist_sorted.size(); ++i) dist_sorted[i] = i;
std::sort(dist_sorted.begin(), dist_sorted.end(), [&dist](size_t i, size_t j) {return dist[i] < dist[j]; });
//分类投票
std::map<float, int> count;
for (size_t i = 0; i < m_k; i++)
{
float vote = m_y[dist_sorted[i]];
count[vote] += 1;
}
//返回投票最多的类别标签
return count.rbegin()->first;
}
private:
std::vector<std::vector<float>> m_x;
std::vector<float> m_y;
int m_k;
float m_p;
};
int main(int argc, char* argv[])
{
std::vector<std::vector<float>> x = { { 0, 10 },{ 1, 8 },{ 10, 1 },{ 7, 4 } };
std::vector<float> y = { 0, 0, 1, 1 };
KNN knn = KNN(x, y, 3, 2);
std::cout << "预测值为:" << knn.predict({ {6,2} }) << std::endl;
system("pause");
return EXIT_SUCCESS;
}
C++实现(kdtree):
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
#include <assert.h>
void partition_sort(std::vector<std::vector<float>>& arr, int k, int key)
{
int start = 0, end = arr.size() - 1;
assert(k >= 0 && k <= end);
while (true)
{
int i = start, j = end;
std::vector<float> pivot = arr[start];
while (i < j)
{
while (i<j && pivot[key] <= arr[j][key]) j -= 1;
if (i == j) break;
arr[i] = arr[j];
i += 1;
while (i<j && arr[i][key] <= pivot[key]) i += 1;
if (i == j) break;
arr[j] = arr[i];
j -= 1;
}
arr[i] = pivot;
if (i == k)
return;
else if (i < k)
start = i + 1;
else
end = i - 1;
}
}
class KDNode
{
public:
KDNode(std::vector<float> data = {}, float label = -1, int axis = -1, KDNode* left = NULL, KDNode* right = NULL, KDNode* parent = NULL) :
m_data(data), m_label(label), m_axis(axis), m_left(left), m_right(right), m_parent(parent){};
public:
std::vector<float> m_data;
float m_label;
int m_axis;
KDNode* m_left;
KDNode* m_right;
KDNode* m_parent;
};
struct Tuple
{
KDNode* kd_node;
float distance;
};
void max_heapreplace(std::vector<Tuple>& heap, Tuple new_node)
{
heap[0] = new_node;
int root = 0, child = 1;
int end = heap.size() - 1;
while (child <= end)
{
if (child<end && heap[child].distance<heap[child + 1].distance)
child += 1;
if (heap[child].distance < new_node.distance)
break;
heap[root] = heap[child];
root = child;
child = 2 * child + 1;
}
heap[root] = new_node;
}
void max_heappush(std::vector<Tuple>& heap, Tuple new_node)
{
heap.push_back(new_node);
int pos = heap.size() - 1;
while (0<pos)
{
int parent_pos = (pos - 1) >> 1;
if (new_node.distance <= heap[parent_pos].distance)
break;
heap[pos] = heap[parent_pos];
pos = parent_pos;
}
heap[pos] = new_node;
}
class KDTree
{
public:
KDTree(std::vector<std::vector<float>> X, std::vector<float> y = {})
{
m_root = NULL;
if (y.size()) m_y_valid = true;
else m_y_valid = false;
create(X, y);
}
KDNode* create_(std::vector<std::vector<float>> X, int axis, int k_dimensions, KDNode* parent = NULL)
{
int n_samples = X.size();
if (n_samples == 0)
return NULL;
int mid = n_samples >> 1;
partition_sort(X, mid, axis);
KDNode* kd_node;
if (m_y_valid)
{
std::vector<float> X_data(X[0].size() - 1);
for (size_t i = 0; i < X_data.size(); i++)
{
X_data[i] = X[mid][i];
}
kd_node = new KDNode(X_data, X[mid][X[0].size()-1], axis, parent);
}
else {
kd_node = new KDNode(X[mid], -1, axis, parent);
}
//std::cout<<"data:" << kd_node->m_data[0] <<" " << kd_node->m_data[1] << std::endl;
int next_axis = (axis + 1) % k_dimensions;
std::vector<std::vector<float>> Xleft_data(mid, std::vector<float>(X[0].size()));
for (size_t i = 0; i < Xleft_data.size(); i++)
{
for (size_t j = 0; j < Xleft_data[0].size(); j++)
{
Xleft_data[i][j] = X[i][j];
}
}
kd_node->m_left = create_(Xleft_data, next_axis, k_dimensions, kd_node);
std::vector<std::vector<float>> Xright_data(X.size() - mid - 1, std::vector<float>(X[0].size()));
for (size_t i = 0; i < Xright_data.size(); i++)
{
for (size_t j = 0; j < Xright_data[0].size(); j++)
{
Xright_data[i][j] = X[i + mid + 1][j];
}
}
kd_node->m_right = create_(Xright_data, next_axis, k_dimensions, kd_node);
return kd_node;
}
void create(std::vector<std::vector<float>> X, std::vector<float> y = {})
{
std::cout << "building kd-tree..." << std::endl;
int k_dimensions = X[0].size();
if (y.size())
{
for (size_t i = 0; i < X.size(); i++)
{
X[i].push_back(y[i]);
}
}
m_root = create_(X, 0, k_dimensions);
}
float p_dist(std::vector<float> data, std::vector<float> point, float p)
{
float p_dist = 0.0;
for (size_t i = 0; i < data.size(); i++)
{
p_dist += pow(data[i] - point[i], p);
}
return pow(p_dist, 1.0 / p);
}
void search_knn_(std::vector<Tuple>& heap, KDNode* kd_node, std::vector<float> point,int k, float p)
{
if (kd_node == NULL)
return;
std::vector<float> data = kd_node->m_data;
float distance = p_dist(data, point, p);
Tuple tuple;
tuple.kd_node = kd_node;
tuple.distance = distance;
if (heap.size() < k)
max_heappush(heap, tuple);
else if(distance<heap[0].distance)
max_heapreplace(heap, tuple);
int axis = kd_node->m_axis;
if (fabs(point[axis] - data[axis]) < heap[0].distance || heap.size() < k)
{
search_knn_(heap, kd_node->m_left, point, k, p);
search_knn_(heap, kd_node->m_right, point, k, p);
}
else if(point[axis] < data[axis])
search_knn_(heap, kd_node->m_left, point, k, p);
else
search_knn_(heap, kd_node->m_right, point, k, p);
}
std::vector<Tuple> search_knn(std::vector<float> point, int k, float p = 2)
{
if(m_root == NULL)
throw std::exception("kd-tree must be not null.");
if(k < 1)
throw std::exception("k must be greater than 0.");
std::vector<Tuple> heap = {};
search_knn_(heap, m_root, point, k, p);
//std::cout << heap.size()<< std::endl;
std::sort(heap.begin(), heap.end(), [](Tuple tuple1, Tuple tuple2) {return tuple1.distance < tuple2.distance; });
return heap;
}
private:
KDNode* m_root;
bool m_y_valid;
};
class KNeighborsClassifier
{
public:
KNeighborsClassifier(int k, float p=2, KDTree* kd_tree=NULL):m_k(k), m_p(p), m_kd_tree(kd_tree) {};
void fit(std::vector<std::vector<float>> x, std::vector<float> y)
{
m_kd_tree = new KDTree(x, y);
}
std::vector<float> predict(std::vector<std::vector<float>> x)
{
if(m_kd_tree==NULL)
throw std::exception("Classifier must be fitted before predict!");
std::vector<float> y_pre(x.size());
for (size_t i = 0; i < x.size(); i++)
{
std::map<float, int> count;
std::vector<Tuple> heap = m_kd_tree->search_knn(x[i], m_k, m_p);
for (size_t i = 0; i < heap.size(); i++)
{
//std::cout << heap[i].kd_node->m_label << std::endl;
++count[heap[i].kd_node->m_label];
}
std::vector<std::pair<float, int>> count_vec;
for (std::map<float, int>::iterator it = count.begin(); it != count.end(); it++)
{
count_vec.push_back(std::make_pair((*it).first, (*it).second));
}
std::sort(count_vec.begin(), count_vec.end(), [](std::pair<float, int> p1, std::pair<float, int> p2) {return p1.second < p2.second; });
y_pre[i] = count_vec.rbegin()->first;
}
return y_pre;
}
private:
int m_k;
float m_p;;
KDTree* m_kd_tree;
};
int main(int argc, char* argv[])
{
std::vector<std::vector<float>> x = { { 0, 10 },{ 1, 8 },{ 10, 1 },{ 7, 4 } };
std::vector<float> y = { 0, 0, 1, 1 };
//std::vector<std::vector<float>> arr = { {0.0f, 1.0f, 0.0f},{0.1f, 0.7777777777777778f, 0.0f}, {1.0, 0.0f, 1.0f}, {0.7f, 0.3333333333333333f, 1.0f} };
//partition_sort(arr, 2, 0);
//for (size_t i = 0; i < arr.size(); i++)
//{
// for (size_t j = 0; j < arr[0].size(); j++)
// {
// std::cout << arr[i][j] << " ";
// }
// std::cout << std::endl;
//}
//KDTree kdtree = KDTree(x);
//std::vector<Tuple> heap = kdtree.search_knn({ 6,2 }, 3);
//for (size_t j = 0; j < heap.size(); j++)
//{
// std::cout << heap[j].distance << std::endl;
//}
KNeighborsClassifier knn = KNeighborsClassifier(3);
knn.fit(x, y);
std::cout << "预测值为:" << knn.predict({ { 6,2 } })[0] << std::endl;
system("pause");
return EXIT_SUCCESS;
}
1352

被折叠的 条评论
为什么被折叠?



