理论知识是参考的《统计学习方法》
代码基本上是参考别人写的,然后整理的。
先占个位置,周日写,别忘了
这是参考,基本是理解之后照抄的。。。。。。
这是数据集,将数据集与下面的代码放到同一目录下即可正常运行
注释周日写,别忘了
class Node(object):
def __init__(self):
self.father = None
self.left = None
self.right = None
self.feature = None
self.split = None
def __str__(self):
return "feature: %s, split: %s" % (str(self.feature), str(self.split))
@property
def brother(self):
ret = Node()
if not self.father:
ret = None
else:
if self.father.left is self:
ret = self.father.right
else:
ret = self.father.left
return ret
# def testNode():
# n1 = Node()
# n1.split = [(0,1)]
# n1.feature = 0
#
# n2 = Node()
# n2.split = [(1, 2)]
# n2.feature = 1
#
# n3 = Node()
# n3.split = [(2, 3)]
# n3.feature = 2
#
# n1.left = n2
# n2.father = n1
# n1.right = n3
# n3.father = n1
# print(n2.brother)
# testNode()
class KDtree(object):
def __init__(self):
self.root = Node()
def __str__(self):
nd = self.root
level = 0
queue = [(nd, -1)]
res = []
while queue:
nd, preLevel = queue.pop(0)
res.append("%d->%d" % (preLevel, level) + str(nd))
if nd.left is not None:
queue.append((nd.left, level))
if nd.right is not None:
queue.append((nd.right, level))
level += 1
return "\n".join(res)
def get_feature_s2(self, X, feature, idx):
ex = 0
ex2 = 0
n = len(idx)
for i in idx:
ex += X[i][feature]
ex2 += X[i][feature] ** 2
return ex2 / n - (ex / n) ** 2
def get_best_feature(self, X, idx):
best_feature = 0
maxS2 = 0
for f in range(len(X[0])):
curS2 = self.get_feature_s2(X, f, idx)
if curS2 > maxS2:
maxS2 = curS2
best_feature = f
return best_feature
def get_median(self, X, best_feature, idx):
midVal = sorted(X[i][best_feature] for i in idx)[len(idx) // 2]
res = idx[0]
for i in idx:
if X[i][best_feature] == midVal:
res = i
break
return res
def split_by_median(self, X, best_feature, mid, idx):
div = [[], []]
for i in idx:
if i == mid:
continue
if X[i][best_feature] < X[mid][best_feature]:
div[0].append(i)
else:
div[1].append(i)
return div
def create_KDtree(self, X, y):
queue = [(self.root,range(len(X)))]
while queue:
node,idx = queue.pop(0)
n = len(idx)
if n == 1:
node.split = (X[idx[0]],y[idx[0]])
continue
best_feature = self.get_best_feature(X,idx)
median = self.get_median(X,best_feature,idx)
div = self.split_by_median(X,best_feature,median,idx)
node.feature = best_feature
node.split = (X[median],y[median])
if div[0] != []: #注意python中is和is not判断的是引用是否相等,==和!=判断的是值是否相等
node.left = Node()
# node.left.father = Node()
node.left.father = node
queue.append((node.left,div[0]))
if div[1] != []:
node.right = Node()
# node.right.father = Node()
node.right.father = node
queue.append((node.right,div[1]))
def get_eu_dist(self,Xi,node):
dist2 = 0
for i in range(len(Xi)):
dist2 += (node.split[0][i] - Xi[i]) ** 2
return dist2 ** 0.5
def get_hyper_dist(self,Xi,node):
return abs(Xi[node.feature] - node.split[0][node.feature])
def search_leave(self,Xi,node_ori):
node = node_ori
while node.left or node.right:
if not node.left:
node = node.right
elif not node.right:
node = node.left
else:
if Xi[node.feature] < node.split[0][node.feature]:
node = node.left
else:
node = node.right
return node
# def testKDtree():
# t1 = KDtree()
# t1.create_KDtree([[6,2],[3,5],[8,1],[6,3],[4,9],[5,0],[1,2]],[1,2,3,4,5,6,7])
# # print(t1.get_eu_dist([2,8],t1.root.left.left))
# print(t1.root.left.left.brother())
# testKDtree()
class MaxHeap(object):
def __init__(self,maxSize,fn):
self.max_size = maxSize
self._items = [None] * maxSize
self.size = 0
self.fn = fn
def insert(self,item):
if self.size == self.max_size:
if self.fn(item) < self.fn(self._items[0]):
self._items[0] = item
self.shift_down(0)
else:
self.size += 1
self._items[self.size - 1] = item
self.shift_up(self.size - 1)
def shift_down(self,parent):
child = parent * 2 + 1
while child < self.size:
if child + 1 < self.size and self.fn(self._items[child]) < self.fn((self._items[child + 1])):
child += 1
if self.fn(self._items[child]) > self.fn((self._items[parent])):
self._items[child],self._items[parent] = self._items[parent],self._items[child]
parent = child
child = parent * 2 + 1
else:
break
def shift_up(self,child):
parent = (child - 1) // 2
while child > 0:
if self.fn(self._items[child]) < self.fn(self._items[parent]):
break
else:
self._items[child], self._items[parent] = self._items[parent],self._items[child]
child = parent
parent = (child - 1) // 2
# mp = MaxHeap(20,lambda x:x)
# mp.insert(1)
# mp.insert(3)
# mp.insert(4)
# mp.insert(2)
# for i in range(mp.size):
# print(mp._items[i])
class KNeighbors(object):
def __init__(self):
self.kneighbors = 0
self.tree = KDtree()
def fit(self,X,y,kneighbors):
self.kneighbors = kneighbors
self.tree = KDtree()
self.tree.create_KDtree(X,y)
def knn_search(self,Xi):
heap = MaxHeap(self.kneighbors,lambda x:x.dist)
tree = self.tree
leave = tree.search_leave(Xi,tree.root)
queue = [(tree.root,leave)]
while queue:
nd_root,nd_cur = queue.pop(0)
nd_root.dist = tree.get_eu_dist(Xi,nd_root)
heap.insert(nd_root)
while nd_cur is not nd_root:
nd_cur.dist = tree.get_eu_dist(Xi,nd_cur)
heap.insert(nd_cur)
hyper_dist = tree.get_hyper_dist(Xi, nd_cur.father)
if nd_cur.brother and (heap.size < heap.max_size or hyper_dist < heap.fn(heap._items[0])):
_nd = tree.search_leave(Xi,nd_cur.brother)
queue.append((nd_cur.brother,_nd))
nd_cur = nd_cur.father
return heap
def predict(self,Xi):
heap = self.knn_search(Xi)
return sum(heap._items[i].split[1] for i in range((heap.size))) > len(Xi) // 2
from numpy.random import seed, choice
import numpy as np
def train_test_split(data, label=None, prob=0.7, random_state=None):
if random_state is not None:
seed(random_state)
n_rows, _ = data.shape
k = int(n_rows * prob)
train_indexes = choice(range(n_rows), size=k, replace=False)
test_indexes = np.array([i for i in range(n_rows) if i not in train_indexes])
data_train = data[train_indexes]
data_test = data[test_indexes]
if label is not None:
label_train = label[train_indexes]
label_test = label[test_indexes]
ret = (data_train, data_test, label_train, label_test)
else:
ret = (data_train, data_test)
if random_state is not None:
seed(None)
return ret
def normalization_x(X):
for f in range(len(X[0])):
minVal = min(X[i][f] for i in range(len(X)))
maxVal = max(X[i][f] for i in range(len(X)))
for i in range(len(X)):
X[i][f] = (X[i][f] - minVal) / (maxVal - minVal)
return X
def main():
data = np.loadtxt('breast_cancer.csv',delimiter=',')
X = data[:,:-1]
y = data[:,-1]
X = normalization_x(X)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=10)
knn = KNeighbors()
knn.fit(X_train, y_train, 21)
y_predict = []
for i in range(len(y_test)):
print(i)
y_predict.append(knn.predict(X_test[i]))
# print(y_predict[i],y_test[i])
acc = sum(y_predict[i] == y_test[i] for i in range(len(y_test))) / (len(y_test))
return acc
acc = main()
print("accuracy: %.2f%%"%(acc * 100))
本文介绍了如何使用Python实现KD树及其在K近邻算法中的应用。首先,详细讲解了KD树的节点结构和构建过程,包括选择最佳分割特征、中位数划分等步骤。接着,展示了MaxHeap数据结构用于存储最近邻节点。最后,通过实例演示了KD树在K近邻算法中的搜索过程,并进行了训练和测试数据的分割、数据标准化以及预测准确性评估。
2万+

被折叠的 条评论
为什么被折叠?



