OneR算法就是,在已有数据中,根据具有相同特征值的个体最可能属于哪个类别进行分类。即取效果最好的那个特征进行分类。
#-*- coding=utf-8 -*-
#
import numpy as np
from sklearn.datasets import load_iris
from collections import defaultdict
from operator import itemgetter
#该算法目的是通过这四个特征中的一个以分辨种类,即,如果某一植物的特征feature_index 的离散值为valu
#那么该植物最有可能是most_frequent_class,错误率为error
#X为离散后的数据,y_true为每组数据的植株种类,feature_index为以第几个特征为标准,value为特征值
def train_feature_value(X,y_true,feature_index,value):
class_counts = defaultdict(int)
for sample,y in zip(X,y_true):
if sample[feature_index] == value:
class_counts[y]+=1
sorted_class_counts = sorted(class_counts.items(),key=itemgetter(1),reverse=True)
print(sorted_class_counts)
most_frequent_class = sorted_class_counts[0][0]
print(most_frequent_class)
incorrect_predictions = [class_count for class_vlue,class_count in class_counts.items() if class_vlue != most_frequent_class]
print(incorrect_predictions)
error = sum(incorrect_predictions)
return most_frequent_class,error
if __name__ == '__main__':
#从scikit-learn库中读取内置的“Iris植物分类数据集”
dataset = load_iris()
x = dataset.data#每株植物的四个特征
y = dataset.target#每株植物的种类,有4个种类
#求4个特征的平均值
attribute_means = x.mean(axis=0)
#当该值大于平局值时为1,小于平局值时为0,完成原始数据的离散化
x_d = np.array(x>=attribute_means,dtype='int')
train_feature_value(x_d,y,0,1)
#TODO
predictors = {}
errors = []