import numpy as np
from sklearn.datasets import load_iris
from operator import itemgetter
from collections import defaultdict
from sklearn.model_selection import train_test_split
import os
from collections import Counter
def make_Dictionary(root_dir):
all_words = []
# 读取所有文件下的数据路径
emails = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]
# 依次遍历处理
for mail in emails:
with open(mail) as m:
for line in m:
words = line.split()
all_words += words
dictionary = Counter(all_words)
list_to_remove = list(dictionary)
for item in list_to_remove:
if not item.isalpha():
del dictionary[item]
elif len(item) == 1:
del dictionary[item]
return dictionary
def extract_features(mail_dir, dictionary):
"""
:param mail_dir: 数据路径
:param dictionary: 数据词典
:return: x,y
"""
files = [os.path.join(mail_dir, fi) for fi in os.listdir(mail_dir)]
features_matrix = np.zeros((len(files), len(dictionary)))
train_labels = np.zeros(len(files))
count = 0
docID = 0
for fil in files:
with open(fil) as fi:
for i, line in enumerate(fi):
if i == 2:
words = line.split()
for word in words:
for i, d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[docID, wordID] = words.count(word)
train_labels[docID] = 0
filepathTokens = fil.split('/')
lastToken = filepathTokens[len(filepathTokens) - 1]
if "spmsg" in lastToken:
train_labels[docID] = 1
count = count + 1
docID = docID + 1
return features_matrix, train_labels
# 加载训练数据集
TRAIN_DIR = "src/train-mails"
# 数据处理
dictionary = make_Dictionary(TRAIN_DIR)
# 分割为数据和标签
X_train, y_train = extract_features(TRAIN_DIR, dictionary)
# 找到特征feature_index下的特征值feature_value对应的多数类及其误差。
def train_feature_class(x, y_true, feature_index, feature_values):
num_class = defaultdict(int)
# 统计feature_value下不同类别对应的样例的数目
for sample, y in zip(x, y_true):
if sample[feature_index] == feature_values:
num_class[y] += 1
# 进行排序,找出最多的类别。按从大到小排列
sorted_num_class = sorted(num_class.items(), key=itemgetter(1), reverse=True)
# 获取排在最前面的,也就是多数类
most_frequent_class = sorted_num_class[0][0]
# 计算误差个数
error = sum(value_num for class_num, value_num in sorted_num_class if class_num != most_frequent_class)
return most_frequent_class, error
# 找到某个特征下的所有规则
def train_feature(x, y_true, feature_index):
# 获取样本数和特征数
n_sample, n_feature = x.shape
assert 0 <= feature_index < n_feature
# 获取所有可能值
value = set(x[:, feature_index])
predictors = {}
errors = []
# 遍历所有值,找出多数类及误差,保存到errors
for current_value in value:
most_frequent_class, error = train_feature_class(x, y_true, feature_index, current_value)
predictors[current_value] = most_frequent_class
errors.append(error)
total_error = sum(errors)
return predictors, total_error
# 找到所有特征下的各特征值的类别。
# 格式就如:{0:({0: 0, 1: 2}, 41)}
# 首先为一个字典,字典的键是某个特征,字典的值由一个集合构成,这个集合又是由一个字典和一个值组成,字典的键是特征值,字典的值为类别,最后一个单独的值是错误率。
all_predictors = {feature: train_feature(X_train, y_train, feature) for feature in range(X_train.shape[1])}
# 筛选出每个特征下的错误率出来
errors = {feature: error for feature, (mapping, error) in all_predictors.items()}
# ********** Begin *********#
# one Rule(OneR)算法
best_feature, best_error =
# 建立模型
model =
# ********** End *********#
print(model)
预期输出:
{'feature': 0, 'predictor': {0.0: 0.0, 1.0: 0.0, 2.0: 1.0, 3.0: 1.0, 4.0: 0.0, 5.0: 0.0, 6.0: 0.0, 7.0: 0.0, 8.0: 0.0, 9.0: 0.0, 10.0: 0.0, 12.0: 1.0, 14.0: 0.0, 15.0: 0.0, 16.0: 1.0, 17.0: 0.0, 22.0: 1.0, 23.0: 0.0, 26.0: 1.0, 27.0: 1.0, 28.0: 0.0, 31.0: 0.0, 34.0: 1.0, 46.0: 1.0, 63.0: 1.0}}