对英文文件进行过滤
训练测试集下载地址github
步骤
1、有标签的文件文本数据导入
2、分词、去除网络符号(网址除外)、统一小写
3、利用Lemmatizer词还原器将单词变为原型
4、标注”not_” 或 “no_”前缀(文章中出现在not或no后面的词是这个词的反义词,加上前缀作为区分)
5、去除停用词
6、引入词袋模型,统计每个文档单词的出现情况
7、用朴素贝叶斯分类器进行训练
8、打印精度检验报表导入相关库
import re import os import pandas as pd import numpy as np import nltk from sklearn.naive_bayes import GaussianNB from nltk import sent_tokenize from nltk import word_tokenize from sklearn.cross_validation import StratifiedShuffleSplit from nltk.stem import WordNetLemmatizer
得到邮件文本
def GetText(url): error_list = [] text = [] url_list = os.listdir(url) for lis in url_list: path = url + '\\' + lis f = open(path) try: text.append((f.read(), url)) except: text.append((open(path,'rb').read(), url)) return text
分词、消除字符(网址除外)、统一小写、词还原
def clean_text(text): lis = text[0].split() lis1 = [] for i in lis: # 去除网址外的非字母字符 if 'http' not in i: lis1.append(re.sub(r'[^a-zA-Z]', '',i)) else: lis1.append(i) # 去除空格,转小写 lis1 = [x.lower() for x in lis1 if x != ''] # 词还原 wlem = WordNetLemmatizer() lis1 = [wlem.lemmatize(x) for x in lis1] return (lis1, text[1])
加not no、除去停用词
def build_negate_features(instance): ''' 文本中出现在not或no之后的词在文章中 代表的是这个词的反义词,所以加个Not_ 的前缀 ''' words = instance[0] final_words=[] negate = False negate_words = ['not', 'no'] for word in words: if negate: word = 'Not_' + word negate = False if word not in negate_words: final_words.append(word) else: negate = True feature_set = {} for word in final_words: feature_set[word] = 1 return (feature_set, instance[1])
训练模型
def build_model(features): model = nltk.NaiveBayesClassifier.train(features) return model
精度检验
def probe_model(model, features, dataset_type = 'Train'): accuracy = nltk.classify.accuracy(model, features) print('\n' + dataset_type + ' Accuracy = %0.2f'%(accuracy*100)+'%') def show_features(model, no_features = 5): print('\nFeature Importance') print('='*20,'\n') # 查看各个特征对模型的贡献 print(model.show_most_informative_features(no_features))
总训练模型
def build_model_cycle_1(train_data, dev_data): model = build_model(train_data) probe_model(model, train_data) probe_model(model, dev_data, 'Dev') return model
切分训练集与测试集
# 输入特征集和标签集 # 输出训练和测试集 # 训练集每行为单词列表和对应标签 def get_train_test(data, y_labels): test_size = 0.3 # 等比例分配 split_label = StratifiedShuffleSplit(y_labels,test_size = test_size, n_iter = 1, random_state = 77) for train_label, test_label in split_label: train = [data[i] for i in train_label] y_train = [y_labels[i] for i in train_label] test = [data[i] for i in test_label] y_test = [y_labels[i] for i in test_label] return train, test, y_train, y_test
总函数
if __name__ == "__main__": not_spam_data = [] text = GetText('not-spam') for t in text: try: not_spam_data.append(build_negate_features(clean_text(t))) except: continue is_spam_data = [] text = GetText('is-spam') for t in text: try: is_spam_data.append(build_negate_features(clean_text(t))) except: continue input_datasets = not_spam_data + is_spam_data y_labels = ['not-spam']*len(not_spam_data) + ['is-spam']*len(is_spam_data) train_data, all_test_data, train_y, all_test_y = get_train_test(input_datasets, y_labels) dev_data, test_data, dev_y, test_y = get_train_test(all_test_data, all_test_y) print('data:', len(input_datasets)) print('train:', len(train_data)) print('dev:', len(dev_data)) print('test:', len(test_data)) model_cycle_1 = build_model_cycle_1(train_data, dev_data) show_features(model_cycle_1)
打印结果