视频地址:https://www.bilibili.com/video/av50971264/
输入语料: 采用sklearn自带的20 newsgroups (~20000篇文档)和nltk自带的reuters(10788篇文档)
import sys
import pdb
import os
import getopt
import time
import gensim
from sklearn.datasets import load_svmlight_file
from scipy.sparse import csr_matrix
from corpusLoader import *
# 从语料库名映射到加载函数,之后调用
corpus2loader = { '20news': load_20news, 'reuters': load_reuters }
def usage():
print """Usage: ldaExp.py corpus_name"""
corpusName = sys.argv[1]
# 加载函数
loader = corpus2loader[corpusName]
# 20news的文档数和类别数都多些,所以主题数设大一些
if corpusName == "20news":
topicNum = 100
else:
topicNum = 50
# 两个语料都已分成train和test集合。后面分别处理
setNames = [ 'train', 'test' ]
basenames = []
subcorpora = []
corpus = []
word2id = {}
id2word = {}
maxWID = 0
for setName in setNames:
print "Process set '%s':" %setName
# 加载语料的train或test子集,单词以句子为单位放入 orig_docs_words,类比放在 orig_docs_cat
setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, cats_docsWords, \
cats_docNames, category_names = loader(setName)
# 文件名前缀
basename = "%s-%s-%d" %( corpusName, setName, setDocNum )
basenames.append(basename)
# 当前循环所处理的语料子集,是一个list的list。每个外层list元素对应一个文档
# 每个内层list为一串 (word_id, frequency) 的pair
# 这种格式是gensim的标准输入格式
subcorpus = []
# 保存原始文本,以供人查看
orig_filename = "%s.orig.txt" %basename
ORIG = open( orig_filename, "w" )
# 每个 wordsInSentences 对应一个文档
# 每个 wordsInSentences 由许多句子组成,每个句子是一个list of words
for wordsInSentences in orig_docs_words:
# 统计当前文档的每个词的频率
doc_wid2freq = {}
# 循环取当前文档的一个句子
for sentence in wordsInSentences:
for w in sentence:
w = w.lower()
ORIG.write( "%s " %w )
# 如果w已在word2id映射表中,映射成wid
if w in word2id:
wid = word2id[w]
# 否则,把w加入映射表,并映射成新wid
else:
wid = maxWID
word2id[w] = maxWID
id2word[maxWID] = w
maxWID += 1
# 统计 wid 的频率
if wid in doc_wid2freq:
doc_wid2freq[wid] += 1
else:
doc_wid2freq[wid] = 1
ORIG.write("\n")
# 把文档中出现的wid按id大小排序
sorted_wids = sorted( doc_wid2freq.keys() )
doc_pairs = []
# 把 (wid, frequency) 的对追加到当前文档的list中
for wid in sorted_wids:
doc_pairs.append( (wid, doc_wid2freq[wid]) )
# 当前文档的list已经完全生成,把它加入subcorpus,即语料子集的list中
subcorpus.append(doc_pairs)
ORIG.close()
print "%d original docs saved in '%s'" %( setDocNum, orig_filename )
# 把整个语料子集list与之前的list合并,得到一个包含train和test集合的所有文档的集合
corpus += subcorpus
# 这里把train和test集合分开放,之后会把不同集合的每个文档的“doc-topic比例”保存成不同文件
subcorpora.append( (subcorpus, orig_docs_cat) )
print "Training LDA..."
startTime = time.time()
# LDA训练的时候是把train和test放一起训练的(更严格的办法应该是只用train集合来训练)
lda = gensim.models.ldamodel.LdaModel( corpus=corpus, num_topics=topicNum, passes=20 )
endTime = time.time()
print "Finished in %.1f seconds" %( endTime - startTime )
for i in xrange(2):
lda_filename = "%s.svm-lda.txt" %basenames[i]
LDA = open( lda_filename, "w" )
print "Saving topic proportions into '%s'..." %lda_filename
# 拿出一个语料子集 (train或者test)
subcorpus, labels = subcorpora[i]
# 遍历子集中每个文档
for d, doc_pairs in enumerate(subcorpus):
label = labels[d]
# 把当前文档作为输入,用训练好的LDA模型求“doc-topic比例”
topic_props = lda.get_document_topics( doc_pairs, minimum_probability=0.001 )
LDA.write( "%d" %label )
# 把K个比例保存成K个特征,svmlight格式
for k, prop in topic_props:
LDA.write(" %d:%.3f" %(k, prop) )
LDA.write("\n")
LDA.close()
print "%d docs saved" %len(subcorpus)
用生成的 '语料名-train-文档数.svm-lda.txt' 作为特征文件,进行训练,在 '语料名-test-文档数.svm-lda.txt' 上测试分类效果。
from sklearn import svm, metrics
from sklearn.datasets import load_svmlight_file
import sys
# 返回precision, recall, f1, accuracy
def getScores( true_classes, pred_classes, average):
precision = metrics.precision_score( true_classes, pred_classes, average=average )
recall = metrics.recall_score( true_classes, pred_classes, average=average )
f1 = metrics.f1_score( true_classes, pred_classes, average=average )
accuracy = metrics.accuracy_score( true_classes, pred_classes )
return precision, recall, f1, accuracy
# 命令行: python classEval.py 语料名 文件类型(lda, bow等等)
corpus = sys.argv[1]
filetype = sys.argv[2]
# (可选)可指定只使用一部分特征
# selected feature dimensions can be specified in the last argument as:
# 1-400 (starting from 1)
if len(sys.argv) > 3:
dims = sys.argv[3].split("-")
dims[0] = int(dims[0]) - 1
dims[1] = int(dims[1])
else:
dims = None
# 按照模板,拿到training和test文件名
if corpus == '20news':
train_file = "20news-train-11314.svm-%s.txt" %filetype
test_file = "20news-test-7532.svm-%s.txt" %filetype
else:
train_file = "reuters-train-5770.svm-%s.txt" %filetype
test_file = "reuters-test-2255.svm-%s.txt" %filetype
# 加载training和test文件的特征
train_features_sparse, true_train_classes = load_svmlight_file(train_file)
test_features_sparse, true_test_classes = load_svmlight_file(test_file)
# 缺省加载为稀疏矩阵。转化为普通numpy array
train_features = train_features_sparse.toarray()
test_features = test_features_sparse.toarray()
print "Train: %dx%d. Test: %dx%d" %( tuple( train_features.shape + test_features.shape ) )
if dims:
train_features = train_features[ :, dims[0]:dims[1] ]
test_features = test_features[ :, dims[0]:dims[1] ]
print "Choose only features %d-%d" %( dims[0]+1, dims[1] )
else:
train_features = train_features[ :, : ]
test_features = test_features[ :, : ]
# 线性SVM,L1正则
model = svm.LinearSVC(penalty='l1', dual=False)
# 在training文件上训练
print "Training...",
model.fit( train_features, true_train_classes )
print "Done."
# 在test文件上做预测
pred_train_classes = model.predict( train_features )
pred_test_classes = model.predict( test_features )
# 汇报结果
print metrics.classification_report(true_train_classes, pred_train_classes, digits=3)
print metrics.classification_report(true_test_classes, pred_test_classes, digits=3)
for average in ['micro', 'macro']:
train_precision, train_recall, train_f1, train_acc = getScores( true_train_classes, pred_train_classes, average )
print "Train Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average,
train_precision, train_recall, train_f1, train_acc )
test_precision, test_recall, test_f1, test_acc = getScores( true_test_classes, pred_test_classes, average )
print "Test Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average,
test_precision, test_recall, test_f1, test_acc )
把sklearn的20newsgroups和nltk的reuters统一的语料访问接口
# -*- coding=GBK -*-
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import reuters
import HTMLParser
import os
import sys
import unicodedata
import re
import pdb
unicode_punc_tbl = dict.fromkeys( i for i in xrange(128, sys.maxunicode)
if unicodedata.category(unichr(i)).startswith('P') )
# 输入: 一个文档
# 处理过程: 先按标点符号分成句子,然后每句按词边界分词
def extractSentenceWords(doc, remove_url=True, remove_punc="utf-8", min_length=1):
# 去掉指定字符集(缺省去掉utf-8的)中的标点符号
if remove_punc:
# ensure doc_u is in unicode
if not isinstance(doc, unicode):
encoding = remove_punc
doc_u = doc.decode(encoding)
else:
doc_u = doc
# remove unicode punctuation marks, keep ascii punctuation marks
doc_u = doc_u.translate(unicode_punc_tbl)
if not isinstance(doc, unicode):
doc = doc_u.encode(encoding)
else:
doc = doc_u
# 去掉文本中的URL(可选)
if remove_url:
re_url = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
doc = re.sub( re_url, "", doc )
# 按句子标点分句
sentences = re.split( r"\s*[,;:`\"()?!{}]\s*|--+|\s*-\s+|''|\.\s|\.$|\.\.+|“|”", doc ) #"
wc = 0
wordsInSentences = []
for sentence in sentences:
if sentence == "":
continue
if not re.search( "[A-Za-z0-9]", sentence ):
continue
# 按词边界分词
words = re.split( r"\s+\+|^\+|\+?[\-*\/&%=<>\[\]~\|\@\$]+\+?|\'\s+|\'s\s+|\'s$|\s+\'|^\'|\'$|\$|\\|\s+", sentence )
words = filter( lambda w: w, words )
if len(words) >= min_length:
wordsInSentences.append(words)
wc += len(words)
#print "%d words extracted" %wc
return wordsInSentences, wc
def load_20news(setName):
newsgroups_subset = fetch_20newsgroups(subset=setName, remove=('headers', 'footers')) #, 'quotes'
totalLineNum = 0
readDocNum = 0
print "Loading 20 newsgroup %s data..." %setName
setDocNum = len(newsgroups_subset.data)
orig_docs_name = []
orig_docs_cat = []
orig_docs_words = []
catNum = len(newsgroups_subset.target_names)
cats_docsWords = [ [] for i in xrange(catNum) ]
cats_docNames = [ [] for i in xrange(catNum) ]
emptyFileNum = 0
for d, text in enumerate(newsgroups_subset.data):
if d % 50 == 49 or d == setDocNum - 1:
print "\r%d %d\r" %( d + 1, totalLineNum ),
text = text.encode("utf-8")
lines = text.split("\n")
if len(text) == 0 or len(lines) == 0:
emptyFileNum += 1
continue
readDocNum += 1
totalLineNum += len(lines)
catID = newsgroups_subset.target[d]
category = newsgroups_subset.target_names[catID]
text = " ".join(lines)
wordsInSentences, wc = extractSentenceWords(text)
filename = newsgroups_subset.filenames[d]
filename = os.path.basename(filename)
orig_docs_words.append( wordsInSentences )
orig_docs_name.append(filename)
orig_docs_cat.append(catID)
cats_docsWords[catID].append(wordsInSentences)
cats_docNames[catID].append(filename)
print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
cats_docsWords, cats_docNames, newsgroups_subset.target_names
def load_reuters(setName):
html = HTMLParser.HTMLParser()
doc_ids = reuters.fileids()
cat2all_ids = {}
cat2train_ids = {}
cat2test_ids = {}
cat2all_num = {}
cand_docNum = 0
for doc_id in doc_ids:
# only choose docs belonging in one category
if len( reuters.categories(doc_id) ) == 1:
cat = reuters.categories(doc_id)[0]
cand_docNum += 1
# 以'train'开头的文档名放在training集合里
if doc_id.startswith("train"):
cat2set_ids = cat2train_ids
# 否则,放到test集合
else:
cat2set_ids = cat2test_ids
if cat in cat2set_ids:
cat2set_ids[cat].append(doc_id)
else:
cat2set_ids[cat] = [ doc_id ]
# both train and test doc_ids are put in cat2all_ids
if cat in cat2all_ids:
cat2all_ids[cat].append(doc_id)
else:
cat2all_ids[cat] = [ doc_id ]
if cat in cat2all_num:
cat2all_num[cat] += 1
else:
cat2all_num[cat] = 1
print "Totally %d docs, %d single-category docs in %d categories" %( len(doc_ids),
cand_docNum, len(cat2train_ids) )
sorted_cats = sorted( cat2all_num.keys(), key=lambda cat: cat2all_num[cat],
reverse=True )
catNum = 10
cats_docsWords = [ [] for i in xrange(catNum) ]
cats_docNames = [ [] for i in xrange(catNum) ]
topN_cats = sorted_cats[:catNum]
print "Top 10 categories:"
keptAllDocNum = 0
keptTrainDocNum = 0
keptTestDocNum = 0
for cat in topN_cats:
print "%s: %d/%d" %( cat, len(cat2train_ids[cat]), len(cat2test_ids[cat]) )
keptTrainDocNum += len(cat2train_ids[cat])
keptTestDocNum += len(cat2test_ids[cat])
keptAllDocNum += len(cat2train_ids[cat]) + len(cat2test_ids[cat])
print "Totally %d docs kept, %d in train, %d in test" %( keptAllDocNum,
keptTrainDocNum, keptTestDocNum )
if setName == "train":
cat2set_ids = cat2train_ids
setDocNum = keptTrainDocNum
elif setName == "test":
cat2set_ids = cat2test_ids
setDocNum = keptTestDocNum
elif setName == "all":
cat2set_ids = cat2all_ids
setDocNum = keptAllDocNum
else:
raise Exception("Unknown set name %s" %setName)
orig_docs_name = []
orig_docs_cat = []
orig_docs_words = []
readDocNum = 0
totalLineNum = 0
emptyFileNum = 0
for cat_id, cat in enumerate(topN_cats):
for doc_id in cat2set_ids[cat]:
if readDocNum % 50 == 49 or readDocNum == setDocNum - 1:
print "\r%d %d\r" %( readDocNum + 1, totalLineNum ),
text = html.unescape( reuters.raw(doc_id) )
text = text.encode("utf-8")
lines = text.split("\n")
if len(text) == 0 or len(lines) == 0:
emptyFileNum += 1
continue
readDocNum += 1
totalLineNum += len(lines)
text = " ".join(lines)
wordsInSentences, wc = extractSentenceWords(text)
filename = doc_id
orig_docs_words.append( wordsInSentences )
orig_docs_name.append(filename)
orig_docs_cat.append(cat_id)
cats_docsWords[cat_id].append(wordsInSentences)
cats_docNames[cat_id].append(filename)
print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
cats_docsWords, cats_docNames, topN_cats