from a_Data_preprocessing import load_xy, vocabs
from numpy import zeros
from b_metrics import metric, Timer
from sklearn.naive_bayes import MultinomialNB
"""数据加载、预处理"""
X_train, X_test, y_train, y_test = load_xy(1)defls_of_w2id(ls_of_wids):
length =len(ls_of_wids)
ls_of_wid = zeros((length, vocabs))for i inrange(length):for wid in ls_of_wids[i]:
ls_of_wid[i, wid -1]+=1return ls_of_wid # 句向量
X_train = ls_of_w2id(X_train)
X_test = ls_of_w2id(X_test)"""建模"""
t = Timer()# 计时器
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)"""预测结果"""
metric(y_test, y_pred)
2、贝叶斯+TF-IDF
from a_Data_preprocessing import load_xy, vocabs
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from numpy import zeros
from sklearn.naive_bayes import MultinomialNB
from b_metrics import metric, Timer
"""数据加载、预处理"""
X_train, X_test, y_train, y_test = load_xy()
dt = Dictionary(X_train)
X_train =[dt.doc2bow(x)for x in X_train]
X_test =[dt.doc2bow(x)for x in X_test]
tfidf = TfidfModel(X_train).idfs # 字典:ID→TF-IDF向量defls_of_w2id(ls_of_wid):
length =len(ls_of_wid)
ls_of_idf = zeros((length, vocabs), dtype='float')for i inrange(length):for(wid, cnt)in ls_of_wid[i]:
ls_of_idf[i, wid]+= cnt * tfidf[wid]return ls_of_idf # TF-IDF句向量
X_train = ls_of_w2id(X_train)
X_test = ls_of_w2id(X_test)"""建模"""
t = Timer()# 计时器
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)"""预测结果"""
metric(y_test, y_pred)
3、逻辑回归+词向量(FastText)
X =[[str(y)]+ x for x, y inzip(X_train, y_train)]
model = FastText(X, size=size, window=window)
w2i ={w: i for i, w inenumerate(model.wv.index2word)}
vectors = model.wv.vectors
w2v =lambda w: vectors[w2i[w]]
X_train_v =[[w2v(w)for w in x]for x in X_train]
X_test_v =[[w2v(w)for w in x if w in w2i]for x in X_test]withopen(PATH_XY_VEC,'wb')as f:
pickle.dump((X_train_v, X_test_v, y_train, y_test), f)
from a_Data_preprocessing import load_xy
from numpy import mean
from sklearn.linear_model import LogisticRegression
from b_metrics import metric, Timer
"""数据加载、预处理"""
X_train, X_test, y_train, y_test = load_xy(2)
X_train =[mean(x, axis=0)for x in X_train]
X_test =[mean(x, axis=0)for x in X_test]"""建模"""
t = Timer()# 计时器
clf = LogisticRegression()
clf.fit(X_train, y_train)"""预测结果"""
y_pred = clf.predict(X_test)
metric(y_test, y_pred)