文章目录
投票模型
import pandas as pd
from sklearn.metrics import classification_report
# 数据
df = pd.read_csv('train.csv').fillna(method='ffill')
X = df.Word.values
y = df.Tag.values
labels = df.Tag.unique().tolist()
labels.remove('O')
# 投票模型
class Majority_vote:
def fit(self, X, y):
counter = {
}
for w, t in zip(X, y):
if w in counter:
if t in counter[w]:
counter[w][t] += 1
else:
counter[w][t] = 1
else:
counter[w] = {
t: 1}
self.vote = {
}
for w, t in counter.items():
self.vote[w] = max(t, key=t.get)
return self
def predict(self, X):
return [self.vote.get(x, 'O') for x in X]
y_pred = Majority_vote().fit(X, y).predict(X)
report = classification_report(y, y_pred, labels)
print(report)
条件随机场
以下为链式CRF
算法~
import pandas as pd
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
# 数据读取、预处理
data = pd.read_csv('train.csv').fillna(method='ffill')
labels = data.Tag.unique().tolist()
labels.remove('O')
# 按句子分组
f = lambda s: [(w, p, t) for w, p, t in zip