标注数据较少时【长文本分类任务】的半监督学习Python算法

数据介绍

语料下载地址:https://download.youkuaiyun.com/download/Yellow_python/12862983
新闻10分类样本总数19467,长度分布类别分布如下:

from data10 import X, Y  # 导入文本10分类
from pandas import Series
from collections import Counter
print(Counter(Y).most_common())  # 标签分类统计
print(Series(X).str.len().describe())  # 文本长度概览
类别数量类别数量
science2093car2066
finance2052sports2017
military2007medicine2000
entertainment1906politics1865
education1749fashion1712
统计指标文本长度
mean753.15
std538.73
min1
25%309
50%645
75%1065
max2094

原理

训练
输入
预测
阈值
训练
预测
预测
少量标注数据
模型1
无标注数据
预测值及其概率
高概率数据
混合标注数据
模型2
结果2
结果1
比较

极简代码

分类模型:逻辑回归;文本编码:TFIDF

import re, numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from jieba import cut

np.random.seed(7)
N = 25000
STOP_WORDS = set('''
了 是 在 和 有 他 我 的 也 为 就 这 都 等 着 来 与 要 又 而 一个 之 以 她 去 那 但 把 我们 可 他们 并 自己 或 由 其 给 使 却
这个 它 已经 及 这样 这些 此 们 这种 如果 因为 即 其中 现在 一些 以及 同时 由于 所以 这里 因 曾 呢 但是 该 每 其他 应 吧 虽然
因此 而且 啊 应该 当时 那么 这么 仍 还有 如此 既 或者 然后 有些 那个 关于 于是 不仅 只要 且 另外 而是 还是 此外 这次 如今 就是
并且 从而 其它 尽管 还要 即使 总是 只有 只是 而言 每次 这是 就会 那是'''.strip().split())  # 情感分析时不用


def tokenizer(text):
    for sentence in re.split('[^a-zA-Z\u4e00-\u9fa5]+', text):
        for word in cut(sentence):
            if word not in STOP_WORDS:
                yield word


class Model:
    def __init__(self):
        self.vec = TfidfVectorizer(tokenizer=tokenizer, max_features=N)
        self.clf = LogisticRegression(C=2, solver='liblinear')

    def fit(self, x, y):
        x = self.vec.fit_transform(x)
        self.clf.fit(x, y)
        return self

    def predict(self, x):
        return self.clf.predict(self.vec.transform(x))

    def predict_probability_one(self, text):
        """单个文本分类并返回概率"""
        proba = self.clf.predict_proba(self.vec.transform([text]))[0]
        argmax = np.argmax(proba)
        return self.clf.classes_[argmax], proba[argmax]

    def classification_report(self, x_test, y_test):
        y_pred = self.predict(x_test)  # 预测
        print('accuracy', np.average([y_test == y_pred]))
        print('\033[034m%s\033[0m' % classification_report(y_test, y_pred))  # F1-score


def experiment(x, y, test_size=.9):
    # 样本切分,设定测试集占比
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
    print('\033[033m', test_size, [i.shape for i in (x_train, x_test, y_train, y_test)], '\033[0m', sep='')
    # 模型1
    m1 = Model().fit(x_train, y_train)
    m1.classification_report(x_test, y_test)
    for threshold in (.9, .8, .7, .6, .5, .4, .3, .2):
        # 高概率预测数据合并到训练集
        x_mix, y_mix = x_train.tolist(), y_train.tolist()
        for i in range(len(x_test)):
            y_pred, proba = m1.predict_probability_one(x_test[i])
            if proba > threshold:
                x_mix.append(x_test[i])
                y_mix.append(y_pred)
        print('\033[035m%.1f\033[0m' % threshold, '合并后训练集数量', len(x_mix), len(y_mix))
        # 模型2
        m2 = Model().fit(x_mix, y_mix)
        m2.classification_report(x_test, y_test)


from data10 import X, Y  # 文本10分类
experiment(X, Y, test_size=.95)
experiment(X, Y, test_size=.9)
experiment(X, Y, test_size=.8)
experiment(X, Y, test_size=.7)
experiment(X, Y, test_size=.6)
experiment(X, Y, test_size=.5)
experiment(X, Y, test_size=.4)

附两组结果

训练集:测试集测试集占比添加伪标签阈值测试集数量训练集数量混合标注数据量标注增加量模型1准度模型2准度模型2-模型1
1比190.950.91849497397300.8231318270.8231318270
1比190.950.81849497397300.8231318270.8231318270
1比190.950.71849497311982250.8231318270.818806099-0.004325727
1比190.950.618494973311121380.8231318270.773710393-0.049421434
1比190.950.518494973586248890.8231318270.753325403-0.069806424
1比190.950.418494973886778940.8231318270.782956635-0.040175192
1比190.950.31849497312501115280.8231318270.812750081-0.010381745
1比190.950.21849497316789158160.8231318270.8291337730.006001947
1比90.90.9175211946194600.8461275040.8461275040
1比90.90.817521194621762300.8461275040.845956281-0.000171223
1比90.90.7175211946474427980.8461275040.832258433-0.013869071
1比90.90.6175211946774457980.8461275040.824781691-0.021345814
1比90.90.51752119461033983930.8461275040.831801838-0.014325666
1比90.90.417521194612878109320.8461275040.843559158-0.002568347
1比90.90.317521194615582136360.8461275040.8518920150.005764511
1比90.90.217521194618401164550.8461275040.8525769080.006449404
1比40.80.91557438933906130.8620136120.8620778226.42096E-05
1比40.80.8155743893583819450.8620136120.8620136120
1比40.80.7155743893930254090.8620136120.859830487-0.002183126
1比40.80.61557438931194880550.8620136120.8592526-0.002761012
1比40.80.515574389314165102720.8620136120.8627199180.000706305
1比40.80.415574389316156122630.8620136120.8634262230.001412611
1比40.80.315574389317888139950.8620136120.8655451390.003531527
1比40.80.215574389319202153090.8620136120.8628483370.000834725
3比70.70.913627584060171770.8679092980.8679092980
3比70.70.8136275840913132910.8679092980.8679826817.33837E-05
3比70.70.71362758401220163610.8679092980.866441623-0.001467674
3比70.70.61362758401420783670.8679092980.867248844-0.000660454
3比70.70.513627584015884100440.8679092980.8685697510.000660454
3比70.70.413627584017343115030.8679092980.86834960.000440302
3比70.70.313627584018535126950.8679092980.8695971230.001687826
3比70.70.213627584019361135210.8679092980.8680560650.000146767
2比30.60.911681778682504640.8740690010.873726565-0.000342436
2比30.60.81168177861138736010.8740690010.873298519-0.000770482
2比30.60.71168177861388460980.8740690010.8740690010
2比30.60.61168177861552877420.8740690010.873726565-0.000342436
2比30.60.51168177861686990830.8740690010.873726565-0.000342436
2比30.60.411681778618046102600.8740690010.8745826560.000513655
2比30.60.311681778618925111390.8740690010.8751819190.001112918
2比30.60.211681778619415116290.8740690010.8744970460.000428046
1比10.50.997349733104837500.8734333260.8738442570.000410931
1比10.50.8973497331338636530.8734333260.872097802-0.001335525
1比10.50.7973497331525455210.8734333260.872508732-0.000924594
1比10.50.6973497331654068070.8734333260.8735360590.000102733
1比10.50.5973497331755478210.8734333260.873330594-0.000102733
1比10.50.4973497331847187380.8734333260.8745633860.00113006
1比10.50.3973497331913093970.8734333260.8738442570.000410931
1比10.50.2973497331943997060.8734333260.8736387920.000205465
3比20.40.91168077871253947520.879157570.878772313-0.000385257
3比20.40.81168077871491071230.879157570.8794144090.000256838
3比20.40.71168077871634085530.879157570.8795428280.000385257
3比20.40.61168077871730595180.879157570.878900732-0.000256838
3比20.40.511680778718138103510.879157570.877102864-0.002054707
3比20.40.411680778718777109900.879157570.879029151-0.000128419
3比20.40.311680778719217114300.879157570.8792859890.000128419
3比20.40.211680778719450116630.879157570.878772313-0.000385257

训练集:测试集测试集占比添加伪标签阈值测试集数量训练集数量混合标注数据量标注增加量模型1准度模型2准度模型2-模型1
1比190.950.91849497397300.8251865470.8251865470
1比190.950.81849497397300.8251865470.8251865470
1比190.950.71849497313193460.8251865470.822050395-0.003136152
1比190.950.618494973316321900.8251865470.79366281-0.031523737
1比190.950.518494973615151780.8251865470.780198983-0.044987564
1比190.950.418494973939484210.8251865470.791391803-0.033794744
1比190.950.31849497312994120210.8251865470.813453012-0.011733535
1比190.950.21849497317059160860.8251865470.8321617820.006975235
1比90.90.9175211946194600.8505793050.8505793050
1比90.90.817521194621101640.8505793050.8512071230.000627818
1比90.90.7175211946444625000.8505793050.842531819-0.008047486
1比90.90.6175211946776358170.8505793050.830717425-0.01986188
1比90.90.51752119461062986830.8505793050.835055077-0.015524228
1比90.90.417521194613375114290.8505793050.840990811-0.009588494
1比90.90.317521194615980140340.8505793050.849380743-0.001198562
1比90.90.217521194618519165730.8505793050.8545174360.003938131
1比40.80.91557438933907140.8641967380.864132529-6.42096E-05
1比40.80.8155743893570918160.8641967380.862912547-0.001284192
1比40.80.7155743893937554820.8641967380.858674714-0.005522024
1比40.80.61557438931208381900.8641967380.858931553-0.005265186
1比40.80.515574389314279103860.8641967380.860729421-0.003467317
1比40.80.415574389316135122420.8641967380.863362014-0.000834725
1比40.80.315574389317879139860.8641967380.8650314630.000834725
1比40.80.215574389319222153290.8641967380.8639399-0.000256838
3比70.70.913627584060041640.8641667280.863799809-0.000366919
3比70.70.8136275840907032300.8641667280.862699053-0.001467674
3比70.70.71362758401220563650.8641667280.862332135-0.001834593
3比70.70.61362758401434285020.8641667280.86262567-0.001541058
3比70.70.513627584015964101240.8641667280.8651207160.000953988
3比70.70.413627584017381115410.8641667280.8652674840.001100756
3比70.70.313627584018549127090.8641667280.8667351580.00256843
3比70.70.213627584019367135270.8641667280.8650473330.000880605
2比30.60.911681778682464600.8725280370.8725280370
2比30.60.81168177861153937530.8725280370.872356819-0.000171218
2比30.60.71168177861399862120.8725280370.8725280370
2比30.60.61168177861562078340.8725280370.8725280370
2比30.60.51168177861692191350.8725280370.8729560830.000428046
2比30.60.411681778618052102660.8725280370.8727848640.000256827
2比30.60.311681778618895111090.8725280370.8742402190.001712182
2比30.60.211681778619403116170.8725280370.8728704730.000342436
1比10.50.997349733103966630.880830080.880110951-0.000719129
1比10.50.8973497331337636430.880830080.879494555-0.001335525
1比10.50.7973497331541356800.880830080.878878159-0.001951921
1比10.50.6973497331662268890.880830080.879700021-0.00113006
1比10.50.5973497331762278890.880830080.8810355460.000205465
1比10.50.4973497331849987660.880830080.880419149-0.000410931
1比10.50.3973497331910193680.880830080.8812410110.000410931
1比10.50.2973497331943296990.880830080.8813437440.000513663
3比20.40.91168077871253047430.8797996660.8799280850.000128419
3比20.40.81168077871498471970.8797996660.879414409-0.000385257
3比20.40.71168077871643486470.8797996660.8801849240.000385257
3比20.40.61168077871736895810.8797996660.8797996660
3比20.40.511680778718168103810.8797996660.878772313-0.001027353
3比20.40.411680778718797110100.8797996660.879671247-0.000128419
3比20.40.311680778719241114540.8797996660.878643894-0.001155772
3比20.40.211680778719446116590.8797996660.87915757-0.000642096

结论

多数情况不提升,某些情况下有轻微提升,有空再做更多微调

评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值