fasttext版本
0.9.1 参考文本分类官方教程:https://fasttext.cc/docs/en/supervised-tutorial.html
数据集格式
" ".join(["__label__classId"]+["我们","中国"]),标签用特定的__label__连接,和分词共同使用空格分割拼接在一起。官方给的参考数据下载地址:https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
代码
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
import jieba
import logging
import fasttext
import pandas as pd
import codecs
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import fasttext
#处理训练集,将训练集的文本信息和label信息合并,清洗特殊符合,同时将文本内容进行分词
def merge_feature_label(feature_name,label_name):
feature=pd.read_csv(feature_name,sep=",")
label=pd.read_csv(label_name,sep=",")
data=feature.merge(label,on='id')
data["X"]=data[["title","content"]].apply(lambda x:"".join([str(x[0]),str(x[1])]),axis=1)
dataDropNa=data.dropna(axis=0