import pandas as pd
import jieba
import re
import numpy as np
from langconv import *
content = pd.read_table('content.txt',encoding='gbk',sep='\n')
data = content.iloc[0,0]
去掉特殊符号和空格,包括数字、标点、字母
pattern = re.compile(u'[^\u4E00-\u9FA5]')
text = pattern.sub('',data)
将繁体字转化为简体字
text = Converter('zh-hans').convert(text)
中文分词
text = jieba.lcut(text)
text
去掉停用词
with open('./stop_words.txt',encoding='utf-8') as f:
words = f.read()
stops = words.split('\n')
text = [i for i in text if i not in stops]
text