#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import jieba
import json
from collections import Counter
reload(sys)
sys.setdefaultencoding("utf-8")
filename = "rowss.txt"
f1 = open("row2.txt", "w+")
with open(filename) as f:
mytext = f.read()
mytext = mytext.decode("utf-8")
mytext = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、|~@#¥%……&*()]+".decode("utf-8"), "".decode("utf8"), mytext)
mytext = " ".join(jieba.cut(mytext))
f1.write(mytext)
word_lst = []
word_dict = {}
with open("row2.txt") as f2, open("row4.txt", "w") as f3:
for word in f2:
word_lst.append(word.split(' '))
for item in word_lst:
for item2 in item:
if item2 not in word_dict:
简单的结巴分词与词频统计
最新推荐文章于 2023-05-31 11:17:08 发布