# -*- coding: utf-8 -*-
"""
Created on Tue Apr 22 17:25:47 2014
@author: lifeix
"""
import sys,re
reload(sys)
sys.setdefaultencoding('utf8')
txt = open('/home/lifeix/xiaoshuo1.txt','r').read()
wfile=open('/home/lifeix/result.txt','w')
r = re.compile('[\x80-\xff]+')
m = r.findall(txt)
dict={}
z1 = re.compile('[\x80-\xff]{3}')
z2 = re.compile('[\x80-\xff]{4}')
z3 = re.compile('[\x80-\xff]{6}')
z4 = re.compile('[\x80-\xff]{8}')
for i in m:
x = i.encode('utf8')
i = z1.findall(x)
for j in i:
if (j in dict):
dict[j]+=1
else:
dict[j]=1
dict=sorted(dict.items(), key=lambda d:d[1],reverse=True)
for a,b in dict:
if b>0:
wfile.write(a+','+str(b)+'\n')
wfile.close()
f = open('/home/lifeix/result.txt','r')
count = 0
for line in f.readlines():
if count%10 == 0:
print '\n'
line = line[0:len(line) - 1]
print "%s "%line,
count = count + 1
f.close()
python统计汉字词频
最新推荐文章于 2023-08-25 10:10:21 发布