- 统计给出文件的单词数量,简单版,测试下网页编辑功能
"""
统计给出文件的单词数量,并按照由多到少排序
"""
'''
#######################
1. 读取文本文件
2. 统计单词个数
3. 排序后输出
#########################
'''
import re
import time
import operator
class WordCount(object):
def __init__(self, filename):
self.filename = filename
self.word_dict = {}
self.split_pattern = re.compile(r"[^A-Za-z-]")
def readfile(self):
print u"文件名:", self.filename
with open(self.filename, 'r') as fd:
for line in fd.readlines():
self.split_word(line)
def split_word(self, line):
bb = re.split(self.split_pattern, line.strip('\n'))
for word in bb:
word_lower = word.lower()
if word_lower != '':
self.dict_count(word_lower)
def dict_count(self, word):
count = 0
if self.word_dict.has_key(word):
count += self.word_dict.get(word)
self.word_dict[word] = count + 1
else:
self.word_dict[word] = 1
def get_result_dict(self):
return self.word_dict
def sort_count(self):
print "单词类型个数", len(self.word_dict.keys())
sort_list = sorted(self.word_dict)
'''
print "单词数量统计列表"
for word in sort_list:
print word, self.word_dict.get(word)
'''
'''
print "单词数量统计列表,词频大于30"
for word in sort_list:
count = self.word_dict.get(word)
if count > 30:
print word, self.word_dict.get(word)
'''
''' 按照单词出现频率排序 '''
sort_dict = sorted(self.word_dict.iteritems(), key=lambda d:d[1], reverse = True)
print sort_dict
count = 0
count_const = 1000
for word in sort_dict:
if word[1] > count_const:
print word[0], word[1]
count += 1
print "单词频率大于", count_const, "词数为" , count
def format_print(self):
self.sort_count()
if __name__ == '__main__':
start_time = time.time()
time.sleep(5)
wordcount = WordCount("/root/code/python/temTestDir/pythonlib.txt")
wordcount.readfile()
wordcount.format_print()
print time.time() - start_time
- 测试结果 ,截取一小部分
