python wordcount

最新推荐文章于 2024-09-05 21:27:48 发布

neo-zhao

最新推荐文章于 2024-09-05 21:27:48 发布

阅读量428

点赞数 1

CC 4.0 BY-SA版权

分类专栏： python 文章标签： python wordcount

本文链接：https://blog.youkuaiyun.com/tmby1314/article/details/53457994

python 专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一个简单的Python程序，用于统计指定文件中单词的数量，并按出现频率从高到低进行排序。该程序首先读取文件内容，然后使用正则表达式分割单词，并统计每个单词出现的次数。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

统计给出文件的单词数量，简单版，测试下网页编辑功能

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
统计给出文件的单词数量，并按照由多到少排序
"""

'''
#######################
1. 读取文本文件
2. 统计单词个数
3. 排序后输出
#########################
'''
import re
import time
import operator


class WordCount(object):
    def __init__(self, filename):
        self.filename = filename
        self.word_dict = {}
        self.split_pattern = re.compile(r"[^A-Za-z-]")

    def readfile(self):
        print u"文件名:", self.filename
        with open(self.filename, 'r') as fd:
            for line in fd.readlines():
                self.split_word(line)

    def split_word(self, line):
        bb = re.split(self.split_pattern, line.strip('\n'))
        for word in bb:
            word_lower = word.lower()
            if word_lower != '':
                self.dict_count(word_lower)

    def dict_count(self, word):
        count = 0
        if self.word_dict.has_key(word):
            count += self.word_dict.get(word)
            self.word_dict[word] = count + 1
        else:
            self.word_dict[word] = 1

    def get_result_dict(self):
        return self.word_dict

    def sort_count(self):
        print "单词类型个数", len(self.word_dict.keys())
        sort_list = sorted(self.word_dict)
        '''
        print "单词数量统计列表"

        for word in sort_list:
            print word, self.word_dict.get(word)
        '''
        '''
        print "单词数量统计列表，词频大于30"
        for word in sort_list:
            count = self.word_dict.get(word)
            if count > 30:
                print word, self.word_dict.get(word)
        '''

        ''' 按照单词出现频率排序 '''
        sort_dict = sorted(self.word_dict.iteritems(), key=lambda d:d[1], reverse = True)
        print  sort_dict
        count = 0
        count_const = 1000
        for word in sort_dict:
            if word[1] > count_const:
                print word[0], word[1]
                count += 1
        print "单词频率大于", count_const, "词数为" , count
    def format_print(self):
        self.sort_count()


if __name__ == '__main__':
    start_time = time.time()
    time.sleep(5)
    wordcount = WordCount("/root/code/python/temTestDir/pythonlib.txt")
    wordcount.readfile()
    wordcount.format_print()
    print time.time() - start_time

测试结果，截取一小部分