词频统计python

原创已于 2024-05-06 10:34:18 修改 · 3.4k 阅读

28 ·

CC 4.0 BY-SA版权

文章标签：

#python

于 2022-02-09 21:35:55 首次发布

Python123题目参考答案__总结专栏收录该内容

135 篇文章

订阅专栏

本文介绍了如何使用Python读取文件，过滤中文和特殊字符，转换为小写，并统计单词频率。核心函数包括读取文件、单词处理、词频统计及去除常见词汇。

def read_file(file):
    """接收文件名为参数，将文件中的内容读为字符串，
    只保留文件中的英文字母和西文符号，
    过滤掉中文(中文字符及全角符号Unicode编码都大于256)
    将所有字符转为小写，
    将其中所有标点、符号替换为空格，返回字符串
    """
    with open(file, 'r', encoding='utf-8') as data:
        string_sign = '!"\'-#$%&()*+,./:;<=>?@[\\]^_‘{|}~'
        string_txt = data.read()
        # print(string_txt)
        for letter in string_txt:
            if not letter.isascii():
                string_txt = string_txt.replace(letter, '')
        string_txt = string_txt.lower()
        # print(string_txt)
        # string_txt = string_txt.lower().replace('\n', ' ')
        # print(string_txt)
        for letter in string_txt:
            if letter in string_sign:
                string_txt = string_txt.replace(letter, ' ')
    return string_txt


def count_of_words(txt):
    """接收去除标点、符号的字符串，统计并返回其中单词数量和不重复的单词数量"""
    txt = txt.replace('\n', ' ')
    ls1 = txt.split(' ')
    for item in ls1.copy():
        if item.isspace():
            ls1.remove(item)
        elif item == '':
            ls1.remove(item)
        # elif item.isdigit():
        #     ls1.remove(item)
    print(ls1)
    print(set(ls1))
    return (len(ls1), len(set(ls1)))


def word_frequency(txt):
    """接收去除标点、符号的字符串，统计并返回每个单词出现的次数
    返回值为字典类型，单词为键，对应出现的次数为值"""
    # print(txt)
    txt = txt.replace('\n', ' ')
    ls1 = txt.split(' ')
    for item in ls1.copy():
        if item.isspace():
            ls1.remove(item)
        elif item == '':
            ls1.remove(item)
        elif item.isdigit():
            ls1.remove(item)
    # print(ls1)
    import collections
    ctr = collections.Counter(ls1)
    return dict(ctr)


def top_ten_words(frequency, cnt):
    """接收词频字典，输出出现次数最多的cnt个单词及其出现次数"""
    frequency = dict(sorted(frequency.items(), key=lambda x:x[1], reverse=True))
    # print(frequency)
    ls1 = list(frequency.keys())[:cnt]
    ls2 = list(frequency.values())[:cnt]
    for i in range(len(ls1)):
        print(f'{ls1[i]} {ls2[i]}')


def top_ten_words_no_excludes(frequency, cnt):
    """接收词频字典，去除常见的冠词、代词、系动词和连接词后，输出出现次数最多的
    cnt个单词及其出现次数，需排除的单词如下：
    excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we',
    'or', 'is', 'was', 'do', 'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her',
    'c','in', 'you', 'had','s', 'with', 'for', 't', 'but', 'as', 'not', 'they',
    'be', 'were', 'so', 'our','all', 'would', 'if', 'him', 'from', 'no', 'me',
    'could', 'when', 'there','them', 'about', 'this', 'their', 'up', 'been',
    'by', 'out', 'did', 'have']
    """
    excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we',
                      'or', 'is', 'was', 'do', 'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her',
                      'c', 'in', 'you', 'had', 's', 'with', 'for', 't', 'but', 'as', 'not', 'they',
                      'be', 'were', 'so', 'our', 'all', 'would', 'if', 'him', 'from', 'no', 'me',
                      'could', 'when', 'there', 'them', 'about', 'this', 'their', 'up', 'been',
                      'by', 'out', 'did', 'have']
    ls_keys = list(frequency.keys())
    for word in excludes_words:
        if word in ls_keys:
            del frequency[word]
    frequency = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
    ls1 = list(frequency.keys())[:cnt]
    ls2 = list(frequency.values())[:cnt]
    for i in range(len(ls1)):
        print(f'{ls1[i]} {ls2[i]}')


if __name__ == '__main__':
    filename = 'Who Moved My Cheese.txt'  # 文件名
    content = read_file(filename)  # 调用函数返回字典类型的数据
    frequency_result = word_frequency(content)  # 统计词频
    cmd = input()
    if cmd == '1':
        n = int(input())
        print(content[:n])
    elif cmd == '2':
        amount_results = count_of_words(content)
        print('文章共有单词{}个，其中不重复单词{}个'.format(*amount_results))
    elif cmd == '3':
        n = int(input())
        top_ten_words(frequency_result, n)
    elif cmd == '4':
        n = int(input())
        top_ten_words_no_excludes(frequency_result, n)

扫描下方二维码，关注后了解更多精彩内容！！