文本分析--NLTK语料库选择

最新推荐文章于 2023-02-23 15:39:07 发布
原创最新推荐文章于 2023-02-23 15:39:07 发布 · 3.8k 阅读
2 ·
CC 4.0 BY-SA版权
文本分析专栏收录该内容
14 篇文章
订阅专栏
本文介绍如何使用NLTK库处理多种类型的语料库，包括古滕堡、网络聊天、布朗、路透社和就职演说等，并展示了如何进行基本的文本统计分析和词汇频率分布图绘制。
# -*-coding:utf-8-*-

"""
    from nltk.book import * 获取所有的语料库
"""

"""
    古滕堡语料库
"""
# from nltk.corpus import gutenberg  # 直接加载某个具体语料库
#
# print gutenberg.fileids()  # 语料库的文本
#
# emma = gutenberg.words('austen-emma.txt')  # 选择其中一个文本
# print len(emma)  # 文本的长度
#
# for fileid in gutenberg.fileids():
#     # raw() 原始的未经过任何处理的文本
#     num_chars = len(gutenberg.raw(fileid))
#     num_words = len(gutenberg.words(fileid))
#     # sents() 把文本划分成句子，其中每一个句子就是一个词链表
#     num_sents = len(gutenberg.sents(fileid))
#     num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
#     # 平均词长，平均句子长度， 文本中每个词出现的平均次数
#     print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
#
# sentences = gutenberg.sents('shakespeare-macbeth.txt')
# print sentences

"""
    网络聊天文本
"""
# from nltk.corpus import webtext
#
# for fileid in webtext.fileids():
#     print fileid, webtext.raw(fileid)[:65], '...'
#
# from nltk.corpus import nps_chat
#
# chatroom = nps_chat.posts('10-19-20s_706posts.xml')
# print chatroom[123]

"""
    布朗语料库
"""
# from nltk.corpus import brown
#
# print brown.categories()  # 文本类型目录
# print brown.words(categories='news')  # 新闻类别的数据
# print brown.words(fileids=['cg22'])
# print brown.sents(categories=['news', 'editorial', 'reviews'])
# print '----------------------------------'
#
# import nltk
#
# news_text = brown.words(categories='news')
# fdist = nltk.FreqDist([w.lower() for w in news_text])
# modals = ['can', 'could', 'may', 'might', 'must', 'will']
# for m in modals:
#     print m + ':', fdist[m],
# print
# print '----------------------------------'
#
# cfd = nltk.ConditionalFreqDist(
#     (genre, word)
#     for genre in brown.categories()
#     for word in brown.words(categories=genre)
# )
# genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
# modals = ['can', 'could', 'may', 'might', 'must', 'will']
# print cfd.tabulate(conditions=genres, samples=modals)

"""
    路透社语料库：
        10788个新闻文档，90个主题，“训练”和“测试”两组
"""
# from nltk.corpus import reuters
# print reuters.fileids()
# print reuters.categories()  # 类别目录
# print reuters.categories('training/9865')
# print reuters.categories(['training/9865', 'training/9880'])
# print reuters.fileids('barley')
# print reuters.fileids(['barley', 'corn'])
# print reuters.categories('training/9880')[:14]
# print reuters.words(['training/9865', 'training/9880'])
# print reuters.words(categories='barley')
# print reuters.words(categories=['barley', 'corn'])

"""
    就职演说语料库
"""
from nltk.corpus import inaugural

print inaugural.fileids()
print [fileid[:4] for fileid in inaugural.fileids()]

import nltk

cfd = nltk.ConditionalFreqDist(
    (target, file[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)
cfd.plot()

"""
    文本语料库的函数：
        fileids()                         语料库中的文本
        fileids([categories])             这些分类对应的语料库中的文本
        categories()                      语料库中的分类
        categories([fileids])             这些文件对应的语料库中的分类
        raw()                             语料库的原始内容
        raw(fileids=[f1,f2,f3])           指定文件的原始内容
        raw(categories=[c1,c2])           指定文件的原始内容
        words()                           整个语料库中的词汇
        words(fileids=[f1,f2,f3])         指定文件中的词汇
        words(categories=[c1,c2])         指定分类中的词汇
        sents()                           指定分类中的句子
        sents(fileids=[f1,f2,f3])         指定文件中的句子
        sents(categories=[c1,c2])         指定分类中的句子
        abspath=(fileid)                  指定文件在磁盘中的位置
        encoding(fileid)                  文件的编码
        open(fileid)                      打开指定语料库文件的文件流
        root()                            到本地安装的语料库根目录的路径
"""

"""
    载入自己的语料库：
        使用NLTK中的PlaintextCorpusReader
"""
# from nltk.corpus import PlaintextCorpusReader
# corpus_root = r'c:/'
# wordlists = PlaintextCorpusReader(corpus_root, '.*')
# wordlists.fileids()
# wordlists.words('connectives')


from nltk.corpus import BracketParseCorpusReader

corpus_root = r'c:\corpora'
file_pattern = r''
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())

"""
    按文体计数词汇：
        FreqDist()以一个简单的链表作为输入
        ConditionalFreqDist()以一个配对链表作为输入
"""
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

genre_word = [
    (genre, word)
    for genre in ['news', 'romance']
    for word in brown.words(categories=genre)
    ]
print len(genre_word)
print genre_word[:4]
print genre_word[-4:]

cfd = nltk.ConditionalFreqDist(genre_word)
print cfd.conditions()
print cfd['news']
print cfd['romance']
print list(cfd['romance'])
print cfd['romance']['could']

"""
    绘制分布图和分布表
"""
from nltk.corpus import inaugural

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)

"""
    条件频率分布：
        cfdist = ConditionalFreqDist(pairs)         从配对链表中创建条件频率分布
        cfdist.conditions()                         将条件按字母排序
        cfdist[condition]                           此条件下的频率分布
        cfdist[condition][sample]                   此条件下的给定样本的频率
        cfdist.tabulate()                           为条件频率分布制表
        cfdist.tabulate(samples, conditions)        指定样本和条件限制下制表
        cfdist.plot()                               为条件频率分布绘图
        cfdist.plot(samples, conditions)            指定样本和条件限制下绘图
        cfdist1 < cfdist2                           测试样本在cfdist1中出现次数是否小于在cfdist2中出现次数
"""

"""
    停用词语料库
"""
from nltk.corpus import stopwords
print stopwords.words('english')
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content)/len(text)
print content_fraction(nltk.corpus.reuters.words())