# -*-coding:utf-8-*-
"""
from nltk.book import * 获取所有的语料库
"""
"""
古滕堡语料库
"""
# from nltk.corpus import gutenberg # 直接加载某个具体语料库
#
# print gutenberg.fileids() # 语料库的文本
#
# emma = gutenberg.words('austen-emma.txt') # 选择其中一个文本
# print len(emma) # 文本的长度
#
# for fileid in gutenberg.fileids():
# # raw() 原始的未经过任何处理的文本
# num_chars = len(gutenberg.raw(fileid))
# num_words = len(gutenberg.words(fileid))
# # sents() 把文本划分成句子,其中每一个句子就是一个词链表
# num_sents = len(gutenberg.sents(fileid))
# num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# # 平均词长,平均句子长度, 文本中每个词出现的平均次数
# print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
#
# sentences = gutenberg.sents('shakespeare-macbeth.txt')
# print sentences
"""
网络聊天文本
"""
# from nltk.corpus import webtext
#
# for fileid in webtext.fileids():
# print fileid, webtext.raw(fileid)[:65], '...'
#
# from nltk.corpus import nps_chat
#
# chatroom = nps_chat.posts('10-19-20s_706posts.xml')
# print chatroom[123]
"""
布朗语料库
"""
# from nltk.corpus import brown
#
# print brown.categories() # 文本类型目录
# print brown.words(categories='news') # 新闻类别的数据
# print brown.words(fileids=['cg22'])
# print brown.sents(categories=['news', 'editorial', 'reviews'])
# print '----------------------------------'
#
# import nltk
#
# news_text = brown.words(categories='news')
# fdist = nltk.FreqDist([w.lower() for w in news_text])
# modals = ['can', 'could', 'may', 'might', 'must', 'will']
# for m in modals:
# print m + ':', fdist[m],
# print
# print '----------------------------------'
#
# cfd = nltk.ConditionalFreqDist(
# (genre, word)
# for genre in brown.categories()
# for word in brown.words(categories=genre)
# )
# genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
# modals = ['can', 'could', 'may', 'might', 'must', 'will']
# print cfd.tabulate(conditions=genres, samples=modals)
"""
路透社语料库:
10788个新闻文档,90个主题,“训练”和“测试”两组
"""
# from nltk.corpus import reuters
# print reuters.fileids()
# print reuters.categories() # 类别目录
# print reuters.categories('training/9865')
# print reuters.categories(['training/9865', 'training/9880'])
# print reuters.fileids('barley')
# print reuters.fileids(['barley', 'corn'])
# print reuters.categories('training/9880')[:14]
# print reuters.words(['training/9865', 'training/9880'])
# print reuters.words(categories='barley')
# print reuters.words(categories=['barley', 'corn'])
"""
就职演说语料库
"""
from nltk.corpus import inaugural
print inaugural.fileids()
print [fileid[:4] for fileid in inaugural.fileids()]
import nltk
cfd = nltk.ConditionalFreqDist(
(target, file[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
cfd.plot()
"""
文本语料库的函数:
fileids() 语料库中的文本
fileids([categories]) 这些分类对应的语料库中的文本
categories() 语料库中的分类
categories([fileids]) 这些文件对应的语料库中的分类
raw() 语料库的原始内容
raw(fileids=[f1,f2,f3]) 指定文件的原始内容
raw(categories=[c1,c2]) 指定文件的原始内容
words() 整个语料库中的词汇
words(fileids=[f1,f2,f3]) 指定文件中的词汇
words(categories=[c1,c2]) 指定分类中的词汇
sents() 指定分类中的句子
sents(fileids=[f1,f2,f3]) 指定文件中的句子
sents(categories=[c1,c2]) 指定分类中的句子
abspath=(fileid) 指定文件在磁盘中的位置
encoding(fileid) 文件的编码
open(fileid) 打开指定语料库文件的文件流
root() 到本地安装的语料库根目录的路径
"""
"""
载入自己的语料库:
使用NLTK中的PlaintextCorpusReader
"""
# from nltk.corpus import PlaintextCorpusReader
# corpus_root = r'c:/'
# wordlists = PlaintextCorpusReader(corpus_root, '.*')
# wordlists.fileids()
# wordlists.words('connectives')
from nltk.corpus import BracketParseCorpusReader
corpus_root = r'c:\corpora'
file_pattern = r''
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
ptb.fileids()
len(ptb.sents())
"""
按文体计数词汇:
FreqDist()以一个简单的链表作为输入
ConditionalFreqDist()以一个配对链表作为输入
"""
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
genre_word = [
(genre, word)
for genre in ['news', 'romance']
for word in brown.words(categories=genre)
]
print len(genre_word)
print genre_word[:4]
print genre_word[-4:]
cfd = nltk.ConditionalFreqDist(genre_word)
print cfd.conditions()
print cfd['news']
print cfd['romance']
print list(cfd['romance'])
print cfd['romance']['could']
"""
绘制分布图和分布表
"""
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
"""
条件频率分布:
cfdist = ConditionalFreqDist(pairs) 从配对链表中创建条件频率分布
cfdist.conditions() 将条件按字母排序
cfdist[condition] 此条件下的频率分布
cfdist[condition][sample] 此条件下的给定样本的频率
cfdist.tabulate() 为条件频率分布制表
cfdist.tabulate(samples, conditions) 指定样本和条件限制下制表
cfdist.plot() 为条件频率分布绘图
cfdist.plot(samples, conditions) 指定样本和条件限制下绘图
cfdist1 < cfdist2 测试样本在cfdist1中出现次数是否小于在cfdist2中出现次数
"""
"""
停用词语料库
"""
from nltk.corpus import stopwords
print stopwords.words('english')
def content_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
return len(content)/len(text)
print content_fraction(nltk.corpus.reuters.words())