2.1 获取文本语料库
2.1.1古腾堡语料库
import nltk
print(nltk.corpus.gutenberg.fileids())
#简·奥斯丁的《艾玛》–emma,找出它包含多少词
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print(len(emma))
#‘austen-emma.txt’ 该文本中surprize的上下文
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print(emma.concordance('surprize'))
#直接调用古腾堡语料库中的文件,就不用每次输入了
nltk.corpus.gutenberg.fileids
from nltk.corpus import gutenberg
print(gutenberg.fileids())
emma = gutenberg.words('austen-emma.txt')
print(emma)
循环遍历列出与gutenberg文件标识符相应的fileid。然后计算统计每个文本,其中raw()函数能在没有进行过任何语言学处理之前把文件的内容分析出来。
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
print(int(num_chars/num_words),int(num_words