使用NLTK
import nltk #导入nltk
nltk.download() #下载语料库
使用官方教程中的文本
from nltk.book import *
寻找特定词在文本中的上下文
text1.concordance("monstrous") #在text1中monstrous的上下文
依据上下文,寻找相似的词语
text1.similar("monstrous")
text2.similar("monstrous")
寻找多个词语在文本中的共同上下文
text2.common_contexts(["monstrous", "very"])
画出词语在文本中的位置信息图
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
对text文本进行计数
len(text3) #文本text3的长度
sorted(set(text3)) #按顺序返回文本text3的全部词语
len(set(text3)) #text3的无重复词语数量、
len(set(text3)) / len(text3) #text3的“词汇量丰富程度”
text3.count("smote") #“smote”在text3中出现的次数
“text”本质上是一个词语的列表(list)
sent1 = ['Call', 'me', 'Ishmael', '.'] #定义一个sent1
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail'] #连接两个list
sent4 + sent1 #连接两个list
sent1.append("Some") #为sent1添加词语元素
text4[173] #返回'awaken'
text4.index('awaken') #返回索引值173
text5[16715:16735]
词语本身就是python中的字符串(string)
name = 'Monty'
name[:4] #'Mont'
name * 2 #'MontyMonty'
name + '!' #'Monty!'
' '.join(['Monty', 'Python']) #'Monty Python'
'Monty Python'.split() #['Monty', 'Python']
对于文本信息的简单统计
使用频率分布 frequency distribution
fdist1 = FreqDist(text1) #生成text1的频率分布
fdist1.most_common(50) #输出最常见的50个词语及其出现次数
fdist1['whale'] #输出特定词语‘whale’的出现次数
简单的词语筛选
V = set(text1)
long_words = [w for w in V if len(w) > 15] #筛选出长词
sorted(long_words)
fdist5 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7) #筛选出高频长词
二元词语搭配
list(bigrams(['more', 'is', 'said', 'than', 'done'])) #返回[('more', 'is'), ('is', 'said'), ('said', 'than'), ('than', 'done')],全部的二元搭配
text4.collocations() #找到比我们根据单个词的频率预期更频繁出现的二元词
作者:hitsunbo
链接:https://www.jianshu.com/p/b296dbadc40c
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
使用语料库和词汇资源
内置的语料库
import nltk
nltk.corpus.gutenberg.fileids() #Gutenberg语料库的全部文件id
emma = nltk.corpus.gutenberg.words('austen-emma.txt') #打开一个Gutenberg的文本文件
len(emma)
或者
from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')
从语料库中提取文本信息
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
再例如Brown语料库
from nltk.corpus import brown
brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.sents(categories=['news', 'editorial', 'reviews'])
cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
内置语料库基本函数
#Example Description
fileids() #the files of the corpus
fileids([categories]) #the files of the corpus corresponding to these categories
categories() #the categories of the corpus
categories([fileids]) #the categories of the corpus corresponding to these files
raw() #the raw content of the corpus
raw(fileids=[f1,f2,f3]) #the raw content of the specified files
raw(categories=[c1,c2]) #the raw content of the specified categories
words() #the words of the whole corpus
words(fileids=[f1,f2,f3]) #the words of the specified fileids
words(categories=[c1,c2]) #the words of the specified categories
sents() #the sentences of the whole corpus
sents(fileids=[f1,f2,f3]) #the sentences of the specified fileids
sents(categories=[c1,c2]) #the sentences of the specified categories
abspath(fileid) #the location of the given file on disk
encoding(fileid) #the encoding of the file (if known)
open(fileid) #open a stream for reading the given corpus file
root #if the path to the root of locally installed corpus
readme() #the contents of the README file of the corpus
载入本地语料库
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')
条件频率分布
频率分布计数可观察事件,例如文本中的单词的出现。 条件频率分布需要将每个事件与条件配对。 因此,不是处理一个单词序列,我们必须处理一个配对序列:
genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(genre_word)
cfd['romance'].most_common(20)
cfd['romance']['could']
词语料
使用词表
def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab - english_vocab
return sorted(unusual)
unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))
使用停用词表
from nltk.corpus import stopwords
stopwords.words('english')
使用英文姓名词表
names = nltk.corpus.names
names.fileids()['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
使用发音词表
entries = nltk.corpus.cmudict.entries()
for entry in entries[42371:42379]:
print(entry)
使用WordNet
from nltk.corpus import wordnet as wn
wn.synsets('motorcar') #[Synset('car.n.01')]
wn.synset('car.n.01').lemma_names() #['car', 'auto', 'automobile', 'machine', 'motorcar']
wn.synset('car.n.01').definition() #'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
wn.synset('car.n.01').examples() #['he needs a car to get to work']
作者:hitsunbo
链接:https://www.jianshu.com/p/60d0c5e265d7
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
处理原始文本
读入web原始文本
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw) #<class 'str'>
读取本地原始文本
f = open('document.txt')
raw = f.read()
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()
获取用户输入
s = input("Enter some text: ")
print("You typed", len(word_tokenize(s)), "words.")
原始文本本身为字符串格式,可以用字符串的函数处理
raw.find("PART I")
raw = raw[5338:1157743]
从原始文本中提取出词,并封装至text
tokens = word_tokenize(raw)
type(tokens) #<class 'list'>
text = nltk.Text(tokens)
type(text) #<class 'nltk.text.Text'>
用正则表达式进行文本模式匹配
import re
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
Operator Behavior
. #Wildcard, matches any character
^abc #Matches some pattern abc at the start of a string
abc$ #Matches some pattern abc at the end of a string
[abc] #Matches one of a set of characters
[A-Z0-9] #Matches one of a range of characters
ed|ing|s #Matches one of the specified strings (disjunction)
* #Zero or more of previous item, e.g. a*, [a-z]* (also known as *Kleene Closure*)
+ #One or more of previous item, e.g. a+, [a-z]+
? #Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
{n} #Exactly n repeats where n is a non-negative integer
{n,} #At least n repeats
{,n} #No more than n repeats
{m,n} #At least m and no more than n repeats
a(b|c)+ #Parentheses that indicate the scope of the operators
规则化文本
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens] #better
[lancaster.stem(t) for t in tokens]
分割句子
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])
作者:hitsunbo
链接:https://www.jianshu.com/p/8cdbdadd8111
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
分类和标记单词
zip与enumerate
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
print(list(zip(words, tags))) #[('I', 'noun'), ('turned', 'verb'), ('off', 'prep'), ('the', 'det'), ('spectroroute', 'noun')]
print(list(enumerate(words))) #[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]
filter与map
def is_content_word(word):
return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',... 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
print(list(filter(is_content_word, sent))) #['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']
lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))
print(sum(lengths) / len(lengths))
词性标注
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text) #[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),('completely', 'RB'), ('different', 'JJ')]
从有标记文本中分割出词和词性
>>> tagged_token = nltk.tag.str2tuple('fly/NN')
>>> tagged_token
('fly', 'NN')
>>> tagged_token[0]
'fly'
>>> tagged_token[1]
'NN'
>>> sent = '''
... The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
... other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
... Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
... said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
... accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
... interest/NN of/IN both/ABX governments/NNS ''/'' ./
.... '''
>>> [nltk.tag.str2tuple(t) for t in sent.split()]
[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ... ('.', '.')]
读取内置有标记语料库
>>> nltk.corpus.brown.tagged_words()
[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
>>> nltk.corpus.brown.tagged_words(tagset='universal')
[('The', 'DET'), ('Fulton', 'NOUN'), ...]
依据上下文得到相似词
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('bought') #made done put said found had seen given left heard been brought gotset was called felt in that told
带有默认值的字典default dictionary
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> frequency['colorless'] = 4
>>> frequency['ideas']
0
>>> pos = defaultdict(list)
>>> pos['sleep'] = ['NOUN', 'VERB']
>>> pos['ideas']
[]
自动标注
作者:hitsunbo
链接:https://www.jianshu.com/p/2f80d8d1eab2
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。