NLTK学习记录

 
 

使用NLTK

import nltk  #导入nltk
nltk.download()  #下载语料库

使用官方教程中的文本

from nltk.book import *

寻找特定词在文本中的上下文

text1.concordance("monstrous") #在text1中monstrous的上下文

依据上下文,寻找相似的词语

text1.similar("monstrous")
text2.similar("monstrous")

寻找多个词语在文本中的共同上下文

text2.common_contexts(["monstrous", "very"])

画出词语在文本中的位置信息图

text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

对text文本进行计数

len(text3)   #文本text3的长度
sorted(set(text3))  #按顺序返回文本text3的全部词语
len(set(text3))  #text3的无重复词语数量、
len(set(text3)) / len(text3)  #text3的“词汇量丰富程度”
text3.count("smote")  #“smote”在text3中出现的次数

“text”本质上是一个词语的列表(list)

sent1 = ['Call', 'me', 'Ishmael', '.']   #定义一个sent1
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']  #连接两个list
sent4 + sent1  #连接两个list
sent1.append("Some")   #为sent1添加词语元素
text4[173]  #返回'awaken'
text4.index('awaken')  #返回索引值173
text5[16715:16735]

词语本身就是python中的字符串(string)

name = 'Monty'
name[:4]  #'Mont'
name * 2  #'MontyMonty'
name + '!'  #'Monty!'
' '.join(['Monty', 'Python'])  #'Monty Python'
'Monty Python'.split()  #['Monty', 'Python']

对于文本信息的简单统计

使用频率分布 frequency distribution

fdist1 = FreqDist(text1)   #生成text1的频率分布
fdist1.most_common(50)  #输出最常见的50个词语及其出现次数
fdist1['whale']  #输出特定词语‘whale’的出现次数

简单的词语筛选

V = set(text1)
long_words = [w for w in V if len(w) > 15]  #筛选出长词
sorted(long_words)

fdist5 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7) #筛选出高频长词

二元词语搭配

list(bigrams(['more', 'is', 'said', 'than', 'done']))  #返回[('more', 'is'), ('is', 'said'), ('said', 'than'), ('than', 'done')],全部的二元搭配

text4.collocations()  #找到比我们根据单个词的频率预期更频繁出现的二元词


作者:hitsunbo
链接:https://www.jianshu.com/p/b296dbadc40c
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。


使用语料库和词汇资源

 
 

内置的语料库

import nltk
nltk.corpus.gutenberg.fileids()  #Gutenberg语料库的全部文件id
emma = nltk.corpus.gutenberg.words('austen-emma.txt')  #打开一个Gutenberg的文本文件
len(emma)

或者

from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')

从语料库中提取文本信息

for fileid in gutenberg.fileids():
  num_chars = len(gutenberg.raw(fileid))
  num_words = len(gutenberg.words(fileid))
  num_sents = len(gutenberg.sents(fileid))
  num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))

再例如Brown语料库

from nltk.corpus import brown
brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.sents(categories=['news', 'editorial', 'reviews'])

cfd = nltk.ConditionalFreqDist(  (genre, word)  for genre in brown.categories()  for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

内置语料库基本函数

#Example  Description
fileids()  #the files of the corpus
fileids([categories])  #the files of the corpus corresponding to these categories
categories()  #the categories of the corpus
categories([fileids])  #the categories of the corpus corresponding to these files
raw() #the raw content of the corpus
raw(fileids=[f1,f2,f3]) #the raw content of the specified files
raw(categories=[c1,c2]) #the raw content of the specified categories
words() #the words of the whole corpus
words(fileids=[f1,f2,f3]) #the words of the specified fileids
words(categories=[c1,c2]) #the words of the specified categories
sents() #the sentences of the whole corpus
sents(fileids=[f1,f2,f3]) #the sentences of the specified fileids
sents(categories=[c1,c2]) #the sentences of the specified categories
abspath(fileid) #the location of the given file on disk
encoding(fileid) #the encoding of the file (if known)
open(fileid) #open a stream for reading the given corpus file
root   #if the path to the root of locally installed corpus
readme() #the contents of the README file of the corpus

载入本地语料库

from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')

条件频率分布

频率分布计数可观察事件,例如文本中的单词的出现。 条件频率分布需要将每个事件与条件配对。 因此,不是处理一个单词序列,我们必须处理一个配对序列:

genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(genre_word)
cfd['romance'].most_common(20)
cfd['romance']['could']

词语料

使用词表

def unusual_words(text):
  text_vocab = set(w.lower() for w in text if w.isalpha())
  english_vocab = set(w.lower() for w in nltk.corpus.words.words())
  unusual = text_vocab - english_vocab 
  return sorted(unusual)

unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))

使用停用词表

from nltk.corpus import stopwords
stopwords.words('english')

使用英文姓名词表

names = nltk.corpus.names
names.fileids()['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]

使用发音词表

entries = nltk.corpus.cmudict.entries()
for entry in entries[42371:42379]:
  print(entry)

使用WordNet

from nltk.corpus import wordnet as wn
wn.synsets('motorcar')   #[Synset('car.n.01')]
wn.synset('car.n.01').lemma_names()  #['car', 'auto', 'automobile', 'machine', 'motorcar']
wn.synset('car.n.01').definition()   #'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
wn.synset('car.n.01').examples()  #['he needs a car to get to work']


作者:hitsunbo
链接:https://www.jianshu.com/p/60d0c5e265d7
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

处理原始文本

 
 

读入web原始文本

from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)  #<class 'str'>

读取本地原始文本

f = open('document.txt')
raw = f.read()

path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()

获取用户输入

s = input("Enter some text: ")
print("You typed", len(word_tokenize(s)), "words.")

原始文本本身为字符串格式,可以用字符串的函数处理

raw.find("PART I")
raw = raw[5338:1157743]

从原始文本中提取出词,并封装至text

tokens = word_tokenize(raw)
type(tokens)  #<class 'list'>
text = nltk.Text(tokens)
type(text)  #<class 'nltk.text.Text'>

用正则表达式进行文本模式匹配

import re
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
Operator  Behavior
.  #Wildcard, matches any character
^abc  #Matches some pattern abc at the start of a string
abc$  #Matches some pattern abc at the end of a string
[abc]  #Matches one of a set of characters
[A-Z0-9]  #Matches one of a range of characters
ed|ing|s  #Matches one of the specified strings (disjunction)
*   #Zero or more of previous item, e.g. a*, [a-z]* (also known as *Kleene Closure*)
+  #One or more of previous item, e.g. a+, [a-z]+
?  #Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
{n}   #Exactly n repeats where n is a non-negative integer
{n,}  #At least n repeats
{,n}  #No more than n repeats
{m,n}  #At least m and no more than n repeats
a(b|c)+  #Parentheses that indicate the scope of the operators

规则化文本

porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]  #better
[lancaster.stem(t) for t in tokens]

分割句子

text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])


作者:hitsunbo
链接:https://www.jianshu.com/p/8cdbdadd8111
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

分类和标记单词

 
 

zip与enumerate

words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
print(list(zip(words, tags)))  #[('I', 'noun'), ('turned', 'verb'), ('off', 'prep'), ('the', 'det'), ('spectroroute', 'noun')]
print(list(enumerate(words)))   #[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

filter与map

def is_content_word(word):
  return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',...  'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
print(list(filter(is_content_word, sent)))  #['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))
print(sum(lengths) / len(lengths))

词性标注

text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)  #[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),('completely', 'RB'), ('different', 'JJ')]

从有标记文本中分割出词和词性

>>> tagged_token = nltk.tag.str2tuple('fly/NN')
>>> tagged_token
('fly', 'NN')
>>> tagged_token[0]
'fly'
>>> tagged_token[1]
'NN'

>>> sent = '''
... The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
... other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
... Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
... said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
... accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
... interest/NN of/IN both/ABX governments/NNS ''/'' ./
.... '''
>>> [nltk.tag.str2tuple(t) for t in sent.split()]
[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ... ('.', '.')]

读取内置有标记语料库

>>> nltk.corpus.brown.tagged_words()
[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
>>> nltk.corpus.brown.tagged_words(tagset='universal')
[('The', 'DET'), ('Fulton', 'NOUN'), ...]

依据上下文得到相似词

text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('bought')  #made done put said found had seen given left heard been brought gotset was called felt in that told

带有默认值的字典default dictionary

>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> frequency['colorless'] = 4
>>> frequency['ideas']
0
>>> pos = defaultdict(list)
>>> pos['sleep'] = ['NOUN', 'VERB']
>>> pos['ideas']
[]

自动标注



作者:hitsunbo
链接:https://www.jianshu.com/p/2f80d8d1eab2
來源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值