NLTK学习记录

最新推荐文章于 2023-11-08 15:37:35 发布

自信哥

最新推荐文章于 2023-11-08 15:37:35 发布

阅读量593

点赞数

 
 
  
  使用NLTK

  
  import nltk  #导入nltk
nltk.download()  #下载语料库

  
  使用官方教程中的文本
  
  from nltk.book import *

  
  寻找特定词在文本中的上下文
  
  text1.concordance("monstrous") #在text1中monstrous的上下文

  
  依据上下文，寻找相似的词语
  
  text1.similar("monstrous")
text2.similar("monstrous")

  
  寻找多个词语在文本中的共同上下文
  
  text2.common_contexts(["monstrous", "very"])

  
  画出词语在文本中的位置信息图
  
  text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

  
  对text文本进行计数
  
  len(text3)   #文本text3的长度
sorted(set(text3))  #按顺序返回文本text3的全部词语
len(set(text3))  #text3的无重复词语数量、
len(set(text3)) / len(text3)  #text3的“词汇量丰富程度”
text3.count("smote")  #“smote”在text3中出现的次数

  
  “text”本质上是一个词语的列表（list）
  
  sent1 = ['Call', 'me', 'Ishmael', '.']   #定义一个sent1
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']  #连接两个list
sent4 + sent1  #连接两个list
sent1.append("Some")   #为sent1添加词语元素
text4[173]  #返回'awaken'
text4.index('awaken')  #返回索引值173
text5[16715:16735]

  
  词语本身就是python中的字符串（string）
  
  name = 'Monty'
name[:4]  #'Mont'
name * 2  #'MontyMonty'
name + '!'  #'Monty!'
' '.join(['Monty', 'Python'])  #'Monty Python'
'Monty Python'.split()  #['Monty', 'Python']

  
  对于文本信息的简单统计
  
  使用频率分布 frequency distribution
  
  fdist1 = FreqDist(text1)   #生成text1的频率分布
fdist1.most_common(50)  #输出最常见的50个词语及其出现次数
fdist1['whale']  #输出特定词语‘whale’的出现次数

  
  简单的词语筛选
  
  V = set(text1)
long_words = [w for w in V if len(w) > 15]  #筛选出长词
sorted(long_words)

fdist5 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7) #筛选出高频长词

  
  二元词语搭配
  
  list(bigrams(['more', 'is', 'said', 'than', 'done']))  #返回[('more', 'is'), ('is', 'said'), ('said', 'than'), ('than', 'done')]，全部的二元搭配

text4.collocations()  #找到比我们根据单个词的频率预期更频繁出现的二元词

 
 


作者：hitsunbo
链接：https://www.jianshu.com/p/b296dbadc40c
來源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

使用语料库和词汇资源

 
 
  
  内置的语料库

  
  import nltk
nltk.corpus.gutenberg.fileids()  #Gutenberg语料库的全部文件id
emma = nltk.corpus.gutenberg.words('austen-emma.txt')  #打开一个Gutenberg的文本文件
len(emma)

  
  或者
  
  from nltk.corpus import gutenberg
gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')

  
  从语料库中提取文本信息
  
  for fileid in gutenberg.fileids():
  num_chars = len(gutenberg.raw(fileid))
  num_words = len(gutenberg.words(fileid))
  num_sents = len(gutenberg.sents(fileid))
  num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))

  
  再例如Brown语料库
  
  from nltk.corpus import brown
brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.sents(categories=['news', 'editorial', 'reviews'])

cfd = nltk.ConditionalFreqDist(  (genre, word)  for genre in brown.categories()  for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

  
  内置语料库基本函数
  
  #Example  Description
fileids()  #the files of the corpus
fileids([categories])  #the files of the corpus corresponding to these categories
categories()  #the categories of the corpus
categories([fileids])  #the categories of the corpus corresponding to these files
raw() #the raw content of the corpus
raw(fileids=[f1,f2,f3]) #the raw content of the specified files
raw(categories=[c1,c2]) #the raw content of the specified categories
words() #the words of the whole corpus
words(fileids=[f1,f2,f3]) #the words of the specified fileids
words(categories=[c1,c2]) #the words of the specified categories
sents() #the sentences of the whole corpus
sents(fileids=[f1,f2,f3]) #the sentences of the specified fileids
sents(categories=[c1,c2]) #the sentences of the specified categories
abspath(fileid) #the location of the given file on disk
encoding(fileid) #the encoding of the file (if known)
open(fileid) #open a stream for reading the given corpus file
root   #if the path to the root of locally installed corpus
readme() #the contents of the README file of the corpus

  
  载入本地语料库
  
  from nltk.corpus import PlaintextCorpusReader
corpus_root = '/usr/share/dict'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()
wordlists.words('connectives')

  
  条件频率分布
  
  频率分布计数可观察事件，例如文本中的单词的出现。 条件频率分布需要将每个事件与条件配对。 因此，不是处理一个单词序列，我们必须处理一个配对序列：
  
  genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]
cfd = nltk.ConditionalFreqDist(genre_word)
cfd['romance'].most_common(20)
cfd['romance']['could']

  
  词语料
  
  使用词表
  
  def unusual_words(text):
  text_vocab = set(w.lower() for w in text if w.isalpha())
  english_vocab = set(w.lower() for w in nltk.corpus.words.words())
  unusual = text_vocab - english_vocab 
  return sorted(unusual)

unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))

  
  使用停用词表
  
  from nltk.corpus import stopwords
stopwords.words('english')

  
  使用英文姓名词表
  
  names = nltk.corpus.names
names.fileids()['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]

  
  使用发音词表
  
  entries = nltk.corpus.cmudict.entries()
for entry in entries[42371:42379]:
  print(entry)

  
  使用WordNet
  
  from nltk.corpus import wordnet as wn
wn.synsets('motorcar')   #[Synset('car.n.01')]
wn.synset('car.n.01').lemma_names()  #['car', 'auto', 'automobile', 'machine', 'motorcar']
wn.synset('car.n.01').definition()   #'a motor vehicle with four wheels; usually propelled by an internal combustion engine'
wn.synset('car.n.01').examples()  #['he needs a car to get to work']

 
 


作者：hitsunbo
链接：https://www.jianshu.com/p/60d0c5e265d7
來源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

处理原始文本

 
 
  
  读入web原始文本

  
  from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)  #<class 'str'>

  
  读取本地原始文本
  
  f = open('document.txt')
raw = f.read()

path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()

  
  获取用户输入
  
  s = input("Enter some text: ")
print("You typed", len(word_tokenize(s)), "words.")

  
  原始文本本身为字符串格式，可以用字符串的函数处理
  
  raw.find("PART I")
raw = raw[5338:1157743]

  
  从原始文本中提取出词，并封装至text
  
  tokens = word_tokenize(raw)
type(tokens)  #<class 'list'>
text = nltk.Text(tokens)
type(text)  #<class 'nltk.text.Text'>

  
  用正则表达式进行文本模式匹配
  
  import re
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

  
  Operator  Behavior
.  #Wildcard, matches any character
^abc  #Matches some pattern abc at the start of a string
abc$  #Matches some pattern abc at the end of a string
[abc]  #Matches one of a set of characters
[A-Z0-9]  #Matches one of a range of characters
ed|ing|s  #Matches one of the specified strings (disjunction)
*   #Zero or more of previous item, e.g. a*, [a-z]* (also known as *Kleene Closure*)
+  #One or more of previous item, e.g. a+, [a-z]+
?  #Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
{n}   #Exactly n repeats where n is a non-negative integer
{n,}  #At least n repeats
{,n}  #No more than n repeats
{m,n}  #At least m and no more than n repeats
a(b|c)+  #Parentheses that indicate the scope of the operators

  
  规则化文本
  
  porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]  #better
[lancaster.stem(t) for t in tokens]

  
  分割句子
  
  text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

 
 


作者：hitsunbo
链接：https://www.jianshu.com/p/8cdbdadd8111
來源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

分类和标记单词

 
 
  
  zip与enumerate

  
  words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']
print(list(zip(words, tags)))  #[('I', 'noun'), ('turned', 'verb'), ('off', 'prep'), ('the', 'det'), ('spectroroute', 'noun')]
print(list(enumerate(words)))   #[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]

  
  filter与map
  
  def is_content_word(word):
  return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',...  'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']
print(list(filter(is_content_word, sent)))  #['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']

lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))
print(sum(lengths) / len(lengths))

  
  词性标注
  
  text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)  #[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),('completely', 'RB'), ('different', 'JJ')]

  
  从有标记文本中分割出词和词性
  
  >>> tagged_token = nltk.tag.str2tuple('fly/NN')
>>> tagged_token
('fly', 'NN')
>>> tagged_token[0]
'fly'
>>> tagged_token[1]
'NN'

>>> sent = '''
... The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
... other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
... Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
... said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
... accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
... interest/NN of/IN both/ABX governments/NNS ''/'' ./
.... '''
>>> [nltk.tag.str2tuple(t) for t in sent.split()]
[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ... ('.', '.')]

  
  读取内置有标记语料库
  
  >>> nltk.corpus.brown.tagged_words()
[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
>>> nltk.corpus.brown.tagged_words(tagset='universal')
[('The', 'DET'), ('Fulton', 'NOUN'), ...]

  
  依据上下文得到相似词
  
  text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('bought')  #made done put said found had seen given left heard been brought gotset was called felt in that told

  
  带有默认值的字典default dictionary
  
  >>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> frequency['colorless'] = 4
>>> frequency['ideas']
0
>>> pos = defaultdict(list)
>>> pos['sleep'] = ['NOUN', 'VERB']
>>> pos['ideas']
[]

  
  自动标注
 
 


作者：hitsunbo
链接：https://www.jianshu.com/p/2f80d8d1eab2
來源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。