使用nltk进行文本预处理

本文详细介绍了一种处理英文文本的方法,包括分词、词干提取、去除停用词、低频词过滤等步骤,同时展示了如何使用NLTK库进行词性标注、绘制词频分布图和位置图,以及构建句法树。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

分词

# coding:utf-8
import nltk
import re
import string
from nltk.corpus import brown
from nltk.book import *
from nltk.tokenize import WordPunctTokenizer
# print(brown.words())
# print(len(brown.sents())) # 句子数
# print(len(brown.words())) # 单词数

def filter_punctuation(words):
    new_words = [];
    illegal_char = string.punctuation + '【·!…()—:“”?《》、;】'
    pattern=re.compile('[%s]' % re.escape(illegal_char))
    for word in words:
        new_word = pattern.sub(u'', word)
        if not new_word == u'':
            new_words.append(new_word)
    return new_words
'''英文文本处理'''
'''词性标注'''
text_en = open(u'./data/text_en.txt',encoding='utf-8-sig',errors='ignore').read()
# text_cn = open(u'./data/text_cn.txt',encoding='utf-8',errors='ignore').read()
text="Don't hesitate to ask questions. Be positive."
# 文本切分成语句
# from nltk.tokenize import sent_tokenize
# print(sent_tokenize(text))
# 大批量切分
# tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
# print(tokenizer.tokenize(text))

# 分词1
words=nltk.word_tokenize(text_en)
print(words[0:20])
# 分词2, 分离标点
# tokenizer=WordPunctTokenizer()
# words = tokenizer.tokenize(text)
# print(words)

['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Pride', 'and', 'Prejudice', ',', 'by', 'Jane', 'Austen', 'Chapter', '1', 'It', 'is', 'a', 'truth', 'universally', 'acknowledged']

提取词干

from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
words = [stemmerlan.stem(x) for x in words]
print(words[0:20])

['the', 'project', 'gutenberg', 'ebook', 'of', 'prid', 'and', 'prejud', ',', 'by', 'jan', 'aust', 'chapt', '1', 'it', 'is', 'a', 'tru', 'univers', 'acknowledg']

去除停用词和标点


from nltk.corpus import stopwords
stops=set(stopwords.words('english'))
words = [word for word in words if word.lower() not in stops]
words = filter_punctuation(words)
print(words[0:20])

['project', 'gutenberg', 'ebook', 'prid', 'prejud', 'jan', 'aust', 'chapt', '1', 'tru', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wif']

低频词过滤

from nltk.probability import *
fdist = FreqDist(words)
words = [word for word in words if fdist[word] > 5]
print(words[0:20])
['prid', 'prejud', 'jan', 'chapt', 'tru', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wif', 'howev', 'littl', 'known', 'feel', 'view']

绘制位置图


from nltk.text import *
Text.dispersion_plot(words, [stemmerlan.stem(x) for x in ["Elizabeth", "Darcy", "Wickham", "Bingley", "Jane"]])

在这里插入图片描述

绘制频率分布图

fdist.plot(20)

在这里插入图片描述

词汇标注

the lawyer questioned the witness about the revolver

import nltk
sentence = "the lawyer questioned the witness about the revolver"
words = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(words)
print(tags)
[('the', 'DT'), ('lawyer', 'NN'), ('questioned', 'VBD'), ('the', 'DT'), ('witness', 'NN'), ('about', 'IN'), ('the', 'DT'), ('revolver', 'NN')]

得到句法树

sentence = "the boy saw the dog with a rod"
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> VBD NP | VBD NP PP
PP -> IN NP
NP -> DT NN | DT NN PP
DT -> "the" | "a"
NN -> "boy" | "dog" | "rod"
VBD -> "saw"
IN -> "with"
""")
words = nltk.word_tokenize(sentence)
rd_parser = nltk.RecursiveDescentParser(grammar)
for tree in rd_parser.parse(words):
    print(tree)
(S
  (NP (DT the) (NN boy))
  (VP
    (VBD saw)
    (NP (DT the) (NN dog) (PP (IN with) (NP (DT a) (NN rod))))))
(S
  (NP (DT the) (NN boy))
  (VP
    (VBD saw)
    (NP (DT the) (NN dog))
    (PP (IN with) (NP (DT a) (NN rod)))))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

羊城迷鹿

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值