先读入数据
import pandas as pd
data = pd.read_excel(r'D:\python\zxzy\amazon_asin\review.xlsx')
title = data['review_revs']
data.head(1)

对每条review进行分句
#分句
import nltk
from nltk.tokenize import sent_tokenize
sent = []
for i in title:
sent.append(sent_tokenize(str(i)))
sent[0:3]

对分句结果sent进行分词
#分词
from nltk.tokenize import word_tokenize
words = []
for i in sent:
for j in i:
words.extend(word_tokenize(j))
words[0:5]

对分词结果words进行小写处理
#小写处理
words_lower = [i.lower() for i in words]
对小写处理结果words_lower去除标点词和停用词
# 去除标点符号和停用词
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', '...',"'s"]
words_clear = []
for i in words_lower:
if i not in english_stopwords:
if i not in english_punctuations:
words_clear.append(i)
print("/".join(words_clear[0:10]))

对去除停用词和标点号后的结果words_clear进行词干化处理
# 词干化处理
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()
words_stem = [st.stem(word) for word in words_clear]
print("/".join(words_stem[0:10]))

将词干化结果转化为text格式
# 将分词结果转化为text格式
from nltk.text import Text
word_text = Text(words_stem)
识别评论文本中常用固定词组搭配
# 识别评论文本中常用固定词组搭配
word_text.collocations(num = 50,window_size = 2)

利用Counter计数器统计出现次数最多的前20个单词
# 利用Counter计数器统计出现次数最多的前20个单词
from collections import Counter
words_counter = Counter(word_text)
words_counter.most_common(20)
#保存words_counter结果
#fo = open(r'D:\python\zxzy\amazon_asin\tshirt.txt', "w")
#for i in words_counter:
# fo.write(i + ":" + str(words_counter[i]) + "\n")
#fo.close()

查看高频词dress上下文结果
#通过查看高频词上下文相关内容,可以了解评论的具体内容
#如果想直接查看原始评论文本,可以通过索引查看
word_text.concordance("dress",lines=10)

词性标注
# 词性标注,找出形容词ADJ和名词NOUN
from nltk.tag import pos_tag
ADJ = []
NOUN = []
for a,b in pos_tag(words_stem,tagset = "universal"):
if b == "ADJ":
ADJ.append(a)
elif b == "NOUN":
NOUN.append(a)
