import pandas as pd
df = pd.read_csv("./labeledTrainData.tsv", sep='\t', escapechar='\\')
df.head(10)
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
| id | sentiment | review |
---|
0 | 5814_8 | 1 | With all this stuff going down at the moment w… |
---|
1 | 2381_9 | 1 | “The Classic War of the Worlds” by Timothy Hin… |
---|
2 | 7759_3 | 0 | The film starts with a manager (Nicholas Bell)… |
---|
3 | 3630_4 | 0 | It must be assumed that those who praised this… |
---|
4 | 9495_8 | 1 | Superbly trashy and wondrously unpretentious 8… |
---|
5 | 8196_8 | 1 | I dont know why people think this is such a ba… |
---|
6 | 7166_2 | 0 | This movie could have been very good, but come… |
---|
7 | 10633_1 | 0 | I watched this video at a friend’s house. I’m … |
---|
8 | 319_1 | 0 | A friend of mine bought this film for £1, and … |
---|
9 | 8713_10 | 1 | <br /><br />This movie is full of references. … |
---|
commentList = df["review"].tolist()
commentSplit = commentList[0:100]
commentStr = "".join(commentSplit)
from bs4 import BeautifulSoup as bs
soup = bs(commentStr,"lxml").get_text()
soup2 = str.lower(soup)
import re
result = re.findall('[a-zA-Z]+',soup2)
result
#部分结果
['with',
'all',
'this',
'stuff',
'going',
'down',
'at',
'the',
'moment',
'with',
'mj',
'i',
've',
'started',
'listening',
'to',
'his',
'music',
'watching',
'the',
'odd',
'documentary',
'here',
'and',
'there',
'watched',
'the',
...]
data = pd.read_csv('stopwords.txt')
list2 = data.values
list3 = []
for i in range(len(list2)):
list3.append(list2[i][0])
list4 = str(list3)
result2 = re.findall('\w+',list4)
result2
#部分结果
['ll',
'm',
're',
's',
't',
've',
'ZT',
'ZZ',
'a',
'a',
's',
'able',
'about',
'above',
'abst',
'accordance',
'according',
'accordingly',
'across',
'act',
'actually',
'added',
'adj',
'adopted',
'affected',
'affecting',
'affects',
'after',
'afterwards',
'again',
'against',
'ah',
'ain',
't',
'all',
'allow',
'allows',
'almost',
'alone',
'along',
'already',
'also',
'although',
'always',
'am',
'among',
'amongst',
'an',
'and',
'announce',
'another',
'any',
'anybody',
'anyhow',
'anymore',
'anyone',
'anything',
'anyway',
'anyways',
'anywhere',
'apart',
'apparently',
'appear',
'appreciate',
'appropriate',
'approximately',
...]
list1 = [w for w in result if w not in result2]
list1
#部分结果
['stuff',
'moment',
'mj',
'started',
'listening',
'music',
'watching',
'odd',
'documentary',
'watched',
'wiz',
'watched',
'moonwalker',
'insight',
'guy',
'cool',
'eighties',
'mind',
'guilty',
'innocent',
'moonwalker',
'biography',
'feature',
'film',
'remember',
'cinema',
'originally',
'released',
'subtle',
...]
from collections import Counter
dict = Counter(list1).most_common(500)
list5=[]
for i in range(len(dict)):
list5.append(dict[i][0])
list6=[]
for j in range(len(dict)):
list6.append(dict[j][1])
import pyecharts as pye
wordcloud = pye.WordCloud(width=1500, height=1000)
wordcloud.add("", list5, list6, word_size_range=[20, 100])
wordcloud
