1. 爬取Oil price
from bs4 import BeautifulSoup
import time
from urllib.request import urlopen
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
#爬取首页
url = 'https://oilprice.com/Energy/Crude-Oil/'
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
Title = []
Date = []
all_title=soup.find_all('h2',class_='categoryArticle__title')#查找标签
for x in all_title:
Title.append(str(x))
all_date = soup.find_all('p', {"class": "categoryArticle__meta"})
for x in all_date:
Date.append(str(x))
Excerpt = re.findall(r'<p class="categoryArticle__excerpt">(.+?)</p>', html, flags=re.DOTALL)
page= pd.DataFrame({'Title':Title,'Date and Author':Date,'Excerpt':Excerpt})
page.to_excel(str(1) + '.xlsx')
#爬取后页
url = 'https://oilprice.com/Energy/Crude-Oil/Page-'
def get_page(url,i):
Title = []
Date = []
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
all_title=soup.find_all('h2',class_='categoryArticle__title')#查找标签
for x in all_title:
Title.append(str(x))
all_date = soup.find_all('p', {"class": "categoryArticle__meta"})
for x in all_date:
Date.append(str(x))
Excerpt = re.findall(r'<p class="categoryArticle__excerpt">(.+?)</p>', html, flags=re.DOTALL)
page= pd.DataFrame({'Title':Title,'Date and Author':Date,'Excerpt':Excerpt})
page.to_excel(str(i) + '.xlsx')
def get_more_pages(start,end):
for one in range(start,end):
get_page(url+str(one)+'.'+'html',one)
time.sleep(2)
get_more_pages(2,20)
#读取文件
df=pd.DataFrame(pd.read_excel(str(1) + '.xlsx'))
for i in range(2,20):
locals()["df"+str(i)] = pd.DataFrame(pd.read_excel(str(i) + '.xlsx'))
a=locals()["df"+str(i)]
df=pd.concat([df,a],axis=0)
#保存在excel
df.to_excel('Info.xlsx')
2. 处理文本
(1)在excel进行简单的分列处理
(2)Tokenization 分词、转小写
#转小写+分词
Lc_b=[nltk.word_tokenize(i) for i in nltk.sent_tokenize(title[0].lower())]#lower:小写
for t in range(1,len(dataraw)):
Lc=[nltk.word_tokenize(i) for i in nltk.sent_tokenize(title[t].lower())]#lower:小写
Lc_b=Lc_b+Lc
另一种分词思路
#分词
t=0
tmp=title[t]
tmp1=nltk.sent_tokenize(tmp)
w_0=[]
for i in tmp1:
for j in nltk.word_tokenize(i):
w_0.append(j)
t=1
tmp=title[t]
tmp1=nltk.sent_tokenize(tmp)
w_1=[]
for i in tmp1:
for j in nltk.word_tokenize(i):
w_1.append(j)
w=[w_0]+[w_1]
for t in range(2,len(data)):
tmp=title[t]
tmp1=nltk.sent_tokenize(tmp)
w_tmp=[]
for i in tmp1:
for j in nltk.word_tokenize(i):
w_tmp.append(j)
w=w+[w_tmp]
(3)Delete punctuations and stopping words
#删除punctuation
punctuation=set(string.punctuation)#加载标点符号
punctuation.add("’")#在集合中添加元素,以删除"’"
tmp1=[]
tmp2=[]
for i in range(len(Lc_b)):
tmp=Lc_b[i]
for j in tmp:
if j not in punctuation:
tmp1.append(j)
tmp2=tmp2+[tmp1]
tmp1=[]
data=tmp2
#删除停用词
stopwords=nltk.corpus.stopwords.words('english')
tmp2=[]
for i in range(len(data)):
tmp=data[i]
for j in tmp:
if j not in stopwords:
tmp1.append(j)
tmp2=tmp2+[tmp1]
tmp1=[]
data1=tmp2
(4)词干提取(stemming)和词形还原(lemmatization)
1)报错解决
运行nltk包报错如下
Resource wordnet not found.
Please use the NLTK Downloader to obtain the resource:
import nltk
nltk.download('wordnet')
For more information see: https://www.nltk.org/data.html
Attempted to load corpora/wordnet
Searched in:
- 'C:\\Users\\Nan/nltk_data'
- 'D:\\python\\nltk_data'
- 'D:\\python\\share\\nltk_data'
- 'D:\\python\\lib\\nltk_data'
- 'C:\\Users\\Nan\\AppData\\Roaming\\nltk_data'
- 'C:\\nltk_data'
- 'D:\\nltk_data'
- 'E:\\nltk_data'
**********************************************************************
一劳永逸的解决方法:
离线下载NLTK Data:nltk_data
将解压文件后得到的packages文件夹放入Searched in后面的任一文件夹
2)代码
#Stemming
#from nltk.stem import SnowballStemmer
#stemmer= SnowballStemmer("english") #Choose a language
#stemmer.stem("countries")
#lemmatization
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()
tmp2=[]
for i in range(len(data)):
tmp=data1[i]
for j in tmp:
j=wnl.lemmatize(j)
tmp1.append(j)
tmp2=tmp2+[tmp1]
tmp1=[]
data2=tmp2
(5)词语list转句子
# Convert words into sentences
List=[]
items=""
for i in range(len(data2)):
x=data2[i]
for _,item in enumerate(x):
items=items+" "+item
items=[items]
List= List+items
items=""
List1=pd.DataFrame(List)
data_out=pd.concat([dataraw, List1], axis=1)
data_out1=data_out.rename(columns={0:'Content'})
Col=['Date','Content']
data_out2=data_out1[Col]
data_out2.to_excel(r'E:\BaiduNetdiskWorkspace\N_S\Sentences.xlsx')
3、使用R语言Sentimentr包计算polarity score
(1)缺失包报错的解决方法
#安装一切可用包:很慢,很大,慎用
availablePackages <- available.packages()[,1]
install.packages(availablePackages)
#安装指定包,以及相关包
install.packages("gdata", dependencies = TRUE)
install.packages("RODBC",destdir="E:/r")
library(qdap)卡住:在console按一下回车,我也不知道为啥
(2) 使用sentimentr计算polarity scores
参考:
https://github.com/trinker/sentimentr#examples
https://www.youtube.com/watch?v=eQU8Zd1B9tM
代码:
library(xlsx)
library(sentimentr)
library(tidyverse)
# 将shanghai2.xls中数据全部加载到data中,data是一个数据框,即一个表格
data <- read.xlsx("E:\\BaiduNetdiskWorkspace\\N_S\\Sentences.xlsx", sheetIndex = 1,encoding = "UTF-8")
View(data)
data %>%
get_sentences() %>%
sentiment_by(by = c("Date"))-> deb_senti
#sentiment_by(by = c("Date")) %>%View
write.csv(deb_senti,file = "E:/BaiduNetdiskWorkspace/N_S/senti.csv")
4、TF-IDF
TFIDF是词频(TF)和逆文档频率(IDF)的乘积。从常识的角度来看,如果一个词汇经常出现,则认为它很重要。但是,如果它出现在所有文档中,那么它可能就没有那么重要的信息了。
词频(TF)计算文档中某个词汇出现的次数。因为文档很长,所以单词出现的频率更高,所以我们可以通过除以文档长度来对频率进行标准化。
TF =(term occurrences in a document)/(total unique terms in the document)
逆文档频率(IDF)
IDF =log(total document in corpus/number of documents with term t in it)
TF-IDF=TF*IDF
945

被折叠的 条评论
为什么被折叠?



