python爬取京东
任务步骤
- 爬取网站信息
- 筛选出我们要的信息
环境库准备:python基本库、beautifulsoup库、selenium库、pyecharts库
工具:chrome、jupyter notebook
代码实现
任务一:简单爬取京东基本信息
import requests
from bs4 import BeautifulSoup
def getDATA(url):
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'}
#User-Agent,浏览器的用户代理字符串,标明身份的第一层标识
#**如果仅一个User-Agent,也很容易被网站通过相同浏览器频繁访问而识别为爬虫程序,所以一般通过使用多个User-Agent随机调用的方式,避免一个请求头长时间访问。
source_file=r"C://**"
try:
parmas={
'keyword':'pro'#搜索
}
r=requests.get(url=url,headers=headers,params=parmas)
# url(请求的url地址,必需 ),headers参数(请求头,可选), params参数 (请求参数,可选),proxies参数 (代理IP,可选),verify参数(ssl证书验证,可选)
r.raise_for_status()#查看网络连接情况
r.encoding='utf-8'#字符编码
# print(r.text)
with open (np1,'w',encoding='utf-8') as f1:
f1.write(r.text)#写入数据
except Exception as np:
print(np)
#这个代码中我们用r.raise_for_status()方法,它就可以有效的判断网络连接的状态。如果网连接出现错误,那么它就会用try-except来获取一个异常,若异常出现则输出np
def extractDATA():
source_file=r"C://**/*"
compile_file=r"C://***/**"
soup= BeautifulSoup(open(np1,'r',encoding='utf-8'),'lxml')
# print(soup.title)
test=soup.select("ul[class='gl-warp clearfix'] li")
# print(test)
with open(compile_file, 'w', encoding='utf-8') as f2:
for x in test:
w=x.select("div[class='p-price']")[0].get_text().strip()
a=x.select("div[class='p-name p-name-type-2']")[0].get_text().strip()
y = x.select("div[class='p-shop']")[0].get_text().strip()
k = x.a.attrs['href'].strip()
s='名称:'+a +'\n价格:'+w+'\n店铺:'+y+'\n链接:'+'http:'+k+'\n----------------------------'
print(s,'\n')
f2.write(s)
def main():
url = ''
getDATA(url)#调用函数
extractDATA()
if __name__ == '__main__':
main()
css修饰标(与上一样)
import re
def getData():
pass
def extractData(html_path,result_list):
with open(html_path,'r',encoding='UTF-8') as f:
html = f.read()
#print(html)
r_price = r'<div class="p-price">.*?<i.*?>(.*?)</i>.*?</div>'
price_list = re.findall(r_price, html, re.S)
print(price_list)
r_url = r'<div class="p-name.*?"><a.*?href="(.*?)">.*?</div>'
url_list = re.findall(r_url, html, re.S)
url_list = ["https:" + url for url in url_list]
print(url_list)
r_desc = r'<div class="p-name.*?">.*?<em>(.*?)</em>.*?</div>'
desc_list = re.findall(r_desc, html, re.S)
r_sub = r'<.*?>'
desc_list = [re.sub(r_sub, '', desc, re.S) for desc in desc_list]
print(desc_list)
dengxian = open("C://**", 'w', encoding="utf-8")
for item in zip(price_list, url_list, desc_list):
print(item)
print(item, file=dengxian)
dengxian.close()
def saveResults():
pass
def main():
html_path = r'source_file'
r_list = list()
#getData
extractData(html_path,r_list)
if __name__ == '__main__':
main()
任务2:自动化爬取京东评论
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
def scrollDown(driver, step=10):
total_height = int(driver.execute_script("return document.body.scrollHeight"))
#自动化
for i in range(1, total_height-1200, step):
driver.execute_script("window.scrollTo(0, {});".format(i))
time.sleep(0.005)
def getComments(url):
driver = webdriver.Chrome()#使用chrmoe
driver.get(url)#请求url
driver.maximize_window()#全屏显示
time.sleep(3)#相应时间
comment_btn = driver.find_element(By.XPATH,'//li[@data-anchor="#comment"]')
#点击所需标签(评论)
comment_btn.click()#跳转
time.sleep(3)
scrollDown(driver)
with open(r'./***','a+',encoding='utf-8')as f:
for i in range(1,5):#跳转页数
comments = driver.find_elements(By.CLASS_NAME,'comment-item')
for comment in comments:
user_info = comment.find_element_by_class_name('user-info').text
text_info = comment.find_element(By.CLASS_NAME,'comment-con').text
star_info = comment.find_element_by_xpath('.//div[@class="comment-column J-comment-column"]/div').get_attribute('class').strip()[-5:]
days_info = comment.find_element_by_xpath('.//div[@class="order-info"]/span[last()]').text
xian = comment.find_element_by_class_name('user-level').text
if xian == "PLUS会员":
print('\n'"---------------"'\n名称', user_info, '\n会员状态:', xian, '\n评论', text_info, '\n星级', star_info, '\n评论时间',days_info)
f.writelines('\n'"---------------"'\n名称:'+ user_info+ '\n会员状态:'+ xian+'\n评论:'+text_info+'\n星级:'+star_info+'\n评论时间:'+days_info)
else:
print( '\n'"---------------"'\n名称', user_info, '\n会员状态:', "普通会员", '\n评论', text_info, '\n星级', star_info, '\n评论时间',days_info)
f.writelines(
'\n'"---------------"'\n名称:' + user_info + '\n会员状态:' + "普通会员" + '\n评论:' + text_info + '\n星级:' + star_info + '\n评论时间:' + days_info)
next_btn = driver.find_element_by_xpath('.//a[@class="ui-pager-next"]')
#所需点击标签
next_btn.click()
time.sleep(3)
scrollDown(driver)
def main():
url=r'https://item.jd.com/100021707422.html'
getComments(url)
if __name__ == '__main__':
main()
任务3:生成词云
#jieba、pandas用来处理数据,数据源以xls格式存储的,这里用pandas进行处理
from pyecharts.charts import WordCloud
from wordcloud import wordcloud
from selenium import webdriver
import time, jieba, pyecharts
import pyecharts.options as opts
# from selenium.webdriver.common.by import By #方法的引用
def getWord(word_list): #创建函数
with open(r'C://**',encoding='utf-8') as f :
txt = f.read() #读取(评论)文件
re_move = [",", "。", " ", '\n', '\xa0',':','*','/d'] # 去除无效数据
for i in re_move:
txt = txt.replace(i, " ")
#判断是否存在,若存在则删除
word_lists = jieba.lcut(txt)
# 使用精确模式对文本进行分词
getWordFreq(word_lists, word_count={})
#通过键值对的形式存储词语及其出现的次数
def getWordFreq(word_list, word_count):
stopwords = [line.strip() for line in open('A:\\1\评论1.txt', encoding='utf-8').readlines()] #遍历并返回
#----readlines() 方法用于读取所有行(直到结束符 EOF)并返回列表。 ----strip() 方法用于移除字符串头尾指定的字符(默认为空格)或字符序列
#sjstopwords = [line.strip() for line in open('"B:\\3206574001\FileRecv\hit_stopwords.txt"', encoding='utf-8').readlines()]
for word in word_list:
#循环
if word not in stopwords:
# if word not in sjstopwords:
word_count[word] = word_count.get(word, 0) + 1
#统计词语的出现的次数
if len(word) == 1: # 单个词语不计算在内
continue
#跳出
else:
word_count[word] = word_count.get(word, 0) + 1
# 遍历所有词语,每出现一次其对应的值加 1
items = list(word_count.items())
#统计次数
items.sort(key=lambda x: x[1], reverse=True)
# 根据词语出现的次数进行从大到小排序
print(items)
drawWordCLoud(word_count)
def drawWordCLoud(word_count):
wCloud = WordCloud()
#color_mask = imread("bg.png") # bg.png背景图片
#d = path.dirname(__file__) # 当前文件文件夹所在目录
# cloud = WordCloud(
# font_path="STXINGKA.ttf", #华文行楷
# font_path=path.join(d,'simsun.ttc'), # 设置字体 宋体
# background_color='white', # 设置背景色
# mask=color_mask, # 词云形状
# max_words=2000, #允许最大词汇
# max_font_size=400, # 最大号字体,如果不指定则为图像高度
# max_font_size = 150, # 画布宽度和高度,如果设置了mask则不会生效
# prefer_horizontal = 0.8 # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1
# )
# word_cloud = cloud.generate(cut_text) # 产生词云
wCloud.set_global_opts(title_opts=opts.TitleOpts(title="高频评论词")) #标题
wCloud.add(
series_name='评论',
data_pair=list(word_count.items()),
shape='pentagon'
)
wCloud.render(r'wordCloud.html') #构建图片生成网页文件
print("已成功构建 wordCloud.html 文件,请访问它以查看最终效果。")
def main():
word_file = open('C:/**', encoding="utf-8")、
#打开文件(评论文件)
word_list = word_file.read()
# read() 方法用于从文件读取指定的字节数,如果未给定或为负则读取所有
# print(word_list)
# word_count = {}
getWord(word_list)
#调用
# getWordFreq(word_list,word_count)
if __name__ == '__main__': #main方法
main()
侵权请联系必删