什么是 selenium
- selenium 是一个用于 web应用程序测试的工具
- selenium 可以直接操作浏览器,就像真正的用户在操作一样
- selenium 支持多种编程语言 (java python javascript c# ruby),进行测试脚本的编写
- selenium 是一个工具集,有了这个工具集,我们可以顺利的开展自动化测试
selenium支持的平台和浏览器
- Google Chrome
- Firefox
- Internet Explorer 7,8,9,10 and 11
- Safari
- Opera
- HtmlUnit
- phantomjs
- Android(with Selendroid or appium)
- ios (with ios-driver or appium)
selenium安装
- pip3 install selenium
selenium 使用
- 安装好之后,引用 from selenium import webdriver
- 需要下载相对应的 浏览器的 驱动 http://npm.taobao.org/mirrors/chromedriver/, 对应浏览器的版本号下载
- 获取到本地路径,mac的下,直接把文件拖到 item中,即可获得。
- 如果 mac下提示“mac 无法打开“chromedriver”,因为无法验证开发者”,需要点双击打开,然后再隐私中通过即可。
- print(sel.xpath("//div[@class='price J-p-11410079235']/text()").extract_first()) 之前用 .extract()[0],如果没有值会抛出异常,而 extract_first则不会。
- browser.close() 关闭浏览器
import time
from selenium import webdriver
from scrapy import Selector
# 使用 webdriver
chrome_browser = webdriver.Chrome(executable_path="/Users/zhangqiang/Desktop/05-python/03-spider-learn/spider/csdn_spider/chromedriver")
# firefox_brower = webdriver.Firefox()
# ie_brower = webdriver.Ie()
chrome_browser.get("https://item.jd.com/11410079235.html#comment")
# time.sleep(30)
# print(chrome_browser.page_source) #这个就是运行完js之后生成的html
sel = Selector(text=chrome_browser.page_source)
# print(sel.xpath("//div[@class='price J-p-11410079235']/text()").extract()[0])
print(sel.xpath("//span[@class='price J-p-11410079235']/text()").extract_first())
#关闭浏览器
chrome_browser.close()
selenium点击元素,可能会抛异常,需要做异常处理
- .find_element_by_xpath
- .click()
import time
from selenium import webdriver
from scrapy import Selector
from selenium.common.exceptions import NoSuchElementException
# 使用 webdriver
chrome_browser = webdriver.Chrome(
executable_path="/Users/zhangqiang/Desktop/05-python/03-spider-learn/spider/csdn_spider/chromedriver")
chrome_browser.get("https://item.jd.com/11410079235.html#comment")
try:
click_ele = chrome_browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_2']")
click_ele.click()
except NoSuchElementException as e:
pass
sel = Selector(text=chrome_browser.page_source)
print(sel.xpath("//span[@class='price J-p-11410079235']/text()").extract_first())
# 关闭浏览器
chrome_browser.close()
9-7 通过 selenium 解析jd商品详情页 https://item.jd.com/11410079235.html#comment
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-14 14:08
# +--------------------------
import re
import time
import json
from datetime import datetime
from selenium import webdriver
from scrapy import Selector
from jd_spider.models import *
from selenium.common.exceptions import NoSuchElementException
brower = webdriver.Chrome(
executable_path="/Users/zhangqiang/Desktop/05-python/03-spider-learn/spider/csdn_spider/chromedriver")
# 把字符串处理成数字
def process_value(nums_str):
"""
:param nums_str: 字符串类型的数字,数字中可能包含"万"
:return: 成功返回数字,默认返回0
"""
nums = 0
re_math = re.search("(\d+)", nums_str)
if re_math:
nums = int(re_math.group(1))
if "万" in nums_str:
nums *= 10000
return nums
def parse_good(good_id):
brower.get('https://item.jd.com/{}.html'.format(good_id))
sel = Selector(text=brower.page_source)
# 提取商品的具体信息
good = Good(id=good_id)
name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
price = float("".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(good_id)).extract()).strip())
detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
# 设置值
good.name = name
good.price = price
good.content = detail
good.image_list = json.dumps(good_images) # 把list转为 字符串
# 模拟点击规格和包装, 通过 文字来定位元素
ggbz_ele = brower.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(), '规格与包装')]")
ggbz_ele.click()
time.sleep(3)
sel = Selector(text=brower.page_source)
ggbz_detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
good.ggbz = ggbz_detail
# 模拟点击商品评价后获取评价的信息
try:
sppj_ele = brower.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_2']")
sppj_ele.click()
time.sleep(3)
except NoSuchElementException as e:
pass
sel = Selector(text=brower.page_source)
tag_list = sel.xpath("//div[@class='tag-list tag-available']/span/text()").extract()
good_rate = int(sel.xpath(("//div[@class='percent-con']/text()")).extract()[0])
good.good_rage = good_rate
summary_as = sel.xpath("//ul[@class='filter-list']/li/a")
for summary in summary_as:
name = summary.xpath("./text()").extract()[0]
nums = summary.xpath("./em/text()").extract()[0]
nums = process_value(nums)
if name == "晒图":
good.has_image_comment_nums = nums
elif name == "视频晒单":
good.has_add_comment_nums = nums
elif name == "追评":
good.has_add_comment_nums = nums
elif name == "好评":
good.well_comment_nums = nums
elif name == "中评":
good.middle_comment_nums = nums
elif name == "差评":
good.bad_comment_nums = nums
elif name == "全部评价":
good.comments_nums = nums
# 保存商品信息
existed_good = Good.select().where(Good.id == good_id)
if existed_good:
good.save()
else:
good.save(force_insert=True)
for tag in tag_list:
re_match = re.match("(.*?)\((\d+)\)", tag)
if re_match:
tag_name = re_match.group(1)
nums = int(re_match.group(2))
existed_summarys = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good == good,
GoodEvaluateSummary.tag == tag_name)
if existed_good:
summary = existed_summarys[0]
else:
summary = GoodEvaluateSummary(good=good)
summary.tag = tag_name
summary.num = nums
summary.save()
# 获取商品的评价
has_next_page = True
while has_next_page:
all_evalutes = sel.xpath("//div[@class='comment-item']")
for item in all_evalutes:
good_evaluate = GoodEvaluate(good=good)
evaluate_id = item.xpath("./@data-guid").extract()[0]
good_evaluate.id = evaluate_id
user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
good_evaluate.user_head_url = user_head_url
good_evaluate.user_name = user_name
star = item.xpath("./div[2]/div[1]/@class").extract()[0]
star = int(star[-1])
good_evaluate.star = star
valuate = "".join(item.xpath("./div[2]/p[1]/text()").extract()).strip()
good_evaluate.content = valuate
image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']//img/@src").extract()
video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()
good_evaluate.image_list = json.dumps(image_list)
good_evaluate.video_list = json.dumps(video_list)
praised_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
comment_nums = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])
good_evaluate.praised_nums = praised_nums
good_evaluate.comment_nums = comment_nums
comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
order_info = comment_info[:-1]
evaluate_time = comment_info[-1]
good_evaluate.good_info = json.dumps(order_info)
evaluate_time = datetime.strptime(evaluate_time, '%Y-%m-%d %H:%M')
good_evaluate.evaluate_time = evaluate_time
# 保存评价信息
existed_good_evaluates = GoodEvaluate.select().where(GoodEvaluate.id == good_evaluate.id)
if existed_good_evaluates:
good_evaluate.save()
else:
good_evaluate.save(force_insert=True)
try:
next_page_ele = brower.find_element_by_xpath("//div[@id='comment']//a[@class='ui-pager-next']")
# next_page_ele.click()
next_page_ele.send_keys("\n") # 也是click方法
time.sleep(5)
sel = Selector(text=brower.page_source)
except NoSuchElementException as e:
has_next_page = False
pass
if __name__ == '__main__':
parse_good(11410079235)