selenium 的介绍和爬取 jd数据

什么是 selenium

  1. selenium 是一个用于 web应用程序测试的工具
  2. selenium 可以直接操作浏览器,就像真正的用户在操作一样
  3. selenium 支持多种编程语言 (java python javascript c# ruby),进行测试脚本的编写
  4. selenium 是一个工具集,有了这个工具集,我们可以顺利的开展自动化测试

selenium支持的平台和浏览器

  1. Google Chrome
  2. Firefox
  3. Internet Explorer 7,8,9,10 and 11
  4. Safari
  5. Opera
  6. HtmlUnit
  7. phantomjs
  8. Android(with Selendroid or appium)
  9. ios (with ios-driver or appium)

selenium安装

  1. pip3 install selenium

selenium 使用

  1. 安装好之后,引用 from selenium import webdriver 
  2. 需要下载相对应的 浏览器的 驱动 http://npm.taobao.org/mirrors/chromedriver/, 对应浏览器的版本号下载
  3. 获取到本地路径,mac的下,直接把文件拖到 item中,即可获得。
  4. 如果 mac下提示“mac 无法打开“chromedriver”,因为无法验证开发者”,需要点双击打开,然后再隐私中通过即可。
  5. print(sel.xpath("//div[@class='price J-p-11410079235']/text()").extract_first()) 之前用 .extract()[0],如果没有值会抛出异常,而 extract_first则不会。
  6. browser.close() 关闭浏览器
import time
from selenium import webdriver
from scrapy import Selector

# 使用 webdriver
chrome_browser = webdriver.Chrome(executable_path="/Users/zhangqiang/Desktop/05-python/03-spider-learn/spider/csdn_spider/chromedriver")
# firefox_brower = webdriver.Firefox()
# ie_brower = webdriver.Ie()
chrome_browser.get("https://item.jd.com/11410079235.html#comment")
# time.sleep(30)

# print(chrome_browser.page_source) #这个就是运行完js之后生成的html
sel = Selector(text=chrome_browser.page_source)
# print(sel.xpath("//div[@class='price J-p-11410079235']/text()").extract()[0])
print(sel.xpath("//span[@class='price J-p-11410079235']/text()").extract_first())

#关闭浏览器
chrome_browser.close()

selenium点击元素,可能会抛异常,需要做异常处理

  1. .find_element_by_xpath
  2. .click()
import time
from selenium import webdriver
from scrapy import Selector
from selenium.common.exceptions import NoSuchElementException

# 使用 webdriver
chrome_browser = webdriver.Chrome(
    executable_path="/Users/zhangqiang/Desktop/05-python/03-spider-learn/spider/csdn_spider/chromedriver")
chrome_browser.get("https://item.jd.com/11410079235.html#comment")

try:
    click_ele = chrome_browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_2']")
    click_ele.click()
except NoSuchElementException as e:
    pass

sel = Selector(text=chrome_browser.page_source)
print(sel.xpath("//span[@class='price J-p-11410079235']/text()").extract_first())

# 关闭浏览器
chrome_browser.close()

9-7 通过 selenium 解析jd商品详情页 https://item.jd.com/11410079235.html#comment

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-14 14:08                
# +--------------------------
import re
import time
import json
from datetime import datetime
from selenium import webdriver
from scrapy import Selector
from jd_spider.models import *
from selenium.common.exceptions import NoSuchElementException

brower = webdriver.Chrome(
    executable_path="/Users/zhangqiang/Desktop/05-python/03-spider-learn/spider/csdn_spider/chromedriver")


# 把字符串处理成数字
def process_value(nums_str):
    """
    :param nums_str: 字符串类型的数字,数字中可能包含"万"
    :return: 成功返回数字,默认返回0
    """
    nums = 0
    re_math = re.search("(\d+)", nums_str)
    if re_math:
        nums = int(re_math.group(1))
        if "万" in nums_str:
            nums *= 10000
    return nums


def parse_good(good_id):
    brower.get('https://item.jd.com/{}.html'.format(good_id))

    sel = Selector(text=brower.page_source)

    # 提取商品的具体信息
    good = Good(id=good_id)
    name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
    price = float("".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(good_id)).extract()).strip())
    detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
    good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
    supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())


    # 设置值
    good.name = name
    good.price = price
    good.content = detail
    good.image_list = json.dumps(good_images)  # 把list转为 字符串

    # 模拟点击规格和包装, 通过 文字来定位元素
    ggbz_ele = brower.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(), '规格与包装')]")
    ggbz_ele.click()
    time.sleep(3)
    sel = Selector(text=brower.page_source)
    ggbz_detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
    good.ggbz = ggbz_detail

    # 模拟点击商品评价后获取评价的信息
    try:
        sppj_ele = brower.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_2']")
        sppj_ele.click()
        time.sleep(3)
    except NoSuchElementException as e:
        pass

    sel = Selector(text=brower.page_source)
    tag_list = sel.xpath("//div[@class='tag-list tag-available']/span/text()").extract()
    good_rate = int(sel.xpath(("//div[@class='percent-con']/text()")).extract()[0])
    good.good_rage = good_rate

    summary_as = sel.xpath("//ul[@class='filter-list']/li/a")
    for summary in summary_as:
        name = summary.xpath("./text()").extract()[0]
        nums = summary.xpath("./em/text()").extract()[0]
        nums = process_value(nums)

        if name == "晒图":
            good.has_image_comment_nums = nums
        elif name == "视频晒单":
            good.has_add_comment_nums = nums
        elif name == "追评":
            good.has_add_comment_nums = nums
        elif name == "好评":
            good.well_comment_nums = nums
        elif name == "中评":
            good.middle_comment_nums = nums
        elif name == "差评":
            good.bad_comment_nums = nums
        elif name == "全部评价":
            good.comments_nums = nums

    # 保存商品信息
    existed_good = Good.select().where(Good.id == good_id)
    if existed_good:
        good.save()
    else:
        good.save(force_insert=True)

    for tag in tag_list:
        re_match = re.match("(.*?)\((\d+)\)", tag)
        if re_match:
            tag_name = re_match.group(1)
            nums = int(re_match.group(2))

            existed_summarys = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good == good,
                                                                  GoodEvaluateSummary.tag == tag_name)
            if existed_good:
                summary = existed_summarys[0]
            else:
                summary = GoodEvaluateSummary(good=good)

            summary.tag = tag_name
            summary.num = nums
            summary.save()

    # 获取商品的评价
    has_next_page = True
    while has_next_page:
        all_evalutes = sel.xpath("//div[@class='comment-item']")
        for item in all_evalutes:
            good_evaluate = GoodEvaluate(good=good)

            evaluate_id = item.xpath("./@data-guid").extract()[0]
            good_evaluate.id = evaluate_id
            user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
            user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()

            good_evaluate.user_head_url = user_head_url
            good_evaluate.user_name = user_name

            star = item.xpath("./div[2]/div[1]/@class").extract()[0]
            star = int(star[-1])
            good_evaluate.star = star

            valuate = "".join(item.xpath("./div[2]/p[1]/text()").extract()).strip()
            good_evaluate.content = valuate

            image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']//img/@src").extract()
            video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()

            good_evaluate.image_list = json.dumps(image_list)
            good_evaluate.video_list = json.dumps(video_list)

            praised_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
            comment_nums = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])

            good_evaluate.praised_nums = praised_nums
            good_evaluate.comment_nums = comment_nums

            comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
            order_info = comment_info[:-1]
            evaluate_time = comment_info[-1]
            good_evaluate.good_info = json.dumps(order_info)
            evaluate_time = datetime.strptime(evaluate_time, '%Y-%m-%d %H:%M')
            good_evaluate.evaluate_time = evaluate_time

            # 保存评价信息
            existed_good_evaluates = GoodEvaluate.select().where(GoodEvaluate.id == good_evaluate.id)
            if existed_good_evaluates:
                good_evaluate.save()
            else:
                good_evaluate.save(force_insert=True)

        try:
            next_page_ele = brower.find_element_by_xpath("//div[@id='comment']//a[@class='ui-pager-next']")
            # next_page_ele.click()
            next_page_ele.send_keys("\n")  # 也是click方法
            time.sleep(5)
            sel = Selector(text=brower.page_source)
        except NoSuchElementException as e:
            has_next_page = False
            pass


if __name__ == '__main__':
    parse_good(11410079235)

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值