Selenium的使用

Selenium

Selenium是一个Web的自动化测试工具,最初是为网站自动化测试而开发的,类型像我们玩游戏用的按键精灵,可以按指定的命令自动操作,不同是Selenium 可以直接运行在浏览器上,它支持所有主流的浏览器(包括PhantomJS这些无界面的浏览器)。

Selenium 可以根据我们的指令,让浏览器自动加载页面,获取需要的数据,甚至页面截屏,或者判断网站上某些动作是否发生。

命令安装:pip install selenium

chromedriver 驱动下载https://npm.taobao.org/mirrors/chromedriver/

我们以煎蛋网为列,煎蛋网妹子图做的反爬比较好,我们通过浏览器模拟爬取内容

代码如下

# 注意: 使用selenium 发请求需要注意加载延迟

from selenium import webdriver
import time
from lxml import etree
import requests
from urllib import request
# 生成一个浏览器
phantom = webdriver.Chrome(executable_path=r'C:\Users\cz\Desktop\chromedriver.exe')

# 通过浏览器发起一个请求
base_url = 'http://jandan.net/ooxx'




def parsePage(html):
    html = etree.HTML(html)
    job_img = html.xpath('//div[@class="text"]//img/@src')
    print(job_img)
    for img in job_img:
        if 'jpg' in img:
            names = img.split('/')[-1]
            lj = 'images/' + names
            request.urlretrieve(img,lj)


def getPage():
    # 第一页请求,并解析
    phantom.get(base_url)
    time.sleep(1)

    html = phantom.page_source
    parsePage(html)
    while True:
        # 找到下一页按钮
        # 点击搜索按钮
        phantom.find_element_by_xpath('//a[@title="Older Comments"]').click()
        time.sleep(1.5)
        html = phantom.page_source
        parsePage(html)
        if 'pager_next_disabled' in html:
            break


if __name__ == '__main__':
    getPage()
    phantom.quit()

#等待页面加载完成在进行点级下一页

#新浪新闻
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import etree
import redis

# 生成一个浏览器

class XingWen(object):
    def __init__(self):
        #生成浏览器
        self.browser = webdriver.Chrome(executable_path=r'C:/Users/cz/Desktop/chromedriver.exe')
        self.wait = WebDriverWait(self.browser, 20)
        #请求页
        self.base_url = 'http://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1'

        #redis
        self.pool = redis.ConnectionPool(host='127.0.0.1',port='6379',decode_responses=True,db=0)
        self.r = redis.Redis(connection_pool=self.pool)
    #匹配
    def parse_list_href(self,html):
        html = etree.HTML(html)
        a_url = html.xpath('//span[@class="c_tit"]/a/@href')
        for urls in a_url:
            #生成器,注意,一定要要进行判断,否则就是死循环
            yield urls

    def gettype(self):
        self.browser.get(self.base_url)
        html = self.browser.page_source
        self.parse_list_href(html)
        while True:
            click_botton = None
            try:
                click_botton = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="pagebox"]/span[last()]')))
                # click_botton = browser.find_element_by_xpath('')
            except Exception as e:
                print(e)
                print('错误')
            if click_botton is not None:
                click_botton.click()
                time.sleep(6)
                html = self.browser.page_source
                self.parse_list_href(html)
                for urls in self.parse_list_href(html):
                    print('----------------', urls)
                    #判断是否为空
                    if urls is None:
                        continue
                        #循环出所有的urls
                    self.put_redis(urls)
    #添加到redis
    def put_redis(self,urls):
        self.r.lpush('url',urls)

# def get_list(url):

if __name__ == '__main__':
    a = XingWen()
    a.gettype()

#瀑布流页面的获取方式

#百度妹子图
from selenium import webdriver
import requests
import time
from lxml import etree
browser = webdriver.PhantomJS()
headers = {
    "Host" : "imgstat.baidu.com",
    "Connection" : "keep-alive",
    "Cache-Control" : "max-age=0",
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
    "Accept" : "*/*",
    "Referer" : "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%B0%8F%E5%A6%B9&step_word=&hs=0&pn=5&spn=0&di=124598778700&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=undefined&lm=undefined&st=undefined&cs=2504568129%2C2923613288&os=2430062590%2C716592224&simid=3348946789%2C501623622&adpicid=0&lpn=0&ln=1977&fr=&fmq=1510276313431_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fimg.pconline.com.cn%2Fimages%2Fupload%2Fupc%2Ftx%2Fitbbs%2F1502%2F26%2Fc59%2F3284329_1424958048599_mthumb.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3F1r_z%26e3Brv5gstgj_z%26e3Bv54_z%26e3BvgAzdH3Fri5p5AzdH3Fstfp_n9bccnn_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0",
    "Accept-Encoding" : "gzip, deflate",
    "Accept-Language" : "zh-CN,zh;q=0.9",
    "Cookie" : "BAIDUID=C478AA4223D2DFB8CAD340CDA1369902:FG=1; BIDUPSID=C478AA4223D2DFB8CAD340CDA1369902; PSTM=1509961517; PSINO=1; H_PS_PSSID=1450_21113_17001_24880_22157; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; userFrom=null",
}
def getPic(html):
    html = etree.HTML(browser.page_source)
    pic_div = html.xpath('//div[@class="imgpage"]')[-1]
    pic_urls = pic_div.xpath('.//ul/li//img/@data-imgurl')
    print(len(pic_urls))
    #'http://image.baidu.com/search/detail?ct=503316480&z=undefined&tn=baiduimagedetail&ipn=d&word=%E5%B0%8F%E5%A6%B9&step_word=&ie=utf-8&in=&cl=undefined&lm=undefined&st=undefined&cs=3663666106,1181943398&os=3560819008,878698382&simid=0,0&pn=3&rn=1&di=5340933670&ln=1977&fr=&fmq=1510276313431_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&is=4189815784,1880439711&istype=0&ist=&jit=&bdtype=17&spn=0&pi=0&gsm=0&objurl=http%3A%2F%2Fb.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2Fe850352ac65c10380ac1052eb8119313b17e899b.jpg&rpstart=0&rpnum=0&adpicid=0'
    for pic in pic_urls:
        print(pic)
        response = requests.get(pic,headers=headers)
        fname = pic.split('/')[-1]
        with open('./images/' + fname,'wb') as f:
            f.write(response.content)

def getPage():
    browser.get('http://image.baidu.com/search/index?tn=baiduimage&word=%E5%B0%8F%E5%A6%B9')
    time.sleep(1)
    # browser.save_screenshot('baidu.png')
    getPic(browser.page_source)
    total = 10
    i = 0
    while i < total:
        print('滚动%d次' % i)
        browser.execute_script('scrollTo(0,document.body.scrollHeight)')

        #短暂的休息 ,等待页面加载图片
        time.sleep(1)
        getPic(browser.page_source)
        i += 1
# browser.save_screenshot('baidu.png')
    browser.quit()
if __name__ == '__main__':
    getPage()

定位UI元素 (WebElements)关于元素的选取,

关于元素的选取,有如下的API 单个元素选取

find_element_by_id

find_elements_by_name

find_elements_by_xpath

find_elements_by_link_text

find_elements_by_partial_link_text

find_elements_by_tag_name

find_elements_by_class_name

find_elements_by_css_selector

鼠标动作链有些时候,我们需要再页面上模拟一些鼠标操作,比如双击、右击、拖拽甚至按住不动等,我们可以通过导入 ActionChains 类来做到:



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值