Selenium
Selenium是一个Web的自动化测试工具,最初是为网站自动化测试而开发的,类型像我们玩游戏用的按键精灵,可以按指定的命令自动操作,不同是Selenium 可以直接运行在浏览器上,它支持所有主流的浏览器(包括PhantomJS这些无界面的浏览器)。
Selenium 可以根据我们的指令,让浏览器自动加载页面,获取需要的数据,甚至页面截屏,或者判断网站上某些动作是否发生。
命令安装:pip install selenium
chromedriver 驱动下载https://npm.taobao.org/mirrors/chromedriver/
我们以煎蛋网为列,煎蛋网妹子图做的反爬比较好,我们通过浏览器模拟爬取内容
代码如下
# 注意: 使用selenium 发请求需要注意加载延迟 from selenium import webdriver import time from lxml import etree import requests from urllib import request # 生成一个浏览器 phantom = webdriver.Chrome(executable_path=r'C:\Users\cz\Desktop\chromedriver.exe') # 通过浏览器发起一个请求 base_url = 'http://jandan.net/ooxx' def parsePage(html): html = etree.HTML(html) job_img = html.xpath('//div[@class="text"]//img/@src') print(job_img) for img in job_img: if 'jpg' in img: names = img.split('/')[-1] lj = 'images/' + names request.urlretrieve(img,lj) def getPage(): # 第一页请求,并解析 phantom.get(base_url) time.sleep(1) html = phantom.page_source parsePage(html) while True: # 找到下一页按钮 # 点击搜索按钮 phantom.find_element_by_xpath('//a[@title="Older Comments"]').click() time.sleep(1.5) html = phantom.page_source parsePage(html) if 'pager_next_disabled' in html: break if __name__ == '__main__': getPage() phantom.quit()
#等待页面加载完成在进行点级下一页
#新浪新闻 from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import time from lxml import etree import redis # 生成一个浏览器 class XingWen(object): def __init__(self): #生成浏览器 self.browser = webdriver.Chrome(executable_path=r'C:/Users/cz/Desktop/chromedriver.exe') self.wait = WebDriverWait(self.browser, 20) #请求页 self.base_url = 'http://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1' #redis self.pool = redis.ConnectionPool(host='127.0.0.1',port='6379',decode_responses=True,db=0) self.r = redis.Redis(connection_pool=self.pool) #匹配 def parse_list_href(self,html): html = etree.HTML(html) a_url = html.xpath('//span[@class="c_tit"]/a/@href') for urls in a_url: #生成器,注意,一定要要进行判断,否则就是死循环 yield urls def gettype(self): self.browser.get(self.base_url) html = self.browser.page_source self.parse_list_href(html) while True: click_botton = None try: click_botton = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="pagebox"]/span[last()]'))) # click_botton = browser.find_element_by_xpath('') except Exception as e: print(e) print('错误') if click_botton is not None: click_botton.click() time.sleep(6) html = self.browser.page_source self.parse_list_href(html) for urls in self.parse_list_href(html): print('----------------', urls) #判断是否为空 if urls is None: continue #循环出所有的urls self.put_redis(urls) #添加到redis def put_redis(self,urls): self.r.lpush('url',urls) # def get_list(url): if __name__ == '__main__': a = XingWen() a.gettype()
#瀑布流页面的获取方式
#百度妹子图 from selenium import webdriver import requests import time from lxml import etree browser = webdriver.PhantomJS() headers = { "Host" : "imgstat.baidu.com", "Connection" : "keep-alive", "Cache-Control" : "max-age=0", "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", "Accept" : "*/*", "Referer" : "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%B0%8F%E5%A6%B9&step_word=&hs=0&pn=5&spn=0&di=124598778700&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=undefined&lm=undefined&st=undefined&cs=2504568129%2C2923613288&os=2430062590%2C716592224&simid=3348946789%2C501623622&adpicid=0&lpn=0&ln=1977&fr=&fmq=1510276313431_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fimg.pconline.com.cn%2Fimages%2Fupload%2Fupc%2Ftx%2Fitbbs%2F1502%2F26%2Fc59%2F3284329_1424958048599_mthumb.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3F1r_z%26e3Brv5gstgj_z%26e3Bv54_z%26e3BvgAzdH3Fri5p5AzdH3Fstfp_n9bccnn_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0", "Accept-Encoding" : "gzip, deflate", "Accept-Language" : "zh-CN,zh;q=0.9", "Cookie" : "BAIDUID=C478AA4223D2DFB8CAD340CDA1369902:FG=1; BIDUPSID=C478AA4223D2DFB8CAD340CDA1369902; PSTM=1509961517; PSINO=1; H_PS_PSSID=1450_21113_17001_24880_22157; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; userFrom=null", } def getPic(html): html = etree.HTML(browser.page_source) pic_div = html.xpath('//div[@class="imgpage"]')[-1] pic_urls = pic_div.xpath('.//ul/li//img/@data-imgurl') print(len(pic_urls)) #'http://image.baidu.com/search/detail?ct=503316480&z=undefined&tn=baiduimagedetail&ipn=d&word=%E5%B0%8F%E5%A6%B9&step_word=&ie=utf-8&in=&cl=undefined&lm=undefined&st=undefined&cs=3663666106,1181943398&os=3560819008,878698382&simid=0,0&pn=3&rn=1&di=5340933670&ln=1977&fr=&fmq=1510276313431_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&is=4189815784,1880439711&istype=0&ist=&jit=&bdtype=17&spn=0&pi=0&gsm=0&objurl=http%3A%2F%2Fb.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2Fe850352ac65c10380ac1052eb8119313b17e899b.jpg&rpstart=0&rpnum=0&adpicid=0' for pic in pic_urls: print(pic) response = requests.get(pic,headers=headers) fname = pic.split('/')[-1] with open('./images/' + fname,'wb') as f: f.write(response.content) def getPage(): browser.get('http://image.baidu.com/search/index?tn=baiduimage&word=%E5%B0%8F%E5%A6%B9') time.sleep(1) # browser.save_screenshot('baidu.png') getPic(browser.page_source) total = 10 i = 0 while i < total: print('滚动%d次' % i) browser.execute_script('scrollTo(0,document.body.scrollHeight)') #短暂的休息 ,等待页面加载图片 time.sleep(1) getPic(browser.page_source) i += 1 # browser.save_screenshot('baidu.png') browser.quit() if __name__ == '__main__': getPage()
定位UI元素 (WebElements)关于元素的选取,
关于元素的选取,有如下的API 单个元素选取
find_element_by_id
find_elements_by_name
find_elements_by_xpath
find_elements_by_link_text
find_elements_by_partial_link_text
find_elements_by_tag_name
find_elements_by_class_name
find_elements_by_css_selector
鼠标动作链有些时候,我们需要再页面上模拟一些鼠标操作,比如双击、右击、拖拽甚至按住不动等,我们可以通过导入 ActionChains 类来做到: