selenium 爬取淘宝宠物信息

首先获得淘宝页面,并且输入查找信息,得到淘宝宠物的宝贝信息

首先要对浏览器对象进行初始化,

browser = webdriver.Chrome("D:/Program Files (x86)/Google/Chrome/Application/chromedriver")

因为我这里安装在D:/Program Files (x86)/Google/Chrome/Application/chromedriver,所以写入安装位置,如果不写有时候会报错,不调用chrome浏览器

def index_page():
    """
    抓取索引页
    :param page: 
    :return: 
    """
    try:
        browser.get(url)
        input=WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
        )
        input.send_keys(KeyWord)#传入所要搜索的商品
        input.send_keys(Keys.ENTER)
        total=WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total')))#获得页面数
        return total.text#返回该类商品有都多少页
    except TimeoutException:
       return index_page()

接下来获得有多少个页面以后,就要实现如何获取下一页,具体代码实现如下,要对淘宝网页进行解析

def next_page(page):
    try:
        input = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
        )
        submit = WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
        )
        get_product()

        input.clear()
        input.send_keys(page)
        submit.click()
        WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
    except TimeoutException:
        return next_page(page)

能够循环获得页面以后就要对商品的信息进行提取,代码实现如下

def get_product():
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div .items .item')))
    html=browser.page_source
    doc=pq(html)
    items=doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product={
            'image':item.find('.pic .img').attr('src'),
            'price':item.find('strong').text(),
            'deal-cnt':item.find('.deal-cnt').text()[:-2],
            'location':item.find('.location').text(),
            'J_ClickStat':item.find('.J_ClickStat').text(),
            'shop':item.find('.shop span').siblings('span').text()

        }
        print(product)
        save_mondb(product)

最后把信息保存到mongodb里,

def save_mondb(result):
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('suc')
    except Exception:
        print('f')

全部代码如下

from selenium import webdriver
import re
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import pymongo

MONGO_URL="localhost"
MONGO_DB='taobao1'
MONGO_COLLECTION='product'
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
browser = webdriver.Chrome("D:/Program Files (x86)/Google/Chrome/Application/chromedriver")
KeyWord='考研书籍全套2019'
url="https://www.taobao.com/"

def index_page():
    """
    抓取索引页
    :param page: 
    :return: 
    """
    try:
        browser.get(url)
        input=WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
        )
        input.send_keys(KeyWord)
        input.send_keys(Keys.ENTER)
        total=WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total')))
        return total.text
    except TimeoutException:
       return index_page()
def next_page(page):
    try:
        input = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
        )
        submit = WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
        )
        get_product()

        input.clear()
        input.send_keys(page)
        submit.click()
        WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
    except TimeoutException:
        return next_page(page)
def get_product():
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div .items .item')))
    html=browser.page_source
    doc=pq(html)
    items=doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product={
            'image':item.find('.pic .img').attr('src'),
            'price':item.find('strong').text(),
            'deal-cnt':item.find('.deal-cnt').text()[:-2],
            'location':item.find('.location').text(),
            'J_ClickStat':item.find('.J_ClickStat').text(),
            'shop':item.find('.shop span').siblings('span').text()

        }
        print(product)
        save_mondb(product)
def save_mondb(result):
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('suc')
    except Exception:
        print('f')


def main():
    page=index_page()
    page=int(re.compile('(\d+)').search(page).group(1))
    print(page)


    for i in range(2,page+1):
        next_page(i)


if __name__ == '__main__':
    main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值