首先获得淘宝页面,并且输入查找信息,得到淘宝宠物的宝贝信息
首先要对浏览器对象进行初始化,
browser = webdriver.Chrome("D:/Program Files (x86)/Google/Chrome/Application/chromedriver")
因为我这里安装在D:/Program Files (x86)/Google/Chrome/Application/chromedriver,所以写入安装位置,如果不写有时候会报错,不调用chrome浏览器
def index_page():
"""
抓取索引页
:param page:
:return:
"""
try:
browser.get(url)
input=WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
input.send_keys(KeyWord)#传入所要搜索的商品
input.send_keys(Keys.ENTER)
total=WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total')))#获得页面数
return total.text#返回该类商品有都多少页
except TimeoutException:
return index_page()
接下来获得有多少个页面以后,就要实现如何获取下一页,具体代码实现如下,要对淘宝网页进行解析
def next_page(page):
try:
input = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
submit = WebDriverWait(browser, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
)
get_product()
input.clear()
input.send_keys(page)
submit.click()
WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
except TimeoutException:
return next_page(page)
能够循环获得页面以后就要对商品的信息进行提取,代码实现如下
def get_product():
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div .items .item')))
html=browser.page_source
doc=pq(html)
items=doc('#mainsrp-itemlist .items .item').items()
for item in items:
product={
'image':item.find('.pic .img').attr('src'),
'price':item.find('strong').text(),
'deal-cnt':item.find('.deal-cnt').text()[:-2],
'location':item.find('.location').text(),
'J_ClickStat':item.find('.J_ClickStat').text(),
'shop':item.find('.shop span').siblings('span').text()
}
print(product)
save_mondb(product)
最后把信息保存到mongodb里,
def save_mondb(result):
try:
if db[MONGO_COLLECTION].insert(result):
print('suc')
except Exception:
print('f')
全部代码如下
from selenium import webdriver
import re
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import pymongo
MONGO_URL="localhost"
MONGO_DB='taobao1'
MONGO_COLLECTION='product'
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
browser = webdriver.Chrome("D:/Program Files (x86)/Google/Chrome/Application/chromedriver")
KeyWord='考研书籍全套2019'
url="https://www.taobao.com/"
def index_page():
"""
抓取索引页
:param page:
:return:
"""
try:
browser.get(url)
input=WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
input.send_keys(KeyWord)
input.send_keys(Keys.ENTER)
total=WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.total')))
return total.text
except TimeoutException:
return index_page()
def next_page(page):
try:
input = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
submit = WebDriverWait(browser, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
)
get_product()
input.clear()
input.send_keys(page)
submit.click()
WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
except TimeoutException:
return next_page(page)
def get_product():
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div .items .item')))
html=browser.page_source
doc=pq(html)
items=doc('#mainsrp-itemlist .items .item').items()
for item in items:
product={
'image':item.find('.pic .img').attr('src'),
'price':item.find('strong').text(),
'deal-cnt':item.find('.deal-cnt').text()[:-2],
'location':item.find('.location').text(),
'J_ClickStat':item.find('.J_ClickStat').text(),
'shop':item.find('.shop span').siblings('span').text()
}
print(product)
save_mondb(product)
def save_mondb(result):
try:
if db[MONGO_COLLECTION].insert(result):
print('suc')
except Exception:
print('f')
def main():
page=index_page()
page=int(re.compile('(\d+)').search(page).group(1))
print(page)
for i in range(2,page+1):
next_page(i)
if __name__ == '__main__':
main()