selenium + phantomJS 爬取 (豆瓣读书)
from selenium import webdriver
import time
from lxml import etree
import pymysql
import re
创建一个函数
def my_brower(url,page):
# 获取浏览器对象
browers = webdriver.PhantomJS(executable_path=r’C:\Users\Administrator\Desktop\phantomjs_2.1.1_windows\bin\phantomjs.exe’)
# 用浏览器发起请求
browers.get(url)
# 休息2秒, 频率低一点,爬的时间久一点,安全就多一点
# time.sleep(2)
# 获取页面信息
html = browers.page_source
# 调用页面解析函数
parse_html(html)
解析页面信息
def parse_html(html):
#生成一个xpath对象
html = etree.HTML(html)
# 获取所有的书籍信息列表 contains是指包含 类属性中包含 sc-bZQynM
books = html.xpath("//div[contains(@class,'sc-bZQynM')]")
# 遍历每一本书籍,然后拿到我们想要的数据
for book in books:
# 创建一个存书字典存数据用
book_dict = {}
# 获取封面信息
book_dict['book_pic'] = book.xpath(".//img/@src")[0]
# 书名
book_name = book.xpath(".//div[@class='title']/a[@class='title-text']/text()")[0]
if '"' in book_name:
pattern = re.compile(r'"')
book_name = pattern.sub('', book_name)
if "'" in book_name:
pattern = re.compile(r"'")
book_name = pattern.sub('', book_name)
# 删除书名中最后出现的\,存数据的时候书名最后的\会把sql语句最后的引号转义,
# 删除可以使代码更健壮
if '\\' in book_name:
book_name = book_name[:-1]
book_dict['book_name'] = book_name
# 书的详情页链接
book_dict['book_url'] = book.xpath(".//div[@class='title']/a[@class='title-text']/@href")[0]
# 书的评分
book_dict['book_score'] = book.xpath(".//span[@class='rating_nums']/text()")[0]
# 获取书的作者、价格、出版社、出版时间
all_div = str(book.xpath("./div[@class='item-root']/div[@class='detail']/div[@class='meta abstract']/text()")[0]).split('/')
print(all_div)
book_dict['book_price'] = all_div[-1]
book_dict['book_date'] = all_div[-2]
book_dict['book_author'] = ','.join(all_div[:-3])
book_dict['book_detail'] = all_div[-3]
print(book_dict)
#调用插入数据库函数
insert_mysql(book_dict)
def insert_mysql(book_dict):
#连接数据库
db = pymysql.connect(host = 'localhost',port = 3306,password = '1234',user = 'root',db = 'test',charset='utf8')
# 创建操作数据库对象
cur = db.cursor()
pic= book_dict['book_pic']
name = book_dict['book_name']
url = book_dict['book_url']
score = book_dict['book_score']
price = book_dict['book_price']
date = book_dict['book_date']
author = book_dict['book_author']
detail = book_dict['book_detail']
sql = 'insert python_book values ("%s","%s","%s","%s","%s","%s","%s","%s");'%(pic,pymysql.escape_string(name),url,score,price,date,author,detail)
# 向数据库添加数据
cur.execute(sql)
# 提交
db.commit()
if name == ‘main’:
for i in range(0,199):
print(’====输出第{}页=’.format((i + 1)))
page = i * 15
base_url = ‘https://book.douban.com/subject_search?search_text=python&cat=1001&start=’ + str(page)
my_brower(base_url,page)