异步爬取豆瓣读书并存储于mysql中
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import requests
from sqlalchemy import create_engine, Column, String, Integer, ForeignKey, Table
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
import re
from sqlalchemy.exc import SQLAlchemyError
base = declarative_base()
engine = create_engine(
"mysql+pymysql://root:1117@127.0.0.1:3306/douban",
max_overflow=5,
pool_size=10,
echo=False
)
book_category = Table('book_category', base.metadata,
Column('book_id', ForeignKey('book.id'), primary_key=True),
Column('category_id', ForeignKey('category.id'), primary_key=True))
book_tag = Table('book_tag', base.metadata,
Column('book_id', ForeignKey('book.id'), primary_key=True),
Column('tag_id', ForeignKey('tag.id'), primary_key=True))
class Book(base):
__tablename__ = 'book'
id = Column(Integer(), primary_key=True, autoincrement=True)
name = Column(String(50), nullable=True)
author = Column(String(50), nullable=True)
price = Column(String(50), nullable=True) # '88.00元(上下册)
rate = Column(String(10), nullable=True)
tag_id = relationship('Tag', secondary='book_tag', backref='book', cascade='all,delete')
category_id = relationship('Category', secondary='book_category', backref='book', cascade='all,delete')
class Tag(base):
__tablename__ = 'tag'
id = Column(Integer(), primary_key=True, autoincrement=True)
tag = Column(String(50), nullable=True)
category_id = Column(Integer(), ForeignKey('category.id'), nullable=True)
class Category(base):
__tablename__ = 'category'
id = Column(Integer(), primary_key=True, autoincrement=True)
category = Column(String(50), nullable=True)
tag_id = relationship('Tag', backref='category')
base.metadata.create_all(engine)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36}',
'Cookie': 'bid="o+UMKZtHh1I"; ll="108296"; _vwo_uuid_v2=DA1862009F71EE5A48FFC32AA2D6E6E10|5f57ac0a3af4bd4695587bc3083b47f1; douban-fav-remind=1; __gads=ID=ed825193597303ee:T=1581398295:S=ALNI_MZ448x2QXNURpII65kTgIwyE6lTTw; gr_user_id=dd1a6251-159c-4547-9bc1-6f8b29284e75; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=47a3f7a4-f437-48ea-92fc-f87c3b4617b0; gr_cs1_47a3f7a4-f437-48ea-92fc-f87c3b4617b0=user_id%3A0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1581588254%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DQFIAhWZsTKO_c1bRt7CyIYoujWFD8WL3yIlzkIjoKA-at_-XynwHOHJtCCgJxpbW%26wd%3D%26eqid%3Df4c3b3370009b995000000065e451f1b%22%5D; _pk_id.100001.3ac3=d8a799aa913a735e.1581588254.1.1581588254.1581588254.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utma=30149280.545170205.1567607013.1581398299.1581588254.3; __utmc=30149280; __utmz=30149280.1581588254.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=30149280.1.10.1581588254; __utma=81379588.1042499984.1581588254.1581588254.1581588254.1; __utmc=81379588; __utmz=81379588.1581588254.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=81379588.1.10.1581588254; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_47a3f7a4-f437-48ea-92fc-f87c3b4617b0=true; __yadk_uid=KrdfNh0zbGFGKM10NkKDwMlnBPgbWIiw'
}
def get_book_html():
url = 'https://book.douban.com'
loop = asyncio.get_event_loop()
category_html = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
categories = category_html.select('.hot-tags-col5 .clearfix')
for category in categories:
book_dict = {}
category_name = category.select('.tag_title')[0].text.strip()
book_dict['category'] = category_name
for book_url in category.select('a.tag'):
if '更多' in book_url.text:
break
tag = book_url.text
book_dict['tag'] = tag
book_url = url + book_url['href']
book_html = BeautifulSoup(requests.get(book_url, headers=headers).text, 'lxml')
books = book_html.select('.info h2 a')
tasks = [get_book_info(book, book_dict) for book in books]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
async def get_book_info(book, book_dict):
book_detail_url = book['href']
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(book_detail_url) as html:
book_detail_html = await html.text()
book_detail_html = BeautifulSoup(book_detail_html, 'lxml')
name = book_detail_html.select('span[property]')[0].text
try:
rate = book_detail_html.select('.rating_num')[0].text
except IndexError:
# 有的没评分
rate = '无'
author = book_detail_html.select('#info a')[0].text.replace(' ', '').replace('\n', '')
pattern = re.compile(r'[定][价][:]</span>(.*?)<br/>', re.S)
book_dict['price'] = re.findall(pattern, str(book_detail_html))[0].replace(' ', '')
# 有的后面没单位
if re.match(r'^.*?(\D)$', book_dict['price']) is None:
book_dict['price'] = book_dict['price'] + '元' if '元' not in book_dict['price'] else book_dict['price']
book_dict['name'] = name
book_dict['rate'] = rate.strip()
book_dict['author'] = author
save_to_sql(book_dict)
def save_to_sql(book_dict):
print('正在保存: {}'.format(book_dict['name']))
sqlsession = sessionmaker(engine)()
book = sqlsession.query(Book).filter_by(name=book_dict['name']).first()
category = sqlsession.query(Category).filter_by(category=book_dict['category']).first()
tag = sqlsession.query(Tag).filter_by(tag=book_dict['tag']).first()
if book is None and category is None and tag is None:
try:
book = Book(name=book_dict['name'], author=book_dict['author'], rate=book_dict['rate'],
price=book_dict['price'])
category = Category(category=book_dict['category'])
category.book.append(book)
sqlsession.add_all([book, category])
sqlsession.commit()
tag = Tag(tag=book_dict['tag'])
tag.category_id = category.id
tag.book.append(book)
sqlsession.add(tag)
sqlsession.commit()
except SQLAlchemyError:
print('保存失败1')
sqlsession.rollback()
elif book is None and category is not None and tag is not None:
try:
book = Book(name=book_dict['name'], author=book_dict['author'], rate=book_dict['rate'],
price=book_dict['price'])
category.book.append(book)
tag.book.append(book)
sqlsession.add(book)
sqlsession.commit()
except SQLAlchemyError:
print('保存失败2')
sqlsession.rollback()
elif book is None and category is not None and tag is None:
try:
book = Book(name=book_dict['name'], author=book_dict['author'], rate=book_dict['rate'],
price=book_dict['price'])
category.book.append(book)
sqlsession.add(book)
sqlsession.commit()
tag = Tag(tag=book_dict['tag'])
tag.category_id = category.id
tag.book.append(book)
sqlsession.add(tag)
sqlsession.commit()
except SQLAlchemyError:
print('保存失败3')
sqlsession.rollback()
elif book is not None and category is not None and tag is None:
try:
tag = Tag(tag=book_dict['tag'])
tag.category_id = category.id
tag.book.append(book)
sqlsession.add(tag)
sqlsession.commit()
except SQLAlchemyError:
print('保存失败4')
sqlsession.rollback()
if __name__ == '__main__':
get_book_html()