异步爬取豆瓣读书

异步爬取豆瓣读书并存储于mysql中

from bs4 import BeautifulSoup
import aiohttp
import asyncio
import requests
from sqlalchemy import create_engine, Column, String, Integer, ForeignKey, Table
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
import re
from sqlalchemy.exc import SQLAlchemyError

base = declarative_base()
engine = create_engine(
    "mysql+pymysql://root:1117@127.0.0.1:3306/douban",
    max_overflow=5,
    pool_size=10,
    echo=False
)

book_category = Table('book_category', base.metadata,
                      Column('book_id', ForeignKey('book.id'), primary_key=True),
                      Column('category_id', ForeignKey('category.id'), primary_key=True))
book_tag = Table('book_tag', base.metadata,
                 Column('book_id', ForeignKey('book.id'), primary_key=True),
                 Column('tag_id', ForeignKey('tag.id'), primary_key=True))


class Book(base):
    __tablename__ = 'book'
    id = Column(Integer(), primary_key=True, autoincrement=True)
    name = Column(String(50), nullable=True)
    author = Column(String(50), nullable=True)
    price = Column(String(50), nullable=True)  # '88.00元(上下册)
    rate = Column(String(10), nullable=True)
    tag_id = relationship('Tag', secondary='book_tag', backref='book', cascade='all,delete')
    category_id = relationship('Category', secondary='book_category', backref='book', cascade='all,delete')


class Tag(base):
    __tablename__ = 'tag'
    id = Column(Integer(), primary_key=True, autoincrement=True)
    tag = Column(String(50), nullable=True)
    category_id = Column(Integer(), ForeignKey('category.id'), nullable=True)


class Category(base):
    __tablename__ = 'category'
    id = Column(Integer(), primary_key=True, autoincrement=True)
    category = Column(String(50), nullable=True)
    tag_id = relationship('Tag', backref='category')


base.metadata.create_all(engine)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36}',
    'Cookie': 'bid="o+UMKZtHh1I"; ll="108296"; _vwo_uuid_v2=DA1862009F71EE5A48FFC32AA2D6E6E10|5f57ac0a3af4bd4695587bc3083b47f1; douban-fav-remind=1; __gads=ID=ed825193597303ee:T=1581398295:S=ALNI_MZ448x2QXNURpII65kTgIwyE6lTTw; gr_user_id=dd1a6251-159c-4547-9bc1-6f8b29284e75; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=47a3f7a4-f437-48ea-92fc-f87c3b4617b0; gr_cs1_47a3f7a4-f437-48ea-92fc-f87c3b4617b0=user_id%3A0; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1581588254%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DQFIAhWZsTKO_c1bRt7CyIYoujWFD8WL3yIlzkIjoKA-at_-XynwHOHJtCCgJxpbW%26wd%3D%26eqid%3Df4c3b3370009b995000000065e451f1b%22%5D; _pk_id.100001.3ac3=d8a799aa913a735e.1581588254.1.1581588254.1581588254.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utma=30149280.545170205.1567607013.1581398299.1581588254.3; __utmc=30149280; __utmz=30149280.1581588254.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=30149280.1.10.1581588254; __utma=81379588.1042499984.1581588254.1581588254.1581588254.1; __utmc=81379588; __utmz=81379588.1581588254.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=81379588.1.10.1581588254; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_47a3f7a4-f437-48ea-92fc-f87c3b4617b0=true; __yadk_uid=KrdfNh0zbGFGKM10NkKDwMlnBPgbWIiw'
}


def get_book_html():
    url = 'https://book.douban.com'
    loop = asyncio.get_event_loop()
    category_html = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml')
    categories = category_html.select('.hot-tags-col5 .clearfix')
    for category in categories:
        book_dict = {}
        category_name = category.select('.tag_title')[0].text.strip()
        book_dict['category'] = category_name
        for book_url in category.select('a.tag'):
            if '更多' in book_url.text:
                break
            tag = book_url.text
            book_dict['tag'] = tag
            book_url = url + book_url['href']
            book_html = BeautifulSoup(requests.get(book_url, headers=headers).text, 'lxml')
            books = book_html.select('.info h2 a')
            tasks = [get_book_info(book, book_dict) for book in books]
            loop.run_until_complete(asyncio.wait(tasks))
    loop.close()


async def get_book_info(book, book_dict):
    book_detail_url = book['href']
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(book_detail_url) as html:
            book_detail_html = await html.text()
    book_detail_html = BeautifulSoup(book_detail_html, 'lxml')
    name = book_detail_html.select('span[property]')[0].text
    try:
        rate = book_detail_html.select('.rating_num')[0].text
    except IndexError:
        # 有的没评分
        rate = '无'
    author = book_detail_html.select('#info a')[0].text.replace(' ', '').replace('\n', '')
    pattern = re.compile(r'[定][价][:]</span>(.*?)<br/>', re.S)
    book_dict['price'] = re.findall(pattern, str(book_detail_html))[0].replace(' ', '')
    # 有的后面没单位
    if re.match(r'^.*?(\D)$', book_dict['price']) is None:
        book_dict['price'] = book_dict['price'] + '元' if '元' not in book_dict['price'] else book_dict['price']
    book_dict['name'] = name
    book_dict['rate'] = rate.strip()
    book_dict['author'] = author
    save_to_sql(book_dict)


def save_to_sql(book_dict):
    print('正在保存: {}'.format(book_dict['name']))
    sqlsession = sessionmaker(engine)()
    book = sqlsession.query(Book).filter_by(name=book_dict['name']).first()
    category = sqlsession.query(Category).filter_by(category=book_dict['category']).first()
    tag = sqlsession.query(Tag).filter_by(tag=book_dict['tag']).first()
    if book is None and category is None and tag is None:
        try:
            book = Book(name=book_dict['name'], author=book_dict['author'], rate=book_dict['rate'],
                        price=book_dict['price'])
            category = Category(category=book_dict['category'])
            category.book.append(book)
            sqlsession.add_all([book, category])
            sqlsession.commit()
            tag = Tag(tag=book_dict['tag'])
            tag.category_id = category.id
            tag.book.append(book)
            sqlsession.add(tag)
            sqlsession.commit()
        except SQLAlchemyError:
            print('保存失败1')
            sqlsession.rollback()
    elif book is None and category is not None and tag is not None:
        try:
            book = Book(name=book_dict['name'], author=book_dict['author'], rate=book_dict['rate'],
                        price=book_dict['price'])
            category.book.append(book)
            tag.book.append(book)
            sqlsession.add(book)
            sqlsession.commit()
        except SQLAlchemyError:
            print('保存失败2')
            sqlsession.rollback()
    elif book is None and category is not None and tag is None:
        try:
            book = Book(name=book_dict['name'], author=book_dict['author'], rate=book_dict['rate'],
                        price=book_dict['price'])
            category.book.append(book)
            sqlsession.add(book)
            sqlsession.commit()
            tag = Tag(tag=book_dict['tag'])
            tag.category_id = category.id
            tag.book.append(book)
            sqlsession.add(tag)
            sqlsession.commit()
        except SQLAlchemyError:
            print('保存失败3')
            sqlsession.rollback()
    elif book is not None and category is not None and tag is None:
        try:
            tag = Tag(tag=book_dict['tag'])
            tag.category_id = category.id
            tag.book.append(book)
            sqlsession.add(tag)
            sqlsession.commit()
        except SQLAlchemyError:
            print('保存失败4')
            sqlsession.rollback()


if __name__ == '__main__':
    get_book_html()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值