python爬虫入门之————————————————案例演练

目标url  https://www.qiushibaike.com/8hr/page/

数据持久化源代码:

"""
Version 1.1.0
Author lkk
Email lkk199404@163.com
date 2018-11-22 21:57
DESC sqlalchemy存储
"""

from sqlalchemy import Column, String, create_engine, Integer, Text
from sqlalchemy.orm import sessionmaker
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

eng = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/data', echo=True)
eng.echo = True
DBSession = sessionmaker(bind=eng)
session = DBSession()


class Joke(Base):
    __tablename__ = 'jokes'
    id = Column(Integer, primary_key=True, autoincrement=True)
    author = Column(String(20))
    content = Column(Text)
    number = Column(String(50))


# 创建从Base派生的所有表
def create_all(eng):
    Base.metadata.create_all(eng)


# 删除DB中所有的表
def drop_all(eng):
    Base.metadata.drop_all(eng)


if __name__ == '__main__':
    create_all(eng)



筛选目标数据源代码

"""
Version 1.1.0
Author lkk
Email lkk199404@163.com
date 2018-11-22 19:55
DESC 糗事百科数据采集
"""
from urllib import request
from bs4 import BeautifulSoup
import requests, chardet
from fake_useragent import UserAgent
import utils2
from utils2 import Joke, session


def get_html(url):
    ua = UserAgent()
    headers = {
        'User-agent': ua.random
    }
    html = requests.get(url, headers=headers).text
    return html


def get_info(html):
    soup = BeautifulSoup(html, 'lxml')
    user_list = (soup.select('h2'))
    content_list = (soup.select('div[class="content"] > span:nth-of-type(1)'))
    joke_list = (soup.select('.stats-vote'))
    # next_url = (find_all('.pagination li > a')[-1].attrs('href'))
    # print()
    for i in range(len(user_list)):
        username = user_list[i].text.strip()
        content = content_list[i].text.strip()
        joke = joke_list[i].text.strip()
        print(username, content, joke)
       
        try:
            info = Joke(author=username, content=content, number=joke)  # 数据持久化处理
            session.add(info)
            session.commit()
        except BaseException as e:
            print(e)
        finally:
            session.close()


next_page = 'https://www.qiushibaike.com/8hr/page/'
for j in range(1, 14):
    next_url = next_page + str(j)
    print(next_url)
    info = get_html(next_url)
    cont = get_info(info)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值