目标url https://www.qiushibaike.com/8hr/page/
数据持久化源代码:
"""
Version 1.1.0
Author lkk
Email lkk199404@163.com
date 2018-11-22 21:57
DESC sqlalchemy存储
"""
from sqlalchemy import Column, String, create_engine, Integer, Text
from sqlalchemy.orm import sessionmaker
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
eng = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/data', echo=True)
eng.echo = True
DBSession = sessionmaker(bind=eng)
session = DBSession()
class Joke(Base):
__tablename__ = 'jokes'
id = Column(Integer, primary_key=True, autoincrement=True)
author = Column(String(20))
content = Column(Text)
number = Column(String(50))
# 创建从Base派生的所有表
def create_all(eng):
Base.metadata.create_all(eng)
# 删除DB中所有的表
def drop_all(eng):
Base.metadata.drop_all(eng)
if __name__ == '__main__':
create_all(eng)
筛选目标数据源代码
"""
Version 1.1.0
Author lkk
Email lkk199404@163.com
date 2018-11-22 19:55
DESC 糗事百科数据采集
"""
from urllib import request
from bs4 import BeautifulSoup
import requests, chardet
from fake_useragent import UserAgent
import utils2
from utils2 import Joke, session
def get_html(url):
ua = UserAgent()
headers = {
'User-agent': ua.random
}
html = requests.get(url, headers=headers).text
return html
def get_info(html):
soup = BeautifulSoup(html, 'lxml')
user_list = (soup.select('h2'))
content_list = (soup.select('div[class="content"] > span:nth-of-type(1)'))
joke_list = (soup.select('.stats-vote'))
# next_url = (find_all('.pagination li > a')[-1].attrs('href'))
# print()
for i in range(len(user_list)):
username = user_list[i].text.strip()
content = content_list[i].text.strip()
joke = joke_list[i].text.strip()
print(username, content, joke)
try:
info = Joke(author=username, content=content, number=joke) # 数据持久化处理
session.add(info)
session.commit()
except BaseException as e:
print(e)
finally:
session.close()
next_page = 'https://www.qiushibaike.com/8hr/page/'
for j in range(1, 14):
next_url = next_page + str(j)
print(next_url)
info = get_html(next_url)
cont = get_info(info)