redis 作为消息队列的例子:
# -*- coding:utf-8 -*-
# @Author: YOYO
# @Time: 2018/10/11 15:38
# @说明:
import json
import re
import sys
import threading
import traceback
import time
from lxml import etree
import redis
import requests
reload(sys)
sys.setdefaultencoding('utf8')
class Yhuo(object):
def __init__(self):
pool = redis.ConnectionPool(host='localhost', port=6379)
self.redis=redis.Redis(connection_pool=pool)
self.url_list='url_list'
self.url_detail_list='url_detail_list'
self.data_list='data_list'
self.s=requests.session()
pass
def login(self):
url = "https://yaohuo.me/waplogin.aspx"
payload = "logname=13438&logpass=xxxx&savesid=0&action=login&classid=0&siteid=1000&sid=-3-0-0-0-0&backurl=wapindex.aspx%3Fsiteid%3D1000&g=%E7%99%BB+%E5%BD%95"
headers = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache",
'content-length': "124",
'content-type': "application/x-www-form-urlencoded",
'origin': "https://yaohuo.me",
'referer': "https://yaohuo.me/waplogin.aspx",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
}
response=self.s.post(url,data=payload,headers=headers)
if '登录成功' in response.text:
print u'登陆成功'
def add_list_urls(self):
"""
向redis中添加抓取的页面
:return: None
"""
next_page = "https://yaohuo.me/bbs/book_list.aspx?action=class&siteid=1000&classid=203&getTotal=28023&page="
# 建立管道
pipe = self.redis.pipeline()
for i in range(1, 10):
pipe.lpush(self.url_list, next_page+str(i))
print next_page+str(i)
# 管道执行
pipe.execute()
def get_url_detail(self):
"""
获取list
:return:
"""
print "get_url_detail"
while True:
url=self.redis.rpop(self.url_list)
if not url:
continue
headers = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
'cache-control': "no-cache",
}
try:
response = self.s.get(url, headers=headers)
except Exception:
self.redis.lpush(self.url_list,url)
return
# 提取url:
res = etree.HTML(response.text)
try:
urls_list = res.xpath('//div[contains(@class,"line")]/a[1]/@href')
except Exception:
print url
return
pipe = self.redis.pipeline()
for url in urls_list:
print url
pipe.lpush(self.url_detail_list,url)
pipe.execute()
def parse_url(self):
print "parse_url"
while True:
url=self.redis.rpop(self.url_detail_list)
if not url:
continue
url='https://yaohuo.me/'+url
headers = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9",
'cache-control': "no-cache",
'pragma': "no-cache",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
}
try:
response = self.s.get(url, headers=headers)
except Exception:
print traceback.format_exc()
continue
res = etree.HTML(response.text)
item = {}
try:
item['content'] = "".join(res.xpath('//div[@class="content"]//text()'))
# try:
# item['file']=[i.xpath('./a/@href') for i in res.xpath('//div[@class="bbscontent"]/div[@class="line"]')]
# except Exception:
# pass
except Exception:
print traceback.format_exc()
return
if item:
print item
data = json.dumps(item)
self.redis.lpush(self.data_list,data)
def save_data(self):
while True:
data=self.redis.rpop(self.data_list)
if not data:
continue
item=json.loads(data)
with open('yhuo.txt','a')as f:
f.write(
item['content']+'\n'
)
def run(self):
self.login()
thread_list=[]
t=threading.Thread(target=self.add_list_urls)
thread_list.append(t)
for i in range(20):
t = threading.Thread(target=self.get_url_detail)
thread_list.append(t)
# self.get_url_detail()
for i in range(20):
t = threading.Thread(target=self.parse_url)
thread_list.append(t)
for i in range(20):
t = threading.Thread(target=self.save_data)
thread_list.append(t)
for t in thread_list:
t.start()
# self.parse_url()
# self.save_data()
if __name__ == '__main__':
yaohuo=Yhuo()
yaohuo.run()