from urllib.parse import urlencode
import requests
import pymongo
from requests.exceptions import ConnectionError
from pyquery import PyQuery as pq
client = pymongo.MongoClient('localhost')
db = client['weixin']
headers = {
'Cookie':'SUV=00206325780448E25A88D8FBD97B9189; SUID=E24804781E20910A000000005A89684C; ABTEST=0|1518954617|v1; SNUID=3E0D2CA3D7D2B2ACF9B54509D8060070; IPLOC=SG; weixinIndexVisited=1; sct=1; JSESSIONID=aaaGZzlqTH0rmhrmTQRfw; ppinf=5|1518954749|1520164349|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTozNjolRTUlQTUlOTQlRTglODUlQkUlRTclOUElODQlRTUlQkYlODN8Y3J0OjEwOjE1MTg5NTQ3NDl8cmVmbmljazozNjolRTUlQTUlOTQlRTglODUlQkUlRTclOUElODQlRTUlQkYlODN8dXNlcmlkOjQ0Om85dDJsdUhvdTZsRFdMTlhZRC11TV9mUTJFSHNAd2VpeGluLnNvaHUuY29tfA; pprdig=UruVq0deIdZCdmhmImyPUxRWiPQ9YijgmQNcLOdNUn3ubOGp6SKT4j7DZobkIHDqU2n9ziJmFkLQW4jDvZtE4h6bLtgXhZnptrFeCYWY_m4eYXzTuzdDbw-_d8zX4aK-pVeUNMBxx7dCCQD-SZkaCLtb7htDIijdBJovoFa4iDg; sgid=14-33645363-AVqJaP0PxEIhnXsrvW8mI1o; ppmdig=15189547500000004e4b276a42c3201b81b3093b348c99c0',
'Host':'weixin.sogou.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
keyword = '风景'
# 开始时不使用代理
proxy = None
proxy_url = 'http://127.0.0.1:5000/random'
maxcount = 3
# 获取代理IP
def get_proxy():
try:
response = requests.get(proxy_url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('没有获取到代理')
return None
# 获取抓取页面
def get_page(urls,count=1):
print('crawling', urls)
print('trying count', count)
global proxy
if count >= maxcount:
print('Tried too many counts')
return None
try:
# 如果有代理,则传入url使用
if proxy:
proxies = {
'http':'http://' + proxy
}
response = requests.get(urls,headers=headers,proxies=proxies,allow_redirects=False)
else:
response = requests.get(urls, headers=headers, allow_redirects=False)
if response.status_code == 200:
return response.text
if response.status_code == 302:
print('302')
proxy = get_proxy()
if proxy:
print('获取到代理',proxy)
# 获取代理成功,重新执行此方法
return get_page(urls)
else:
print('获取代理失败')
return None
except ConnectionError as e:
print('错误', e.args)
proxy = get_proxy()
# 出现异常的话,把错误次数传递回去
count += 1
return get_page(urls,count)
def get_index(keyword,page):
data = {
'page': page,
'query': keyword,
'type': 2
}
value = urlencode(data)
url = 'http://weixin.sogou.com/weixin?'
urls = url + value
html = get_page(urls)
return html
def parse_index(html):
doc = pq(html)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
yield item.attr('href')
def get_detail(article_url):
try:
response = requests.get(article_url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
return None
def parse_detail(html):
doc = pq(html)
title = doc('.rich_media_title').text()
content = doc('.rich_media_content').text()
date = doc('#post-date').text()
name = doc('#post-user').text()
return {
'title':title,
'content':content,
'date':date,
'name':name,
}
def save_to_mongo(data):
if db['article'].update({'title':data['title']},{'$set':data}, True):
print('存储到mongo',data['title'])
else:
print('存储失败',data['title'])
def main():
for page in range(1, 101):
html = get_index(keyword, page)
if html:
article_urls = parse_index(html)
for article_url in article_urls:
article_html = get_detail(article_url)
if article_html:
article_data = parse_detail(article_html)
print(article_data)
if article_data:
save_to_mongo(article_data)
if __name__ == '__main__':
main()
更换代理爬取文章
最新推荐文章于 2020-12-25 16:27:31 发布