6.3 Ajax结果提取
# 微博个人主页爬取
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from pymongo import MongoClient
# 需要先开启MongoDB服务
client = MongoClient()
db = client['weibo']
collection = db['weibo']
import requests
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'page': page
}
url = base_url + urlencode(params)
try:
resp = requests.get(url=url, headers=headers)
if resp.status_code == 200:
return resp.json()
except requests.ConnectionError as e:
print('Error:', e.args)
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
# 这里,某个getIndex?可能返回不了含有mblog键的数据,返回None
if item == None:
# print(item, end='\n===================\n')
continue
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
def save_2_mongo(result):
if collection.insert(result): # 插入字典
print('save to mongo')
if __name__ == '__main__':
for page in range(1, 11):
json = get_page(page)
results = parse_page(json)
for result in results:
save_2_mongo(result)
6.4 分析ajax,爬取今日头条斋藤飞鸟美图
import requests
from urllib.parse import urlencode
import os
from hashlib import md5
from multiprocessing.pool import Pool
GROUP_START = 1
GROUP_END = 20
def get_page(offset):
para = {
'offset': offset,
'format': 'json',
'keyword': '斋藤飞鸟',
'autoload': 'true',
'count': 20,
'cur_tab': 3, # 3,而不是1,对应:“综合 视频 图集 用户” 中的图集
'from': 'gallery' # gallery,而不是search_tab
}
headers = {
'Host': 'www.toutiao.com',
'Referer': 'https://www.toutiao.com/search/?keyword=%E6%96%8B%E8%97%A4%E9%A3%9E%E9%B8%9F',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(para)
try:
resp = requests.get(url=url) # 不加headers=headers,似乎也能爬取
if resp.status_code == 200:
return resp.json() # ajax请求,似乎返回json格式数据
except requests.ConnectionError:
return None
def get_image(json):
# print(json)
if json.get('data'): # 如果data项不为空
for item in json.get('data'):
title = item.get('title')
image = item.get('large_image_url')
# 课本原文爬取参数项已失效,看来需要拿到article_url,在新的文章页面中进行爬取,此处只爬取大封面图作为示范
yield {
'image': image,
'title': title
}
def save_image(item):
file_name = item.get('title')
if not os.path.exists(file_name):
os.makedirs(file_name)
try:
resp = requests.get(item.get('image'))
if resp.status_code == 200:
file_path = '{0}/{1}.{2}'.format(file_name, md5(resp.content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(resp.content)
else:
print('Already Download ', file_path)
except requests.ConnectionError:
print('Failed to Save Image.')
def main(offset):
json = get_page(offset)
for item in get_image(json):
print(item)
save_image(item)
if __name__ == '__main__':
pool = Pool()
groups = ([x*20 for x in range(GROUP_START, GROUP_END+1)])
print(groups)
pool.map(main, groups)
pool.close()
pool.join() # 等待所有子线程结束(即关闭后)