爬取雪球网

最新推荐文章于 2024-10-21 23:52:35 发布

转载最新推荐文章于 2024-10-21 23:52:35 发布 · 406 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/luwanhe/p/9484753.html

文章标签：

#json

本文分享了一段使用Python的requests库和json模块爬取雪球网站数据的代码实例。该脚本通过发送GET请求获取指定URL的数据，并解析返回的JSON格式信息，实现了对雪球公共时间线的抓取。文章详细展示了如何设置请求头、构造URL以及处理返回数据的过程。

import requests
import json



url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={}&count={}&category=111'
def xueqiu(number=1,max_id=None,count=None):
if max_id is None:
        full_url = url.format(-1,10)
else:
        full_url = url.format(max_id,count)
    count = 15

    headers = {
'Cookie': 'device_id=3049fba19293376977728a287084d21f; _ga=GA1.2.780783310.1531212991; s=e212ctwtfc; __utmz=1.1531213044.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=1.780783310.1531212991.1531213044.1531220599.2; aliyungf_tc=AQAAADtGMFhh1gsAUhVFeSfUZkqI1Vuj; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; _gid=GA1.2.1152894742.1534296305; _gat_gtag_UA_16079156_4=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1534296305; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534296305; u=211534296306130',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
    }
# 最大页码数
    if number<=3:

print('第%d页:'%number)
        number += 1
        response = requests.get(full_url, headers=headers)
        res_dict = json.loads(response.text)
        list_list = res_dict['list']

# 遍历 list_list
        for list_item_dict in list_list:
# list 列表内的一个item, 他是一个dict
            data_str = list_item_dict['data']
            d = json.loads(data_str)
print(d['id'], d['title'])
        xueqiu(number,max_id=list_list[-1]['id'],count=count)

if __name__ == '__main__':
    xueqiu(1,-1,10)