爬去今日头条(关键字自己取)

最新推荐文章于 2025-10-25 21:48:16 发布

原创最新推荐文章于 2025-10-25 21:48:16 发布 · 3k 阅读

CC 4.0 BY-SA版权

文章标签：

之前写的很菜，后来无意中找到了崔庆才的视频，发现对不上，网页已经改版，所以就特地改写了一下，其中一个坑，坑了我一天把，就是js的JSON.parse()方法和python中的json.parse()不一样，js的可以序列化\\,而且全都替换没了，python的不行，这点不得不吐槽。后来用demjson转义也不行，最后用了正则放法才替换掉后进行反序列化(用replace也行)，为自己鼓掌。下面就是代码，不多bb了。

import json,demjson
import re
from urllib.parse import urlencode

import requests
from bs4 import BeautifulSoup
from requests import RequestException


def get_page_index(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery'
    }
    # https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        print(url)
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求索引页出错')
        return None


def parse_page_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')


def get_page_detail(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/52.0.2743.116 Safari/537.36 '
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求详情页出错', url)
        return None


def parse_page_detail(html,url):
    pic_url = 'http://p3.pstatp.com/'
    soup = BeautifulSoup(html, 'lxml')
    title = soup.select('title')[0].get_text()
    image_pattern = re.compile('gallery: JSON.parse(.*?)siblingList:', re.S)

    result = re.search(image_pattern, html)
    with open('a.txt', 'a', encoding='utf8') as f:
        f.write(result.group(1))
        f.close()
    results = re.sub(r'\\', '', result.group(1))
    results = results.strip().lstrip('("').rstrip('"),')

    if results:
        data = json.loads(results)
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [(pic_url+item.get('uri')) for item in sub_images]
            return {
                'title': title,
                'url': url,
                'images':images,

            }
    pass


def main():
    html = get_page_index(0, '街拍')
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_detail(html, url)
            with open('toutiao.txt', 'a', encoding='utf8')as f:
                f.write(json.dumps(result, ensure_ascii=False) + '\n')
            print(result)


if __name__ == '__main__':
    main()