自编码爬取今日头条街拍

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
    1.抓取索引页内容,利用requests请求目标站点,得到索引网页Htnl代码,返回结果
    2.抓取详情页内容,解析返回结果,得到详情页的链接,并进一步抓取详情页的信息
    3.下载图片与保存数据库,将图片下载到本地,并把页面信息及图片URL保存到MongDB
    4.开启循环及多线程,对多页内容遍历,开启多线程提高抓取速度
"""
import json
import os
import re
from multiprocessing import Pool
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import pymongo
from config import *
from hashlib import md5

client = pymongo.MongoClient(MONGDB_URL)
db = client.db[MONGO_DB]


def request_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("请求索引页出错", url)
        return None


def get_page_index(offset, keyword=KEYWORD):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
    }
    url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
    result = request_url(url)
    return result


def parse_parse_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get("data"):
            yield item.get("article_url")


def get_page_detail(url):
    result = request_url(url)
    return result


def parse_page_detail(html, url):
    soup = BeautifulSoup(html, 'lxml')
    if soup.select('title'):
        title = soup.select('title')[0].get_text()
    else:
        title = None
    images_pattern = re.compile("var gallery = (.*?);", re.S)
    result = re.search(images_pattern, html)
    if result:
        data = json.loads(result.group(1))
        if data and "sub_images" in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get("url") for item in sub_images]
            for image in images:
                download_image(image)
            return {
                "title": title,
                "url": url,
                "images": images,
            }


def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print("存储到MongoDB成功", result)
        return True
    return False


def download_image(url):
    print("正在下载图片", url)
    result = request_url(url)
    return result


def save_iamge(content):
    file_path = '{0}/toutiao/{1}{2}'.format(os.getcwd(), md5(content).hexdigest(), '.jpg')
    file_dir = os.path.join(os.getcwd(), "toutiao")
    if not os.path.exists(file_path):
        if not os.path.exists(file_dir):
            os.mkdir(file_dir)
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()


def main(offset):
    html = get_page_index(offset, KEYWORD)
    for url in parse_parse_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_detail(html, url)
            if result:
                save_to_mongo(result)

if __name__ == '__main__':
    groups = [x*20 for x in range(GROUP_START, GROUP_END + 1)]
    pool = Pool()
    pool.map(main, groups)

  

转载于:https://www.cnblogs.com/nixingguo/p/7262438.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值