今日头条街拍图片爬取

最新推荐文章于 2022-11-21 19:03:57 发布

转载最新推荐文章于 2022-11-21 19:03:57 发布 · 89 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/lxh777/p/9490895.html

本文介绍了一款用于从今日头条网站抓取街拍图片的Python爬虫程序。该程序使用requests库获取网页内容，并利用正则表达式及JSON解析来提取图片链接。此外，还通过MySQL数据库存储所下载图片的信息。

import re
import requests
import os
from urllib import request
import json
from mysql_tu import mysql_conn


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for v in range(0,60,20):
    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(v)

    response = requests.get(url,headers=headers)
    html_json_dict = response.json()

    # 创建文件
    if not os.path.exists('cccc'):
        os.mkdir('cccc')

    data_list = html_json_dict['data']
    # print(data_list)
    for data_item in data_list:
        if 'article_url' in data_item:
            article_url = data_item['article_url']
            # print(article_url)
    #
            response = requests.get(article_url,headers=headers)
            html_ee = response.text
            # print(html_ee)
            # html_ee = json.loads(html_str)

            # print(type(html_str))
            pp = r'gallery: JSON\.parse\((.*)\),'
            match_res = re.search(pp, html_ee)
            # print(match_res.group(1))
            if match_res:
                match_str = match_res.group(1)
                match_dict = json.loads(match_str)
                # print(match_dict)
                # print(type(match_dict))
                match_dict = json.loads(match_dict)
                # print(match_dict)
                # print(type(match_dict))
                image_dict = match_dict['sub_images']
                # print(image_dict)
                for v in image_dict:
                    image_aa = v['url']
                    print(image_aa)
                    try:
                        # filename = 'cccc/' + image_aa.split('/')[-1] + '.jpg'
                        filename = image_aa.split('/')[-1] + '.jpg'

                        # 下载图片
                        # request.urlretrieve(image_aa, filename)
                        ver = {}
                        ver['filename'] = filename
                        sql = 'insert into jiepai(filename) values("{filename}")'.format(**ver)
                        mc = mysql_conn()
                        mc.execute_modify_mysql(sql)
                    except TimeoutError:
                        print('下载超时')
                        continue
            else:
                print('没有那个文件')
                continue

#文件名 mysql_tu.py

import pymysql

class mysql_conn(object):
    # 魔术方法, 初始化, 构造函数
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1', user='root', password='lxh1122', port=3306, database='py11')
        self.cursor = self.db.cursor()

    # 执行modify(修改)相关的操作
    def execute_modify_mysql(self, sql):
        self.cursor.execute(sql)
        self.db.commit()

    # 魔术方法, 析构化 ,析构函数
    def __del__(self):
        self.cursor.close()
        self.db.close()

if __name__=='__main__':
    sql = 'insert into jiepai values ()'
    mc = mysql_conn()
    mc.execute_modify_mysql(sql)
    sql = 'insert into jiepai values ()'

    mc.execute_modify_mysql(sql)
    sql = 'insert into jiepai values ()'

    mc.execute_modify_mysql(sql)
    sql = 'insert into jiepai values ()'

    mc.execute_modify_mysql(sql)