爬虫程序

本文介绍了如何编写Python爬虫程序,从百度网站抓取指定类型的图片内容。通过修改obj_list列表,可以定制爬取的图片类型。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

从百度爬取所需要的图片内容:

# -*- coding: utf-8 -*-
import re, os
import requests
from urllib.request import urlretrieve


def download1(url, filename, filepath):
    full_name = os.path.join(filepath, filename)
    if os.path.exists(full_name):
        print("【消息】文件已经存在:", full_name)
    try:
        pic = requests.get(url, timeout=5)
    except:
        print('【错误】当前图片无法下载')
        return
    try:
        with open(filepath + "/" + filename, 'wb') as wf:
            wf.write(pic.content)
    except:
        print("【错误】写入失败")


def download2(url, filename, filepath):
    full_name = os.path.join(filepath, filename)
    if os.path.exists(full_name):
        print("【消息】文件已经存在:", full_name)
    try:
        urlretrieve(url, full_name)
    except:
        print('【错误】当前图片无法下载')


def search(word, local_path="./data/down/", page=None, keep_original_name=True):
    local_path += word
    os.makedirs(local_path, exist_ok=True)
    url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={word}&pn={pn}&gsm={gsm:x}&ct=&ic=0&lm=-1&width=0&height=0'.format(
        word=word, pn=20 * page, gsm=40 + 20 * page)

    print("HHHC:0====>page=%d,url=\"%s\"" % (page, url))
    html = requests.get(url).text
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)

    i = 0
    for url in pic_url:
        print(url)
        i = i + 1
        filename = os.path.split(url)[1].split('?')[0]
        filename_split = filename.split('.')
        if len(filename_split) != 2:
            print("【错误】文件名异常:" + filename)
            continue
        # print("HHHA:0====>", filename_split[1])
        if filename_split[1] != 'jpg' and filename_split[1] != 'JPG' \
                and filename_split[1] != 'png' and filename_split[1] != 'PNG':
            print("【错误】类型异常:" + filename)
            continue

        if not keep_original_name:
            filename = filename.split('.')[0].strip() + "-" + str(page) + "-" + str(i) + "." + filename.split('.')[
                1].strip()

        download1(url, filename, local_path)
    return


def search_50_page(word, local_path="./data/down/"):
    for i in range(1, 50):
        search(word, local_path, i)


def search_list_test():
    obj_list = ["苹果", "香蕉", "桔子", "橙子", "桃子", "樱桃", "龙眼", "荔枝", "雪梨", "草莓", "葡萄", "猕猴桃", "菠萝", "番石榴", "青梅"]
    # obj_list = ["菊花", "蒲公英", "玫瑰", "向日葵", "郁金香"]
    for obj in obj_list:
        search_50_page(obj, "./data/fruit_photos/")


if __name__ == '__main__':
    search_list_test()

修改 obj_list 列表里面的内容即可

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值