python 爬虫图片

本文介绍了一个使用Python编写的简单爬虫程序,该程序能够从淘宝MM主页抓取模特的个人页面链接,并进一步抓取她们相册中的图片。程序利用多线程提高效率,通过BeautifulSoup解析网页内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#!/usr/bin/env python
# encoding: utf-8
'''
@author: caopeng
@license: (C) Copyright 2013-2017, Node Supply Chain Manager Corporation Limited.
@contact: deamoncao100@gmail.com
@software: garner
@file: movie1.py
@time: 2017/9/16 0016 14:49
@desc:
'''
import threading
import time
import urllib
import urllib.request
import os,queue,re
from bs4 import BeautifulSoup
def getUrl(name,hostUrls,girlsUrls,flag):
    while  not flag.isSet():
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        try:
            hostUrl=hostUrls.get(timeout=2)
        except queue.Empty:
            print("queue empty")
            return
        request=urllib.request.Request(hostUrl,headers=headers)
        response=urllib.request.urlopen(request)
        data=response.read().decode('gbk')
        soup=BeautifulSoup(data)
        tag_lady=soup.find_all("a",attrs={"class":"lady-avatar"})
        for tag_href in tag_lady:
            girlsUrls.put("https:"+tag_href['href'])
            print("录入:https:"+tag_href['href'])
        hostUrls.task_done()
        print("getUrl is working")

def getImg(name,girlsUrls,flag):
    while not flag.isSet():
        user_agent ='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240'
        headers={'User-Agent':user_agent}
        try:
            ur=girlsUrls.get(timeout=5)
        except queue.Empty:
            print(name+" imgqueue empty")
            return
        pattern=re.compile(r"/(\d+).htm")
        items=pattern.findall(ur)
        girlUrl="https://mm.taobao.com/self/aiShow.htm?userId="+items[0]
        request=urllib.request.Request(girlUrl,headers=headers)
        response=urllib.request.urlopen(request)
        data=response.read()
        soup=BeautifulSoup(data)
        fileName=soup.head.title.contents
        fileName[0]=fileName[0].rstrip()
        tag_div=soup.find('div',attrs={"class":"mm-aixiu-content"})
        imgs=tag_div.find_all("img",attrs={})
        if len(imgs)==0:
            girlsUrls.task_done()
            return
        path=cdir+'/'+str(fileName[0])
        if not os.path.exists(path):
            os.makedirs(path)
        n=0
        for img in imgs:
            n=n+1
            link=img.get('src')
            if link:
                s="http:"+str(link)
                i=link[link.rfind('.'):]
                try:
                    request=urllib.request.Request(s)
                    response=urllib.request.urlopen(request)
                    imgData=response.read()
                    pathfile=path+r'/'+str(n)+i
                    with open(pathfile,'wb') as f:                       
                        f.write(imgData)
                        f.close()
                        print("thread "+name+" write:"+pathfile)
                except:
                    print(str(name)+" thread write false:"+s)
        girlsUrls.task_done()

#start=time.time()
if __name__=='__main__':
    start=time.time()
    hostUrls=queue.Queue()
    girlsUrls=queue.Queue()
    cdir=os.getcwd()
    url='https://mm.taobao.com/json/request_top_list.htm?page='
    flag_girl=threading.Event()
    flag_img=threading.Event()
    for i in range(1,3):
        u=url+str(i)
        hostUrls.put(u) 
    threads_girl = threading.Thread(target=getUrl, args=(str(1), hostUrls,girlsUrls,flag_girl))
    threads_img = [threading.Thread(target=getImg, args=(str(i+1), girlsUrls,flag_img))
               for i in range(8)]
    threads_girl.start()
    while(girlsUrls.empty()):
        print("wait..")
        time.sleep(0.1)
    for t in threads_img:
        t.start()
    hostUrls.join()
    flag_girl.set()
    girlsUrls.join()
    flag_img.set()
    for t in threads_img:
        t.join()
    end=time.time()
    print("run time:"+str(end-start))

### 使用Python编写的爬虫抓取图片 为了实现使用Python编写爬虫来抓取网页上的图片,可以遵循一系列特定的方法和技术。准备工作包括安装必要的库,如`requests`用于发送HTTP请求以及`BeautifulSoup`或`lxml`解析HTML文档[^1]。 #### 安装依赖包 首先需要确保环境中已经安装了所需的第三方库: ```bash pip install requests beautifulsoup4 lxml ``` #### 编写基本框架 创建一个简单的Python脚本来构建基础架构,该结构能够访问目标网站并提取页面中的图像链接。 ```python import os from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup def download_image(url, folder_path='images'): """下载单张图片""" try: response = requests.get(url) if not os.path.exists(folder_path): os.makedirs(folder_path) file_name = os.path.join(folder_path, os.path.basename(urlparse(url).path)) with open(file_name, 'wb') as f: f.write(response.content) print(f'Successfully downloaded {file_name}') except Exception as e: print(e) def fetch_images_from_page(page_url): """从给定URL中获取所有img标签对应的src属性值即为图片地址列表""" html_content = requests.get(page_url).text soup = BeautifulSoup(html_content, "html.parser") img_tags = soup.find_all('img') image_urls = set() # 去重 for tag in img_tags: src = tag.get('src', '') full_src = urljoin(page_url, src) # 处理相对路径 if full_src.startswith(('http://', 'https://')): image_urls.add(full_src) return list(image_urls) if __name__ == '__main__': page_to_scrape = input("Enter the URL of a website to scrape images from:") urls = fetch_images_from_page(page_to_scrape) for u in urls: download_image(u) ``` 这段代码定义了一个名为 `fetch_images_from_page()` 的函数用来查找指定网页内的所有 `<img>` 标签,并返回它们的源(`src`)属性组成的集合;另一个辅助性的方法叫做 `download_image()` ,它接收一个完整的图片链接作为参数并将文件保存到本地磁盘上。 请注意,在实际应用过程中还需要考虑更多因素比如异常处理、反爬机制规避策略等,上述例子仅提供最简化版本供学习交流之用[^2]。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值