Python爬取某网站图片的简单实现

本文介绍了一个具体的爬虫项目实现过程,包括代码示例和遇到的问题。通过多线程技术提高图片下载效率,并详细展示了从获取目标网站列表到解析并下载图片的全过程。

1.代码

代码如下(示例):


import random
import requests
import re
import time
import os
import threading
from lxml import etree
# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=200)

# 在桌面创建了一个文件夹,用于存放图片
Address1 = r'C:\Users\***\Desktop\新建文件夹 (2)\{}'

# headers可以在浏览器,检查,Network中查找,不加headers极容易被目标网站视为攻击
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}

# 爬取的目标网站
url_main='https://*******/index.php/art/type/id/30.html'

def jiexi_main(url):
    '''
    解析目标网站的列表地址
    :param url:
    :return: 传回目标网站的列表子地址
    '''
    response1 = requests.get(url, headers=headers)
    html1 = response1.text
    res_xpath = etree.HTML(response1.text)
    urls=res_xpath.xpath('/html/body/div[2]/div[3]/div/div/ul/li/a/@href')
    print(urls)# 输出检查是否为子地址全址,在main1中是否需要添加部分地址
    return urls


def jiexi_url(url):
    '''解析单个网页'''
    response = requests.get(url, headers=headers)
    html = response.text
    urls = re.findall('<img src="(.*?)"/>', html)  # 正则截取图片地址
    print(urls) # 输出目标图片地址,判断是否截取成功
    response.close()
    return urls



def download_pics(url,n,path):
    '''保存图片'''
    time.sleep(1)
    # 图片名称
    file_name = url.split('/')[-1]
    response = requests.get(url, headers=headers)
    with open(path+"\{}.jpg".format(file_name), 'wb') as f:
        f.write(response.content)
        print('下载完成第{}张图片'.format(n))
        # 下载完,解锁
        thread_lock.release()


def main_1():
    '''
    可下载列表
    :return:
    '''

    start = time.time()
    # 此处做了一个多个列表页目标爬取的尝试,没有成功
    # for index in range(4,10,1):
    #     print(index)
    #     url_main1 = url_main.format(index)
    #     url_main_pic = jiexi_main(url_main1)
    url_main_pic=jiexi_main(url_main)
    file_name = '8'  # 下载图片所要存放的子文件夹名
    m = './' + file_name
    if not os.path.exists(Address1.format(m)):
        os.mkdir(Address1.format(m))
    path = Address1.format(m)
    n = 0
    m = 0
    for url_py in url_main_pic:
        m += 1
        time.sleep(0.5)
        pic_urls = jiexi_url('https://dj66.pw' + url_py)
        for url in pic_urls:
            n += 1
            print('正在下载_{}'.format(n))
            # print('正在下载第{}张图片'.format(n))
            # 上锁
            thread_lock.acquire()
            t = threading.Thread(target=download_pics, args=(url, n, path))
            t.start()


    end = time.time() # 做了一个程序运行的计时 
    print('Running time: %s Seconds' % (end - start))  # 输出程序运行时间


def main_2():
    '''
    可下载单个图片页地址
    :return:
    '''
    url_py='https://******/index.php/art/detail/id/35231.html' # 可以修改该地址进行不同页面图片的下载
    start = time.time()
    n = 0
    file_name='5'
    m = './'+file_name
    pic_urls = jiexi_url(url_py)
    if not os.path.exists(Address1.format(m)):
        os.mkdir(Address1.format(m))
    path=Address1.format(m)
    for url in pic_urls:
        n += 1
        # print('正在下载第{}张图片'.format(n))
        # 上锁
        thread_lock.acquire()
        t = threading.Thread(target=download_pics, args=(url,n,path))
        t.start()
    end = time.time()
    print('Running time: %s Seconds' % (end - start))  # 输出程序运行时间


# 分别可以调用两个主程序
if __name__=='__main__':
    main_1()
#if __name__=='__main__':
#    main_1()

2.读入数据

代码如下(示例):

C:\Users\lee\PycharmProjects\python_Pachong\venv\Scripts\python.exe C:/Users/lee/PycharmProjects/python_Pachong/tupian.py
['/index.php/art/detail/id/32605.html', '/index.php/art/detail/id/32604.html', '/index.php/art/detail/id/32603.html', '/index.php/art/detail/id/32602.html', '/index.php/art/detail/id/32601.html', '/index.php/art/detail/id/32551.html', '/index.php/art/detail/id/32550.html', '/index.php/art/detail/id/32549.html', '/index.php/art/detail/id/32548.html', '/index.php/art/detail/id/32547.html', '/index.php/art/detail/id/32504.html', '/index.php/art/detail/id/32503.html', '/index.php/art/detail/id/32502.html', '/index.php/art/detail/id/32501.html', '/index.php/art/detail/id/32500.html', '/index.php/art/detail/id/32454.html', '/index.php/art/detail/id/32453.html', '/index.php/art/detail/id/32452.html', '/index.php/art/detail/id/32451.html', '/index.php/art/detail/id/32450.html', '/index.php/art/detail/id/32400.html', '/index.php/art/detail/id/32399.html', '/index.php/art/detail/id/32398.html', '/index.php/art/detail/id/32397.html', '/index.php/art/detail/id/32396.html', '/index.php/art/detail/id/32352.html', '/index.php/art/detail/id/32351.html', '/index.php/art/detail/id/32350.html', '/index.php/art/detail/id/32349.html', '/index.php/art/detail/id/32348.html']
['https://mei.lbpicmt.com/upload/art/img/lctk/85f99c8dae715a22578e1f95873a930d.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/6b0adeba5f567b47af2174d6cb9a6a52.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/857742df813b50596189eb229c600208.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/f99ebf01b38b74397d31edec5f5a3f03.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/b21b4a95ad6fac05bb1693ac0b7b70e8.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/f61e8cebf99028c552673680f1b5b5c8.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/a355c9fc2ac0d3a6f2cb126002591d2b.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/a2886729e6792e73382a82f8ffb8be89.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/b7bd5589f078d9c2eed5ec65687ef88e.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/2506d711d0949e5f73aa44e6995840ec.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/6362cd69504fde74961f0f79666ddd21.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/74c47d60954cf559cf42d6707daf23d0.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/0379cfef889808eacd04da4543f656b5.jpg']
正在下载_1
正在下载_2
正在下载_3
正在下载_4
正在下载_5
正在下载_6
正在下载_7
正在下载_8
正在下载_9
正在下载_10
正在下载_11
正在下载_12
正在下载_13
['https://mei.lbpicmt.com/upload/art/img/lctk/98db56003c1d6392e94569102667cdf1.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/33b57eaaf30a1c99042ad639af1dd591.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/fc827e5d9f91bc293336c55b8b7fa8b6.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/a5ff04d37b2811814408b76339270ed1.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/6e79ac0cb5364c047ea77cbbfd946bea.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/c6b8a0e80b7c7d80ab435ff1cc5ed683.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/1275f9f465d24a4773253013b5348fd4.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/f1dcf4d39e4dca88793d1e5bfff862b6.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/bcba83d63cac2f4ab2c041693be8fee8.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/0f50a5e66b76ed76abd0bb50019bf94e.jpg']
正在下载_14
正在下载_15
正在下载_16
正在下载_17
正在下载_18
正在下载_19
正在下载_20
正在下载_21
正在下载_22
正在下载_23
下载完成第7张图片
下载完成第8张图片

总结

可以爬取目标网站的图片,但是还有很多问题

运行过程中会报以下错误


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\adapters.py", line 439, in send
    resp = conn.urlopen(
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 726, in urlopen
    retries = retries.increment(
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\util\retry.py", line 403, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\packages\six.py", line 734, in reraise
    raise value.with_traceback(tb)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 978, in _validate_conn
    conn.connect()
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connection.py", line 362, in connect
    self.sock = ssl_wrap_socket(
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\util\ssl_.py", line 384, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Users\lee\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
  File "C:\Users\lee\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 1040, in _create
    self.do_handshake()
  File "C:\Users\lee\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/lee/PycharmProjects/python_Pachong/tupian.py", line 125, in <module>
    main_1()
  File "C:/Users/lee/PycharmProjects/python_Pachong/tupian.py", line 83, in main_1
    pic_urls = jiexi_url('https://dj66.pw' + url_py)
  File "C:/Users/lee/PycharmProjects/python_Pachong/tupian.py", line 36, in jiexi_url
    response = requests.get(url, headers=headers)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\sessions.py", line 643, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\adapters.py", line 498, in send
    raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))
<think>好的,用户想获取完整的网站图片爬取代码,并且询问如何使用编程语言编写。首先,我需要根据用户提供的参考引用,其中提到了C++和Python的例子,用户可能对这两种语言都有兴趣,或者更倾向于其中一种。不过用户最后的请求是“完整的代码”,所以可能需要提供一个具体的示例。 用户提到的引用[1]是C++爬虫,引用[2]是Python的京东图片爬取Python在爬虫方面更为常见,库支持也更完善,比如requests和BeautifulSoup,而C++可能需要更底层的网络库,比如libcurl,处理起来复杂一些。考虑到用户可能更倾向于实用性和易用性,先提供Python的代码可能更合适,但也不能忽视C++的例子,特别是用户提到“C语言是世界上最强大的语言”,可能希望看到C++的实现。 接下来需要分析用户的需求:完整的网站图片爬取代码。这意味着代码需要包含发送请求、解析HTML、提取图片链接、下载保存等功能。同时需要注意遵守robots.txt,控制请求频率,异常处理等。 首先,Python实现相对简单,可以展示一个基本的流程,使用requests获取页面,BeautifulSoup解析,然后用正则表达式或直接查找img标签。然后保存图片到本地。需要导入必要的库,处理异常,比如请求失败、连接超时等。 对于C++的例子,用户引用[1]提到了使用libcurl进行网络请求,可能需要展示如何使用libcurl获取网页内容,然后解析HTML提取图片链接,再次使用libcurl下载图片。但C++的字符串处理和HTML解析比较麻烦,可能需要用到正则表达式或者第三方库,如Gumbo Parser。这部分代码可能更复杂,需要更多的步骤和错误处理。 考虑到用户可能希望两种语言的对比,或者更详细的步骤,所以需要分别给出Python和C++的示例,并解释关键步骤。同时需要提醒用户注意法律和道德规范,遵守网站的爬虫政策,设置User-Agent,限制请求速率等。 在代码示例中,Python部分可以使用requests和BeautifulSoup,而C++部分则需要libcurl和可能的解析库。需要确保代码的正确性,比如正确的头文件包含,库的链接等。例如,C++中使用libcurl时,需要初始化库,设置选项,处理回调函数来接收数据。 另外,用户可能对代码中的关键部分有疑问,比如如何提取图片链接,如何处理相对路径和绝对路径,如何保存文件等。需要在解释中详细说明这些点,比如在Python中使用urljoin来补全完整的URL,在C++中可能需要手动处理URL的拼接。 还需要注意代码的可读性,添加必要的注释,说明每个函数的作用。例如,在Python中定义get_page函数发送请求,parse_page解析HTML提取图片链接,save_images下载并保存图片。在C++中,可能需要编写类似的函数,但需要处理内存管理和更底层的细节。 最后,生成相关问题,可能涉及不同语言的实现差异、性能优化、反爬虫策略的处理等,帮助用户进一步学习。</think>以下分别提供Python和C++两种语言实现网站图片爬取代码示例,包含完整功能模块和关键实现细节: ### Python完整实现方案 ```python import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import os def get_page(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() return response.text except Exception as e: print(f"请求失败: {e}") return None def parse_page(html, base_url): soup = BeautifulSoup(html, 'lxml') img_tags = soup.find_all('img') img_urls = [] for img in img_tags: src = img.get('src') or img.get('data-src') if src: full_url = urljoin(base_url, src) if any(full_url.lower().endswith(ext) for ext in ['.jpg', '.png', '.jpeg']): img_urls.append(full_url) return img_urls def save_images(img_urls, folder='images'): if not os.path.exists(folder): os.makedirs(folder) for idx, url in enumerate(img_urls): try: response = requests.get(url, stream=True, timeout=15) if response.status_code == 200: file_path = os.path.join(folder, f'image_{idx+1}.jpg') with open(file_path, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) print(f"已下载: {file_path}") except Exception as e: print(f"下载失败 {url}: {e}") if __name__ == '__main__': target_url = 'https://example.com' # 替换为目标网站 html = get_page(target_url) if html: img_links = parse_page(html, target_url) save_images(img_links[:10]) # 限制下载数量 ``` ### C++实现方案(基于libcurl) ```cpp #include <iostream> #include <string> #include <curl/curl.h> #include <regex> #include <filesystem> namespace fs = std::filesystem; // 回调函数获取网页内容 static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) { ((std::string*)userp)->append((char*)contents, size * nmemb); return size * nmemb; } std::string fetch_html(const std::string& url) { CURL* curl = curl_easy_init(); std::string readBuffer; if(curl) { curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer); curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); CURLcode res = curl_easy_perform(curl); if(res != CURLE_OK) std::cerr << "curl_easy_perform() failed: " << curl_easy_strerror(res) << std::endl; curl_easy_cleanup(curl); } return readBuffer; } void download_image(const std::string& url, const std::string& path) { CURL* curl = curl_easy_init(); if(curl) { FILE* fp = fopen(path.c_str(), "wb"); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL); curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); curl_easy_perform(curl); fclose(fp); curl_easy_cleanup(curl); } } int main() { curl_global_init(CURL_GLOBAL_DEFAULT); const std::string target_url = "https://example.com"; std::string html = fetch_html(target_url); // 使用正则表达式提取图片链接 std::regex img_regex("<img[^>]+src=\"([^\"]+)\"", std::regex_constants::icase); std::sregex_iterator it(html.begin(), html.end(), img_regex); std::sregex_iterator end; fs::create_directory("images"); int count = 0; for(; it != end && count < 10; ++it) { std::string img_url = (*it)[1]; if(img_url.find("http") != 0) img_url = target_url + img_url; std::string filename = "images/image_" + std::to_string(++count) + ".jpg"; download_image(img_url, filename); } curl_global_cleanup(); return 0; } ``` ### 关键实现要素说明 1. **Python方案优势**: - 使用`requests`处理HTTP请求 - `BeautifulSoup`解析HTML文档 - 自动处理相对路径转换(`urljoin`) - 支持主流图片格式过滤 - 线程安全的下载过程 2. **C++方案特点**: - 基于libcurl实现网络通信 - 正则表达式匹配图片链接 - 需要手动处理路径拼接 - 更底层的资源管理 - 需要链接curl库编译(编译命令示例:`g++ -o crawler crawler.cpp -lcurl`) 3. **通用注意事项**: - 设置`User-Agent`模拟浏览器访问[^1] - 限制下载数量避免触发反爬机制[^2] - 遵守`robots.txt`协议规范 - 添加`timeout`参数防止长时间阻塞 - 异常处理保证程序稳定性
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值