自动化爬取与Selenium实战：获取微博图片-优快云博客

本文链接：https://blog.youkuaiyun.com/EXEC_CHRONICLE_KEY/article/details/120687967

本文介绍了使用Python的urllib和lxml抓取无反爬措施网站图片，以及如何利用Selenium配合cookies获取微博博主的完整图片集，包括下载和处理过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

介绍两种方式，一种是比较简单的方式，爬取的网站没有反爬措施，直接使用urllib和lxml库即可。

另一种例如新浪微博，想要获取某个博主完整的图片需要对页面进行下拉，则使用selenium，用自动化点击的方式来获取图片。

简单版：

import urllib.request
import re
from lxml import etree
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


def checkfile(filepath):
    if os.path.exists(filepath):
        #shutil.rmtree(filepath)
        return filepath
    os.mkdir(filepath)
    return filepath

def download(url,user_agent="wswp",proxy=None,num_retries=2):
    print("Downloading: ",url)
    headers={'User-agent':user_agent}
    req=urllib.request.Request(url,headers=headers)
    opener=urllib.request.build_opener()
    if proxy:
        proxy_params={urlparse.urlparse(url).scheme:proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html=opener.open(req).read()

    except urllib.request.URLError as e:
        print("Download error",e.reason)
        html=None
        if num_retries>0:
            if hasattr(e,"code") and 500 <=e.code<600:
                return download(url,user_agent,proxy,num_retries-1)
    return html


count=1
def get_image(html,filepath):


    selector=etree.HTML(html)

    #筛选想要抓取的图片地址,保存为list
    #在浏览器检查查图片，然后复制xpath路径，去掉下标
    #如   //*[@id="main"]/div[3]/ul/li[1]/a/span/img
    imgurls = selector.xpath('//*[@id="main"]/div/ul/li/a/span/img/@src')

    global count
    count=1

    
    f=open("imgurl.txt",'a+')

    for i in imgurls:
        i="https://pic.netbian.com"+i
        print(i)

        #保存图片网址
        f.write(i+"\n")

        file_path="C:/Users/78565/Desktop/py3/%s/" % filepath +"%d.jpg" % count 
        
        if os.path.exists(file_path):
            pass
        else:
            while 1:
                    try:
                        urllib.request.urlretrieve(i,"C:/Users/78565/Desktop/py3/%s/" % filepath +"%d.jpg" % count )
                    except:
                        print("net error")
                        break
                    continue
  
        count+=1
    f.close()


url="https://pic.netbian.com/"

filepath="img"
k=checkfile("C:/Users/78565/Desktop/py3/"+filepath)#创建文件夹
try:
    #下载网页
    html=download(url) 

    #从网页提取图片并下载
    get_image(html,filepath)

except:
    print("tasks end!")

Selenium版

首先获取cookies，需要下载chrome浏览器驱动放到chrome.exe同一目录下，并把路径加到环境变量，这样就可以不在chrome.exe目录下运行脚本

from selenium import webdriver
import time
import json


# 填写webdriver的保存目录
options = webdriver.ChromeOptions()
options.binary_location = r"C:\Users\78565\AppData\Local\Google\Chrome\Application\chrome.exe"

driver = webdriver.Chrome(options=options)

# 记得写完整的url 包括http和https
driver.get('https://weibo.com/')

# 程序打开网页后20秒内 “手动登陆账户” 
time.sleep(60)

with open('cookies.txt','w') as f:
    # 将cookies保存为json格式
    f.write(json.dumps(driver.get_cookies()))

driver.close()

打开浏览器，抓取图片


import re
import json

import selenium
from selenium import webdriver

import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import shutil
import requests
import urllib
import urllib.request


def scroll(browser):
    try:
        #出现网络超时，请重新点击
        browser.find_element_by_xpath('//div[@class="m-tips m-tips-tp cursor"]').click()
    except:
        pass

    try:  
        #到达底部
        browser.find_element_by_xpath('//div[@class="Bottom_text_1kFLe"]').click()
    except:
        return False
    return True


def checkfile(filepath):
    if os.path.exists(filepath):
        #shutil.rmtree(filepath)
        return filepath
    os.mkdir(filepath)
    return filepath


def get_photo(lis,filepath):

    x = 1
    for i in lis:
        i=i.strip("\n")
        print(i)

        imgname =i[-21:]
        #imgname=img[-21:]

        print("开始下载")

        file_path="C:/Users/78565/Desktop/py3/%s/"%filepath+imgname
        if os.path.exists(file_path):
            pass
        else:
            while 1:
                try:
                    urllib.request.urlretrieve(i, "C:/Users/78565/Desktop/py3/%s/" % filepath+imgname)
                except:
                    pass
                break
                    
        print("第%s张图片下载完成！" % x)
        x += 1
        time.sleep(1)


def get_img(browser,txt):

    for times in range(2):
        while not scroll(browser):#如果没到页面底部，则进行页面拖动，拉取滚动条到底部
            browser.execute_script("window.scrollBy(0,3000)")
            time.sleep(1)

    time.sleep(5)
    print("***开始查找***")

    #在全部图片的div下查找所有img元素
    #在浏览器检查查图片，然后复制xpath路径，去掉下标
    #如   //*[@id="main"]/div[3]/ul/li[1]/a/span/img 变为 //*[@id="main"]/div/ul/li/a/span/img

    a=browser.find_elements_by_xpath('//*[@id="app"]/div/div/div/main/div/div/div/div/div/div/div/div/div/div/div/div/div/img')
    lis_2=[]
    f=open(txt,"w")
    str1=''
    for x in a:

        key=x.get_attribute("src")

        #查看大图
        key=key.replace("orj360","large")

        print(key)

        f.write(key+"\n")
        lis_2.append(key)

    #图片数量
    print(len(lis_2))

    return lis_2






if __name__ == '__main__':

    options = webdriver.ChromeOptions()
    options.binary_location = r"C:\\Users\78565\AppData\Local\Google\Chrome\Application\chrome.exe"

    url="https://weibo.com/u/2616380702?tabtype=album"
    
    txt="{}.txt".format(url[20:30])
    
    driver = webdriver.Chrome(chrome_options=options)
    driver.get("http://photo.weibo.com/")
        # 二、删除cookie
    driver.delete_all_cookies()

    # 三、添加上一步骤获得的cookie
    with open('cookies.txt','r') as f:
        # 使用json读取cookies 注意读取的是文件 所以用load而不是loads
        cookies_list = json.load(f)

        for cookie in cookies_list:
            # 该字段有问题所以删除就可以 
            if 'expiry' in cookie:
                del cookie['expiry']
            driver.add_cookie(cookie)
    #重新打开网址
    driver.get(url)

    imglist=get_img(driver,txt)

    k=checkfile("C:/Users/78565/Desktop/py3/"+url[20:30])#创建文件夹

    get_photo(imglist,url[20:30])

    print("Finished!")

Python的网络库有时候总是卡住，可以先保存图片地址，再用curl进行下载,Linux下使用

#include<iostream>
#include<string>
#include<vector>
#include<curl/curl.h>
#include <sys/stat.h>
#include<string.h>

using namespace std;

//网址文件
#define TXT "url.txt"

//保存图片的文件夹
#define DIR "url"

//下载文件数据接收函数
size_t dl_req_reply(void *buffer, size_t size, size_t nmemb, void *user_p)
{
	FILE *fp = (FILE *)user_p;
	size_t return_size = fwrite(buffer, size, nmemb, fp);
	//cout << (char *)buffer << endl;
	return return_size;
}

//http POST请求文件下载  
CURLcode dl_curl_post_req(const string &url, const string &postParams, string filename)
{

	FILE *fp = fopen(filename.c_str(), "wb");

	// curl初始化  
	CURL *curl = curl_easy_init();
	// curl返回值 
	CURLcode res;
	if (curl)
	{
		// set params
		//设置curl的请求头
		struct curl_slist* header_list = NULL;
		header_list = curl_slist_append(header_list, "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6788.400 QQBrowser/10.3.2843.400");
		header_list = curl_slist_append(header_list, "Content-Type: application/x-www-form-urlencoded; charset=UTF-8");
		curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);

		//不接收响应头数据0代表不接收 1代表接收
		curl_easy_setopt(curl, CURLOPT_HEADER, 0);

		//设置请求为post请求
		curl_easy_setopt(curl, CURLOPT_POST, 1);


		//设置请求的URL地址
		curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
		//设置post请求的参数
		curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postParams.c_str());

		//设置ssl验证
		curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
		curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);

		//CURLOPT_VERBOSE的值为1时，会显示详细的调试信息
		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);

		curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);

		//设置数据接收函数和接收指针
		curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &dl_req_reply);
		curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);

		curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);

		//设置超时时间
		//curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 6);
		//curl_easy_setopt(curl, CURLOPT_TIMEOUT, 6);

		// 开启post请求
		res = curl_easy_perform(curl);
	}
	//释放curl 
	curl_easy_cleanup(curl);
	//释放文件资源
	fclose(fp);
	return res;
}



//下载文件数据接收函数
size_t get_reply(void *buffer, size_t size, size_t nmemb, void *user_p)
{
	FILE *fp = (FILE *)user_p;
	size_t return_size = fwrite(buffer, size, nmemb, fp);
	//cout << (char *)buffer << endl;
	return return_size;
}

//http get请求文件下载  
CURLcode get_curl_post_req(const string &url, const string &postParams, char* filename)
{

//------------------------------------------------------
//检查文件是否已存在

	printf("%s \n",filename);
	struct stat st;
	size_t len;
	if (stat(filename, &st))
	{
		len=0;
	}
	else
	{
		len= static_cast<size_t>(st.st_size);
	}	
	printf("len %d \n",len);
	if(len>0)
	{
		return CURLE_SEND_ERROR;
	}

//-----------------------------------------------------	
	FILE *fp = fopen(filename, "w+b");



	// curl初始化  
	CURL *curl = curl_easy_init();
	// curl返回值 
	CURLcode res;
	if (curl)
	{
		// set params
		//设置curl的请求头
		struct curl_slist* header_list = NULL;
		header_list = curl_slist_append(header_list, "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6788.400 QQBrowser/10.3.2843.400");
		header_list = curl_slist_append(header_list, "Content-Type: application/x-www-form-urlencoded; charset=UTF-8");
		curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);

		//不接收响应头数据0代表不接收 1代表接收
		curl_easy_setopt(curl, CURLOPT_HEADER, 0);


		//设置请求的URL地址
		curl_easy_setopt(curl, CURLOPT_URL, url.c_str());


		//设置ssl验证
		curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
		curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);

		//CURLOPT_VERBOSE的值为1时，会显示详细的调试信息
		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);

		//写入要发送的数据
		curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);

		//给上面的函数提供写入文件描述符
		curl_easy_setopt(curl, CURLOPT_READDATA, NULL);

		//设置数据接收函数和接收指针
		curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &get_reply);
		curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);

		curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);

		//设置超时时间
		//curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 6);
		//curl_easy_setopt(curl, CURLOPT_TIMEOUT, 6);

		// 开启请求,默认为get
		res = curl_easy_perform(curl);
	}
	//释放curl 
	curl_easy_cleanup(curl);
	//释放文件资源
	fclose(fp);
	return res;
}




void get_img_url(string filename,vector<string> &imglist)
{
	FILE *fp = fopen(filename.c_str(), "rb");
	char url[1024]={0}; 

	while(fgets(url,1024,fp))
	{
		cout<<url<<endl;
		imglist.emplace_back(url);
	}

}
//执行成功，但是如果有error返回，也是收不到buf 比如 ls 查看一个不存在的文件，就接收不到buf
static int popenRead(const char *cmd, char *buffer);
int popenRead(const char *cmd, char *buffer)
{
    int rv;
    FILE *fp;

    rv = -1;
    printf("cmd = %s\n", cmd);
    fp = popen(cmd, "r");
    if (fp)
    {
        printf("popen success\n");
        fread(buffer,1,96,fp);
        rv=0;
        pclose(fp);
    }
    else
    {
        printf("popen %s fail: %d,%s\n", cmd, errno, strerror(errno));
    }
    
    return rv;
}


//下载图片不能用post 用get
int main()
{
	vector<string> imageList;
	get_img_url(TXT,imageList);

	char buf[1024]={0};
	char cmd[96]={0};

//========================================
//查看文件夹是否存在
	if(popenRead("ls",buf)==0)
	{
		if(!strstr(buf,DIR))
		{
			printf("mkdir %s\n",DIR);
			memset(cmd,0,96);
			sprintf(cmd,"mkdir %s",DIR);
			system(cmd);
		}
	}
//========================================	

	int count=1;
	for(size_t i=0; i<imageList.size(); i++)//for (auto it = my_array.begin(); it != my_array.end(); ++it)  { *it 为元素  }
	{	


		char imgpath[96]={0};
		sprintf(imgpath,"%s/%d.jpg",DIR,count++);


		// Remove leading and trailing whitespace
		static const char whitespace[] = " \n\t\v\r\f";
		imageList[i].erase( 0,imageList[i].find_first_not_of(whitespace) );
		imageList[i].erase( imageList[i].find_last_not_of(whitespace) + 1U );


		auto res3 = get_curl_post_req(imageList[i], "", imgpath);
		if (res3 == CURLE_OK )
		{
			cout << "get 下载成功！" << endl;
		}
		else if(res3 == CURLE_SEND_ERROR)
		{
			cout<<"文件已下载 !"<<endl;
		}
		else
		{
			cout<<"download error !"<<endl;
		}

	}

	return 0;
}

Python 爬取图片