介绍两种方式,一种是比较简单的方式,爬取的网站没有反爬措施,直接使用urllib和lxml库即可。
另一种例如新浪微博,想要获取某个博主完整的图片需要对页面进行下拉,则使用selenium,用自动化点击的方式来获取图片。
简单版:
import urllib.request
import re
from lxml import etree
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def checkfile(filepath):
if os.path.exists(filepath):
#shutil.rmtree(filepath)
return filepath
os.mkdir(filepath)
return filepath
def download(url,user_agent="wswp",proxy=None,num_retries=2):
print("Downloading: ",url)
headers={'User-agent':user_agent}
req=urllib.request.Request(url,headers=headers)
opener=urllib.request.build_opener()
if proxy:
proxy_params={urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
html=opener.open(req).read()
except urllib.request.URLError as e:
print("Download error",e.reason)
html=None
if num_retries>0:
if hasattr(e,"code") and 500 <=e.code<600:
return download(url,user_agent,proxy,num_retries-1)
return html
count=1
def get_image(html,filepath):
selector=etree.HTML(html)
#筛选想要抓取的图片地址,保存为list
#在浏览器检查查图片,然后复制xpath路径,去掉下标
#如 //*[@id="main"]/div[3]/ul/li[1]/a/span/img
imgurls = selector.xpath('//*[@id="main"]/div/ul/li/a/span/img/@src')
global count
count=1
f=open("imgurl.txt",'a+')
for i in imgurls:
i="https://pic.netbian.com"+i
print(i)
#保存图片网址
f.write(i+"\n")
file_path="C:/Users/78565/Desktop/py3/%s/" % filepath +"%d.jpg" % count
if os.path.exists(file_path):
pass
else:
while 1:
try:
urllib.request.urlretrieve(i,"C:/Users/78565/Desktop/py3/%s/" % filepath +"%d.jpg" % count )
except:
print("net error")
break
continue
count+=1
f.close()
url="https://pic.netbian.com/"
filepath="img"
k=checkfile("C:/Users/78565/Desktop/py3/"+filepath)#创建文件夹
try:
#下载网页
html=download(url)
#从网页提取图片并下载
get_image(html,filepath)
except:
print("tasks end!")
Selenium版
首先获取cookies,需要下载chrome浏览器驱动放到chrome.exe同一目录下,并把路径加到环境变量,这样就可以不在chrome.exe目录下运行脚本
from selenium import webdriver
import time
import json
# 填写webdriver的保存目录
options = webdriver.ChromeOptions()
options.binary_location = r"C:\Users\78565\AppData\Local\Google\Chrome\Application\chrome.exe"
driver = webdriver.Chrome(options=options)
# 记得写完整的url 包括http和https
driver.get('https://weibo.com/')
# 程序打开网页后20秒内 “手动登陆账户”
time.sleep(60)
with open('cookies.txt','w') as f:
# 将cookies保存为json格式
f.write(json.dumps(driver.get_cookies()))
driver.close()
打开浏览器,抓取图片
import re
import json
import selenium
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import shutil
import requests
import urllib
import urllib.request
def scroll(browser):
try:
#出现网络超时,请重新点击
browser.find_element_by_xpath('//div[@class="m-tips m-tips-tp cursor"]').click()
except:
pass
try:
#到达底部
browser.find_element_by_xpath('//div[@class="Bottom_text_1kFLe"]').click()
except:
return False
return True
def checkfile(filepath):
if os.path.exists(filepath):
#shutil.rmtree(filepath)
return filepath
os.mkdir(filepath)
return filepath
def get_photo(lis,filepath):
x = 1
for i in lis:
i=i.strip("\n")
print(i)
imgname =i[-21:]
#imgname=img[-21:]
print("开始下载")
file_path="C:/Users/78565/Desktop/py3/%s/"%filepath+imgname
if os.path.exists(file_path):
pass
else:
while 1:
try:
urllib.request.urlretrieve(i, "C:/Users/78565/Desktop/py3/%s/" % filepath+imgname)
except:
pass
break
print("第%s张图片下载完成!" % x)
x += 1
time.sleep(1)
def get_img(browser,txt):
for times in range(2):
while not scroll(browser):#如果没到页面底部,则进行页面拖动,拉取滚动条到底部
browser.execute_script("window.scrollBy(0,3000)")
time.sleep(1)
time.sleep(5)
print("***开始查找***")
#在全部图片的div下查找所有img元素
#在浏览器检查查图片,然后复制xpath路径,去掉下标
#如 //*[@id="main"]/div[3]/ul/li[1]/a/span/img 变为 //*[@id="main"]/div/ul/li/a/span/img
a=browser.find_elements_by_xpath('//*[@id="app"]/div/div/div/main/div/div/div/div/div/div/div/div/div/div/div/div/div/img')
lis_2=[]
f=open(txt,"w")
str1=''
for x in a:
key=x.get_attribute("src")
#查看大图
key=key.replace("orj360","large")
print(key)
f.write(key+"\n")
lis_2.append(key)
#图片数量
print(len(lis_2))
return lis_2
if __name__ == '__main__':
options = webdriver.ChromeOptions()
options.binary_location = r"C:\\Users\78565\AppData\Local\Google\Chrome\Application\chrome.exe"
url="https://weibo.com/u/2616380702?tabtype=album"
txt="{}.txt".format(url[20:30])
driver = webdriver.Chrome(chrome_options=options)
driver.get("http://photo.weibo.com/")
# 二、删除cookie
driver.delete_all_cookies()
# 三、添加上一步骤获得的cookie
with open('cookies.txt','r') as f:
# 使用json读取cookies 注意读取的是文件 所以用load而不是loads
cookies_list = json.load(f)
for cookie in cookies_list:
# 该字段有问题所以删除就可以
if 'expiry' in cookie:
del cookie['expiry']
driver.add_cookie(cookie)
#重新打开网址
driver.get(url)
imglist=get_img(driver,txt)
k=checkfile("C:/Users/78565/Desktop/py3/"+url[20:30])#创建文件夹
get_photo(imglist,url[20:30])
print("Finished!")
Python的网络库有时候总是卡住,可以先保存图片地址,再用curl进行下载,Linux下使用
#include<iostream>
#include<string>
#include<vector>
#include<curl/curl.h>
#include <sys/stat.h>
#include<string.h>
using namespace std;
//网址文件
#define TXT "url.txt"
//保存图片的文件夹
#define DIR "url"
//下载文件数据接收函数
size_t dl_req_reply(void *buffer, size_t size, size_t nmemb, void *user_p)
{
FILE *fp = (FILE *)user_p;
size_t return_size = fwrite(buffer, size, nmemb, fp);
//cout << (char *)buffer << endl;
return return_size;
}
//http POST请求文件下载
CURLcode dl_curl_post_req(const string &url, const string &postParams, string filename)
{
FILE *fp = fopen(filename.c_str(), "wb");
// curl初始化
CURL *curl = curl_easy_init();
// curl返回值
CURLcode res;
if (curl)
{
// set params
//设置curl的请求头
struct curl_slist* header_list = NULL;
header_list = curl_slist_append(header_list, "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6788.400 QQBrowser/10.3.2843.400");
header_list = curl_slist_append(header_list, "Content-Type: application/x-www-form-urlencoded; charset=UTF-8");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);
//不接收响应头数据0代表不接收 1代表接收
curl_easy_setopt(curl, CURLOPT_HEADER, 0);
//设置请求为post请求
curl_easy_setopt(curl, CURLOPT_POST, 1);
//设置请求的URL地址
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
//设置post请求的参数
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postParams.c_str());
//设置ssl验证
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
//CURLOPT_VERBOSE的值为1时,会显示详细的调试信息
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
//设置数据接收函数和接收指针
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &dl_req_reply);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
//设置超时时间
//curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 6);
//curl_easy_setopt(curl, CURLOPT_TIMEOUT, 6);
// 开启post请求
res = curl_easy_perform(curl);
}
//释放curl
curl_easy_cleanup(curl);
//释放文件资源
fclose(fp);
return res;
}
//下载文件数据接收函数
size_t get_reply(void *buffer, size_t size, size_t nmemb, void *user_p)
{
FILE *fp = (FILE *)user_p;
size_t return_size = fwrite(buffer, size, nmemb, fp);
//cout << (char *)buffer << endl;
return return_size;
}
//http get请求文件下载
CURLcode get_curl_post_req(const string &url, const string &postParams, char* filename)
{
//------------------------------------------------------
//检查文件是否已存在
printf("%s \n",filename);
struct stat st;
size_t len;
if (stat(filename, &st))
{
len=0;
}
else
{
len= static_cast<size_t>(st.st_size);
}
printf("len %d \n",len);
if(len>0)
{
return CURLE_SEND_ERROR;
}
//-----------------------------------------------------
FILE *fp = fopen(filename, "w+b");
// curl初始化
CURL *curl = curl_easy_init();
// curl返回值
CURLcode res;
if (curl)
{
// set params
//设置curl的请求头
struct curl_slist* header_list = NULL;
header_list = curl_slist_append(header_list, "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6788.400 QQBrowser/10.3.2843.400");
header_list = curl_slist_append(header_list, "Content-Type: application/x-www-form-urlencoded; charset=UTF-8");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list);
//不接收响应头数据0代表不接收 1代表接收
curl_easy_setopt(curl, CURLOPT_HEADER, 0);
//设置请求的URL地址
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
//设置ssl验证
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
//CURLOPT_VERBOSE的值为1时,会显示详细的调试信息
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
//写入要发送的数据
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
//给上面的函数提供写入文件描述符
curl_easy_setopt(curl, CURLOPT_READDATA, NULL);
//设置数据接收函数和接收指针
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &get_reply);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
//设置超时时间
//curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 6);
//curl_easy_setopt(curl, CURLOPT_TIMEOUT, 6);
// 开启请求,默认为get
res = curl_easy_perform(curl);
}
//释放curl
curl_easy_cleanup(curl);
//释放文件资源
fclose(fp);
return res;
}
void get_img_url(string filename,vector<string> &imglist)
{
FILE *fp = fopen(filename.c_str(), "rb");
char url[1024]={0};
while(fgets(url,1024,fp))
{
cout<<url<<endl;
imglist.emplace_back(url);
}
}
//执行成功,但是如果有error返回,也是收不到buf 比如 ls 查看一个不存在的文件,就接收不到buf
static int popenRead(const char *cmd, char *buffer);
int popenRead(const char *cmd, char *buffer)
{
int rv;
FILE *fp;
rv = -1;
printf("cmd = %s\n", cmd);
fp = popen(cmd, "r");
if (fp)
{
printf("popen success\n");
fread(buffer,1,96,fp);
rv=0;
pclose(fp);
}
else
{
printf("popen %s fail: %d,%s\n", cmd, errno, strerror(errno));
}
return rv;
}
//下载图片不能用post 用get
int main()
{
vector<string> imageList;
get_img_url(TXT,imageList);
char buf[1024]={0};
char cmd[96]={0};
//========================================
//查看文件夹是否存在
if(popenRead("ls",buf)==0)
{
if(!strstr(buf,DIR))
{
printf("mkdir %s\n",DIR);
memset(cmd,0,96);
sprintf(cmd,"mkdir %s",DIR);
system(cmd);
}
}
//========================================
int count=1;
for(size_t i=0; i<imageList.size(); i++)//for (auto it = my_array.begin(); it != my_array.end(); ++it) { *it 为元素 }
{
char imgpath[96]={0};
sprintf(imgpath,"%s/%d.jpg",DIR,count++);
// Remove leading and trailing whitespace
static const char whitespace[] = " \n\t\v\r\f";
imageList[i].erase( 0,imageList[i].find_first_not_of(whitespace) );
imageList[i].erase( imageList[i].find_last_not_of(whitespace) + 1U );
auto res3 = get_curl_post_req(imageList[i], "", imgpath);
if (res3 == CURLE_OK )
{
cout << "get 下载成功!" << endl;
}
else if(res3 == CURLE_SEND_ERROR)
{
cout<<"文件已下载 !"<<endl;
}
else
{
cout<<"download error !"<<endl;
}
}
return 0;
}