Java爬取网页图片携带header认证访问

Java爬虫爬取网页数据
一.简单介绍爬虫
网络爬虫(Web Crawler),又称为网络蜘蛛(Web Spider)或 Web 信息采集器,是一种按照一定规则,自动抓取或下载网络信息的计算机程序或自动化脚本,是目前搜索引擎的重要组成部分。
我的这个demo 是基于Jsoup做个简单实现java爬虫的
jsoup是一款Java的HTML解析器,主要用来对HTML解析 jsoup 中文官网

二.依赖

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>
        <!-- 文件下载 -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.5</version>
        </dependency>

三.代码

package com.wxw.springboot;

import org.junit.Assert;
import org.junit.Test;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

public class TestCatchImage {

    /**
     * 测试携带请求头
     */
    @Test
    public void testDownloadImageWithHeaders() {
        String baiduLogoUrl = "xxxxx.png";
        File localFile = new File("D:\\javaStudy\\springboot\\" + "scenery1.png");
        Map<String, String> headers = new HashMap<String, String>();
//		headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
//		headers.put("Accept-Encoding", "gzip, deflate");
//		headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
//		headers.put("Cache-Control", "max-age=0");
//		headers.put("Connection", "keep-alive");
//		headers.put("Cookie", "referer=%22https%3A%5C%2F%5C%2Fwww.58pic.com%5C%2Ftupian%5C%2FBMP.html%22; qt_visitor_id=%228d800369135f0ea407efc30c35b855c0%22; qt_uid=0; qt_type=0; user-browser=%22baidu%22; history_search=%22%7B%5C%22BMP_%5C%22%3A%5C%22%5C%5C%5C%2Ftupian%5C%5C%5C%2FBMP.html%5C%22%7D%22; awake=0; qiantudata2018jssdkcross=%7B%22distinct_id%22%3A%2216956b4d4b859f-0f2dc330b57f7c-3a3a5c0e-2073600-16956b4d4b92bd%22%2C%22props%22%3A%7B%22latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22latest_referrer_host%22%3A%22www.baidu.com%22%2C%22latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; qt_utime=1551937913; FIRSTVISITED=1551937885.528; showAd:8d800369135f0ea407efc30c35b855c0=%22w6SIEgLKiJOIC5HVD3fKoJHKodaWmZy8mtm4zJbLytqWn5vMyZmWyZm4yJG4nwmWiIWIywr5zxj3AxnLCL2Pzci9iJeIlcj3DxjUiJOImsiSiNnOB6DFDgLTzxmIoJmSiMXHC6rFC5HVD423Aw4LiJOXntuXotm6ote3Fv3%3D%22");
//		headers.put("Host", "pic.58pic.com");
//		headers.put("If-Modified-Since", "Tue, 22 Jul 2014 15:40:16 GMT");
//		headers.put("If-None-Match", "ae01753237194fa9a5badf2d90fd20d6");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
        headers.put("Referer", "https://www.huzhan.com/");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
//		headers.put("Upgrade-Insecure-Requests", "1");
//		headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
        Assert.assertTrue(downloadImageWithHeaders(baiduLogoUrl, "png", localFile, headers));
    }

    /**
     * 携带头信息下载网络图片
     * @param formatName 文件格式名称
     * @param localFile 下载到本地文件
     * @param headers http协议交互中header信息,如Cookie
     * @return 下载是否成功
     */
    public static boolean downloadImageWithHeaders(String imageUrl, String formatName, File localFile, Map<String, String> headers) {
        boolean isSuccess = false;
        InputStream stream = null;
        try {
            URL url = new URL(imageUrl);
            URLConnection conn = url.openConnection();
            if (headers != null && !headers.isEmpty()) {
                //设置头信息
                for (Map.Entry<String, String> entry : headers.entrySet()) {
                    conn.setRequestProperty(entry.getKey(), entry.getValue());
                }
            }
            conn.setDoInput(true);
            stream = conn.getInputStream();
            BufferedImage bufferedImg = ImageIO.read(stream);
            if (bufferedImg != null) {
                isSuccess = ImageIO.write(bufferedImg, formatName, localFile);
            } else {
                throw new RuntimeException("图片[" + imageUrl + "]下载失败");
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (stream != null) {
                try {
                    stream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return isSuccess;
    }
}

注意爬取的图片为jpg 输入jpg

四.总结

要在有html元素有的情况下,才能通过jsoup来爬虫,如果是这接口获得的数据,那么通过jsoup是无法获取到的。
java爬虫在爬取网络的数据用到了jsoup中的方法,在下载css,img和创建html就用到了I/O流。

在Python中使用Scrapy框架爬取百度或其他网站时,设置HTTP头(Header)是非常常见的,因为很多网站为了防止爬虫滥用其服务,可能会检查请求头中的User-Agent信息。如果你不设置或使用默认的User-Agent,可能会导致IP被封禁或者返回错误响应。 以下是如何在Scrapy项目中设置自定义请求头的步骤: 1. 首先,确保你已经安装了Scrapy库,如果没有,可以通过`pip install scrapy`进行安装。 2. 在你的Scrapy项目目录下,打开`settings.py`文件。这是Scrapy全局配置的地方。 3. 找到`DOWNLOADER_MIDDLEWARES`设置,这是一个字典,用于添加中间件(Middleware)。中间件可以在下载过程中处理请求和响应。 4. 添加一个新的中间件,名为`CustomHeadersMiddleware`,并设置`USER_AGENT`字段。这里是一个示例: ```python # settings.py DOWNLOADER_MIDDLEWARES = { 'your_project.middlewares.CustomHeadersMiddleware': 543, # 设置权重,数值越大优先级越高 } class CustomHeadersMiddleware: def process_request(self, request, spider): request.headers.setdefault('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3') ``` 在这个例子中,我们设置了User-Agent为Chrome浏览器的一个常见值。你可以根据实际需求修改这个字符串,让它看起来像真实用户的浏览器请求。 5. 保存更改后,重启Scrapy爬虫,新的请求头就会应用到所有的爬取操作中。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值