Java爬取网页图片携带header认证访问

最新推荐文章于 2024-07-30 03:18:53 发布

原创最新推荐文章于 2024-07-30 03:18:53 发布 · 309 阅读

CC 4.0 BY-SA版权

文章标签：

Java爬虫爬取网页数据
一.简单介绍爬虫
网络爬虫（Web Crawler），又称为网络蜘蛛（Web Spider）或 Web 信息采集器，是一种按照一定规则，自动抓取或下载网络信息的计算机程序或自动化脚本，是目前搜索引擎的重要组成部分。
我的这个demo 是基于Jsoup做个简单实现java爬虫的
jsoup是一款Java的HTML解析器，主要用来对HTML解析 jsoup 中文官网

二.依赖

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>
        <!-- 文件下载 -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.5</version>
        </dependency>

三.代码

package com.wxw.springboot;

import org.junit.Assert;
import org.junit.Test;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

public class TestCatchImage {

    /**
     * 测试携带请求头
     */
    @Test
    public void testDownloadImageWithHeaders() {
        String baiduLogoUrl = "xxxxx.png";
        File localFile = new File("D:\\javaStudy\\springboot\\" + "scenery1.png");
        Map<String, String> headers = new HashMap<String, String>();
//		headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
//		headers.put("Accept-Encoding", "gzip, deflate");
//		headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
//		headers.put("Cache-Control", "max-age=0");
//		headers.put("Connection", "keep-alive");
//		headers.put("Cookie", "referer=%22https%3A%5C%2F%5C%2Fwww.58pic.com%5C%2Ftupian%5C%2FBMP.html%22; qt_visitor_id=%228d800369135f0ea407efc30c35b855c0%22; qt_uid=0; qt_type=0; user-browser=%22baidu%22; history_search=%22%7B%5C%22BMP_%5C%22%3A%5C%22%5C%5C%5C%2Ftupian%5C%5C%5C%2FBMP.html%5C%22%7D%22; awake=0; qiantudata2018jssdkcross=%7B%22distinct_id%22%3A%2216956b4d4b859f-0f2dc330b57f7c-3a3a5c0e-2073600-16956b4d4b92bd%22%2C%22props%22%3A%7B%22latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22latest_referrer_host%22%3A%22www.baidu.com%22%2C%22latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; qt_utime=1551937913; FIRSTVISITED=1551937885.528; showAd:8d800369135f0ea407efc30c35b855c0=%22w6SIEgLKiJOIC5HVD3fKoJHKodaWmZy8mtm4zJbLytqWn5vMyZmWyZm4yJG4nwmWiIWIywr5zxj3AxnLCL2Pzci9iJeIlcj3DxjUiJOImsiSiNnOB6DFDgLTzxmIoJmSiMXHC6rFC5HVD423Aw4LiJOXntuXotm6ote3Fv3%3D%22");
//		headers.put("Host", "pic.58pic.com");
//		headers.put("If-Modified-Since", "Tue, 22 Jul 2014 15:40:16 GMT");
//		headers.put("If-None-Match", "ae01753237194fa9a5badf2d90fd20d6");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
        headers.put("Referer", "https://www.huzhan.com/");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
//		headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
//		headers.put("Upgrade-Insecure-Requests", "1");
//		headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
        Assert.assertTrue(downloadImageWithHeaders(baiduLogoUrl, "png", localFile, headers));
    }

    /**
     * 携带头信息下载网络图片
     * @param formatName 文件格式名称
     * @param localFile 下载到本地文件
     * @param headers http协议交互中header信息，如Cookie
     * @return 下载是否成功
     */
    public static boolean downloadImageWithHeaders(String imageUrl, String formatName, File localFile, Map<String, String> headers) {
        boolean isSuccess = false;
        InputStream stream = null;
        try {
            URL url = new URL(imageUrl);
            URLConnection conn = url.openConnection();
            if (headers != null && !headers.isEmpty()) {
                //设置头信息
                for (Map.Entry<String, String> entry : headers.entrySet()) {
                    conn.setRequestProperty(entry.getKey(), entry.getValue());
                }
            }
            conn.setDoInput(true);
            stream = conn.getInputStream();
            BufferedImage bufferedImg = ImageIO.read(stream);
            if (bufferedImg != null) {
                isSuccess = ImageIO.write(bufferedImg, formatName, localFile);
            } else {
                throw new RuntimeException("图片[" + imageUrl + "]下载失败");
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (stream != null) {
                try {
                    stream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return isSuccess;
    }
}

注意爬取的图片为jpg 输入jpg