Jsoup抓取图片

最新推荐文章于 2021-03-02 16:26:47 发布

Jim pretty

最新推荐文章于 2021-03-02 16:26:47 发布

阅读量338

点赞数

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/pingpei1133/article/details/96155192

准备

选取适合的网站，使用Jsoup获取网页DOM元素。
目标网站：http://www.win4000.com/zt/meinv.html
抓取美女图
使用SpringBoot新建一个工程。添加依赖

   <dependencies>
	<dependency>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-web</artifactId>
	</dependency>

	<dependency>
		<groupId>org.projectlombok</groupId>
		<artifactId>lombok</artifactId>
		<optional>true</optional>
	</dependency>
	<dependency>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-test</artifactId>
		<scope>test</scope>
	</dependency>
	<dependency>
		<groupId>org.jsoup</groupId>
		<artifactId>jsoup</artifactId>
		<version>1.11.3</version>
	</dependency>
	<dependency>
		<groupId>org.apache.httpcomponents</groupId>
		<artifactId>httpclient</artifactId>
		<version>4.5.7</version>
	</dependency>

</dependencies>

打开网页

检查元素查看网页的结构，查找图片所在的DOM元素节点。

a 标签的所在的DOM节点为

.Left_bar .tab_tj .tab_box ul li a

使用Jsoup解析网页,

第一步: 获取当前页所有图集的href值
第二步：获取图集中的img src属性值
第三步：使用工具类下载图片

代码如下：

import com.felix.project.util.FileUtils;
import com.felix.project.util.HttpClientUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import java.io.IOException;

@Slf4j
@Component
public class CrawlPage {
    private static final String HTTPURL = "http://www.win4000.com/zt/meinv_1.html";
    private static final String FILEPATH="E:/jsoup/images/";
    
    public static void crawlImg() throws IOException {
       // 统计当前页抓取的图片
        int count=0;
        log.info("开始抓取 --> 目标地址："+HTTPURL);
        String html = HttpClientUtils.getHtml(HTTPURL, "utf-8");
        //解析网页
        Document doc = Jsoup.parse(html);
        //获取 a 标签的所有节点
        Elements aImgElement = doc.select(".Left_bar .tab_tj .tab_box ul li a");
        //第一步
        //循环遍历出a 标签中的 href 属性值
        for (Element a : aImgElement) {
            //获取a 标签的属性值
            String href = a.select("a[href]").attr("href");
            //获取到的href值是这样的
            // http://www.win4000.com/wallpaper_detail_157015.html
            //获取循环索引值
            int index=aImgElement.indexOf(a)+1;

            //拆分href获得 http://www.win4000.com/wallpaper_detail_157015
            String url = href.split(".html")[0];
            //获取下一张图片的结构是这样的
            // http://www.win4000.com/wallpaper_detail_157015_2.html
            log.info("获取第"+index+"个图集");
            // 获取该图集共有多少页
            int page = Integer.parseInt(Jsoup.parse(HttpClientUtils.getHtml(href, "utf-8")).select(".Bigimg .ptitle em").text());
           int total=0;
           //第二步
            for (int i = 1; i <=page ; i++) {
                //根据网页地址发现规律 下一张的图片拼接为
                String imgHtml = HttpClientUtils.getHtml(url+"_"+i+".html", "utf-8");
                //解析图片页网页
                Document imgDoc = Jsoup.parse(imgHtml);
                Elements select = imgDoc.select(".pic_main .col-main .main-wrap .pic-meinv a");
                //获取img src属性值
                // http://pic1.win4000.com/wallpaper/2019-03-29/5c9d7bba69328.jpg
                String src = select.select("img").attr("src");
                //获取img title属性值
                String title = select.select("img").attr("title");
                //下载图片工具
                FileUtils.downloadImage(src,FILEPATH+title,System.currentTimeMillis()+".jpg");
                log.info("下载第--"+index+"--"+title+"--个图集的第"+i+"张图片");
                count++;
                total++;
            }
            log.info("第-"+index +"-个图集共有"+total+"张图片");
        }
        log.info("本次获取"+count+"图片");
    }
}

工具类

FileUtils 图片下载

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class FileUtils {
    /**
     *
     * @param url 图片地址
     * @param filePath 保存路径
     * @param fileName 保存图片名称
     */
    public static void downloadImage(String url,String filePath ,String fileName){

        try {
        URL u = new URL(url);
        URLConnection connection = u.openConnection();
        connection.setReadTimeout(60000);
        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
        InputStream inputStream = connection.getInputStream();

        File dir = new File(filePath);
        if (!dir.exists()) {
            dir.mkdirs();
        }

        File file = new File(filePath + File.separator + fileName);

        BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));

        // 构建缓冲区
        byte[] buf = new byte[1024];


        int size;
        // 写入到文件
        while (-1 != (size = inputStream.read(buf))) {
            out.write(buf, 0, size);
        }

        out.close();
        inputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

HttpClientUtils 获取网页

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientUtils {

    public static String getHtml(String url,String charset) throws IOException{
        CloseableHttpClient client = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

        CloseableHttpResponse response = client.execute(httpGet);
        //返回实体
        HttpEntity entity = response.getEntity();

        String html = EntityUtils.toString(entity, charset);

        response.close();
        client.close();
        return html;
    }
}