Jsoup HttpClient 抓取网络上的图片

本文介绍了使用 Jsoup 和 HttpClient 库抓取网页中的图片,并将图片保存到指定目录的过程。详细展示了链接处理、线程操作及图片保存的实现。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.th.spider.test; import java.io.BufferedOutputStream; import java.io.FileOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.util.EntityUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Exmaple3 { private static final Log log = LogFactory.getLog(Exmaple3.class); /** * 抓取图片存放目录 */ private static final String PIC_DIR = "/home/li/pic"; /** * 链接超时 */ private static final int TIME_OUT = 5000; static void go3(String url) throws Exception { Connection conn= Jsoup.connect(url); Document doc = conn.get(); Elements links = doc.select("div.piclist img[src]"); for(int i=0;i<links.size();i++){ Element element = links.get(i); final String imgUrl = element.attr("src"); log.info(imgUrl); Thread.sleep(500); new Thread(new Runnable() { public void run() { try { save(imgUrl); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }).start(); } } static void go2(String url) throws Exception { Connection conn= Jsoup.connect(url); Document doc = conn.get(); Elements links = doc.select("div.cc a[href]"); for(int i=0;i<links.size();i++){ Element element = links.get(i); final String dirUrl = "http://www.3lian.com"+element.attr("href"); log.info(dirUrl); Thread.sleep(500); new Thread(new Runnable() { public void run() { try { Connection conn= Jsoup.connect(dirUrl); Document doc = conn.get(); Elements images = doc.select("div.mb_jjnr img[src]"); for(int j=0;j<images.size();j++){ Element img = images.get(j); String imgUrl = img.attr("src"); log.info(imgUrl); save(imgUrl); } } catch (Exception e) { e.printStackTrace(); } } }).start(); } } /** * 处理帖子URL * @param url * @throws Exception */ static void go(String url) throws Exception { // JSOP创建链接 Connection conn = Jsoup.connect(url); // 请求返回整个文档对象 Document doc = conn.post(); // 选择所有class=zoom 的img标签对象 Elements imgs = doc.select("img[class=zoom]"); // 循环每个img标签 for (int i = 0; i < imgs.size(); i++) { Element img = imgs.get(i); // 取得图片的下载地址 String picURL = doc.baseUri() + img.attr("file"); log.info(picURL); // 保存图片 save(picURL); } } //<img src="static/image/common/none.gif" file="data/attachment/forum/201105/08/174412nz3jq4z90s33s2t0.jpg" width="770" class="zoom" onclick="zoom(this, this.src)" id="aimg_180565" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="img_src_29620.jpg" title="img_src_29620.jpg" /> //doc.select("img[class=zoom]") /** * 保存图片 * @param url * @param i * @throws Exception */ static void save(String url) throws Exception { String fileName = url.substring(url.lastIndexOf("/")); String filePath = PIC_DIR + "/" + fileName; BufferedOutputStream out = null; byte[] bit = getByte(url); if (bit.length > 0) { try { out = new BufferedOutputStream(new FileOutputStream(filePath)); out.write(bit); out.flush(); log.info("Create File success! [" + filePath + "]"); } finally { if (out != null) out.close(); } } } /** * 获取图片字节流 * @param uri * @return * @throws Exception */ static byte[] getByte(String uri) throws Exception { HttpClient client = new DefaultHttpClient(); client.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT); HttpGet get = new HttpGet(uri); get.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT); try { HttpResponse resonse = client.execute(get); if (resonse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { HttpEntity entity = resonse.getEntity(); if (entity != null) { return EntityUtils.toByteArray(entity); } } } catch (Exception e) { e.printStackTrace(); } finally { client.getConnectionManager().shutdown(); } return new byte[0]; } public static void main(String[] args) throws Exception { // 开始抓取图片 go2("http://www.3lian.com/gif/more/03/0301.html"); //go3("http://www.ivsky.com/tupian/nvxing_gouwu_qingjing_v6969/"); } }
需要的主要jar包

httpclient-4.0.1jar jsoup-1.5.2.jar

go go2 go3分别对应不同格式的抓取 仔细看就知道原因了

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值