用Jsoup抓取长颈鹿但丁图片-优快云博客

本文详细介绍了如何使用Java编程语言通过Jsoup库从指定网站批量抓取并下载长颈鹿但丁漫画网站上的图片，包括分页抓取、多线程下载及保存至本地的操作流程。

(官网似乎已改版，此代码没用了)

1、pom文件配置或者添加jsoup1.6.3jar包：

	<dependencies>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.6.3</version>
		</dependency>
	</dependencies>

2、抓取长颈鹿但丁图片URL:

package com.sxit.jsoup;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 功能：抓取长颈鹿但丁图片
 * 类名:jsoupPic
 * 作者:smile
 * 时间:Nov 11, 2012：2:17:57 PM
 */
public class jsoupPic {

	public static List<String> getDocument() {

		List<String> list = new ArrayList<String>();
		try {
			Connection con = null;
			// 分页后缀
			String[] a = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" };
			// 遍历语录漫画下面四个项目（哲理、职场、爱情、恶搞）
			for (int i = 1; i <= 4; i++) {
				// 单个项目的第一页url
				String url = "http://www.danding.com.cn/pic_fl_" + i + ".html";
				// 建立连接
				con = Jsoup.connect(url);
				// 获取页面document
				Document doc = con.get();
				// 获取末尾是jpg的img标签元素
				Elements e = doc.select("img[src$=.jpg]");
				// 遍历第一页jpg图片的路径
				for (int j = 0; j < e.size(); j++) {
					Element ei = e.get(j);
					// System.out.println("第"+i+"页图片地址为----------->>>>>
					// http://www.danding.com.cn/"+ei.attr("src"));
					list.add("http://www.danding.com.cn/" + ei.attr("src"));
				}

				int flag = 0;

				while (flag == 0) {
					// 当前页是否存在下一页
					boolean isExist = true;
					isExist = isExistsNextPage(doc);
					int k = 0;
					while (isExist) {
						// System.out.println("----------------->>>存在下一页");
						// 下一页的url地址
						url = "http://www.danding.com.cn/pic_fl_" + i + a[k] + ".html";
						doc = traverse(url, list);
						isExist = isExistsNextPage(doc);
						k++;
					}
					flag = 1;
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return list;
	}

	/**
	 * 判断是否有下一页
	 * 
	 * @param doc
	 * @return
	 */
	public static boolean isExistsNextPage(Document doc) {

		// 判断当前页是否还有下一页
		Elements e = doc.select(":containsOwn(下一页)");
		if (e.size() > 0) { // 有下一页
			return true;
		} else
			return false;
	}

	/**
	 * 遍历document
	 * 
	 * @param list
	 * @param doc
	 * @throws IOException
	 */
	public static Document traverse(String src, List<String> list) throws IOException {

		Connection con = Jsoup.connect(src);
		Document doc = con.get();
		// 获取末尾是jpg的标签元素
		Elements e = doc.select("img[src$=.jpg]");

		for (int j = 0; j < e.size(); j++) {
			Element ei = e.get(j);
			list.add("http://www.danding.com.cn/" + ei.attr("src"));
		}
		return doc;
	}
}

3、批量下载到本地：

package com.sxit.jsoup;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * 功能：批量下载
 * 类名:ThreadPoolManage
 * 作者:smile
 * 时间:Nov 11, 2012：6:02:49 PM
 */
public class ThreadPoolManage {

	final ExecutorService exec = Executors.newFixedThreadPool(20);

	private String filePath;

	private List<String> list;

	// 结束的倒数锁
	final CountDownLatch stop = new CountDownLatch(20);

	public static void main(String[] args) {
		new ThreadPoolManage("D://xxooThread");
	}

	public ThreadPoolManage(String filePath) {

		list = jsoupPic.getDocument();
		// 启20个线程跑,每个线程只跑总数的1/20,第一个线程跑1-->list.size()/20,
		// 第二个线程从list.size()/20+1-->2*list.size()/20,最后一个线程则跑(n-1)*list.size()/20+1-->list.size()
		for (int i = 1; i <= 20; i++) {
			this.exec.submit(new ImageThread(i, filePath, list, stop));
		}

		try {
			// 等待stop变为0
			stop.await();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}

		// 等所有线程跑完最后关闭ExecutorService
		exec.shutdown();
	}
}

class ImageThread implements Runnable {

	private String filePath;

	private List<String> list;

	private int index;

	private final CountDownLatch stop;

	public ImageThread(int index, String filePath, List<String> ilistst, CountDownLatch stop) {
		this.index = index;
		this.filePath = filePath;
		this.list = ilistst;
		this.stop = stop;
	}

	public void run() {

		String picUrl = "";
		InputStream is = null;
		OutputStream os = null;
		URL url = null;
		HttpURLConnection con = null;

		// 判断保存路径是否存在 不存在则新建文件夹
		File f = new File(filePath);
		File temp = null;
		if (!f.exists()) {
			f.mkdir();
		}

		if (list != null) {
			// 每次需要跑的数目
			int count = list.size() / 20;
			int start = (index - 1) * count + 1;
			int end = 0;
			if (index != 20) {
				end = index * count;
			} else {
				end = list.size() - 1;
			}

			for (int i = start; i <= end; i++) {
				picUrl = list.get(i);

				try {
					url = new URL(picUrl);
					con = (HttpURLConnection) url.openConnection();
					// 设置连接超时
					con.setConnectTimeout(100 * 1000);
					// 设置读取超时
					con.setReadTimeout(100 * 1000);
					is = new BufferedInputStream(con.getInputStream());
					os = new BufferedOutputStream(new FileOutputStream(new File(filePath + "/" + i + ".jpg")));
					byte[] b = new byte[1024];
					int length = 0;
					while ((length = is.read(b)) != -1) {
						os.write(b, 0, length);
					}
					os.flush();
					System.out.println(index + "号线程----------------->>>>>>>保存完第" + i + "张");
				} catch (Exception e) {
					System.out.println(index + "号线程跑到第" + start + "张图片+++++++++++++抛出异常,异常信息为：" + e.getMessage());
					// 抛出异常捕获，继续执行
					continue;
				}
			}

			try {
				if (is != null) {
					is.close();
				}
				if (os != null) {
					os.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			} finally {
				// 当前线程完成,减1
				this.stop.countDown();
			}
		}
	}
}

3、源码如下