(官网似乎已改版,此代码没用了)
1、pom文件配置或者添加jsoup1.6.3jar包:
<dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.6.3</version> </dependency> </dependencies>
2、 抓取长颈鹿但丁图片URL:
package com.sxit.jsoup;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 功能:抓取长颈鹿但丁图片
* 类名:jsoupPic
* 作者:smile
* 时间:Nov 11, 2012:2:17:57 PM
*/
public class jsoupPic {
public static List<String> getDocument() {
List<String> list = new ArrayList<String>();
try {
Connection con = null;
// 分页后缀
String[] a = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" };
// 遍历语录漫画下面四个项目(哲理、职场、爱情、恶搞)
for (int i = 1; i <= 4; i++) {
// 单个项目的第一页url
String url = "http://www.danding.com.cn/pic_fl_" + i + ".html";
// 建立连接
con = Jsoup.connect(url);
// 获取页面document
Document doc = con.get();
// 获取末尾是jpg的img标签元素
Elements e = doc.select("img[src$=.jpg]");
// 遍历第一页jpg图片的路径
for (int j = 0; j < e.size(); j++) {
Element ei = e.get(j);
// System.out.println("第"+i+"页图片地址为----------->>>>>
// http://www.danding.com.cn/"+ei.attr("src"));
list.add("http://www.danding.com.cn/" + ei.attr("src"));
}
int flag = 0;
while (flag == 0) {
// 当前页是否存在下一页
boolean isExist = true;
isExist = isExistsNextPage(doc);
int k = 0;
while (isExist) {
// System.out.println("----------------->>>存在下一页");
// 下一页的url地址
url = "http://www.danding.com.cn/pic_fl_" + i + a[k] + ".html";
doc = traverse(url, list);
isExist = isExistsNextPage(doc);
k++;
}
flag = 1;
}
}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}
/**
* 判断是否有下一页
*
* @param doc
* @return
*/
public static boolean isExistsNextPage(Document doc) {
// 判断当前页是否还有下一页
Elements e = doc.select(":containsOwn(下一页)");
if (e.size() > 0) { // 有下一页
return true;
} else
return false;
}
/**
* 遍历document
*
* @param list
* @param doc
* @throws IOException
*/
public static Document traverse(String src, List<String> list) throws IOException {
Connection con = Jsoup.connect(src);
Document doc = con.get();
// 获取末尾是jpg的标签元素
Elements e = doc.select("img[src$=.jpg]");
for (int j = 0; j < e.size(); j++) {
Element ei = e.get(j);
list.add("http://www.danding.com.cn/" + ei.attr("src"));
}
return doc;
}
}
3、批量下载到本地:
package com.sxit.jsoup;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 功能:批量下载
* 类名:ThreadPoolManage
* 作者:smile
* 时间:Nov 11, 2012:6:02:49 PM
*/
public class ThreadPoolManage {
final ExecutorService exec = Executors.newFixedThreadPool(20);
private String filePath;
private List<String> list;
// 结束的倒数锁
final CountDownLatch stop = new CountDownLatch(20);
public static void main(String[] args) {
new ThreadPoolManage("D://xxooThread");
}
public ThreadPoolManage(String filePath) {
list = jsoupPic.getDocument();
// 启20个线程跑,每个线程只跑总数的1/20,第一个线程跑1-->list.size()/20,
// 第二个线程从list.size()/20+1-->2*list.size()/20,最后一个线程则跑(n-1)*list.size()/20+1-->list.size()
for (int i = 1; i <= 20; i++) {
this.exec.submit(new ImageThread(i, filePath, list, stop));
}
try {
// 等待stop变为0
stop.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
// 等所有线程跑完最后关闭ExecutorService
exec.shutdown();
}
}
class ImageThread implements Runnable {
private String filePath;
private List<String> list;
private int index;
private final CountDownLatch stop;
public ImageThread(int index, String filePath, List<String> ilistst, CountDownLatch stop) {
this.index = index;
this.filePath = filePath;
this.list = ilistst;
this.stop = stop;
}
public void run() {
String picUrl = "";
InputStream is = null;
OutputStream os = null;
URL url = null;
HttpURLConnection con = null;
// 判断保存路径是否存在 不存在则新建文件夹
File f = new File(filePath);
File temp = null;
if (!f.exists()) {
f.mkdir();
}
if (list != null) {
// 每次需要跑的数目
int count = list.size() / 20;
int start = (index - 1) * count + 1;
int end = 0;
if (index != 20) {
end = index * count;
} else {
end = list.size() - 1;
}
for (int i = start; i <= end; i++) {
picUrl = list.get(i);
try {
url = new URL(picUrl);
con = (HttpURLConnection) url.openConnection();
// 设置连接超时
con.setConnectTimeout(100 * 1000);
// 设置读取超时
con.setReadTimeout(100 * 1000);
is = new BufferedInputStream(con.getInputStream());
os = new BufferedOutputStream(new FileOutputStream(new File(filePath + "/" + i + ".jpg")));
byte[] b = new byte[1024];
int length = 0;
while ((length = is.read(b)) != -1) {
os.write(b, 0, length);
}
os.flush();
System.out.println(index + "号线程----------------->>>>>>>保存完第" + i + "张");
} catch (Exception e) {
System.out.println(index + "号线程跑到第" + start + "张图片+++++++++++++抛出异常,异常信息为:" + e.getMessage());
// 抛出异常捕获,继续执行
continue;
}
}
try {
if (is != null) {
is.close();
}
if (os != null) {
os.close();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
// 当前线程完成,减1
this.stop.countDown();
}
}
}
}
3、源码如下