本例子只作为测试,页面个数直接设置了100个,可以可能会少或者多,容易报错,更优化的一种方式是获取“下一页”按钮的地址,然后再访问,当访问不到“下一页”的内容时跳出
多线程只体现在文件提取,也可以在elements循环中再加一个多线程访问页面的
本案例需要jsoup包的支持,可到下方url下载
Test.java==============>主方法
package com.test.main;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Test {
public static void main(String[] args) {
ExecutorService executor = Executors.newFixedThreadPool(5);
Document doc = null;
FileWriter writer = null;
String rui="index";
List<String> alist = new ArrayList<String>();
//int keyword = 4;
for(int keyword=4;keyword<100;keyword++){
try {
//创建页面对象
doc = Jsoup.connect("http://pic.yesky.com/c/6_20491_"+keyword+".shtml").userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36").timeout(10000).get();
//根据标签和class id获取元素
Elements div = doc.select("div.lb_box");
//根据标签获取元素
Elements dl = div.select("dl");
Elements dd = div.select("dd");
Elements pages = dd.select("a");
for(Element e : pages){
System.out.println(e.text());
System.out.println(e.attr("href"));
Document imgdoc = Jsoup.connect(e.attr("href")).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36").timeout(10000).get();
Elements scroll = imgdoc.select("div.effect_scroll");
Elements li = scroll.select("li");
Elements urls = li.select("a");
int i=0;
for(Element ipage : urls){
Document imgpage = Jsoup.connect(ipage.attr("href")).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36").timeout(10000).get();
Elements imgediv = imgpage.select("div.l_effect_img_mid");
Element img = imgediv.select("img").first();
FileOutUtils fo =new FileOutUtils(img, e.text());
fo.start();
System.out.println(i);
i++;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 单线程下载
* @author lcx
* @param e
* @param filepath
*/
public static void savefile(Element e,String filepath){
String src=e.attr("src");//获取img中的src路径
// System.out.println(src);
//获取后缀名
String imageName = src.substring(src.lastIndexOf("/") + 1,src.length());
//连接url
URL url;
System.out.println(src);
try {
url = new URL(src);
URLConnection uri=url.openConnection();
//获取数据流
InputStream is=uri.getInputStream();
//写入数据流
File file = new File("E://imgs//"+filepath);
if(!file.exists()){
file.mkdirs();
}
OutputStream os = new FileOutputStream(new File("E://imgs//"+filepath+"//", imageName));
byte[] buf = new byte[1024];
int l=0;
while((l=is.read(buf))!=-1){
os.write(buf, 0, l);
}
} catch (MalformedURLException e1) {
e1.printStackTrace();
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
FileOutUtils.java ==============>多线程保存到本地
package com.test.main;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import org.jsoup.nodes.Element;
public class FileOutUtils extends Thread {
private Element e;
private String filepath;
public FileOutUtils(Element e, String filepath) {
this.e = e;
this.filepath = filepath;
}
/**
* 多线程下载
* @author lcx
* @param e
* @param filepath
*/
public void savefile(Element e,String filepath){
String src=e.attr("src");//获取img中的src路径
// System.out.println(src);
//获取后缀名
String imageName = src.substring(src.lastIndexOf("/") + 1,src.length());
//连接url
URL url;
try {
url = new URL(src);
URLConnection uri=url.openConnection();
//获取数据流
InputStream is=uri.getInputStream();
//写入数据流
File file = new File("E://imgs//"+filepath);
if(!file.exists()){
file.mkdirs();
}
OutputStream os = new FileOutputStream(new File("E://imgs//"+filepath+"//", imageName));
byte[] buf = new byte[1024];
int l=0;
while((l=is.read(buf))!=-1){
os.write(buf, 0, l);
}
} catch (MalformedURLException e1) {
e1.printStackTrace();
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
}
public void run() {
this.savefile(this.e,this.filepath);
}
}