最近发现了一个解析html的框架,拿来试试,以游久为例,代码:
package com.test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Test4 extends Thread {
List<String> list = new ArrayList<String>();
int index = 0;
InputStream inputStream;
OutputStream outputStream;
File parent;
public Test4() throws IOException {
list.add("http://news.uuu9.com/2011/201103/186409.shtml");
Document document = Jsoup.connect(list.get(0)).get();
parent = new File("f:/image/uuu9/"+document.title());
parent.mkdir();
Element element = document.select("div#pagecount").first();
for (int i = 2; i <= Integer.parseInt(element.text()); i++) {
list.add("http://news.uuu9.com/2011/201103/186409.shtml".replaceAll(".shtml", "_"+i+".shtml"));
}
while(index<list.size()-1) {
run();
}
}
public void run() {
try {
Document document = Jsoup.connect(list.get(index)).get();
Elements elements = document.select("img[title=点击图片翻页]");
for (int i = 0; i < elements.size(); i++) {
String img = elements.get(i).attr("src").replaceAll("_Z", "");
inputStream = new URL(img).openStream();
File file = new File(parent,img.substring(img.lastIndexOf("/")+1));
System.out.println(file);
file.createNewFile();
outputStream = new FileOutputStream(file);
byte[] bs = new byte[2048];
int c = 0;
while((c=inputStream.read(bs))!=-1) {
outputStream.write(bs,0,c);
}
outputStream.flush();
}
index++;
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
new Test4().start();
}
}
package com.test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Test4 extends Thread {
List<String> list = new ArrayList<String>();
int index = 0;
InputStream inputStream;
OutputStream outputStream;
File parent;
public Test4() throws IOException {
list.add("http://news.uuu9.com/2011/201103/186409.shtml");
Document document = Jsoup.connect(list.get(0)).get();
parent = new File("f:/image/uuu9/"+document.title());
parent.mkdir();
Element element = document.select("div#pagecount").first();
for (int i = 2; i <= Integer.parseInt(element.text()); i++) {
list.add("http://news.uuu9.com/2011/201103/186409.shtml".replaceAll(".shtml", "_"+i+".shtml"));
}
while(index<list.size()-1) {
run();
}
}
public void run() {
try {
Document document = Jsoup.connect(list.get(index)).get();
Elements elements = document.select("img[title=点击图片翻页]");
for (int i = 0; i < elements.size(); i++) {
String img = elements.get(i).attr("src").replaceAll("_Z", "");
inputStream = new URL(img).openStream();
File file = new File(parent,img.substring(img.lastIndexOf("/")+1));
System.out.println(file);
file.createNewFile();
outputStream = new FileOutputStream(file);
byte[] bs = new byte[2048];
int c = 0;
while((c=inputStream.read(bs))!=-1) {
outputStream.write(bs,0,c);
}
outputStream.flush();
}
index++;
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
new Test4().start();
}
}