// String html = CrawlHtmlTools.getInstance().getPageContent(
// "http://blog.sina.com.cn/s/blog_63ca10e60102dspm.html?tj=1");
//
String host = "http://www.google.com.hk";
String logos = "http://www.google.com.hk/logos";
Document doc = null;
try {
doc = Jsoup
.connect("http://www.google.com.hk/logos/logos00-1.html")
.userAgent(RequestHeader.FireFox.valueOf()).timeout(
Integer.valueOf(PropertiesConfig
.getValue("timeout"))).get();
} catch (IOException e) {
e.printStackTrace();
}
Map<String, String> imgMap = new HashMap<String, String>();
// String html = doc.toString();
// Document htmlDoc = Jsoup.parse(html,
// "http://www.google.com.hk");
boolean ok = true;
while (ok) {
Elements els = doc.select("dd:has(p)");
for (Element e : els) {
imgMap.put(e.select("p").text()+".gif", host
+ e.select("p").select("img").attr("src"));
// System.out.println(e.select("p"));
// System.out.println(host
// + e.select("p").select("img").attr("src"));
}
try {
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e2) {
e2.printStackTrace();
}
Pattern p = Pattern.compile("<a\\s+href=\"(.+?)\">下一頁 »</a>");
Matcher match = p.matcher(doc.toString());
if (match.find()) {
String temp = logos + "/" + match.group(1);
try {
doc = Jsoup.connect(temp).userAgent(
RequestHeader.FireFox.valueOf()).timeout(
Integer.valueOf(PropertiesConfig
.getValue("timeout"))).get();
} catch (NumberFormatException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
} else {
ok = false;
break;
}
}
Set<Map.Entry<String,String>> set = imgMap.entrySet();
for(Iterator<Map.Entry<String,String>> it = set.iterator(); it.hasNext();){
Map.Entry<String, String> entry = it.next();
DownloadFile df = new DownloadFile();
df.setDownloadUrl(entry.getValue());
df.setPath("f:/img/"+entry.getKey());
DefaultDownload dd = new DefaultDownload();
dd.execute(df);
}