package com.unbank.robotspider.util;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SmartNextPageFecther {
public static void main(String[] args) {
String url = "http://focus.stockstar.com/SS2014061700001351.shtml";
Document document = JsoupUtil.readUrl(url);
Map<Integer, String> pageList = new SmartNextPageFecther()
.getNextPageUrl(document, url);
for (int i = 0; i < pageList.size() + 3; i++) {
String nextUrl = pageList.get(i);
if (nextUrl != null) {
System.out.println(nextUrl);
}
}
}
public Map<Integer, String> getNextPageUrl(Document doc, String baseurl) {
Document document = doc.clone();
Map<Integer, String> map = new HashMap<Integer, String>();
Elements a_elements = document.getElementsByTag("a");
int prePageNum = 5;
int pageNum = 0;
for (Element e : a_elements) {
String uu = e.attr("href");
uu = UrlTools.getFullUrl(baseurl, uu);
if (uu == null || uu.trim().isEmpty()) {
continue;
}
String a_text = e.text();
// 是否是下一页的
boolean bl = checkText(a_text);
if (bl) {
int cu = checkUrl(baseurl, uu);
if (cu != -1) {
pageNum = pageNum > cu ? pageNum : cu;
prePageNum = prePageNum < cu ? prePageNum : cu;
map.put(cu, uu);
}
}
}
if (map.size() >= 2) {
// 说明是3页了
String second = null;
String third = null;
if (prePageNum == 0) {
second = map.get(0);
third = map.get(1);
} else if (prePageNum == 1) {
second = map.get(1);
third = map.get(2);
} else if (prePageNum == 2) {
second = map.get(2);
third = map.get(3);
}
String urlRule = UrlRuleUtil.getURlRule(second, third);
for (int i = prePageNum; i <= pageNum; i++) {
if (map.get(i) == null) {
String page = UrlRuleUtil.getcheckURL(urlRule, i);
map.put(i, page);
}
}
}
return map;
}
public boolean checkText(String text) {
String[] texts = { "首页", "第一页", "下一页", "末页", "最后一页", "尾页" };
for (int i = 0; i < texts.length; i++) {
if (texts[i].equals(text)) {
return true;
}
}
if (text.matches("\\d{1,2}")) {
return true;
}
return false;
}
public int checkUrl(String url1, String url2) {
int l1 = url1.length();
int l2 = url2.length();
if (l1 == 0 || l2 == 0) {
return -1;
}
String longStr = l1 > l2 ? url1 : url2;
String shortStr = l1 < l2 ? url1 : url2;
int j = 0;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < longStr.length() - 1; i++) {
if (longStr.charAt(i) != shortStr.charAt(j)) {
sb.append(longStr.charAt(i));
} else {
j++;
if (j == shortStr.length()) {
break;
}
}
}
if (sb.length() == 0) {
return -1;
}
String variances = sb.toString();
String numStr = variances.replaceAll("_", "").replaceAll("=", "")
.replaceAll("index", "").replaceAll("page", "")
.replaceAll("p", "").replaceAll("-", "");
if (numStr.matches("\\d{1,2}")) {
return Integer.valueOf(numStr);
} else {
return -1;
}
}
}