package com.test;
import java.io.IOException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/***
* 使用Jsoup去解析查询手机号归属地
* 原理:抓取再解析html...
* @author ljl
*
*/
public class Test2 {
/**
* 正则表达式,抽取手机归属地
*/
public static final String REGEX_GET_MOBILE = "(?is)(<tr[^>]+>[\\s]*<td[^>]+>[\\s]*卡号归属地[\\s]*</td>[\\s]*<td[^>]+>([^<]+)</td>[\\s]*</tr>)"; // 2:from
/**
* 正则表达式,审核要获取手机归属地的手机是否符合格式,可以只输入手机号码前7位
*/
public static final String REGEX_IS_MOBILE = "(?is)(^1[3|4|5|8][0-9]\\d{4,8}$)";
/**
* 从www.ip138.com
* 返回的结果网页内容中获取手机号码归属地,结果为:省份 城市
* @param htmlSource
* @return
*/
public static String parseMobileFrom(String htmlSource){
Pattern p=null;
Matcher m=null;
String result=null;
p=Pattern.compile(REGEX_GET_MOBILE);
m=p.matcher(htmlSource);
while(m.find()){
if(m.start(2)>0){
result=m.group(2);
result=result.replaceAll(" ", " ");
}
}
return result;
}
/**
* 验证手机号
* @param mobileNumber
* @return
*/
public static boolean veriyMobile(String mobileNumber){
Pattern p=null;
Matcher m=null;
p=Pattern.compile(REGEX_IS_MOBILE);
m=p.matcher(mobileNumber);
return m.matches();
}
public static void main(String[] args) throws Exception {
String mobile="13800138000";
getNetFormMobileInfo(mobile);
}
private static void getNetFormMobileInfo(String mobileNumber) throws IOException, HttpException {
if(!veriyMobile(mobileNumber)){
try {
throw new Exception("不是完整的11位手机号或者正确的手机号前七位");
} catch (Exception e) {
e.printStackTrace();
}
}
StringBuffer buffer = new StringBuffer();
String url = "http://www.ip138.com";
buffer.append(url);
buffer.append(":8080");//端口
buffer.append("/");
buffer.append("search.asp?");
buffer.append("mobile=" + mobileNumber);
buffer.append("&action=mobile");
String basePath = buffer.toString();
Document doc=Jsoup.parse(new URL(basePath), 3000);
if(doc!=null){
//从class=tdc样式下面抓取
Elements tdcs = doc.getElementsByAttributeValue("class", "tdc");
for(Element td:tdcs){
//从class=tdc2样式下面抓取
Elements tdc2s=td.getElementsByAttributeValue("class","tdc2");
for(Element tdc:tdc2s){
//System.out.println(tdc);
//<[^>]+>去掉html标签,去掉 html标签的空格
String mobileInfo=tdc.select("td").html().replaceAll("<[^>]+>", "").replaceAll(" ", "").replaceAll("-->", "");
System.out.println(mobileInfo);
}
}
}else{
System.err.println("网络异常~~");
}
}
}