import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Logger;
/**
* <ul>
* <li>Function:
* <ul>
* <li>TODO</li>
* </ul>
* </li>
* <li>CopyRight
* </li>
* <li>author: <a href="http://blog.youkuaiyun.com/wgzhl2008">wgzhl2008</a></li>
* <li>E-mail: <a href="mailto:wgzhl2008@gmail.com">wgzhl2008@gmail.com</a>
* <li>Version:1.0</li>
* <li>Date:2012-3-17 下午01:36:26</li>
* </ul>
*/
public class FetchShopUtil {
private static final Logger logger = Logger.getLogger(FetchShopUtil.class);
/**
* @param url
* @return
* @see 依据网址获取页面内容
* @author * @version 1.0
* @date 2012-3-17 下午02:04:19
*/
public static String fetchHtmlFromWebPage(String url){
GetMethod getMethod =new GetMethod(url);
HttpClient client = new HttpClient();
int status = 0;
try{
status = client.executeMethod(getMethod);
if(status!=HttpStatus.SC_OK){
//如果不成功,休息3s后再进行一次抓取
Thread.sleep(1000*3);
status = client.executeMethod(getMethod);
}
}catch (Exception e) {
getMethod.releaseConnection();
logger.error("抓取网页内容出错"+e.getMessage(),e);
}
String sResponse="";
if(status==HttpStatus.SC_OK){
try {
sResponse=getMethod.getResponseBodyAsString();
} catch (IOException e) {
getMethod.releaseConnection();
logger.error("抓取网页内容出错"+e.getMessage(),e);
}
}else{
logger.info("抓取网页内容失败");
}
getMethod.releaseConnection();
return sResponse;
}
/**
* @param html
* @return {@link List}
* @see 得到店铺名字
* @author
* @version 1.0
* @date 2012-3-17 下午02:09:01
*/
public static List<String> fecthShopName(String html){
List<String> resultItem = new ArrayList<String>();
Pattern p1 = Pattern
.compile("<p(\\s*)class=(\\s*)\"nick\"(\\s*)>(.*?)</p>");
Matcher m1 = p1.matcher(html);
String itemContent[] = null;
while (m1.find()) {
itemContent = m1.group().split(">");
for(String s:itemContent){
int index=s.indexOf("<");
if(index!=0){
resultItem.add(s.substring(0,index).trim());
}
}
}
return resultItem;
}
}