java中利用URL抓取网页内容

最新推荐文章于 2021-03-16 17:10:00 发布

原创最新推荐文章于 2021-03-16 17:10:00 发布 · 1.1k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#url #java #string #exception #html #date

本文介绍了一种使用Java实现的网页内容抓取方法及从抓取到的内容中解析特定信息的技术。通过HttpClient发起请求获取指定URL的网页内容，并利用正则表达式从中提取店铺名称等信息。

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Logger;

/**
 * <ul>
 * <li>Function:
 * 		<ul>
 * 		<li>TODO</li>
 * 		</ul>
 * </li>
 * <li>CopyRight
 * </li>
 * <li>author: <a href="http://blog.youkuaiyun.com/wgzhl2008">wgzhl2008</a></li>
 * <li>E-mail: <a href="mailto:wgzhl2008@gmail.com">wgzhl2008@gmail.com</a>
 * <li>Version:1.0</li>
 * <li>Date:2012-3-17 下午01:36:26</li>
 * </ul>
 */
public class FetchShopUtil {
	private static final Logger logger = Logger.getLogger(FetchShopUtil.class);
	/**
	 * @param url
	 * @return
	 * @see 依据网址获取页面内容
	 * @author 	 * @version 1.0
	 * @date 2012-3-17 下午02:04:19
	 */
	public static String fetchHtmlFromWebPage(String url){
		GetMethod getMethod =new GetMethod(url);
		HttpClient client = new HttpClient();
		int status = 0;
		try{
			status = client.executeMethod(getMethod);
			if(status!=HttpStatus.SC_OK){
				//如果不成功，休息3s后再进行一次抓取
				Thread.sleep(1000*3);
				status = client.executeMethod(getMethod);
			}
		}catch (Exception e) {
			getMethod.releaseConnection();
			logger.error("抓取网页内容出错"+e.getMessage(),e);
		}
		String sResponse="";
		if(status==HttpStatus.SC_OK){
			try {
				sResponse=getMethod.getResponseBodyAsString();
			} catch (IOException e) {
				getMethod.releaseConnection();
				logger.error("抓取网页内容出错"+e.getMessage(),e);
			}
		}else{
			logger.info("抓取网页内容失败");
		}
		getMethod.releaseConnection();
		
		return sResponse;
	}
	
	/**
	 * @param html
	 * @return {@link List}
	 * @see 得到店铺名字
	 * @author

	 * @version 1.0
	 * @date 2012-3-17 下午02:09:01
	 */
	public static List<String> fecthShopName(String html){
		List<String> resultItem = new ArrayList<String>();
		Pattern p1 = Pattern
				.compile("<p(\\s*)class=(\\s*)\"nick\"(\\s*)>(.*?)</p>");
		Matcher m1 = p1.matcher(html);
		String itemContent[] = null;
		while (m1.find()) {
			itemContent = m1.group().split(">");
			for(String s:itemContent){
				int index=s.indexOf("<");
				if(index!=0){
					resultItem.add(s.substring(0,index).trim());
				}
			}
		}
		return resultItem;
	}
}