今天学习了网页抓取,,现在只想把自己所学的,所用的,同大家分享!
AliexCategory.java
// 需要的jar 有: commons-httpcllient3.1.jar commons-logging.jar commons-codec.jar
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.lang.StringUtils;
public class AliexCategory{
private static final String categoryUrl=http://www.iteye.com;
private static final String CHARSET= "utf8";
public static String getHtml(String url){
HttpClient hc = new HttpClient();
HttpMethod get = null;
if(StringUtils.isEmpty(url))
return "" ;
String page = "";
try {
// 设置cookie的兼容性,这一行必须要加,否则服务器无法获取登陆状态
hc.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
get = new GetMethod(url);
get.getParams().setContentCharset(CHARSET);
int statusCode = hc.executeMethod(get);
if(statusCode==HttpStatus.SC_OK){
page= get.getResponseBodyAsString()
}
// System.out.println("http头中的编码信息charset : "+ get.getResponseCharSet());
} catch (Exception e) {
logger.error("getHtml exception,url:"+url, e);
e.printStackTrace();
page = "";
} finally {
if (get != null) {
get.releaseConnection(); // 无论执行是否成功,都必须释放连接
}
}
return page;
}
// 在main方法中直接调用getHtml 测试即可
public static void main(String[] args) {
String html = getHtml(categoryUrl);
System.out.println(html);
}
}