package com.leiwang.HttpClient;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
public class httpClient {
private static Map<Thread, HttpClient> clients = new HashMap<Thread, HttpClient>();
public static void clean() {
clients.clear();
}
public static String postPage(String url, String enc, Map<String, String> params) throws Exception {
System.out.println(url);
PostMethod method = new PostMethod(url);
method.getParams().setContentCharset(enc);
setHeaders(method);
NameValuePair[] pairs = new NameValuePair[params.size()];
int i = 0;
for (Map.Entry<String, String> entry : params.entrySet()) {
pairs[i++] = new NameValuePair(entry.getKey(), entry.getValue());
}
method.setRequestBody(pairs);
return tryResponse(method, enc);
}
public static String getPage(String url, String enc) throws Exception {
//System.out.println("sss" + url);
GetMethod method;
int idx = url.indexOf('?');
if(idx==-1) {
method = new GetMethod(url);
} else {
method = new GetMethod(url.substring(0, idx));
method.setQueryString(url.substring(idx + 1));
}
setHeaders(method);
return tryResponse(method, enc);
}
private static String tryResponse(final HttpMethod method, final String enc) throws Exception {
HttpClient hc = clients.get(Thread.currentThread());
if(hc==null) {
clients.put(Thread.currentThread(), hc = new HttpClient());
hc.getHttpConnectionManager().getParams().setSoTimeout(30000);
hc.getHttpConnectionManager().getParams().setConnectionTimeout(30000);
hc.getParams().setSoTimeout(30000);
hc.getParams().setConnectionManagerTimeout(30000);
}
final String[] s = new String[1];
final HttpClient fhc = hc;
final Exception[] fe = new Exception[1];
Thread thread = new Thread() {
public void run() {
while(true) {
try {
fhc.executeMethod(method);
if(method.getStatusCode()==500) throw new Exception("status code: 500");
String ss = getResponse(method, enc);
// System.out.println(method.getStatusText());
if(ss.length()==0) throw new Exception("zero length response text");
method.releaseConnection();
s[0] = ss;
synchronized (fhc) {
fhc.notify();
}
return;
} catch (Exception e) {
fe[0] = e;
try {
Thread.sleep(60*1000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
} finally {
method.releaseConnection();
}
}
}
};
thread.start();
synchronized (fhc) {
fhc.wait(1200000);
}
thread.interrupt();
thread.stop();
if(s[0]==null) throw new Exception("fail at last: " + method.getURI(), fe[0]);
return s[0];
}
private static void setHeaders(HttpMethod method) {
method.getParams().setCookiePolicy(CookiePolicy.RFC_2965);
method.setRequestHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");
method.setRequestHeader("Accept", "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5");
method.setRequestHeader("Accept-Language", "zh-cn,zh;q=0.5");
method.setRequestHeader("Accept-Encoding", "gzip,deflate");
method.setRequestHeader("Accept-Charset", "gb2312,utf-8;q=0.7,*;q=0.7");
method.setRequestHeader("Keep-Alive", "300");
method.setRequestHeader("Connection", "Keep-Alive");
}
private static String getResponse(HttpMethod method, String enc) throws IOException {
Header renc = method.getResponseHeader("Content-Encoding");
InputStream is;
if(renc!=null && renc.getValue().equals("gzip")) {
is = new GZIPInputStream(method.getResponseBodyAsStream());
} else {
is = method.getResponseBodyAsStream();
}
Reader reader = new InputStreamReader(is, enc);
char[] buf = new char[1024];
StringBuilder sb = new StringBuilder();
int rd;
while ((rd = reader.read(buf)) != -1) {
sb.append(buf, 0, rd);
}
return sb.toString();
}
public static void main(String[] args){
try {
String str = "";
for(int i=0;i<10;i++){
getPage("http://localhost:8080/wleing/myhttpclient.jsp","GBk");
System.out.println(".................." + str);
}
//System.out.println(".................." + str);
} catch (Exception e) {
e.printStackTrace();
}
}
}
本文介绍了一种利用Apache HttpClient库实现网页抓取的方法。通过示例代码展示了如何配置HttpClient对象,设置请求头,发送GET和POST请求,并解析响应内容。特别关注了处理HTTP状态码及异常情况,确保了请求的稳定性和准确性。
257

被折叠的 条评论
为什么被折叠?



