工作间隙.看到一个问题说要用HttpClient来抓取A网页的数据,可是必须要登录后才能访问A.问怎么传递登录用到的cookie值
记得 同一个HttpClient实例 请求过程中是自动记录cookie等数据的。
以下代码 用的是 org.apache.http 下的工具类
package com.test.demo1;
import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
import java.net.SocketTimeoutException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.*;
import org.apache.http.util.EntityUtils;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.conn.params.ConnRoutePNames;
/**模拟登录后抓取网页数据
* 分页等待拓展,未做
* @author xxx
* 用的包是 http://hc.apache.org/downloads.cgi 的HttpClient 4.3.6 (GA)
*/
public class HttpClientDataDemo {
@SuppressWarnings("deprecation")
private static final HttpClient client = new DefaultHttpClient(); // 保证目标地址的请求 和 登录的请求是同一个句柄
private static final String username = "im_user"; // 登录的账号
private static final String password = "im_password"; // 密码
private static final String aimUrl = "http://pindao.xxx.com/diannao.htm"; // 目标地址
private static final String login = "https://passport.xxx.com/ids/login"; // 登录地址
private static final String html_code = "gbk";
// 有些网页会做一些客户端限制
private static final String USER_AGENT = "User-Agent";
private static final String AGENT = "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30";
/*** 全局变量类实例*/
private HttpContext localContext = new BasicHttpContext();
// 如果不为空,则强制设置cookie
private String cookie;
// 代理服务器
private HttpHost proxy;
public static void main(String[] args) throws IOException {
// 登录
login(username, password);
// 其它操作
getData(aimUrl);
}
/**
* 其它操作
* @param url
* @throws IOException
*/
static void getData(String url) throws IOException {
HttpGet get = new HttpGet(url);
HttpResponse response = client.execute(get);
System.out.println("getData status:" + response.getStatusLine());
HttpEntity entity = response.getEntity();
print(entity);
}
/**
* 执行登录
*
* @throws IOException
*/
static void login(String user, String pwd) throws IOException {
HttpPost post = new HttpPost(login);
// 登录表单的信息 "username" 对应 form 里面用户名的属性 name "password" 同理
List<NameValuePair> qparams = new ArrayList<NameValuePair>();
qparams.add(new BasicNameValuePair("username", user));
qparams.add(new BasicNameValuePair("password", pwd));
qparams.add(new BasicNameValuePair("url", aimUrl));
UrlEncodedFormEntity params = new UrlEncodedFormEntity(qparams, html_code); // 网页编码
post.setEntity(params);
HttpResponse response = client.execute(post);
System.out.println(response.getAllHeaders());
System.out.println("login status:"+response.getStatusLine());
post.abort();
}
/**
* 打印页面
*
* @param entity
* @throws IOException
*/
private static void print(HttpEntity entity) throws IOException {
if (entity != null) {
// 打印响应内容长度
System.out.println("Response content length: " + entity.getContentLength());
// 打印响应内容
System.out.println("Response content: " + EntityUtils.toString(entity));
}
}
// 代理等等情况 参看 caller
private String post(HttpEntity entity, String url)
{
String result = null;
HttpPost request = new HttpPost(url);
request.addHeader(USER_AGENT, AGENT);
request.addHeader("Accept-Encoding", "gzip");
if(cookie != null){
request.setHeader("Cookie", cookie);
}
request.setEntity(entity);
try
{
if (proxy != null) {
result = client.execute(proxy, request, new RedirectionResponsehandler(), localContext);
} else {
result = client.execute(request, new RedirectionResponsehandler(), localContext);
}
}
catch (SocketTimeoutException e)
{
e.printStackTrace();
}
catch (ClientProtocolException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
catch (NullPointerException e)
{
e.printStackTrace();
}
return result;
}
/**
* 使用ResponseHandler接口处理响应,HttpClient使用ResponseHandler会自动管理连接的释放,解决了对连接的释放管理
*/
private class RedirectionResponsehandler implements ResponseHandler<String>
{
@Override
public String handleResponse(HttpResponse response)
throws ClientProtocolException, IOException {
// TODO Auto-generated method stub
return null;
}
}
public String getCookie() {
return cookie;
}
public void setCookie(String cookie) {
this.cookie = cookie;
}
public void setProxy(String proxyUrl, int port) {
proxy = new HttpHost(proxyUrl, port);
client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
}
}
参考代码放上
package com.suning.ebuy.ott.util;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.SSLHandshakeException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.slf4j.LoggerFactory;
/**
*/
public class Caller
{
/**
public static final String ENCODE = HTTP.UTF_8;
private static final String USER_AGENT = "User-Agent";
private static final String HTTP_GET = "GET";
private static final String HTTP_POST = "POST";
private static final String AGENT = "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30";
private static final String HTTP_STR = "xxx.com";
private org.slf4j.Logger logger = LoggerFactory.getLogger(getClass());
/**
* 是否是Https请求
*/
private boolean isHttpsRequest;
/**
* 创建DefaultHttpClient实例,同于执行请求
*/
private final DefaultHttpClient mClient;
/**
* 全局变量类实例
*/
private HttpContext localContext = new BasicHttpContext();
/**
* 如果不为空,则强制设置cookie
*/
private String cookie;
/**
* 代理服务器
*/
private HttpHost proxy;
public Caller()
{
mClient = CommonHttpClient.createHttpClient();
mClient.setHttpRequestRetryHandler(mRequestRetryHandler);
}
/**
* @Description:
*/
public void setCookieStore(CookieStore cookieStore)
{
mClient.setCookieStore(cookieStore);
}
/**
* @Description:
*/
public CookieStore getCookieStore()
{
return mClient.getCookieStore();
}
/**
* 打开https请求
*/
public void setHttpsRequestOpen()
{
isHttpsRequest = true;
}
private String getTempUrl()
{
String tempUrl = isHttpsRequest ? "" : "";
if (isHttpsRequest)
{
isHttpsRequest = false;
}
return tempUrl;
}
/**
* @throws UnsupportedEncodingException
* @Description:post访问主站通用方法
*/
public String post(String url, List<NameValuePair> list, String action)
throws UnsupportedEncodingException
{
list = addBasicParams(list);
return post(new UrlEncodedFormEntity(list, ENCODE), url + action);
}
/**
* @Description:带参数的post通用方法
*/
public String post(List<NameValuePair> olist, String action)
throws ClientProtocolException, IOException
{
List<NameValuePair> list = addBasicParams(olist);
return post(new UrlEncodedFormEntity(list, ENCODE), getTempUrl()
+ action);
}
/***
* post通用方法
*/
private String post(HttpEntity entity, String url)
{
String result = null;
HttpPost request = new HttpPost(url);
request.addHeader(USER_AGENT, AGENT);
logger.debug("http post url", url);
request.addHeader("Accept-Encoding", "gzip");
if(cookie != null){
request.setHeader("Cookie", cookie);
}
request.setEntity(entity);
try
{
if (proxy != null) {
result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
} else {
result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
}
logger.debug("http post result", result);
}
catch (SocketTimeoutException e)
{
logger.warn(e.getMessage(), e);
}
catch (ClientProtocolException e)
{
logger.warn(e.getMessage(), e);
}
catch (IOException e)
{
logger.warn(e.getMessage(), e);
}
catch (NullPointerException e)
{
logger.warn(e.getMessage(), e);
}
return result;
}
/**
* @throws IOException
* @throws ClientProtocolException
*/
public String postJson(String url, String action, String jsonData) throws ClientProtocolException, IOException
{
HttpPost request = new HttpPost(url + action);
request.addHeader(USER_AGENT, AGENT);
request.addHeader("Content-Type", "application/json");
request.addHeader("Accept-Encoding", "gzip");
if(cookie != null){
request.setHeader("Cookie", cookie);
}
request.setEntity(new ByteArrayEntity(jsonData.getBytes(ENCODE)));
String result = null;
if (proxy != null) {
result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
} else {
result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
}
logger.debug("request url and response result::", url + action + "\n"
+ result);
return result;
}
/**
*
* @Description:为请求添加通用参数
*/
private List<NameValuePair> addBasicParams(List<NameValuePair> olist)
{
List<NameValuePair> list = new ArrayList<NameValuePair>();
if (olist != null)
{
list.addAll(olist);
}
return list;
}
/**
*
* @Description:直接通过url 调用get通用方法,没有缓存
* @param url外部链接
*/
public String getWithoutCache(String url, List<NameValuePair> list,
String action)
{
return get(list, url + action, false);
}
/**
*
* @Description:直接通过url 调用get通用方法,有缓存
* @param url外部链接
*/
public String get(String url, List<NameValuePair> list, String action)
{
return get(list, url + action, true);
}
/**
*
* @Description:内部系统get通用方法,没有缓存
*/
public String getWithoutCache(List<NameValuePair> list, String action)
{
return get(list, getTempUrl() + action, false);
}
/**
*
* @Description:内部系统get通用方法,有缓存
*/
public String get(List<NameValuePair> list, String action)
{
return get(list, getTempUrl() + action, true);
}
public String get(String url){
HttpGet request = new HttpGet(url);
request.addHeader(USER_AGENT, AGENT);
String result = "";
try {
if (cookie != null) {
request.setHeader("Cookie", cookie);
}
if (proxy != null) {
result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
} else {
result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
logger.warn(e.getMessage(), e);
} catch (IOException e) {
// TODO Auto-generated catch block
logger.warn(e.getMessage(), e);
}
return result;
}
/**
*
* @Description:
*/
private String get(List<NameValuePair> list, String url, boolean withCache)
{
String result = null;
list = addBasicParams(list);
// 重定向,拼接多个?处理
if(null != list && list.size() > 0){
url = url + "?" + URLEncodedUtils.format(list, ENCODE);
}
// 重定向时,url路径不全处理(去除ip访问搜索系统pre和环境的情况)
if(!url.contains(HTTP_STR) && !url.contains("192.168.33.2")){
url = HttpMtsPropertiesUtil.getValue("B2C_URL_ROOT") + url;
}
if (result != null)
{
logger.error("caller get ", "Caller.get [cached] {}" + url);
return result;
}
else
{
HttpGet request = new HttpGet(url);
// request.addHeader("Accept-Encoding", "gzip");
request.addHeader(USER_AGENT, AGENT);
if(cookie != null){
request.setHeader("Cookie", cookie);
}
logger.error("http url {}", request.getURI().toString());
try
{
if (proxy != null) {
result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
} else {
result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
}
}
catch (ClientProtocolException e)
{
logger.warn(e.getMessage(), e);
}
catch (IOException e)
{
logger.warn(e.getMessage(), e);
}
catch (NullPointerException e)
{
logger.warn(e.getMessage(), e);
}
}
if (result != null)
{
logger.error("http result {}", result);
}
return result;
}
/**
* 异常自动恢复处理, 使用HttpRequestRetryHandler接口实现请求的异常恢复
*/
private HttpRequestRetryHandler mRequestRetryHandler = new HttpRequestRetryHandler()
{
// 自定义的恢复策略
public boolean retryRequest(IOException exception, int executionCount,
HttpContext context)
{
// 设置恢复策略,在发生异常时候将自动重试3次
if (executionCount >= 3)
{
// Do not retry if over max retry count
return false;
}
if (exception instanceof NoHttpResponseException)
{
// Retry if the server dropped connection on us
return true;
}
if (exception instanceof SSLHandshakeException)
{
// Do not retry on SSL handshake exception
return false;
}
HttpRequest request = (HttpRequest) context
.getAttribute(ExecutionContext.HTTP_REQUEST);
boolean idempotent = (request instanceof HttpEntityEnclosingRequest);
if (!idempotent)
{
// Retry if the request is considered idempotent
return true;
}
if (exception != null)
{
return true;
}
return false;
}
};
/**
* 使用ResponseHandler接口处理响应,HttpClient使用ResponseHandler会自动管理连接的释放,解决了对连接的释放管理
*/
private class RedirectionResponsehandler implements ResponseHandler<String>
{
@Override
public String handleResponse(HttpResponse response)
throws ClientProtocolException, IOException
{
HttpUriRequest currentReq = (HttpUriRequest) localContext
.getAttribute(ExecutionContext.HTTP_REQUEST);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK)
{
HttpEntity entity = response.getEntity();
if (entity != null)
{
Header header = entity.getContentEncoding();
if (header != null)
{
String contentEncoding = header.getValue();
if (contentEncoding != null)
{
if (contentEncoding.contains("gzip"))
{
entity = new GzipDecompressingEntity(entity);
}
}
}
String charset = EntityUtils.getContentCharSet(entity) == null ? ENCODE
: EntityUtils.getContentCharSet(entity);
Header[] headers = response
.getHeaders("passport.login.flag");
if (headers.length > 0)
{
String result = "{\"@code\":\"1\",\"@desc\":\"用户未登录\",\"xml\":{\"@errorCode\":\"common.2.userNotLoggedIn\"}}";
return result;
}
String responseBody = new String(
EntityUtils.toByteArray(entity), charset);
// 如果需要自动转发
Pattern redirPtn = Pattern
.compile(
"<meta http-equiv=(?:'|\")refresh(?:'|\").*url=(.*)(?:'|\") />",
Pattern.CASE_INSENSITIVE);
Matcher mathcer = redirPtn.matcher(responseBody);
while (mathcer.find())
{
String refreshUrl = mathcer.group(1);
if (!refreshUrl.startsWith("http:"))
{
HttpHost currentHost = (HttpHost) localContext
.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
String rdu = (currentReq.getURI().isAbsolute()) ? currentReq
.getURI().toString()
: (currentHost.toURI() + currentReq
.getURI());
refreshUrl = String.format("%s%s", rdu, refreshUrl);
}
if (currentReq.getMethod().equalsIgnoreCase(HTTP_POST))
{
return post(refreshUrl, null, "");
}
else if (currentReq.getMethod().equalsIgnoreCase(
HTTP_GET))
{
//切记别改成带缓存的 wujj
return getWithoutCache(refreshUrl, null, "");
}
}
return responseBody;
}
else
{
return null;
}
}
else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_MOVED_TEMPORARILY
|| response.getStatusLine().getStatusCode() == HttpStatus.SC_MOVED_PERMANENTLY)
{
// 从头中取出转向的地址
Header locationHeader = response.getLastHeader("location");
String location = locationHeader.getValue();
if (null!=currentReq && currentReq.getMethod().equalsIgnoreCase(HTTP_POST))
{
return post(location, null, "");
}
else if (null!=currentReq && currentReq.getMethod().equalsIgnoreCase(HTTP_GET))
{
return getWithoutCache(location, null, "");
}
}
else
{
return null;
}
return null;
}
}
public String getCookie() {
return cookie;
}
public void setCookie(String cookie) {
this.cookie = cookie;
}
public void setProxy(String proxyUrl, int port) {
proxy = new HttpHost(proxyUrl, port);
mClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
}
}