HttpClient 抓取网页

工作间隙.看到一个问题说要用HttpClient来抓取A网页的数据,可是必须要登录后才能访问A.问怎么传递登录用到的cookie值

记得 同一个HttpClient实例 请求过程中是自动记录cookie等数据的。

以下代码 用的是 org.apache.http 下的工具类

package com.test.demo1;

import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
import java.net.SocketTimeoutException;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.*;
import org.apache.http.util.EntityUtils;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.conn.params.ConnRoutePNames;



/**模拟登录后抓取网页数据
 * 分页等待拓展,未做
 * @author xxx
 * 用的包是  http://hc.apache.org/downloads.cgi 的HttpClient 4.3.6 (GA) 
 */
public class HttpClientDataDemo {
	
	@SuppressWarnings("deprecation")
	private static final HttpClient client = new DefaultHttpClient(); // 保证目标地址的请求 和 登录的请求是同一个句柄
	
	private static final String username = "im_user";	// 登录的账号
	private static final String password = "im_password";	// 密码
	private static final String aimUrl = "http://pindao.xxx.com/diannao.htm"; // 目标地址
	private static final String login = "https://passport.xxx.com/ids/login"; // 登录地址
	private static final String html_code = "gbk";
	
	// 有些网页会做一些客户端限制
	private static final String USER_AGENT = "User-Agent";
    private static final String AGENT = "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30";

    /*** 全局变量类实例*/
    private HttpContext localContext = new BasicHttpContext();
    
    // 如果不为空,则强制设置cookie
    private String cookie;
    // 代理服务器
    private HttpHost proxy;
    
	public static void main(String[] args) throws IOException {
		// 登录
		login(username, password);
		// 其它操作
		getData(aimUrl);
	}

	/**
	 * 其它操作
	 * @param url
	 * @throws IOException
	 */
	static void getData(String url) throws IOException {
		
		HttpGet get = new HttpGet(url);
		HttpResponse response = client.execute(get);
		System.out.println("getData status:" + response.getStatusLine());
		HttpEntity entity = response.getEntity();
		print(entity);
	}

	/**
	 * 执行登录
	 * 
	 * @throws IOException
	 */
	static void login(String user, String pwd) throws IOException {
		HttpPost post = new HttpPost(login);

		// 登录表单的信息 "username" 对应 form 里面用户名的属性 name "password" 同理
		List<NameValuePair> qparams = new ArrayList<NameValuePair>();
		qparams.add(new BasicNameValuePair("username", user));
		qparams.add(new BasicNameValuePair("password", pwd));
		qparams.add(new BasicNameValuePair("url", aimUrl));

		UrlEncodedFormEntity params = new UrlEncodedFormEntity(qparams, html_code); // 网页编码
		post.setEntity(params);
		
		HttpResponse response = client.execute(post);
		System.out.println(response.getAllHeaders());
		System.out.println("login status:"+response.getStatusLine());
		post.abort();
	}

	/**
	 * 打印页面
	 * 
	 * @param entity
	 * @throws IOException
	 */
	private static void print(HttpEntity entity) throws IOException {
        if (entity != null) {
            // 打印响应内容长度    
            System.out.println("Response content length: " + entity.getContentLength());  
            // 打印响应内容    
            System.out.println("Response content: " + EntityUtils.toString(entity));  
        }  
	}
	
	// 代理等等情况 参看 caller
	private String post(HttpEntity entity, String url)
    {
        String result = null;
        HttpPost request = new HttpPost(url);
        request.addHeader(USER_AGENT, AGENT);
        request.addHeader("Accept-Encoding", "gzip");
        if(cookie != null){
            request.setHeader("Cookie", cookie);
        }
        request.setEntity(entity);
        try
        {
            if (proxy != null) {
                result = client.execute(proxy, request, new RedirectionResponsehandler(), localContext);
            } else {
                result = client.execute(request, new RedirectionResponsehandler(), localContext);
            }
            
        }
        catch (SocketTimeoutException e)
        {
            e.printStackTrace();
        }
        catch (ClientProtocolException e)
        {
            e.printStackTrace();
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        catch (NullPointerException e)
        {
            e.printStackTrace();
        }
        return result;
    }
	
	/**
     * 使用ResponseHandler接口处理响应,HttpClient使用ResponseHandler会自动管理连接的释放,解决了对连接的释放管理
     */
    private class RedirectionResponsehandler implements ResponseHandler<String>
    {

		@Override
		public String handleResponse(HttpResponse response)
				throws ClientProtocolException, IOException {
			// TODO Auto-generated method stub
			return null;
		}
    	
    }
	
	public String getCookie() {
        return cookie;
    }

    public void setCookie(String cookie) {
        this.cookie = cookie;
    }
    
    public void setProxy(String proxyUrl, int port) {
        proxy = new HttpHost(proxyUrl, port);
        client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
    }

}

参考代码放上

package com.suning.ebuy.ott.util;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.net.ssl.SSLHandshakeException;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HTTP;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.slf4j.LoggerFactory;


/**
 */
public class Caller
{
    /**
    public static final String ENCODE = HTTP.UTF_8;
    private static final String USER_AGENT = "User-Agent";
    private static final String HTTP_GET = "GET";
    private static final String HTTP_POST = "POST";
    private static final String AGENT = "Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30";

    private static final String HTTP_STR = "xxx.com";
    
    private org.slf4j.Logger logger = LoggerFactory.getLogger(getClass());
    /**
     * 是否是Https请求
     */
    private boolean isHttpsRequest;

    /**
     * 创建DefaultHttpClient实例,同于执行请求
     */
    private final DefaultHttpClient mClient;

    /**
     * 全局变量类实例
     */
    private HttpContext localContext = new BasicHttpContext();
    
    /**
     * 如果不为空,则强制设置cookie
     */
    private String cookie;
    /**
     * 代理服务器
     */
    private HttpHost proxy;

    public Caller()
    {
        mClient = CommonHttpClient.createHttpClient();
        mClient.setHttpRequestRetryHandler(mRequestRetryHandler);
    }

    /**
     * @Description:
     */
    public void setCookieStore(CookieStore cookieStore)
    {
        mClient.setCookieStore(cookieStore);
    }

    /**
     * @Description:
     */
    public CookieStore getCookieStore()
    {
        return mClient.getCookieStore();
    }

    /**
     * 打开https请求
     */
    public void setHttpsRequestOpen()
    {
        isHttpsRequest = true;
    }

    private String getTempUrl()
    {
        String tempUrl = isHttpsRequest ? "" : "";
        if (isHttpsRequest)
        {
            isHttpsRequest = false;
        }
        return tempUrl;
    }

    /**
     * @throws UnsupportedEncodingException
     * @Description:post访问主站通用方法
     */
    public String post(String url, List<NameValuePair> list, String action)
            throws UnsupportedEncodingException
    {
        list = addBasicParams(list);
        return post(new UrlEncodedFormEntity(list, ENCODE), url + action);
    }

    /**
     * @Description:带参数的post通用方法
     */
    public String post(List<NameValuePair> olist, String action)
            throws ClientProtocolException, IOException
    {
        List<NameValuePair> list = addBasicParams(olist);
        return post(new UrlEncodedFormEntity(list, ENCODE), getTempUrl()
                + action);
    }

    /***
     * post通用方法
     */
    private String post(HttpEntity entity, String url)
    {
        String result = null;
        HttpPost request = new HttpPost(url);
        request.addHeader(USER_AGENT, AGENT);
        logger.debug("http post url", url);
        request.addHeader("Accept-Encoding", "gzip");
        if(cookie != null){
            request.setHeader("Cookie", cookie);
        }
        request.setEntity(entity);
        try
        {
            if (proxy != null) {
                result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
            } else {
                result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
            }
            
            logger.debug("http post result", result);
        }
        catch (SocketTimeoutException e)
        {
            logger.warn(e.getMessage(), e);
        }
        catch (ClientProtocolException e)
        {
            logger.warn(e.getMessage(), e);
        }
        catch (IOException e)
        {
            logger.warn(e.getMessage(), e);
        }
        catch (NullPointerException e)
        {
            logger.warn(e.getMessage(), e);
        }
        return result;
    }
    
    /**
     * @throws IOException 
     * @throws ClientProtocolException 
     */
    public String postJson(String url, String action, String jsonData) throws ClientProtocolException, IOException
    {
        HttpPost request = new HttpPost(url + action);
        request.addHeader(USER_AGENT, AGENT);
        request.addHeader("Content-Type", "application/json");
        request.addHeader("Accept-Encoding", "gzip");
        if(cookie != null){
            request.setHeader("Cookie", cookie);
        }
        request.setEntity(new ByteArrayEntity(jsonData.getBytes(ENCODE)));

        String result = null;
        if (proxy != null) {
            result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
        } else {
            result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
        }
         

        logger.debug("request url and response result::", url + action + "\n"
                + result);
        return result;
    }

    /**
     * 
     * @Description:为请求添加通用参数
     */
    private List<NameValuePair> addBasicParams(List<NameValuePair> olist)
    {
        List<NameValuePair> list = new ArrayList<NameValuePair>();
        if (olist != null)
        {
            list.addAll(olist);
        }
        return list;
    }

    /**
     * 
     * @Description:直接通过url 调用get通用方法,没有缓存
     * @param url外部链接
     */
    public String getWithoutCache(String url, List<NameValuePair> list,
            String action)
    {
        return get(list, url + action, false);
    }

    /**
     * 
     * @Description:直接通过url 调用get通用方法,有缓存
     * @param url外部链接
     */
    public String get(String url, List<NameValuePair> list, String action)
    {
        return get(list, url + action, true);
    }

    /**
     * 
     * @Description:内部系统get通用方法,没有缓存
     */
    public String getWithoutCache(List<NameValuePair> list, String action)
    {
        return get(list, getTempUrl() + action, false);
    }

    /**
     * 
     * @Description:内部系统get通用方法,有缓存
     */
    public String get(List<NameValuePair> list, String action)
    {
        return get(list, getTempUrl() + action, true);
    }

    public String get(String url){
        HttpGet request = new HttpGet(url);
        request.addHeader(USER_AGENT, AGENT);
        String result = "";
        try {
            
            if (cookie != null) {
                request.setHeader("Cookie", cookie);
            }
            
            if (proxy != null) {
                result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
            } else {
                result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
            }

            
        } catch (ClientProtocolException e) {
            // TODO Auto-generated catch block
            logger.warn(e.getMessage(), e);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            logger.warn(e.getMessage(), e);
        }
        
        return result;
    }
    /**
     * 
     * @Description:
     */
    private String get(List<NameValuePair> list, String url, boolean withCache)
    {
        String result = null;
        list = addBasicParams(list);
        
        // 重定向,拼接多个?处理
        if(null != list && list.size() > 0){
        	url = url + "?" + URLEncodedUtils.format(list, ENCODE);
        }
        
        // 重定向时,url路径不全处理(去除ip访问搜索系统pre和环境的情况)
        if(!url.contains(HTTP_STR) && !url.contains("192.168.33.2")){
        	url = HttpMtsPropertiesUtil.getValue("B2C_URL_ROOT") + url;
        }

        if (result != null)
        {
            logger.error("caller get ", "Caller.get [cached]  {}" + url);
            return result;
        }
        else
        {
            HttpGet request = new HttpGet(url);
            // request.addHeader("Accept-Encoding", "gzip");
            request.addHeader(USER_AGENT, AGENT);
            if(cookie != null){
                request.setHeader("Cookie", cookie);
            }
            logger.error("http url {}", request.getURI().toString());
            try
            {
                if (proxy != null) {
                    result = mClient.execute(proxy, request, new RedirectionResponsehandler(), localContext);
                } else {
                    result = mClient.execute(request, new RedirectionResponsehandler(), localContext);
                }
            }
            catch (ClientProtocolException e)
            {
                logger.warn(e.getMessage(), e);
            }
            catch (IOException e)
            {
                logger.warn(e.getMessage(), e);
            }
            catch (NullPointerException e)
            {
                logger.warn(e.getMessage(), e);
            }
        }
        if (result != null)
        {
            logger.error("http result {}", result);
        }

        return result;
    }

    /**
     * 异常自动恢复处理, 使用HttpRequestRetryHandler接口实现请求的异常恢复
     */
    private HttpRequestRetryHandler mRequestRetryHandler = new HttpRequestRetryHandler()
    {
        // 自定义的恢复策略
        public boolean retryRequest(IOException exception, int executionCount,
                HttpContext context)
        {
            // 设置恢复策略,在发生异常时候将自动重试3次
            if (executionCount >= 3)
            {
                // Do not retry if over max retry count
                return false;
            }
            if (exception instanceof NoHttpResponseException)
            {
                // Retry if the server dropped connection on us
                return true;
            }
            if (exception instanceof SSLHandshakeException)
            {
                // Do not retry on SSL handshake exception
                return false;
            }
            HttpRequest request = (HttpRequest) context
                    .getAttribute(ExecutionContext.HTTP_REQUEST);
            boolean idempotent = (request instanceof HttpEntityEnclosingRequest);
            if (!idempotent)
            {
                // Retry if the request is considered idempotent
                return true;
            }
            if (exception != null)
            {
                return true;
            }
            return false;
        }
    };

    /**
     * 使用ResponseHandler接口处理响应,HttpClient使用ResponseHandler会自动管理连接的释放,解决了对连接的释放管理
     */
    private class RedirectionResponsehandler implements ResponseHandler<String>
    {
        @Override
        public String handleResponse(HttpResponse response)
                throws ClientProtocolException, IOException
        {
            HttpUriRequest currentReq = (HttpUriRequest) localContext
                    .getAttribute(ExecutionContext.HTTP_REQUEST);
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK)
            {
                HttpEntity entity = response.getEntity();
                if (entity != null)
                {
                    Header header = entity.getContentEncoding();
                    if (header != null)
                    {
                        String contentEncoding = header.getValue();
                        if (contentEncoding != null)
                        {
                            if (contentEncoding.contains("gzip"))
                            {
                                entity = new GzipDecompressingEntity(entity);
                            }
                        }
                    }
                    String charset = EntityUtils.getContentCharSet(entity) == null ? ENCODE
                            : EntityUtils.getContentCharSet(entity);
                    Header[] headers = response
                            .getHeaders("passport.login.flag");
                    if (headers.length > 0)
                    {
                        String result = "{\"@code\":\"1\",\"@desc\":\"用户未登录\",\"xml\":{\"@errorCode\":\"common.2.userNotLoggedIn\"}}";
                        return result;
                    }
                    String responseBody = new String(
                            EntityUtils.toByteArray(entity), charset);
                    // 如果需要自动转发
                    Pattern redirPtn = Pattern
                            .compile(
                                    "<meta http-equiv=(?:'|\")refresh(?:'|\").*url=(.*)(?:'|\") />",
                                    Pattern.CASE_INSENSITIVE);
                    Matcher mathcer = redirPtn.matcher(responseBody);
                    while (mathcer.find())
                    {
                        String refreshUrl = mathcer.group(1);
                        if (!refreshUrl.startsWith("http:"))
                        {
                            HttpHost currentHost = (HttpHost) localContext
                                    .getAttribute(ExecutionContext.HTTP_TARGET_HOST);
                            String rdu = (currentReq.getURI().isAbsolute()) ? currentReq
                                    .getURI().toString()
                                    : (currentHost.toURI() + currentReq
                                            .getURI());
                            refreshUrl = String.format("%s%s", rdu, refreshUrl);
                        }
                        if (currentReq.getMethod().equalsIgnoreCase(HTTP_POST))
                        {
                            return post(refreshUrl, null, "");
                        }
                        else if (currentReq.getMethod().equalsIgnoreCase(
                                HTTP_GET))
                        {
                            //切记别改成带缓存的 wujj
                            return getWithoutCache(refreshUrl, null, "");
                        }
                    }
                    return responseBody;
                }
                else
                {
                    return null;
                }
            }
            else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_MOVED_TEMPORARILY
                    || response.getStatusLine().getStatusCode() == HttpStatus.SC_MOVED_PERMANENTLY)
            {
                // 从头中取出转向的地址
                Header locationHeader = response.getLastHeader("location");
                String location = locationHeader.getValue();
                if (null!=currentReq && currentReq.getMethod().equalsIgnoreCase(HTTP_POST))
                {
                    return post(location, null, "");
                }
                else if (null!=currentReq && currentReq.getMethod().equalsIgnoreCase(HTTP_GET))
                {
                    return getWithoutCache(location, null, "");
                }
            }
            else
            {
                return null;
            }
            return null;
        }
    }

    public String getCookie() {
        return cookie;
    }

    public void setCookie(String cookie) {
        this.cookie = cookie;
    }
    
    public void setProxy(String proxyUrl, int port) {
        proxy = new HttpHost(proxyUrl, port);
        mClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
    }
    
    
}




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值