【爬虫】——模拟请求

需求

我们将要爬取人民网某些新闻中的数据,那我们该如何实现呢?

首先我们先实现第一步:根据url,模拟url请求,根据url获取网页中的内容。

模拟请求

public class HttpClientUtil {

    static final int timeOut = 30 * 10000;  //  连接超时时间为3分钟
    /** 
     * 模拟请求 ,支持 https
     *  
     * @param url       资源地址 
     * @param map   参数列表 
     * @param encoding  编码 
     * @return 
     * @throws NoSuchAlgorithmException  
     * @throws KeyManagementException  
     * @throws IOException  
     * @throws ClientProtocolException  
     */  
    public static String post(String url, Map<String,String> map,String encoding) throws KeyManagementException, NoSuchAlgorithmException, ClientProtocolException, IOException {  
        String results = "";  
        //采用绕过验证的方式处理https请求  
        SSLContext sslcontext = SSLClient.createIgnoreVerifySSL();  
           // 设置协议http和https对应的处理socket链接工厂的对象  
        HostnameVerifier hv = new HostnameVerifier() {
            public boolean verify(String urlHostName, SSLSession session) {
                System.out.println("Warning: URL Host: " + urlHostName + " vs. "
                        + session.getPeerHost());
                return true;
            }
        };

        LayeredConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, hv);

        Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()  
               .register("http", PlainConnectionSocketFactory.INSTANCE)  
               .register("https", sslsf)  
               .build();  

         PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);  

         HttpClients.custom().setConnectionManager(connManager);  

         //设置全局的标准cookie策略  
         RequestConfig config = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();
           //创建自定义的httpclient对象  
         CloseableHttpClient client = HttpClients.custom().setConnectionManager(connManager).setDefaultRequestConfig(config).build();  

         //创建post方式请求对象  
         HttpPost httpPost = new HttpPost(url); 
       //设置请求和传输超时时间  毫秒
         RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(timeOut).setConnectTimeout(timeOut).build();
         httpPost.setConfig(requestConfig); 

        //装填参数  
         List<NameValuePair> nvps = new ArrayList<NameValuePair>();  
         if(map!=null){  
            for (Entry<String, String> entry : map.entrySet()) {  
                nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));  
            }  
         }  
        //设置参数到请求对象中  
         httpPost.setEntity(new UrlEncodedFormEntity(nvps, encoding));  

         System.out.println("请求地址:"+url);  
         System.out.println("请求参数:"+nvps.toString());  

        //设置header信息  
        //指定报文头【Content-type】、【User-Agent】  
         httpPost.setHeader("Content-type", "application/x-www-form-urlencoded");  
         httpPost.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");  

         //执行请求操作,并拿到结果(同步阻塞)  
         CloseableHttpResponse response = client.execute(httpPost); 

         if(response.getStatusLine().getStatusCode()!=200){//
             System.out.println("*******response.getStatusLine().getStatusCode()!=200*****");
             try {
                Thread.sleep(3000);
                 response = client.execute(httpPost);
            } catch (Exception e) {
                e.printStackTrace();
            }

         }

         /* //
         if (response.getStatusLine().getStatusCode() != 200){
             System.out.println("****************请求返回失败******************"+response.getStatusLine().getStatusCode());
             try {
                Thread.sleep(8000);
                response = client.execute(httpPost); //再连一次
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

         }*/

        //获取结果实体  
         HttpEntity entity = response.getEntity();  
         if (entity != null) {  
            //按指定编码转换结果实体为String类型  
             results = EntityUtils.toString(entity, encoding);  
         }  
         EntityUtils.consume(entity);  
        //释放链接  
         response.close();  
         return results;  
    }  


/**
 * 
 * @param url
 * @param params 可变参数,设置网页编码
 * @return
 * @throws KeyManagementException
 * @throws NoSuchAlgorithmException
 * @throws ClientProtocolException
 * @throws IOException
 */
    public static String get(String url,String ... params) {
        String results = "";
        try {
            String encoding = CommonVariable.encoding_default;
            if (params.length > 0) {
                encoding = params[0];
            }


            //采用绕过验证的方式处理https请求
            SSLContext sslcontext = SSLClient.createIgnoreVerifySSL();
            // 设置协议http和https对应的处理socket链接工厂的对象
            HostnameVerifier hv = new HostnameVerifier() {
                public boolean verify(String urlHostName, SSLSession session) {
                /*System.out.println("Warning: URL Host: " + urlHostName + " vs. "
                        + session.getPeerHost());*/
                    return true;
                }
            };

            LayeredConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, hv);

            Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
                    .register("http", PlainConnectionSocketFactory.INSTANCE)
                    .register("https", sslsf)
                    .build();

            PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);

            HttpClients.custom().setConnectionManager(connManager);

            //设置全局的标准cookie策略
            RequestConfig config = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();
            //创建自定义的httpclient对象
            CloseableHttpClient client = HttpClients.custom().setConnectionManager(connManager).setDefaultRequestConfig(config).build();

            //创建GET方式请求对象

            HttpGet httpGet = new HttpGet(url);
            //设置请求和传输超时时间  毫秒
            RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(timeOut).setConnectTimeout(timeOut).build();
            httpGet.setConfig(requestConfig);

            //设置header信息
            //指定报文头【Content-type】、【User-Agent】
            httpGet.setHeader("Content-type", "application/x-www-form-urlencoded");
            httpGet.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

            //执行请求操作,并拿到结果(同步阻塞)
            CloseableHttpResponse response = client.execute(httpGet);

            if (response.getStatusLine().getStatusCode() != 200) {//
                //System.out.println("*******response.getStatusLine().getStatusCode()!=200*****重连");
               /* try {
                    Thread.sleep(3000);
                    response = client.execute(httpGet);
                } catch (Exception e) {
                    e.printStackTrace();
                }*/
            }

            //获取结果实体
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                //按指定编码转换结果实体为String类型
                results = EntityUtils.toString(entity, encoding);
            }
            EntityUtils.consume(entity);
            //释放链接
            response.close();
        }
        catch (Exception ex)
        {
            ex.printStackTrace();
        }
         return results;  
    }  

}

当然我们这步的实现也可以采取很多优秀的框架,就不用自己写工具类来进行实现啦!

下一节会具体介绍,怎么获取想要的具体内容

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值