需求
我们将要爬取人民网某些新闻中的数据,那我们该如何实现呢?
首先我们先实现第一步:根据url,模拟url请求,根据url获取网页中的内容。
模拟请求
public class HttpClientUtil {
static final int timeOut = 30 * 10000; // 连接超时时间为3分钟
/**
* 模拟请求 ,支持 https
*
* @param url 资源地址
* @param map 参数列表
* @param encoding 编码
* @return
* @throws NoSuchAlgorithmException
* @throws KeyManagementException
* @throws IOException
* @throws ClientProtocolException
*/
public static String post(String url, Map<String,String> map,String encoding) throws KeyManagementException, NoSuchAlgorithmException, ClientProtocolException, IOException {
String results = "";
//采用绕过验证的方式处理https请求
SSLContext sslcontext = SSLClient.createIgnoreVerifySSL();
// 设置协议http和https对应的处理socket链接工厂的对象
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. "
+ session.getPeerHost());
return true;
}
};
LayeredConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, hv);
Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", sslsf)
.build();
PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
HttpClients.custom().setConnectionManager(connManager);
//设置全局的标准cookie策略
RequestConfig config = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();
//创建自定义的httpclient对象
CloseableHttpClient client = HttpClients.custom().setConnectionManager(connManager).setDefaultRequestConfig(config).build();
//创建post方式请求对象
HttpPost httpPost = new HttpPost(url);
//设置请求和传输超时时间 毫秒
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(timeOut).setConnectTimeout(timeOut).build();
httpPost.setConfig(requestConfig);
//装填参数
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
if(map!=null){
for (Entry<String, String> entry : map.entrySet()) {
nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
}
//设置参数到请求对象中
httpPost.setEntity(new UrlEncodedFormEntity(nvps, encoding));
System.out.println("请求地址:"+url);
System.out.println("请求参数:"+nvps.toString());
//设置header信息
//指定报文头【Content-type】、【User-Agent】
httpPost.setHeader("Content-type", "application/x-www-form-urlencoded");
httpPost.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//执行请求操作,并拿到结果(同步阻塞)
CloseableHttpResponse response = client.execute(httpPost);
if(response.getStatusLine().getStatusCode()!=200){//
System.out.println("*******response.getStatusLine().getStatusCode()!=200*****");
try {
Thread.sleep(3000);
response = client.execute(httpPost);
} catch (Exception e) {
e.printStackTrace();
}
}
/* //
if (response.getStatusLine().getStatusCode() != 200){
System.out.println("****************请求返回失败******************"+response.getStatusLine().getStatusCode());
try {
Thread.sleep(8000);
response = client.execute(httpPost); //再连一次
} catch (InterruptedException e) {
e.printStackTrace();
}
}*/
//获取结果实体
HttpEntity entity = response.getEntity();
if (entity != null) {
//按指定编码转换结果实体为String类型
results = EntityUtils.toString(entity, encoding);
}
EntityUtils.consume(entity);
//释放链接
response.close();
return results;
}
/**
*
* @param url
* @param params 可变参数,设置网页编码
* @return
* @throws KeyManagementException
* @throws NoSuchAlgorithmException
* @throws ClientProtocolException
* @throws IOException
*/
public static String get(String url,String ... params) {
String results = "";
try {
String encoding = CommonVariable.encoding_default;
if (params.length > 0) {
encoding = params[0];
}
//采用绕过验证的方式处理https请求
SSLContext sslcontext = SSLClient.createIgnoreVerifySSL();
// 设置协议http和https对应的处理socket链接工厂的对象
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
/*System.out.println("Warning: URL Host: " + urlHostName + " vs. "
+ session.getPeerHost());*/
return true;
}
};
LayeredConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslcontext, hv);
Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", sslsf)
.build();
PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
HttpClients.custom().setConnectionManager(connManager);
//设置全局的标准cookie策略
RequestConfig config = RequestConfig.custom().setCookieSpec(CookieSpecs.IGNORE_COOKIES).build();
//创建自定义的httpclient对象
CloseableHttpClient client = HttpClients.custom().setConnectionManager(connManager).setDefaultRequestConfig(config).build();
//创建GET方式请求对象
HttpGet httpGet = new HttpGet(url);
//设置请求和传输超时时间 毫秒
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(timeOut).setConnectTimeout(timeOut).build();
httpGet.setConfig(requestConfig);
//设置header信息
//指定报文头【Content-type】、【User-Agent】
httpGet.setHeader("Content-type", "application/x-www-form-urlencoded");
httpGet.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//执行请求操作,并拿到结果(同步阻塞)
CloseableHttpResponse response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode() != 200) {//
//System.out.println("*******response.getStatusLine().getStatusCode()!=200*****重连");
/* try {
Thread.sleep(3000);
response = client.execute(httpGet);
} catch (Exception e) {
e.printStackTrace();
}*/
}
//获取结果实体
HttpEntity entity = response.getEntity();
if (entity != null) {
//按指定编码转换结果实体为String类型
results = EntityUtils.toString(entity, encoding);
}
EntityUtils.consume(entity);
//释放链接
response.close();
}
catch (Exception ex)
{
ex.printStackTrace();
}
return results;
}
}
当然我们这步的实现也可以采取很多优秀的框架,就不用自己写工具类来进行实现啦!
下一节会具体介绍,怎么获取想要的具体内容