在抓取https开头的网页时报错:unable to find valid certification path to requested target或者是peer not authenticated异常,原因你可能是使用jdk1.6,可以1.7试试,如果还是报错那就重新包装抓取用到HttpClient类,代码如下:
新建HttpsClient类
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.http.client.HttpClient;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
public class HttpsClient {
public static DefaultHttpClient getNewHttpsClient(HttpClient httpClient){
try {
SSLContext ctx = SSLContext.getInstance("TLS");
X509TrustManager tm = new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
};
ctx.init(null, new TrustManager[] { tm }, null);
SSLSocketFactory ssf = new SSLSocketFactory(ctx,SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
SchemeRegistry registry = new SchemeRegistry();
registry.register(new Scheme("https", 443, ssf));
ThreadSafeClientConnManager mgr = new ThreadSafeClientConnManager(registry);
return new DefaultHttpClient(mgr, httpClient.getParams());
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
}
}
在抓取之前重新获取httpClient类(httpClient = HttpsClient.getNewHttpsClient(httpClient);)
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
public class Test {
public static void main(String[] args) {
String url ="https://crs.edqm.eu/db/4DCGI/search?vSelectName=2&vContains=1&vtUserName=a&OK=Search&vTypeCRS=";
String html = getPageHtml(url);
System.out.println(html);
}
/**
* 获取网页html
*/
public static String getPageHtml(String currentUrl) {
HttpClient httpClient=new DefaultHttpClient();
httpClient = HttpsClient.getNewHttpsClient(httpClient);
String html = "";
HttpGet request = new HttpGet(currentUrl);
HttpResponse response = null;
try {
response = httpClient.execute(request);
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK){
HttpEntity mEntity = response.getEntity();
html = EntityUtils.toString(mEntity);
}
}catch(IOException e){
e.printStackTrace();
}
return html.toString();
}
}
使用的jar:
commons-httpclient-3.1.jar
commons-logging.jar
httpclient-4.2.5.jar
httpcore-4.2.4.jar
以上代码使用jdk1.7测试通过。
源码和jar已上传http://download.youkuaiyun.com/detail/itjavaer/8172293,导入eclipse中就能运行。