使用"HttpClient"爬取网页的代码示例 - 快速上手;
依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.14</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
示例
使用JDK原生API
发送HTTP
请求
文件01
HttpURLConnectionDemo
代码01
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
/**
* 使用JDK原生的API来请求网页
*/
public class HttpURLConnectionDemo {
public static void main(String[] args) throws IOException {
String urlStr = "https://www.baidu.com/";
URL url = new URL(urlStr);
URLConnection urlConnection = url.openConnection();
HttpURLConnection httpURLConnection = (HttpURLConnection) urlConnection;
/*
请求行
空格
请求头
请求体
*/
// 设置请求类型
httpURLConnection.setRequestMethod("GET");
// 设置请求头
httpURLConnection.setRequestProperty("Accept-Charset", "utf-8");
// 获取httpURLConnection的输入流
try (
InputStream is = httpURLConnection.getInputStream();
InputStreamReader isr = new InputStreamReader(is, StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(isr);
) {
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
}
}
}
}
结果01
使用HttpClient
发送"无参GET"请求
文件01
HttpClientDemoA
代码01
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* 使用HttpClient发送无参GET请求
*/
public class HttpClientDemoA {
public static void main(String[] args) {
// 可关闭的httpClient客户端,相当于你打开的一个浏览器
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/";
// 构造httpGet请求对象
HttpGet httpGet = new HttpGet(urlStr);
/*加请求头的操作*/
// 解决httpClient被认为不是真人的问题
httpGet.addHeader("User-Agent:", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47");
// 解决防盗链,value:发生防盗链的网址的url
httpGet.addHeader("Referer", "https://www.baidu.com/");
// 可关闭的响应
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
// 获取响应结果:DecompressingEntity
/*
HttpEntity不仅可以作为结果,也可以作为请求的参数实体,有很多的实现。
*/
HttpEntity entity = response.getEntity();
// 对"HttpEntity"操作的工具类
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
// 确保流关闭
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
结果01
使用HttpClient
发送"有参GET"请求
文件01
HttpClientDemoB
代码01
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
/**
* 使用HttpClient发送有参GET请求
*/
public class HttpClientDemoB {
public static void main(String[] args) throws UnsupportedEncodingException {
// 可关闭的httpClient客户端,相当于你打开的一个浏览器
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String passwordParam = "123+abc 456|789";// 123 abc
// 做urlEncode:如果是浏览器的话,是自动做了的,123%2Babc+456%7C789
URLEncoder.encode(passwordParam, StandardCharsets.UTF_8.name());
String urlStr = "https://www.baidu.com/";
// 构造httpGet请求对象
HttpGet httpGet = new HttpGet(urlStr);
// 加请求头的操作
// 解决httpClient被认为不是真人的问题
httpGet.addHeader("User-Agent:", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47");
// 解决防盗链,value:发生防盗链的网址的url
httpGet.addHeader("Referer", "https://www.baidu.com/");
// 可关闭的响应
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
// 获取响应结果:DecompressingEntity
/*
HttpEntity不仅可以作为结果,也可以作为请求的参数实体,有很多的实现。
*/
HttpEntity entity = response.getEntity();
// 对"HttpEntity"操作的工具类
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
// 确保流关闭
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
保存网络上的图片到本地
文件01
GetPicture
代码01
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.FileOutputStream;
import java.io.IOException;
/**
* 保存网络图片到本地
*/
public class GetPicture {
public static void main(String[] args) {
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/img/pc_675fe66eab33abff35a2669768c43d95.png";
HttpGet httpGet = new HttpGet(urlStr);
httpGet.addHeader("User-Agent:", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47");
// 可关闭的响应
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
// image/图片的后缀:image/jpg、image/jpeg、image/png
String contentType = entity.getContentType().getValue();
String suffix = ".jpg";
if (contentType.contains("jpg") || contentType.contains("jpeg")) {
suffix = ".jpg";
} else if (contentType.contains("bmp") || contentType.contains("bitmap")) {
suffix = ".bmp";
} else if (contentType.contains("png")) {
suffix = ".png";
} else if (contentType.contains(".gif")) {
suffix = ".gif";
}
//获取文件的字节流
byte[] bytes = EntityUtils.toByteArray(entity);
String localAbsPath = "e:\\abc" + suffix;
FileOutputStream fos = new FileOutputStream(localAbsPath);
fos.write(bytes);
fos.close();
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
结果01
设置代理实现访问
文件01
HttpClientDemoC
代码01
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* 设置访问代理
*/
public class HttpClientDemoC {
public static void main(String[] args) {
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/";
HttpGet httpGet = new HttpGet(urlStr);
// 创建一个代理
String ip = "221.5.80.66";
int port = 3128;
HttpHost proxy = new HttpHost(ip, port);
// 对每一个请求进行配置,会覆盖全局的默认请求配置
RequestConfig requestConfig = RequestConfig.custom()
.setProxy(proxy).build();
httpGet.setConfig(requestConfig);
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
设置连接超时和请求超时
文件01
HttpClientDemoD
代码01
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* 连接超时和读取超时
*/
public class HttpClientDemoD {
public static void main(String[] args) {
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/";
HttpGet httpGet = new HttpGet(urlStr);
// 对每一个请求进行配置,会覆盖全局的默认请求配置
RequestConfig requestConfig = RequestConfig.custom()
// 设置连接超时,单位:ms,完成TCP三次握手的时间上限
.setConnectTimeout(5000)
// 设置读取超时,单位:ms,从请求的网值获得响应数据的时间间隔
.setSocketTimeout(3000)
// 指从连接池里面获取connection的超时时间
// .setConnectionRequestTimeout(1)
.setConnectionRequestTimeout(5000)
.build();
httpGet.setConfig(requestConfig);
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
结果01
使用HttpClient
发送"表单类型的POST"请求
文件01
HttpClientDemoE
代码01
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
/**
* 发送表单类型的POST请求
* "application/x-www-form-urlencoded"
*/
public class HttpClientDemoE {
public static void main(String[] args) {
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/";
// 创建httpPost对象
HttpPost httpPost = new HttpPost(urlStr);
// 设置请求头
httpPost.addHeader("Content-Type","application/x-xxx-form-urlencoded; charset=UTF-8");
// 给post对象设置参数
/*
NameValuePair: <input id="user-name-label" type="text" name="userName"/>
的"name(userName)"和"input"标签里面输入的值就构成了一个NameValuePair对象
*/
List<NameValuePair> list = new ArrayList<>();
list.add(new BasicNameValuePair("userName","乐知者java"));
list.add(new BasicNameValuePair("password","123456"));
// 把参数集合设置到formEntity
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(null,StandardCharsets.UTF_8);
httpPost.setEntity(formEntity);
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
使用HttpClient
发送"Json类型的POST"请求
文件01
HttpClientDemoF
代码01
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.BasicHttpEntity;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
/**
* 发送表单类型的POST请求
* "application/json"
*/
public class HttpClientDemoF {
public static void main(String[] args) {
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/";
// 创建httpPost对象
HttpPost httpPost = new HttpPost(urlStr);
//String:是一个Json字符串
JSONObject jsonObj = new JSONObject();
jsonObj.put("username", "java乐之者");
jsonObj.put("password", "123456");
StringEntity jsonEntity = new StringEntity(jsonObj.toString(), Consts.UTF_8);
// 也需要给entity设置一下内容类型
jsonEntity.setContentType(new BasicHeader("Content-Type", "application/json; charset=UTF-8"));
// 设置entity的编码
jsonEntity.setContentEncoding(Consts.UTF_8.name());
httpPost.setEntity(jsonEntity);
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
使用HttpClient
发送"上传文件的POST"请求
文件01
HttpClientDemoG
代码01
import org.apache.http.Consts;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
/**
* 发送上传文件的POST请求
* "multipart/from-data"
*/
public class HttpClientDemoG {
public static void main(String[] args) {
CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
String urlStr = "https://www.baidu.com/";
// 创建httpPost对象
HttpPost httpPost = new HttpPost(urlStr);
// 构造一个ContentBody的实现类对象
FileBody fileBody = new FileBody(new File("e:\\a.png"));
// 设置构造上传文件使用的entity
MultipartEntityBuilder builder = MultipartEntityBuilder.create();
builder.setCharset(Consts.UTF_8);// 设置编码
builder.setContentType(ContentType.create(
"multipart/fom-data",Consts.UTF_8));
builder.setMode(HttpMultipartMode.BROWSER_COMPATIBLE);// 设置浏览器模式
// 对于普通的表单字段如果含有中文的话,不能通过addTextBody,否则乱码
// text:指的是输入的值
StringBody userNameTextBody = new StringBody("小明",
ContentType.create(
"text/plain",Consts.UTF_8));
HttpEntity httpEntity = builder.addPart("fileName", fileBody)
// 通过file,byte[],inputstream来上传文件
.addBinaryBody("fileName", new File("e:\\b.png"))
.addPart("username", userNameTextBody)
.addTextBody("password", "123456")
.build();
httpPost.setEntity(httpEntity);
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpPost);
HttpEntity entity = response.getEntity();
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
配置HttpClient
绕过"https
"安全认证
文件01
HttpClientDemoH
代码01
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.http.util.EntityUtils;
import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
/**
* 配置httpClient绕过https安全认证
*/
public class HttpClientDemoH {
//创建支持安全协议的连接工厂
private ConnectionSocketFactory trustHttpsCertificates() throws Exception {
SSLContextBuilder sslContextBuilder = new SSLContextBuilder();
//
sslContextBuilder.loadTrustMaterial(null, new TrustStrategy() {
// 判断是否信任URL
@Override
public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
return true;
}
});
SSLContext sslContext = sslContextBuilder.build();
SSLConnectionSocketFactory sslConnectionSocketFactory = new
SSLConnectionSocketFactory(sslContext,
new String[]{"SSLv2Hello", "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}
, null, NoopHostnameVerifier.INSTANCE);
return sslConnectionSocketFactory;
}
public void start() throws Exception {
Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", trustHttpsCertificates())
.build();
// 创建一个ConnectionManager
PoolingHttpClientConnectionManager pool = new PoolingHttpClientConnectionManager(registry);
// 定制CloseableHttpClient对象
HttpClientBuilder httpClientBuilder = HttpClients.custom().setConnectionManager(pool);
// 配置好httpClient之后,通过build方法来获取httpClient对象
CloseableHttpClient closeableHttpClient = httpClientBuilder.build();
// CloseableHttpClient closeableHttpClient = httpClientBuilder.createDefault();
String urlStr = "https://zcdsade.cfd//";
HttpGet httpGet = new HttpGet(urlStr);
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
String toStringResult = EntityUtils.toString(entity, StandardCharsets.UTF_8);
System.out.println(toStringResult);
EntityUtils.consume(entity);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (closeableHttpClient != null) {
try {
closeableHttpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static void main(String[] args) throws Exception {
HttpClientDemoH httpClientDemo = new HttpClientDemoH();
httpClientDemo.start();
}
}
【功能实现】
[获取响应头以及相应的"Content-Type"]
sdf
CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
aa
StatusLine statusLine = response.getStatusLine();
sdf
// 代表本次请求的成功、失败的状态 statusLine.getStatusCode;
afa
// 获取所有的响应头 Header[] allHeaders = response.getAllHeaders(); for (Header header : allHeaders) { System.out.println("响应头" + header.getName() + "的值:" + header.getValue()); }
ewwsr
// 获取响应头"Content-Type"的值 HttpEntity entity = response.getEntity(); System.out.println("ContentType的值:" + entity.getContentType());