HttpClient之Java爬虫工具
工具协议
:http协议
引入依赖
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
</dependencies>
编写工具类
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class HttpClientUtils {
// 爬虫工具类
public void spaderUtil(String url1) {
// 提取接收参数, 截取有用信息
String substring = null;
if (url1.length() == 37) {
substring = url1.substring(20, 32);
}
if (url1.length() == 39) {
substring = url1.substring(20, 34);
}
// 字符串拼接
String url = "https://club.jd.com/comment/productPageComments.action?productId=" + substring + "&score=0&sortType=5&page=0&pageSize=10";
// 创建 HttpClient 对象,用于发送请求
CloseableHttpClient httpClient = HttpClients.createDefault();
// 因为使用的是 GET 请求,所以创建一个 HttpGet 对象
HttpGet httpGet = new HttpGet(url);
// 写入请求头参数
httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36");
// 创建一个 HttpResponse 用于接受请求结果
CloseableHttpResponse response = null;
try {
// 发送请求
response = httpClient.execute(httpGet);
// 将返回的结果转换为 JSONObject 类型,便与解析
JSONObject object = JSONObject.parseObject(EntityUtils.toString(response.getEntity()));
// 因为返回的结果中包含 10 条评论,因此将所有的评论存储到 JSONArray 对象中
JSONArray rawComment = JSONArray.parseArray(String.valueOf(object.get("comments")));
// 对 JSONArray 对象进行遍历,拿到所有的评论,其他的信息可根据字段名自行捕获
for (Object and : rawComment) {
System.out.println("创建评论日期:" + ((JSONObject) and).get("creationTime").toString().substring(0, 10));
System.out.println("打分:" + ((JSONObject) and).get("score").toString());
System.out.println("评论内容:" + ((JSONObject) and).get("content").toString());
System.out.println("手机版本号:" + ((JSONObject) and).get("mobileVersion").toString());
System.out.println("会员等级:" + ((JSONObject) and).get("userClient").toString());
System.out.println("收货后" + ((JSONObject) and).get("days").toString() + "天后点击确认收获");
}
} catch (Exception e) {
// 请求异常捕获
System.out.println("发送请求失败,请检查您的网络");
} finally {
// 请求完毕,关闭 Http 对象
if (response != null) {
try {
response.close();
} catch (Exception e) {
e.printStackTrace();
}
}
try {
httpClient.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
测试效果
public class Test {
// 测试爬虫工具页面
public static void main(String[] args) {
new HttpClientUtils().spaderUtil("https://item.jd.com/100034304878.html");
}
}
测试结果: