目录
实验环境准备
编程软件:IDEA 2022.1
JDK版本:jdk15.0.1
MAVEN版本:apache-maven-3.5.4
浏览器:EDGE
访问网站:拉钩网
IEAD的设置
1.maven设置
2.一键自动生成序列化serialVersionUID
3.一键导包
4.IEAD函数构造,setter 、getter方法快捷键(alt+ins)
5.快速遍历,输入it,选择相应的方式,有不同的遍历方法
6.ctrl+alt+v,生成返回值
7.ctrl+alt+l,格式化代码
一、分析网页数据结构
1.进入拉钩网,搜索大数据
2. 打开F12(开发者模式),找到positionAjax.json?needAddtionalResult=false
3.复制请求头和响应头(各个浏览器有些许差异,以自己使用的浏览器为准)
二、使用IDEA编写数据采集程序
1.编写响应结果JavaBean类(HttpClientResp.java)
package com.position.reptile;
import java.io.Serial;
import java.io.Serializable;
public class HttpClientResp implements Serializable {
@Serial
private static final long serialVersionUID = -2224539827395038194L;
private int code;
private String content;
public HttpClientResp() {
}
public HttpClientResp(int code) {
this.code = code;
}
public HttpClientResp(String content) {
this.content = content;
}
public HttpClientResp(int code, String content) {
this.code = code;
this.content = content;
}
public int getCode() {
return code;
}
public void setCode(int code) {
this.code = code;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
@Override
public String toString() {
return "[code=" + code + ", content=" + content + "]";
}
}
2.编写封装HTTP请求工具类(HttpClientUtils.java)
package com.position.reptile;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpEntityEnclosingRequestBase;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class HttpClientUtils {
private static final String ENCODING = "UTF-8";
private static final int CONNECT_TIMEOUT = 6000;
private static final int SOCKET_TIMEOUT = 6000;
public static void packageHeader(Map<String, String> params, HttpRequestBase httpMethod) {
if (params != null) {
Set<Map.Entry<String, String>> entrySet = params.entrySet();
for (Map.Entry<String, String> entry : entrySet) {
httpMethod.setHeader(entry.getKey(), entry.getValue());
}
}
}
public static void packageParam(Map<String, String> params, HttpEntityEnclosingRequestBase httpMethod) throws UnsupportedEncodingException {
if (params !=null){
List<NameValuePair> nvps=new ArrayList<NameValuePair>();
Set<Map.Entry<String, String>> entrySet = params.entrySet();
for (Map.Entry<String, String> entry : entrySet) {
nvps.add(new BasicNameValuePair(entry.getKey(),entry.getValue()));
}
httpMethod.setEntity(new UrlEncodedFormEntity(nvps,ENCODING));
}
}
public static HttpClientResp getHttpClientResult(CloseableHttpResponse httpResponse, CloseableHttpClient httpClient, HttpRequestBase httpMethod) throws Exception {
httpResponse=httpClient.execute(httpMethod);
if (httpResponse!=null && httpResponse.getStatusLine()!=null){
String content="";
if (httpResponse.getEntity()!=null){
content= EntityUtils.toString(httpResponse.getEntity(),ENCODING);
}
return new HttpClientResp(httpResponse.getStatusLine().getStatusCode(),content);
}
return new HttpClientResp(HttpStatus.SC_INTERNAL_SERVER_ERROR);
}
public static HttpClientResp doPost(String url,Map<String,String> headers,Map<String,String> params) throws Exception{
CloseableHttpClient httpClient= HttpClients.createDefault();
HttpPost httpPost=new HttpPost(url);
RequestConfig requestConfig=RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
httpPost.setConfig(requestConfig);
packageHeader(headers,httpPost);
packageParam(params,httpPost);
CloseableHttpResponse httpResponse=null;
try {
return getHttpClientResult(null,httpClient,httpPost);
}finally {
release(null,httpClient);
}
}
public static void release(CloseableHttpResponse httpResponse,CloseableHttpClient httpClient) throws IOException{
if (httpResponse!=null){
httpResponse.close();
}
if (httpClient!=null){
httpClient.close();
}
}
}
3.编写封装存储在HDFS数据工具类(HttpClientHdfsUtils.java)
package com.position.reptile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
public class HttpClientHdfsUtils {
public static void createFileBySysTime(String url, String fileName,String data) throws IOException {
System.setProperty("HADOOP_USER_NAME", "root");
Path path=null;
Calendar calendar=Calendar.getInstance();
Date time=calendar.getTime();
SimpleDateFormat format=new SimpleDateFormat("yyyyMMdd");
String filePath=format.format(time);
Configuration conf = new Configuration();
URI uri = URI.create(url);
FileSystem fileSystem;
try {
fileSystem = FileSystem.get(uri,conf);
path=new Path("/JobData/"+filePath);
if (!fileSystem.exists(path)){
fileSystem.mkdirs(path);
}
FSDataOutputStream fsDataOutputStream = fileSystem.create(new Path(path.toString()+"/"+fileName));
IOUtils.copyBytes(new ByteArrayInputStream(data.getBytes()), fsDataOutputStream, conf, true);
fileSystem.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
4.实现网页数据采集(HttpClientData.java)
package com.position.reptile;
import java.util.HashMap;
import java.util.Map;
public class HttpClientData {
public static void main(String[] args) throws Exception {
Map<String, String> headers = new HashMap<String, String>();
headers.put("Cookie", "privacyPolicyPopup=false; RECOMMEND_TIP=true; user_trace_token=20221009161207-8045057e-be73-4aeb-a832-fc14567d0259; LGUID=20221009161207-1db47cc9-1f90-48a4-bbca-8420c56d4b6f; index_location_city=%E5%85%A8%E5%9B%BD; __lg_stoken__=738691b2e0a0e7d60646b7e4b6c53d7910f4008d35d8a5a9b73a61fa21fbf88840341987781c7c37830616cb5bae7a9fadf5fe47775903926795b2d5be4811dfe3a8c56a62d5; JSESSIONID=ABAAAECAAEBABII4063E574A8EEDC78E88DDFBF08118DE9; WEBTJ-ID=20221010164350-183c1108d405f3-07024fa4ea1a1c-7b555471-1327104-183c1108d41beb; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22183bbcd2511fa5-0019654d0c4d5-7b555471-1327104-183bbcd251255c%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22106.0.0.0%22%2C%22%24latest_referrer_host%22%3A%22%22%7D%2C%22%24device_id%22%3A%22183bbcd2511fa5-0019654d0c4d5-7b555471-1327104-183bbcd251255c%22%7D; LGSID=20221010164351-1f255952-2a26-469f-b32f-433689c0d3b5; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5F%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; X_MIDDLE_TOKEN=a98b387ae06fd2cae4734034ee2f7efd; _ga=GA1.2.2035100235.1665391681; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1665391681; _gid=GA1.2.671406612.1665391681; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1665392144; LGRID=20221010165543-2bbb81d1-7b4c-4b1b-98c2-5032984165e1; SEARCH_ID=c35f92fb4a9a494ebaacecf284bcbc38; X_HTTP_TOKEN=4d2cc8354ecc1d4d6412935661820db0420c5f5053");
headers.put("Connection", "keep-alive");
headers.put("Accept", "application/json, text/javascript, */*; q=0.01");
headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
headers.put("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko))Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37");
headers.put("Content-Type",
"application/x-www-form-urlencoded; charset=UTF-8");
headers.put("Referer",
"https://www.lagou.com/jobs/list_%E5%A4%A7%E6%95%B0%E6%8D%AE/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=");
headers.put("Origin", "https://www.lagou.com");
headers.put("X-Requested-With", "XMLHttpRequest");
headers.put("X-Anit-Forge-Token", "None");
headers.put("Cache-Control", "no-cache");
headers.put("X-Anit-Forge-Code", "0");
headers.put("Host", "www.lagou.com");
//设置请求参数
Map<String, String> params = new HashMap<String, String>();
params.put("kd", "大数据");
params.put("city", "全国");
for (int i = 1; i < 31; i++) {
params.put("pn", String.valueOf(i));
HttpClientResp result = HttpClientUtils
.doPost("https://www.lagou.com/jobs/positionAjax.jsonneedAddtionalResult=false&first=true&px=default",
headers,params);
HttpClientHdfsUtils.createFileBySysTime("hdfs://hadoop1:9000",
"page"+i,result.toString());
Thread.sleep(1 * 500);
}
}
}
5.pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>jobcase-reptile</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.1</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>15</maven.compiler.source>
<maven.compiler.target>15</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>
三、实行数据采集
1.启动Hadoop集群
[root@hadoop1 software]# start-all.sh
2.运行 HttpClientData.java代码
3.查看数据(网页)
//使用hdfs命令查看数据列表
[root@hadoop1 software]# hdfs dfs -ls /JobData/20221010
//使用hdfs命令查看一个数据
[root@hadoop1 ~]# hdfs dfs -cat /JobData/20221010/page1