一.加入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.9</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
二、建立HttpClientDownPage这个类请求方法分为get和post两种,代码如下:
//设置代理,模仿浏览器
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36";
public static String sendGet(String url) {
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求和传输超时时间
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(2000).setConnectTimeout(2000).build();
CloseableHttpResponse response = null;
String html = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet(url);
try {
request.setHeader("User-Agent", USER_AGENT);
request.setConfig(requestConfig);
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
html = EntityUtils.toString(httpEntity, "GBK");
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return html;
}
public static String sendPost(String url, String param) {
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置请求和传输超时时间
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(100000).setConnectTimeout(100000).build();
CloseableHttpResponse response = null;
String html = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpPost httpPost = new HttpPost(url);
try {
httpPost.setHeader("User-Agent", USER_AGENT);
List<NameValuePair> list = new LinkedList<>();
BasicNameValuePair param1 = new BasicNameValuePair("currentpage", param);
BasicNameValuePair param2 = new BasicNameValuePair("pagesize", "20");
list.add(param1);
list.add(param2);
// 使用URL实体转换工具
UrlEncodedFormEntity entityParam = new UrlEncodedFormEntity(list, "UTF-8");
httpPost.setEntity(entityParam);
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(httpPost);
//4.判断响应状态为200,进行处理
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
html = EntityUtils.toString(httpEntity, "GBK");
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return html;
}
三、处理返回的网页数据 ,我这边是将网页上分页数据中的id进行提取。
public static HashSet<String> paraseList(Document document) {
HashSet<String> hashSet = new HashSet<>();
//根据网页标签解析源码
Elements elements = document.select(".search_div");
Elements a = elements.select("a");
//去除表头
for (Element element : a) {
String id = element.attr("id");
hashSet.add(id);
}
return hashSet;
}
四、最后将页面保存下来方便快速提取信息
public static void main(String[] args) throws IOException {
/*String detail = HttpClientDownPage.getDetail("http://www.chinadrugtrials.org.cn/eap/clinicaltrials.searchlistdetail", null);
System.out.println(detail);*/
// 解析样本获取id
for (int i = 1; i <= 510; i++) {
String s = ReadFile.readFile("E:\\pa\\" + i + ".txt");
Document parse = Jsoup.parse(s);
HashSet<String> hashSet = HttpClientDownPage.paraseList(parse);
for (String s1 : hashSet) {
String detail1 = HttpClientDownPage.getDetail("http://www.chinadrugtrials.org.cn/eap/clinicaltrials.searchlistdetail", s1);
PrintWriter pw = new PrintWriter(new FileWriter("E:\\pa\\padetails\\" + s1 + ".txt"));
pw.write(detail1);
pw.close();
}
}