springboot之爬虫抓取数据

最新推荐文章于 2023-06-09 13:39:53 发布

404 Not Bug

最新推荐文章于 2023-06-09 13:39:53 发布

阅读量2.2k

点赞数 1

文章标签： java 数据挖掘爬虫后端 spring boot

本文链接：https://blog.youkuaiyun.com/weixin_43933478/article/details/105092803

版权

一.加入依赖

   <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.9</version>
    </dependency>

    <dependency>
        <groupId>net.sourceforge.htmlunit</groupId>
        <artifactId>htmlunit</artifactId>
        <version>2.27</version>
    </dependency>

    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>

二、建立HttpClientDownPage这个类请求方法分为get和post两种，代码如下：

  //设置代理，模仿浏览器
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36";

public static String sendGet(String url) {
    //1.生成httpclient，相当于该打开一个浏览器
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //设置请求和传输超时时间
    RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(2000).setConnectTimeout(2000).build();
    CloseableHttpResponse response = null;
    String html = null;
    //2.创建get请求，相当于在浏览器地址栏输入 网址
    HttpGet request = new HttpGet(url);
    try {
        request.setHeader("User-Agent", USER_AGENT);
        request.setConfig(requestConfig);
        //3.执行get请求，相当于在输入地址栏后敲回车键
        response = httpClient.execute(request);
        //4.判断响应状态为200，进行处理
        if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
            //5.获取响应内容
            HttpEntity httpEntity = response.getEntity();
            html = EntityUtils.toString(httpEntity, "GBK");
        } else {
            //如果返回状态不是200，比如404（页面不存在）等，根据情况做处理，这里略
            System.out.println("返回状态不是200");
            System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
        }
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //6.关闭
        HttpClientUtils.closeQuietly(response);
        HttpClientUtils.closeQuietly(httpClient);
    }
    return html;
}

public static String sendPost(String url, String param) {
    //1.生成httpclient，相当于该打开一个浏览器
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //设置请求和传输超时时间
    RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(100000).setConnectTimeout(100000).build();
    CloseableHttpResponse response = null;
    String html = null;
    //2.创建get请求，相当于在浏览器地址栏输入 网址
    HttpPost httpPost = new HttpPost(url);
    try {
        httpPost.setHeader("User-Agent", USER_AGENT);
        List<NameValuePair> list = new LinkedList<>();
        BasicNameValuePair param1 = new BasicNameValuePair("currentpage", param);
        BasicNameValuePair param2 = new BasicNameValuePair("pagesize", "20");
        list.add(param1);
        list.add(param2);
        // 使用URL实体转换工具
        UrlEncodedFormEntity entityParam = new UrlEncodedFormEntity(list, "UTF-8");
        httpPost.setEntity(entityParam);
        //3.执行get请求，相当于在输入地址栏后敲回车键
        response = httpClient.execute(httpPost);
        //4.判断响应状态为200，进行处理
        if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
            //5.获取响应内容
            HttpEntity httpEntity = response.getEntity();
            html = EntityUtils.toString(httpEntity, "GBK");
        } else {
            //如果返回状态不是200，比如404（页面不存在）等，根据情况做处理，这里略
            System.out.println("返回状态不是200");
            System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
        }
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //6.关闭
        HttpClientUtils.closeQuietly(response);
        HttpClientUtils.closeQuietly(httpClient);
    }
    return html;
}

三、处理返回的网页数据，我这边是将网页上分页数据中的id进行提取。

public static HashSet<String> paraseList(Document document) {
    HashSet<String> hashSet = new HashSet<>();
    //根据网页标签解析源码
    Elements elements = document.select(".search_div");
    Elements a = elements.select("a");
    //去除表头
    for (Element element : a) {
        String id = element.attr("id");
        hashSet.add(id);
    }
    return hashSet;
}

四、最后将页面保存下来方便快速提取信息

 public static void main(String[] args) throws IOException {
        /*String detail = HttpClientDownPage.getDetail("http://www.chinadrugtrials.org.cn/eap/clinicaltrials.searchlistdetail", null);
        System.out.println(detail);*/


        // 解析样本获取id
        for (int i = 1; i <= 510; i++) {
            String s = ReadFile.readFile("E:\\pa\\" + i + ".txt");
            Document parse = Jsoup.parse(s);
            HashSet<String> hashSet = HttpClientDownPage.paraseList(parse);
            for (String s1 : hashSet) {
                String detail1 = HttpClientDownPage.getDetail("http://www.chinadrugtrials.org.cn/eap/clinicaltrials.searchlistdetail", s1);
                PrintWriter pw = new PrintWriter(new FileWriter("E:\\pa\\padetails\\" + s1 + ".txt"));
                pw.write(detail1);
                pw.close();
            }
        }