高考小数据篇--1(爬虫)

本文介绍了一个使用Java实现的高考数据爬虫程序,通过HttpClients和Jsoup解析网页,抓取大学列表信息,并存储到数据库中。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

java爬虫程序的可行度在很难赶得上主流的一些Python语言开发爬虫程序,但是java的用户量还是挺大的,下面介绍一下java爬虫的主要的一个过程:

上菜

 

package com.shaoyayu.html;

import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
import com.shaoyayu.Databaseutil.DbUtil;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileWriter;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by shaoyayu on 2019/4/19.
 */
public class Collegelist {
    /**
     *  爬虫地址:
     *      http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=&page=1
     *      http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=contact&page=1
     *      将高考数据输入到数据可中的表个数据将的展示
     */
    public static void main(String[] ages){
        aaaa("http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=&page=",228);
//        XXBug.aatt("http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=contact&page=",1);
//        List<String> list = new ArrayList<>();
//        for (int i = 0; i < 11; i++) {
//            list.add("属性"+i);
//        }
//        inputData(list);
    }

    public static void aaaa(String url,int i){
        int t= jiexi(getHtml(url+i));
        if (t==1){
            return;
        }else {
            i++;
            aaaa(url,i);
        }
    }

    public static String getHtml(String url){

        CloseableHttpClient client = HttpClients.createDefault();
        HttpGet get = new HttpGet(url);
        try {
            CloseableHttpResponse httpResponse = client.execute(get);
            if (httpResponse.getStatusLine().getStatusCode()==200){
                System.err.println(url);
                System.err.println("请求成功");
                String html = EntityUtils.toString(httpResponse.getEntity(),"UTF-8");
//                FileWriter fileWriter = new FileWriter("E:/BugWeb/大学数据库.html");
//                fileWriter.write(html);
//                fileWriter.flush();
//                fileWriter.close();
//                System.err.println("文件输出完成");
                return html;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return "";
    }

    public static int jiexi(String html){

        Document document = Jsoup.parse(html);

        Element pageNumWrap = document.select("div.pageNumWrap").first();

        String totalNum = pageNumWrap.attributes().get("totalNum");
        String totalPage = pageNumWrap.attributes().get("totalPage");
        String page = pageNumWrap.attributes().get("page");
        if (totalNum==null&&totalNum.equals("0")){
            return 0;
        }
        Elements college_info = document.select("div.college_info");
        int t=0;
        for (Element element:college_info) {
            List<String> lists = new ArrayList<>();
            Elements div_clearfix = element.select("div.clearfix");
            String presentationUrl = div_clearfix.select("[target='_blank']").first().attributes().get("href").trim();
            String schoolName = div_clearfix.select("h4.college_name").text().trim();
            lists.add(presentationUrl);
            lists.add(schoolName);
            Elements str1 = div_clearfix.select("a[target='_blank'].weibo");
            String officialMicroblog = "";
            for (Element e : str1) {
                officialMicroblog =officialMicroblog.concat(e.attributes().get("href")+":"+e.text()+":");
            }
            lists.add(officialMicroblog);
            Elements div_grays = div_clearfix.select("div.gray");
            try {
                for (Element element1:div_grays) {
                    Elements p_geay = element1.children();
                    if (p_geay.get(0)!=null){
                        if (p_geay.get(0).text().trim().split(":")[1]!=null) {
                            String dizi = p_geay.get(0).text().trim().split(":")[1].trim();
                            lists.add(dizi);
                        }
                    }
                    if (p_geay.get(1)!=null){
                        if (p_geay.get(1).text().trim().split(":").length>1){
                            if (p_geay.get(1).text().trim().split(":")[1]!=null){
                                String xueke = p_geay.get(1).text().trim().split(":")[1].trim();
                                lists.add(xueke);
                            }
                        }

                    }

                }
                Elements span_orange = div_clearfix.select("span.orange");
                String orange = span_orange.text();
                lists.add(orange);
                int i =0;
                for (String string:lists) {
                    System.err.println(string+":"+i);
                    i++;
                }
                inputData(lists);
                System.err.println(t++);
            }catch (Exception e){
                e.printStackTrace();
            }finally {
                continue;
            }

        }

        if (totalPage.equals(page)){
            return 1;
        }
        return 2;

    }
    private static void inputData(List<String> Strs) {
        DbUtil dbUtil = new DbUtil();
        Connection connection = null;
        PreparedStatement ps = null;
        if (Strs.size()==9){
            for (;Strs.size()==9;){
                Strs.add("");
            }
        }
        //添加聊天信息
        try {
            //创建sql命令
            connection = dbUtil.getConnection();
            //创建SQL语句
            String sql = "INSERT INTO tb_schools(" +
                                                "presentationUrl,schoolName,officialMicroblog,location,keyDiscipline,category," +
                                                "master,subjection,doctor,rests) " +
                                                "values(?,?,?,?,?,?,?,?,?,?)";
            //创建sql命令对象
            ps = (PreparedStatement) connection.prepareStatement(sql);
            for (int i = 1; i<Strs.size()+1;i++){
                ps.setString(i,Strs.get(i-1));
            }
            ps.executeUpdate();
        }catch (Exception e) {
            e.printStackTrace();
        }finally{
            //关闭资源
            try {
                if (ps!=null) {
                    ps.close();
                }
            } catch (SQLException e) {
                System.out.println("ps"+"关闭失败");
                e.printStackTrace();
            }
            try {
                if (connection!=null) {
                    connection.close();
                }
            } catch (SQLException e) {
                System.out.println("connection"+"关闭失败");
                e.printStackTrace();
            }
        }
    }
}

这是我写个一个测试类,

最后我写一个线程池的方式陪上递归方式请求分页。。。

HttpClients+jsoup。配置一个Maven工程就可以跑了

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值