java_webmagic_小红书_社区精选

本文介绍了一种针对小红书网站的爬虫实现方法,通过解析网页获取文章标题、作者信息、点赞数等,并使用Java封装数据,最终将信息存入MySQL数据库。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


封装类




public class PC {


    private String href;
    private String title;
    private String nickname;
    private String img;
    private String likes;
    private String userhref;

    public PC() {
    }

    public PC(String href, String title, String nickname, String img, String likes, String userhref) {
        this.href = href;
        this.title = title;
        this.nickname = nickname;
        this.img = img;
        this.likes = likes;
        this.userhref = userhref;
    }

    public String getHref() {
        return href;
    }

    public String getTitle() {
        return title;
    }

    public String getNickname() {
        return nickname;
    }

    public String getImg() {
        return img;
    }

    public String getLikes() {
        return likes;
    }

    public String getUserhref() {
        return userhref;
    }

    public void setHref(String href) {
        this.href = href;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public void setNickname(String nickname) {
        this.nickname = nickname;
    }

    public void setImg(String img) {
        this.img = img;
    }

    public void setLikes(String likes) {
        this.likes = likes;
    }

    public void setUserhref(String userhref) {
        this.userhref = userhref;
    }

    @Override
    public String toString() {
        return "PC{" +
                "href='" + href + '\'' +
                ", title='" + title + '\'' +
                ", nickname='" + nickname + '\'' +
                ", img='" + img + '\'' +
                ", like='" + likes + '\'' +
                ", userhref='" + userhref + '\'' +
                '}';
    }
}

数据库存储——dao层


public class dao extends JdbcTemplate {
    private static final String URL = "jdbc:mysql://localhost:3306/test?serverTimezone=UTC";
    private static final String UNAME = "root";
    private static final String PWD = "root";


    private static Connection conn = null;

    static
    {
        try
        {
            // 1.加载驱动程序
            Class.forName("com.mysql.cj.jdbc.Driver");
            // 2.获得数据库的连接
            conn = DriverManager.getConnection(URL, UNAME, PWD);
        }
        catch (ClassNotFoundException e)
        {
            e.printStackTrace();
        }
        catch (SQLException e)
        {
            e.printStackTrace();
        }
    }

    public static Connection getConnection()
    {
        return conn;
    }
}

**

main方法

**

//小红书网页,浏览器数据加密查看数据不全,需要将数据下载进行解析
public class MyXHS implements PageProcessor {

    //抓取网络相关配置,包括编码,间隔时间,重试次数
    private Site site= Site.me().setRetryTimes(3).setSleepTime(100);
    @Override
    public Site getSite() {
        return site;
    }

    //解析页面
    @Override
    public void process(Page page) {

        //将获取到的页面数据转换为jsoup格式
        Document document = page.getHtml().getDocument();
        //System.out.println(document);

        //获取文章块
        Elements select = document.select("div[class=note-list dual-column-layout] div[class=note-item note]");
        //System.out.println(select);


        ArrayList<String> h = new ArrayList<>();
        //获取文章地址
         Elements hrefss = select.select("a[class=image-wrapper]");
         for (Element hrefs : hrefss) {
                String href = "https://www.xiaohongshu.com"+hrefs.attr("href");
                h.add(href);
         }

        ArrayList<String> t = new ArrayList<>();
        //获取文章标题
        Elements titless = select.select("div[class=note-content] h3[class=note-title]");
        for (Element titles : titless) {
            String title = titles.text();
            t.add(title);
        }

        ArrayList<String> n = new ArrayList<>();
        ArrayList<String> u = new ArrayList<>();
        //获取作者名称
        Elements nicknamess = select.select("a[class=note-author-nickname]");
        for (Element nicknames : nicknamess) {
            String nickname = nicknames.text();
            String uhref = "https://www.xiaohongshu.com"+nicknames.attr("href");
            n.add(nickname);
            u.add(uhref);
        }

        ArrayList<String> i = new ArrayList<>();
        //获取作者头像
        Elements imgss = select.select("div[class=avatar-img cube-image normal-image] img");
        for (Element imgs : imgss) {
            String img = imgs.attr("src");
            i.add(img);
        }

        //点赞数
        ArrayList<String> l = new ArrayList<>();
        Elements likess = select.select("span[class=note-likes]");
        for (Element likes: likess) {
            String like = likes.text();
            l.add(like);
        }

        dao dao = new dao();
        ArrayList<PC> strings = new ArrayList<>();

        for (int i1 = 0; i1 < h.size(); i1++) {
            PC pc = new PC();
            String Href = h.get(i1);
            String Title = t.get(i1);
            String Img = i.get(i1);
            String Nickname = n.get(i1);
            String Like = l.get(i1);
            String Userhref = u.get(i1);
            //封装对象
            pc.setHref(Href);
            pc.setTitle(Title);
            pc.setImg(Img);
            pc.setNickname(Nickname);
            pc.setLikes(Like);
            pc.setUserhref(Userhref);
            //System.out.println(Nickname+" "+Title+" "+Like+" "+Href+" "+Img+" "+Userhref);
            strings.add(pc);
        }


        //封装,插入数据库
        for (int i1 = 0; i1 < strings.size(); i1++) {
           // page.putField("files",strings);
            try {
                addGoddess(strings.get(i1));
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        /*for (PC pc : strings) {
            System.out.println(pc);
        }*/

    }






    //执行爬虫的主程序
    public static void main(String[] args) {
        Spider.create(new MyXHS())
                .addUrl("https://www.xiaohongshu.com/explore") //设置爬取路径
                .thread(5)
                .run();                                 //执行爬虫
    }



    public void addGoddess(PC pc) throws SQLException
    {
        // 获得数据库连接
        Connection conn = dao.getConnection();
        //java.sql.SQLException: Incorrect string value: '\xF0\x9F\x92\xB019...' for 错误,解决
        String sqlCharset = "set names utf8mb4";
        Statement statment = conn.createStatement();
        ResultSet resultSet = statment.executeQuery(sqlCharset);


        String sql = "insert into test(Title,Nickname,Likes,Userhref,Href,Img) " +
                "   values(?,?,?,?,?,?)";
        PreparedStatement ptmt = conn.prepareStatement(sql);
        ptmt.setString(1, pc.getTitle());
        ptmt.setString(2, pc.getNickname());
        ptmt.setString(3, pc.getLikes());
        ptmt.setString(4, pc.getUserhref());
        ptmt.setString(5, pc.getHref());
        ptmt.setString(6, pc.getImg());
        ptmt.execute();
    }
}


效果展示


在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值