Java使用Jsoup爬取贴吧指定词条帖子信息

添加jsoup的依赖:
在这里插入图片描述
普通类:

package com.company.bean;

/**
 * TieBaInfo class
 * @author sago
 * @date 2021/03/10
 */
public class TieBaInfo {

    //贴吧名称
    private String tbName;
    //当前页地址
    private String tbUrl;
    //下一页地址
    private String tbNextUrl;
    //上一页地址
    private String tbPreUrl;
    //第一页地址
    private String firstUrl;
    //最后一页地址
    private String lastUrl;

    public String getTbName() {
        return tbName;
    }

    public void setTbName(String tbName) {
        this.tbName = tbName;
    }

    public String getTbUrl() {
        return tbUrl;
    }

    public void setTbUrl(String tbUrl) {
        this.tbUrl = tbUrl;
    }

    public String getTbNextUrl() {
        return tbNextUrl;
    }

    public void setTbNextUrl(String tbNextUrl) {
        this.tbNextUrl = "http://tieba.baidu.com" + tbNextUrl;
    }

    public String getTbPreUrl() {
        return tbPreUrl;
    }

    public void setTbPreUrl(String tbPreUrl) {
        this.tbPreUrl = "http://tieba.baidu.com" + tbPreUrl;
    }

    public String getFirstUrl() {
        return firstUrl;
    }

    public void setFirstUrl(String firstUrl) {
        this.firstUrl = "http://tieba.baidu.com" + firstUrl;
    }

    public String getLastUrl() {
        return lastUrl;
    }

    public void setLastUrl(String lastUrl) {
        this.lastUrl = "http://tieba.baidu.com" + lastUrl;
    }

    @Override
    public String toString() {
        return "TieBa [tbName=" + tbName + ", tbUrl=" + tbUrl + ", tbNextUrl="
                + tbNextUrl + ", tbPreUrl=" + tbPreUrl + ", firstUrl="
                + firstUrl + ", lastUrl=" + lastUrl + "]";
    }
}
package com.company.bean;

/**
 * TieZiInfo class
 * @author sago
 * @date 2021/03/10
 */
public class TieZiInfo {

    /**
     * 标题
     */
    private String title;
    /**
     * 作者
     */
    private String author;
    /**
     * 回复数
     */
    private String num;
    /**
     * 帖子地址
     */
    private String url;

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getNum() {
        return num;
    }

    public void setNum(String num) {
        this.num = num;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = "http://tieba.baidu.com" + url;
    }

    @Override
    public String toString() {
        return "TieZi [title=" + title + ", author=" + author + ", num=" + num
                + ", url=" + url + "]";
    }
}

工具类:

package com.company;


import com.company.bean.TieBaInfo;
import com.company.bean.TieZiInfo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * UtilMethod 工具类
 *
 * @author sago
 * @date 2021/03/10
 */
public class UtilMethod {

    public static Document getDocument(String url) {
        Document doc = null;
        try {
            doc = Jsoup.connect(url).timeout(5000).get();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return doc;
    }

    public static TieBaInfo getTieBaInfo(Document doc) {
        TieBaInfo tieBaInfo = new TieBaInfo();
        tieBaInfo.setFirstUrl(doc.select("a.first").attr("href"));
        tieBaInfo.setLastUrl(doc.select("a.last").attr("href"));
        tieBaInfo.setTbName(doc.select("a.card_title_fname").text());
        tieBaInfo.setTbNextUrl(doc.select("a.next").attr("href"));
        tieBaInfo.setTbPreUrl(doc.select("a.pre").attr("href"));
        tieBaInfo.setTbUrl(doc.baseUri());
        return tieBaInfo;
    }

    public static Elements getElements(Document doc) {
        Elements el = doc.select("li.j_thread_list");
        return el;
    }

    public static TieZiInfo getTieZiInfo(Element element) {
        TieZiInfo tieZi = new TieZiInfo();
        // span.threadlist_rep_num  旧版是div  新版是span
        Element numElemenet = element.select("span.threadlist_rep_num").first();
        tieZi.setNum(numElemenet.text());
        Element titleElement = element.select("a.j_th_tit").first();
        tieZi.setUrl(titleElement.attr("href"));
        String text = element.select("span.tb_icon_author").text();
        if ("----".equals(text)) {
			tieZi.setAuthor(getAuthor(tieZi.getUrl()));
        } else {
            tieZi.setAuthor(element.select("span.tb_icon_author").text());
        }
        tieZi.setTitle(titleElement.text());

        return tieZi;
    }

    public static String getAuthor(String url) {
        Document doc = getDocument(url);
        return doc.select("div.louzhubiaoshi").attr("author");
    }
}

main:

package com.company;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;

/**
 * GetTiezi class
 * 根据关键词 word,最大页数p,返回改关键词所有帖子信息(回复数、标题、作者)
 * 拿到标题的 href 拼接成url 如:http://tieba.baidu.com/p/2949701560
 * @author sago
 * @date 2021/03/10
 */
public class GetTiezi {
    public static void main(String[] args) throws UnsupportedEncodingException {
        //设置最大页数
        int p = 100;
        //设置搜索关键词
        String word = "图拉丁";
        String tbName = URLEncoder.encode(word, "utf-8");

        int maxPage = p * 50;
        String url = "";
        for (int i = 0; i < maxPage; ) {
            url = "http://tieba.baidu.com/f?ie=utf-8&kw=" + tbName + "&pn=" + i;

            Document doc = UtilMethod.getDocument(url);
            System.out.println("第" + (i / 50 + 1) + "页");
            System.out.println(url);
            // li.j_thread_list 拿到每个帖子元素
            Elements el = doc.select("li.j_thread_list");
            for (Element element : el) {
                System.out.println(UtilMethod.getTieZiInfo(element));
            }
            i += 50;
        }
    }
}

完整项目地址:https://gitee.com/imliuxi/tiezi.git

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值