添加jsoup的依赖:
普通类:
package com.company.bean;
/**
* TieBaInfo class
* @author sago
* @date 2021/03/10
*/
public class TieBaInfo {
//贴吧名称
private String tbName;
//当前页地址
private String tbUrl;
//下一页地址
private String tbNextUrl;
//上一页地址
private String tbPreUrl;
//第一页地址
private String firstUrl;
//最后一页地址
private String lastUrl;
public String getTbName() {
return tbName;
}
public void setTbName(String tbName) {
this.tbName = tbName;
}
public String getTbUrl() {
return tbUrl;
}
public void setTbUrl(String tbUrl) {
this.tbUrl = tbUrl;
}
public String getTbNextUrl() {
return tbNextUrl;
}
public void setTbNextUrl(String tbNextUrl) {
this.tbNextUrl = "http://tieba.baidu.com" + tbNextUrl;
}
public String getTbPreUrl() {
return tbPreUrl;
}
public void setTbPreUrl(String tbPreUrl) {
this.tbPreUrl = "http://tieba.baidu.com" + tbPreUrl;
}
public String getFirstUrl() {
return firstUrl;
}
public void setFirstUrl(String firstUrl) {
this.firstUrl = "http://tieba.baidu.com" + firstUrl;
}
public String getLastUrl() {
return lastUrl;
}
public void setLastUrl(String lastUrl) {
this.lastUrl = "http://tieba.baidu.com" + lastUrl;
}
@Override
public String toString() {
return "TieBa [tbName=" + tbName + ", tbUrl=" + tbUrl + ", tbNextUrl="
+ tbNextUrl + ", tbPreUrl=" + tbPreUrl + ", firstUrl="
+ firstUrl + ", lastUrl=" + lastUrl + "]";
}
}
package com.company.bean;
/**
* TieZiInfo class
* @author sago
* @date 2021/03/10
*/
public class TieZiInfo {
/**
* 标题
*/
private String title;
/**
* 作者
*/
private String author;
/**
* 回复数
*/
private String num;
/**
* 帖子地址
*/
private String url;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getNum() {
return num;
}
public void setNum(String num) {
this.num = num;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = "http://tieba.baidu.com" + url;
}
@Override
public String toString() {
return "TieZi [title=" + title + ", author=" + author + ", num=" + num
+ ", url=" + url + "]";
}
}
工具类:
package com.company;
import com.company.bean.TieBaInfo;
import com.company.bean.TieZiInfo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
/**
* UtilMethod 工具类
*
* @author sago
* @date 2021/03/10
*/
public class UtilMethod {
public static Document getDocument(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url).timeout(5000).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
public static TieBaInfo getTieBaInfo(Document doc) {
TieBaInfo tieBaInfo = new TieBaInfo();
tieBaInfo.setFirstUrl(doc.select("a.first").attr("href"));
tieBaInfo.setLastUrl(doc.select("a.last").attr("href"));
tieBaInfo.setTbName(doc.select("a.card_title_fname").text());
tieBaInfo.setTbNextUrl(doc.select("a.next").attr("href"));
tieBaInfo.setTbPreUrl(doc.select("a.pre").attr("href"));
tieBaInfo.setTbUrl(doc.baseUri());
return tieBaInfo;
}
public static Elements getElements(Document doc) {
Elements el = doc.select("li.j_thread_list");
return el;
}
public static TieZiInfo getTieZiInfo(Element element) {
TieZiInfo tieZi = new TieZiInfo();
// span.threadlist_rep_num 旧版是div 新版是span
Element numElemenet = element.select("span.threadlist_rep_num").first();
tieZi.setNum(numElemenet.text());
Element titleElement = element.select("a.j_th_tit").first();
tieZi.setUrl(titleElement.attr("href"));
String text = element.select("span.tb_icon_author").text();
if ("----".equals(text)) {
tieZi.setAuthor(getAuthor(tieZi.getUrl()));
} else {
tieZi.setAuthor(element.select("span.tb_icon_author").text());
}
tieZi.setTitle(titleElement.text());
return tieZi;
}
public static String getAuthor(String url) {
Document doc = getDocument(url);
return doc.select("div.louzhubiaoshi").attr("author");
}
}
main:
package com.company;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
/**
* GetTiezi class
* 根据关键词 word,最大页数p,返回改关键词所有帖子信息(回复数、标题、作者)
* 拿到标题的 href 拼接成url 如:http://tieba.baidu.com/p/2949701560
* @author sago
* @date 2021/03/10
*/
public class GetTiezi {
public static void main(String[] args) throws UnsupportedEncodingException {
//设置最大页数
int p = 100;
//设置搜索关键词
String word = "图拉丁";
String tbName = URLEncoder.encode(word, "utf-8");
int maxPage = p * 50;
String url = "";
for (int i = 0; i < maxPage; ) {
url = "http://tieba.baidu.com/f?ie=utf-8&kw=" + tbName + "&pn=" + i;
Document doc = UtilMethod.getDocument(url);
System.out.println("第" + (i / 50 + 1) + "页");
System.out.println(url);
// li.j_thread_list 拿到每个帖子元素
Elements el = doc.select("li.j_thread_list");
for (Element element : el) {
System.out.println(UtilMethod.getTieZiInfo(element));
}
i += 50;
}
}
}
完整项目地址:https://gitee.com/imliuxi/tiezi.git