加入依赖:
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.7.19</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.4</version>
</dependency>
代码:
package com.longqi.boothtml;
import cn.hutool.http.HttpUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author LongQi
* @projectName boot-integration
* @description: 抓取百度网页并获取链接页面数据
* @date 2023/3/30 15:23
*/
public class HtmlPage {
public static void main(String[] args) {
try{
String resp = HttpUtil.get("https://www.baidu.com",10000);
System.out.println(resp);
Document doc = Jsoup.parse(resp);
Elements elements = doc.select("a");
System.out.println("已查询到网页");
int count = 0;
for(Element element:elements){
Attributes attrs = element.attributes();
String url = attrs.get("href");
if(url.startsWith("http")){
resp = HttpUtil.get(url);
System.out.println(resp);
count++;
}
}
System.out.println("抓取了"+count+"个网页");
}catch (Exception e){
System.out.println("抓取异常");
e.printStackTrace();
}
}
}
上面用Hutool的HttpUtil进行查询页面,接着哟昂Jsoup将页面转化为Document,再用选择器筛选出a标签,再轮询获取href属性的链接,最后一一访问。