先加载一个jar包<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.12.1</version> </dependency>实现的工具类 package com.tzle1.countryside.util; import com.tzle1.countryside.entity.VO.CrawlVO; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Component; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; @Component public class Python { /** * jsoup方式 获取http://zw.china.com.cn/新闻列表页 * */ public Map jsoupList(String url){ Map map = new HashMap(); Integer id=0; try { Document document = Jsoup.connect(url).get(); Elements elements = document.select("div.lp > ul > li > a");//找到文章对应的a标签 for (Element element:elements){ // 获取详情页链接 String d_url = element.attr("href"); Document ment = Jsoup.connect(d_url).get(); Elements select = ment.select("div.big_img > div.center_photo > p"); List<String> strings = select.eachText(); String content=""; for (String s : strings) { content+=s+"<br/>"; } // 获取标题 String title = element.ownText(); CrawlVO crawl = new CrawlVO(); crawl.setCrawlId(id) .setTitle(title) .setContent(content); map.put(id++,crawl); } } catch (IOException e) { e.printStackTrace(); } return map; } }
java爬取网上公开的文章
最新推荐文章于 2024-12-21 00:06:53 发布