jsoup获取文档类示例

最新推荐文章于 2025-09-07 20:32:33 发布

weixin_30797027

最新推荐文章于 2025-09-07 20:32:33 发布

阅读量49

点赞数

CC 4.0 BY-SA版权

文章标签： java

原文链接：http://www.cnblogs.com/bboymonk/p/6052437.html

 1 import java.io.IOException;
 2 
 3 import org.jsoup.Jsoup;
 4 import org.jsoup.nodes.Document;
 5 import org.jsoup.nodes.Element;
 6 import org.jsoup.select.Elements;
 7 
 8 
 9 public class WyCrawler {
10     public static void main(String[] args) {
11         try {
12             Document document = Jsoup.connect("http://某网页").timeout(3000).get();
13             String selector = "li>div[class=titleBar clearfix]>h3>a";    
14             Elements elements = document.select(selector);
15             for(Element element:elements){
16 //                System.out.println(element.text());
17                 String url = element.absUrl("href");
18                 Document document2 = Jsoup.connect(url).get();
19                 Elements elements2 = document2.select("#endText");
20                 for(Element element2:elements2){
21                     System.out.println(element2.text());
22                 }
23             }
24         } catch (IOException e) {
25             e.printStackTrace();
26         }
27     }
28 }

上面是如何爬取超链接里的内容，下面的比较简单

 1 import java.io.IOException;
 2 
 3 import org.jsoup.Jsoup;
 4 import org.jsoup.nodes.Document;
 5 import org.jsoup.nodes.Element;
 6 import org.jsoup.select.Elements;
 7 
 8 
 9 public class Test {
10     public static void main(String[] args) {
11         try {
12             Document document = Jsoup.connect("http://www.某网页.com/").get();
13             //获取内容
14 //            String selector = "div[class=panel panel20 post-item post-box]>div[class=item-detail]>div[class=item-content]";
15 //            Elements elements = document.select(selector);
16 //            for(Element element:elements){
17 //                System.out.println(element.text());
18 //            }
19             
20             //获取标题
21 //            String selector2 = "div[class=panel panel20 post-item post-box]>div[class=item-detail]>h2[class=item-title]";
22 //            Elements elements = document.select(selector2);
23 //            for(Element element:elements){
24 //                System.out.println(element.text());
25 //            }
26             
27             //综合写法，标题内容一起获取
28             String selector = "div[class=panel panel20 post-item post-box]>div[class=item-detail]";
29             Elements elements = document.select(selector);
30             for(Element element:elements){
31                 Elements titles = element.select("div[class=item-title]");
32                 Elements content = element.select("h2[class=item-content]");
33                 System.out.println(titles.text()+"\n"+content.text());
34             }
35             
36             
37             
38         } catch (IOException e) {
39             e.printStackTrace();
40         }
41     }
42 }