Java解析HTML

最新推荐文章于 2025-06-28 15:32:58 发布

z2007130205

最新推荐文章于 2025-06-28 15:32:58 发布

阅读量1.4k

点赞数

分类专栏： java

java 专栏收录该内容

62 篇文章

订阅专栏

使用Java解析HTML很简单，使用jsoup.jar来解析，使用起来和jquery差不多

下面是两个例子

1.解析web页面

   
  
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class ParseWebPage {

   

       public static void main(String[] args) throws Exception {
           
           Connection conn = Jsoup.connect("http://www.hao123.com");
           Document document = conn.get();
           
           //解析出 class为feedback的li标签  的后代a标签元素
           Elements elements = document.select("li.feedback a");
             
           for (Element element : elements) {
               System.out.println(element.html());
               System.out.println(element.attr("href"));
           }
             
       }
       
   
}
   
  

2.解析本地页面

   
  
import java.io.File;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class ParseLocalPage {

   public static void main(String[] args) throws Exception {
       File file =new File("E:/JavaScriptDojo/jqueryui/测试Button.html");
       Document document = Jsoup.parse(file, "utf-8");
       
       Elements es = document.select("#getDPvalues");
       
       for (Element element : es) {
            System.out.printf("%s\t%s\n" ,element.html() ,element.val());
       }
   }
}
   
  

很简单吧

使用jsoup解析HTML之获取html源码

上一讲我们简要的介绍了jsoup和HTML DOM结构。在准备解析html之前，首先要做的当然是获取需要解析的html源码。jsoup提供了多种获取和解析html的方式。

新建一个Java工程，命名为JsoupDemo。然后导入jsoup的jar包。

下面分别来演示几种获取Document对象的方法。

  1、从字符串中解析Dom

 
         1
       
         2
       
         3
       
         4
       
         5
       
         6
       
         7
       
         8
       
         9
       
         10
       
        //Parse a document from a String
       
        static
         
        void
         
        parseDocFromString
        (
        )
        {
       
        String
         
        html
         
        =
         
        "<html><head><title>Parse a document from a String</title></head>"
       
        +
         
        "<body><p>Parsed HTML into a doc.</p></body></html>"
        ;
       
        //从字符串中解析dom
       
        Document 
        doc
         
        =
         
        Jsoup
        .
        parse
        (
        html
        )
        ;
       
        System
        .
        out
        .
        println
        (
        doc
        .
        title
        (
        )
        )
        ;
       
        }

使用Jsoup的parse(String html)类方法，可以从字符串中获取Document对象，然后再进行详细的解析。

  2、从URL中获取Document对象

 
         1
       
         2
       
         3
       
         4
       
         5
       
         6
       
         7
       
         8
       
         9
       
         10
       
         11
       
         12
       
         13
       
         14
       
         15
       
         16
       
         17
       
         18
       
         19
       
         20
       
        //Load a Document from a URL
       
        static
         
        void
         
        loadDocByUrl
        (
        )
         
        throws
         
        IOException
        {
       
        //要请求的网址
       
        String
         
        url
         
        =
         
        "https://libecnu.lib.ecnu.edu.cn/search~S0*chx/"
        ;
       
        //请求参数
       
        Map
        <
        String
        ,
         
        String
        >
         
        params
         
        =
         
        new
         
        HashMap
        <
        String
        ,
         
        String
        >
        (
        )
        ;
       
        params
        .
        put
        (
        "searcharg"
        ,
         
        "java"
        )
        ;
       
        params
        .
        put
        (
        "searchtype"
        ,
         
        "t"
        )
        ;
       
        params
        .
        put
        (
        "SORT"
        ,
         
        "DZ"
        )
        ;
       
        params
        .
        put
        (
        "extended"
        ,
         
        "0"
        )
        ;
       
        Document 
        doc
         
        =
         
        Jsoup
        .
        connect
        (
        url
        )
       
        .
        userAgent
        (
        "Mozilla"
        )
          
        //声明了浏览器用于 HTTP 请求的用户代理头的值
       
        .
        timeout
        (
        10
        *
        1000
        )
           
        //超时时间
       
        .
        data
        (
        params
        )
               
        //请求参数
       
        .
        get
        (
        )
        ;
                     
        //使用get方法，对应还有post()
       
        System
        .
        out
        .
        println
        (
        doc
        .
        html
        (
        )
        )
        ;
          
        //打印获取的html源码
       
        }

connect(String url)方法将会得到一个Connection类的实例，Connection类是HttpConnection的子类，然后调用get()方法，将会发送get请求，返回一个Document对象。类似的，我们也可以通过post()获取，主要是看我们的请求类型是get还是post。如果请求需要参数，我们可以使用Map<String,String>构造参数，然后通过data(Map<String,String> params)方法设置。得到Document对象后，我们就可以对其进行解析。

   3、从文件中获取Document对象

当我们本地有一个html文件时，我们可以使用parse(File in, String charsetName)方法从本地文件中获取Document对象。

 
         1
       
         2
       
         3
       
         4
       
         5
       
         6
       
        //Load a Document from a File
       
        static
         
        void
         
        loadDocFromFile
        (
        )
         
        throws
         
        IOException
        {
       
        File 
        inputFile
         
        =
         
        new
         
        File
        (
        "input.html"
        )
        ;
       
        Document 
        doc
         
        =
         
        Jsoup
        .
        parse
        (
        inputFile
        ,
         
        "UTF-8"
        )
        ;
       
        System
        .
        out
        .
        println
        (
        doc
        .
        html
        (
        )
        )
        ;
          
        //打印获取的html源码
       
        }

最后我们在main方法中测试三种获取Document对象的方法，发现都能正常获取到Document对象。

 
         1
       
         2
       
         3
       
         4
       
         5
       
         6
       
         7
       
         8
       
         9
       
        public
         
        static
         
        void
         
        main
        (
        String
        [
        ]
         
        args
        )
         
        throws
         
        IOException
         
        {
       
        parseDocFromString
        (
        )
        ;
       
        loadDocByUrl
        (
        )
        ;
       
        loadDocFromFile
        (
        )
        ;
       
        }