用C#使用HtmlParser.NET的例子。

最新推荐文章于 2018-08-25 16:00:46 发布

DotNet灵魂

最新推荐文章于 2018-08-25 16:00:46 发布

阅读量6.6k

点赞数 1

分类专栏： C# 文章标签： c# null textbox class input html

C# 专栏收录该内容

29 篇文章

订阅专栏

本文介绍如何使用C#解析HTML文件，并通过构建树形视图展示HTML页面结构。解析过程包括读取HTML内容、解析标签及其属性、递归构建节点。最终将解析结果以树形视图的形式展示，实现对HTML页面结构的可视化。

using System;
using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;

private void button1_Click(object sender, EventArgs e)
{
    //we can use the stream to load a html file from the local disk
    // or use the uri to load a web page from the internet
    //byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
    //MemoryStream memsteam = new MemoryStream(htmlBytes);
    //InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
    //Page page = new Page(input);
    //Lexer lex = new Lexer(page);

    if (this.textBox1.Text.Length <= 0)
        return;
    //here I read the html from the textbox
     Lexer lexer = new Lexer(this.textBox1.Text);
     Parser parser = new Parser(lexer);
     NodeList htmlNodes = parser.Parse(null);
    this.treeView1.Nodes.Clear();
    this.treeView1.Nodes.Add("root");
     TreeNode treeRoot = this.treeView1.Nodes[0];
    for (int i = 0; i < htmlNodes.Count; i++)
     {
        this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
     }
}

private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
    if (htmlNode == null || treeNode == null) return;

     TreeNode current = treeNode;
    //current node
    if (htmlNode is ITag)
     {
         ITag tag=(htmlNode as ITag);
        if (!tag.IsEndTag())
         {
            string nodeString = tag.TagName;
            if (tag.Attributes != null && tag.Attributes.Count > 0)
             {
                if (tag.Attributes["ID"] != null)
                     nodeString = nodeString + " { id=\"" + tag.Attributes["ID"].ToString() + "\" }";
                if (tag.Attributes["CLASS"] != null)
                     nodeString = nodeString + " { class=\"" + tag.Attributes["CLASS"].ToString() + "\" }";
                if (tag.Attributes["STYLE"] != null)
                     nodeString = nodeString + " { style=\"" + tag.Attributes["STYLE"].ToString() + "\" }";
                if (tag.Attributes["HREF"] != null)
                     nodeString = nodeString + " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";
             }
             current = new TreeNode(nodeString);
             treeNode.Nodes.Add(current);
         }
     }

    //the children nodes
    if (htmlNode.Children!=null && htmlNode.Children.Count > 0)
     {
        this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
     }

    //the sibling nodes
    if (siblingRequired)
     {
         INode sibling = htmlNode.NextSibling;
        while (sibling != null)
         {
            this.RecursionHtmlNode(treeNode, sibling, false);
             sibling = sibling.NextSibling;
         }
     }
}

    screen snapshot for the example:

    The fault tolerance of the parser is very good, as shown in the pic below (although it could do this more intelligently, I really think that's enough for use):