Html convert to DOM Tree

HTML解析与DOM树构建
本文介绍了一种使用Microsoft.mshtml库解析HTML内容并构建DOM树的方法。通过实例演示了如何将HTML源代码转换为IHTMLDocument2对象,并递归地创建DOM树,最后将DOM树展示为树状结构。

添加引用 Microsoft.mshtml

并且把项目的属性中的非安全运行允许

但是感觉是不是 html内容不规则的时候, 会崩溃.

OMG, 是不是能够有更好的方式来解析这个html内容呢>>> 望见文者, 推荐推荐. 谢谢

 

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using mshtml;
using System.Runtime.InteropServices;

namespace Html2DOMTree
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string html = "";
            if (richTextBox1.Text != "")
            {
                html = richTextBox1.Text;   //取得html源代码
            }

            IHTMLDocument2 doc2 = Parse(html);
            IHTMLDocument3 HTMLDocument = (IHTMLDocument3)doc2;  //doc2对象转换成HTMLDocument对象
            IHTMLDOMNode rootDomNode = null;

            rootDomNode = (IHTMLDOMNode)HTMLDocument.documentElement; //获取文档根部节点,也就是HTML节点

            TreeNode root = treeView1.Nodes.Add("HTML");     ////加入跟节点
            InsertDOMNodes(rootDomNode, root); //把domnode插入到跟节点中,调用InsertDOMNodes方法

        }

         //解析Dom树
        unsafe IHTMLDocument2 Parse(string s)       //unsafe关键字表示不安全上下文
        {
            IHTMLDocument2 pDocument = new HTMLDocumentClass();
            if (pDocument != null)
            {
                IPersistStreamInit pPersist = pDocument as IPersistStreamInit;  //as运算符类似于强制转换操作;如果转换不可行,as会返回null而不是引发异常。
                pPersist.InitNew();
                pPersist = null;
                IMarkupServices ms = pDocument as IMarkupServices;
                if (ms != null)
                {
                    IMarkupContainer pMC = null;
                    IMarkupPointer pStart, pEnd;
                    ms.CreateMarkupPointer(out pStart);
                    ms.CreateMarkupPointer(out pEnd);
                    System.Text.StringBuilder sb = new System.Text.StringBuilder(s);
                    IntPtr pSource = Marshal.StringToHGlobalUni(s);
                    ms.ParseString(ref *(ushort*)pSource.ToPointer(), 0, out pMC, pStart, pEnd);
                    if (pMC != null)
                    {
                        Marshal.Release(pSource);
                        return pMC as IHTMLDocument2;
                    }
                    Marshal.Release(pSource);
                }
            }
            return null;
        }
        //插入Dom树
        public void InsertDOMNodes(IHTMLDOMNode parentnode, TreeNode tree_node)
        {
            if (parentnode.hasChildNodes())//是否有子结点
            {
                IHTMLDOMChildrenCollection allchild = (IHTMLDOMChildrenCollection)parentnode.childNodes;
                int length = allchild.length;
                for (int i = 0; i < length; i++)//对每个子结点进行处理,首先取出每个子节点的属性,然后进行递归
                {
                    IHTMLDOMNode child_node = (IHTMLDOMNode)allchild.item(i);
                    string m_snodeName = child_node.nodeName;
                    object m_onodevalue = child_node.nodeValue;
                    string m_snodetype = child_node.nodeType.ToString();
                    string m_snodevalue = "";
                    if (m_onodevalue != null)
                        m_snodevalue = m_onodevalue.ToString().Trim();
                    TreeNode tempnode = null;

                    if (child_node.nodeName.Equals("#text"))
                    {
                        if ((m_snodevalue != null) && (!m_snodevalue.Equals("")))
                        {
                            tempnode = tree_node.Nodes.Add(m_snodevalue);
                        }
                    }
                    else
                    {
                        tempnode = tree_node.Nodes.Add(child_node.nodeName);
                        InsertDOMNodes(child_node, tempnode);
                    }
                }
            }
        } 
    }

    [ComVisible(true), ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713"), InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
    public interface IPersistStreamInit
    {
        void GetClassID([In, Out] ref Guid pClassID);
        [return: MarshalAs(UnmanagedType.I4)]
        [PreserveSig]
        int IsDirty();
        void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
        void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm,
         [In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
        void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
        void InitNew();
    }
}

转载于:https://www.cnblogs.com/25-to-life/archive/2010/09/23/1833350.html

def convert_masscan_report(xml_path, xls_path): DOMTree = xml.dom.minidom.parse(xml_path) data = DOMTree.documentElement nodelist = data.getElementsByTagName('host') ip_info = {} for node in nodelist: scan_endtime = node.getAttribute('endtime') scan_endtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(scan_endtime))) address_node = node.getElementsByTagName('address') addrtype = address_node[0].getAttribute('addrtype') addr = address_node[0].getAttribute('addr') port_node = node.getElementsByTagName('port') ip_prefix = addr.split(".")[0] + "." + addr.split(".")[1] + "." + addr.split(".")[2] if ip_prefix not in ip_info: ip_info[ip_prefix] = {} for port in port_node: if addr in ip_info[ip_prefix]: ip_info[ip_prefix][addr][1] = ip_info[ip_prefix][addr][1] + "," + portid continue protocol = port.getAttribute('protocol') portid = port.getAttribute('portid') state_element = port.getElementsByTagName('state') state = state_element[0].getAttribute('state') reason = state_element[0].getAttribute('reason') reason_ttl = state_element[0].getAttribute('reason_ttl') print('[+] | %s | %s | %s | %s | %s | %s | %s | %s |' % ( addr, portid, state, protocol, addrtype, reason, reason_ttl, scan_endtime)) scan_info = [addr, portid, state, protocol, addrtype, reason, reason_ttl, scan_endtime] ip_info[ip_prefix][addr] = scan_info workbook = xlsxwriter.Workbook(xls_path) for sheet_name, sheet_value in ip_info.items(): worksheet = workbook.add_worksheet(sheet_name) worksheet.autofilter("A1:H1") # 设置过滤 worksheet.freeze_panes(1, 0) # 冻结窗格 worksheet.lastrow = 0 summary_header = ["addr", "port", "state", "protocol", "addrtype", "reason", "reason_ttl", "scan_endtime"] for idx, item in enumerate(summary_header): worksheet.write(0, idx, item, workbook.add_format({"bold": True})) worksheet.lastrow += 1 for addr, addr_info in sheet_value.items(): for i in range(0, len(addr_info)): worksheet.write(worksheet.lastrow, i, addr_info[i]) worksheet.lastrow += 1 workbook.close()
06-02
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值