VB.NET 获取HTML中的文字

  ''' <summary>
    ''' 去除HTML标记
    ''' </summary>
    ''' <param name="strHtml">包括HTML的源码 </param>
    ''' <returns>已经去除后的文字</returns>
    Public Shared Function GetStrfromHTML(ByVal strHtml As String) As String


        Dim aryReg As String() = {"<script[^>]*?>.*?</script>", "<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", "([\r\n])[\s]+", "&(quot|#34);", "&(amp|#38);", "&(lt|#60);", _
         "&(gt|#62);", "&(nbsp|#160);", "&(iexcl|#161);", "&(cent|#162);", "&(pound|#163);", "&(copy|#169);", _
         "&#(\d+);", "-->", "<!--.*\n"}

        'chr(161),
        'chr(162),
        'chr(163),
        'chr(169),
        Dim aryRep As String() = {"", "", "", """", "&", "<", _
         ">", " ", "¡", "¢", "£", "©", _
         "", vbCr & vbLf, ""}

        Dim newReg As String = aryReg(0)
        Dim strOutput As String = strHtml
        For i As Integer = 0 To aryReg.Length - 1
            Dim regex As New Regex(aryReg(i), RegexOptions.IgnoreCase)
            strOutput = regex.Replace(strOutput, aryRep(i))
        Next

        strOutput.Replace("<", "")
        strOutput.Replace(">", "")
        strOutput.Replace(vbCr & vbLf, "")


        Return strOutput
    End Function



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值