''' <summary> ''' 去除HTML标记 ''' </summary> ''' <param name="strHtml">包括HTML的源码 </param> ''' <returns>已经去除后的文字</returns> Public Shared Function GetStrfromHTML(ByVal strHtml As String) As String Dim aryReg As String() = {"<script[^>]*?>.*?</script>", "<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", "([\r\n])[\s]+", "&(quot|#34);", "&(amp|#38);", "&(lt|#60);", _ "&(gt|#62);", "&(nbsp|#160);", "&(iexcl|#161);", "&(cent|#162);", "&(pound|#163);", "&(copy|#169);", _ "&#(\d+);", "-->", "<!--.*\n"} 'chr(161), 'chr(162), 'chr(163), 'chr(169), Dim aryRep As String() = {"", "", "", """", "&", "<", _ ">", " ", "¡", "¢", "£", "©", _ "", vbCr & vbLf, ""} Dim newReg As String = aryReg(0) Dim strOutput As String = strHtml For i As Integer = 0 To aryReg.Length - 1 Dim regex As New Regex(aryReg(i), RegexOptions.IgnoreCase) strOutput = regex.Replace(strOutput, aryRep(i)) Next strOutput.Replace("<", "") strOutput.Replace(">", "") strOutput.Replace(vbCr & vbLf, "") Return strOutput End Function
VB.NET 获取HTML中的文字
最新推荐文章于 2022-03-01 13:56:28 发布