利用正则表达式实现去除所有HTML标签代码

本文介绍如何在ASP.NET中使用正则表达式去除HTML标记,包括删除标签、脚本、图片等元素,以获取纯文本内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

protected string str = "<table><tr><td>sdasasdsdd</td></tr></table><br><p>sds</p><img id='img1' src='http://www.zhixing123.cn/uploads/allimg/110330/1104201G0-0.gif' width='100' height='50' alt=''>aaassss<br><img src='http://www.zhixing123.cn/uploads/allimg/110330/1104201G0-0.gif' width='100' height='50' alt=''> 说是道 ";
protected void Page_Load(object sender, EventArgs e)
{
//string regexstr = @"<[^>]*>"; //去除所有的标签
//@"<script[^>]*?>.*?</script >" //去除所有脚本,中间部分也删除

// string regexstr = @"<img[^>]*>"; //去除图片的正则
// string regexstr = @"<(?!br).*?>"; //去除所有标签,只剩br
// string regexstr = @"<table[^>]*?>.*?</table>"; //去除table里面的所有内容
string regexstr = @"<(?!img|br|p|/p).*?>"; //去除所有标签,只剩img,br,p

str = Regex.Replace(str, regexstr, string.Empty, RegexOptions.IgnoreCase);
}
asp中正则表达式去除HTML标记(窃自eWebEditor)
2009年12月31日 星期四 下午 12:40
function ExecReg(re, content) 
Dim myRegExp, ResultString
Set myRegExp = New RegExp
myRegExp.Global = True 
myRegExp.Pattern = re
ResultString = myRegExp.Replace(content, "" ) 
ExecReg = ResultString
end function
function DecodeFilter(html) 
html = LCase (html)
' 去除所有客户端脚本javascipt,vbscript,jscript,js,vbs,event, 
html = ExecReg( " </?script[^>]*> " , html)
html = ExecReg( " (javascript|jscript|vbscript|vbs): " , html)
html = ExecReg( " on(mouse|exit|error|click|key) " , html)
html = ExecReg( " &# " , html)
' 去除表格<table><tr><td><th><a><p><img><div> 
html = ExecReg( " </?table[^>]*> " , html)
html = ExecReg( " </?tr[^>]*> " , html)
html = ExecReg( " </?th[^>]*> " , html)
html = ExecReg( " </?td[^>]*> " , html)
html = ExecReg( " </?a[^>]*> " , html)
html = ExecReg( " </?p[^>]*> " , html)
html = ExecReg( " </?img[^>]*> " , html)
html = ExecReg( " </?div[^>]*> " , html)
html = ExecReg( " </?ul[^>]*> " , html)
html = ExecReg( " </?li[^>]*> " , html)
html = ExecReg( " </?tbody[^>]*> " , html)
html = ExecReg( " </?h1[^>]*> " , html)
html = ExecReg( " </?h2[^>]*> " , html)
html = ExecReg( " </?h3[^>]*> " , html)
html = ExecReg( " </?h4[^>]*> " , html)
html = ExecReg( " </?h5[^>]*> " , html)
html = ExecReg( " </?h6[^>]*> " , html)
html = ExecReg( " </?b[^>]*> " , html)
html = ExecReg( " </?strong[^>]*> " , html)
' 去除样式类class="" 
html = ExecReg( " (<[^>]+) class=[^ |^>]*([^>]*>) " , html)
' 去除样式style="" 
html = ExecReg( " (<[^>]+) style=""[^""]*""([^>]*>) " , html)
' 去除XML<?xml> 
html = ExecReg( " <\?xml[^>]*> " , html)
' 去除命名空间<o:p></o:p> 
html = ExecReg( " </?[a-z]+:[^>]*> " , html)
' 去除字体<font></font> 
html = ExecReg( " </?font[^>]*> " , html)
' 去除字幕<marquee></marquee> 
html = ExecReg( " </?marquee[^>]*> " , html)
' 去除对象<object><param><embed></object> 
html = ExecReg( " </?object[^>]*> " , html)
html = ExecReg( " </?param[^>]*> " , html)
html = ExecReg( " </?embed[^>]*> " , html)
DecodeFilter = html
end function
 
Function RemoveHTML(strText)
Dim RegEx
Set RegEx = New RegExp
RegEx.Pattern = "<[^>]*>"
RegEx.Global = True
RemoveHTML = RegEx.Replace(strText, "")
End Function

function nohtml(str) 
dim re 
Set re=new RegExp 
re.IgnoreCase =true 
re.Global=True 
re.Pattern="(\<.[^\<]*\>)" 
str=re.replace(str," ") 
re.Pattern="(\<\/[^\<]*\>)" 
str=re.replace(str," ") 
str=replace(str," ","") 
str=replace(str," ","") 
nohtml=str 
set re=nothing 
end function
注:java中 "html内容".replaceAll("<[^>]*>", "")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值