/// <summary> /// 去除HTML标记 /// </summary> /// <param name="NoHTML">包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([/r/n])[/s]+", "",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "/"",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "/xa1",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "/xa2",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "/xa3",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "/xa9",RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(/d+);", "",RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("/r/n", ""); Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring; } /**/ ///提取HTML代码中文字的C#函数 /// <summary> /// 去除HTML标记 /// </summary> /// <param name="strHtml">包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> using System; using System.Text.RegularExpressions; public class StripHTMLTest { public static void Main() { string s = StripHTML( "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>"); Console.WriteLine(s); } public static string StripHTML(string strHtml) { string[]aryReg = { @"<script[^><!--]*?>.*?// --></script>", @"<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[" "'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>", @"([/r/n])[/s]+", @ "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @ "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", @"&#(/d+);", @"-->", @"