#region 几个比较实用比较N的正则表达式
/// <summary>
/// 几个比较实用比较N的正则表达式[思归写出来的方法]
/// </summary>
/// <param name="strHtml"></param>
/// <returns></returns>
public static string StripHtml(string strHtml)
{
// 比较牛的处理文章中的图片,根据这个可推出好多类似有用的正则
//将<img>转化为标准的<img src="" border="1" alt="">
strHtml = Regex.Replace(strHtml, @"<img/s+(((?<alt>alt=('[^']*'|""[^""]*""|[^/s>]*))|(?<src>src=('[^']*'|""[^""]*""|[^/s>]*))|(?<border>border=('[^']*'|""[^""]*""|[^/s>]*))|(?<others>[^=<>]+=('[^']*'|""[^""]*""|[^/s>]*)))/s*)*[^>]*>", "[img ${src} ${border} ${alt}]", RegexOptions.IgnoreCase);
//不能显示大图片的时候用(并且实现lightbox效果)
string imgicon = "<img src='images/imgicon.jpg' width='16' height='12' border='0' alt='点击查看大图'>";
strHtml = Regex.Replace(strHtml, @"<img/s+((src=(?<src>'[^']*'|""[^""]*""|[^/s>]*))/s*)*[^>]*>", @"<a href=${src} rel='lightbox'>" + imgicon + "</a>", RegexOptions.IgnoreCase);
//图片的一般处理
strHtml = Regex.Replace(strHtml, @"<img/s+((src=(?<src>'[^']*'|""[^""]*""|[^/s>]*))/s*)*[^>]*>", @"<img src= ${src}>", RegexOptions.IgnoreCase);
//将<strhtml str="str" str="str">整理成<strhtml>
strHtml = Regex.Replace(strHtml, @"<div[^>]+>|]+>", "<div>", RegexOptions.IgnoreCase);
//所有带<>的标签都去掉
strHtml = Regex.Replace(strHtml, @"<[^>]+>|]+>", "", RegexOptions.IgnoreCase);
return strHtml;
}
#endregion
#region // 清理Word产生的垃圾代码(不是很理想,但...)
/// <summary>
/// Removes all FONT and SPAN tags, and all Class and Style attributes.
/// Designed to get rid of non-standard Microsoft Word HTML tags.
/// http://tim.mackey.ie/CleanWordHTMLUsingRegularExpressions.aspx
/// http://article.pchome.net/content-425187.html
/// </summary>
private string CleanWord(string strHtml)
{
// start by completely removing all unwanted tags
strHtml = Regex.Replace(strHtml, @"<[/]?(font|span|xml|del|ins|[ovwxp]:/w+)[^>]*?>", "", RegexOptions.IgnoreCase);
// then run another pass over the html (twice), removing unwanted attributes
strHtml = Regex.Replace(strHtml, @"<([^>]*)(?:class|lang|style|size|face|[ovwxp]:/w+)=(?:'[^']*'|""[^""]*""|[^/s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase);
strHtml = Regex.Replace(strHtml, @"<([^>]*)(?:class|lang|style|size|face|[ovwxp]:/w+)=(?:'[^']*'|""[^""]*""|[^/s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase);
// [20080323]
strHtml = Regex.Replace(strHtml, @"%", "%", RegexOptions.IgnoreCase);
// [20080323]去掉<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /></
strHtml = Regex.Replace(strHtml, @"<[?]xml[^>]+>|]+>", "", RegexOptions.IgnoreCase);
// [20080323]自动加载图片(自动替换M_IMG[i])
for (int i = 1; i < 100; i++)
{
strHtml = Regex.Replace(strHtml, @"#M_IMG" + i + "#", "<img src='images/" + i + ".jpg'>", RegexOptions.IgnoreCase);
}
return strHtml;
}
#endregion