利用一个第三方编辑器,来编辑新闻,但是用户常常将word中的文字直接拷到编辑器,word中很多无用的代码,为了过滤掉这些代码,我找了很多资料,但是最终页没有找到,但是我找到了一个javascript版本的,无奈,只有转换成C#版得,以下是转换后的C#代码,最下边有javascript源代码,如果有不对的地方,还希望大家告诉我,多谢了
C#清除word的代码
public static string ClearWord(string sourceText, bool bIgnoreFont = true, bool bRemoveStyles = true, bool cleanWordKeepsStructure = true)
{
sourceText = Regex.Replace(sourceText, @"<o:p>\s*<\/o:p>", "");
sourceText = Regex.Replace(sourceText, @"<o:p>.*?<\/o:p>", " ");
// Remove mso-xxx styles.
sourceText = Regex.Replace(sourceText, @"\s*mso-[^:]+:[^;""]+;?", "", RegexOptions.IgnoreCase);
// Remove margin styles.
sourceText = Regex.Replace(sourceText, @"\s*MARGIN: 0cm 0cm 0pt\s*;", "", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*MARGIN: 0cm 0cm 0pt\s*""", "\"", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*TEXT-INDENT: 0cm\s*;", "", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*TEXT-INDENT: 0cm\s*""", "\"", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*TEXT-ALIGN: [^\s;]+;?""", "\"", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*PAGE-BREAK-BEFORE: [^\s;]+;?""", "\"", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*FONT-VARIANT: [^\s;]+;?""", "\"", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*tab-stops:[^;""]*;?", "", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*tab-stops:[^""]*", "", RegexOptions.IgnoreCase);
// Remove FONT face attributes.
if (bIgnoreFont)
{
sourceText = Regex.Replace(sourceText, @"\s*face=""[^""]*""", "", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*face=[^ >]*", "", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"\s*FONT-FAMILY:[^;""]*;?", "", RegexOptions.IgnoreCase);
}
// Remove Class attributes
sourceText = Regex.Replace(sourceText, @"<(\w[^>]*) class=([^ |>]*)([^>]*)", "<$1$3", RegexOptions.IgnoreCase);
// Remove styles.
if (bRemoveStyles)
sourceText = Regex.Replace(sourceText, @"<(\w[^>]*) style=""([^\""]*)""([^>]*)", "<$1$3", RegexOptions.IgnoreCase);
// Remove empty styles.
sourceText = Regex.Replace(sourceText, @"\s*style=""\s*""", "", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<SPAN\s*[^>]*>\s* \s*<\/SPAN>", " ", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<SPAN\s*[^>]*><\/SPAN>", "", RegexOptions.IgnoreCase);
// Remove Lang attributes
sourceText = Regex.Replace(sourceText, @"<(\w[^>]*) lang=([^ |>]*)([^>]*)", "<$1$3", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<SPAN\s*>(.*?)<\/SPAN>", "$1", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<FONT\s*>(.*?)<\/FONT>", "$1", RegexOptions.IgnoreCase);
// Remove XML elements and declarations
sourceText = Regex.Replace(sourceText, @"<\\?\?xml[^>]*>", "", RegexOptions.IgnoreCase);
// Remove Tags with XML namespace declarations: <o:p><\/o:p>
sourceText = Regex.Replace(sourceText, @"<\/?\w+:[^>]*>", "", RegexOptions.IgnoreCase);
// Remove comments [SF BUG-1481861].
sourceText = Regex.Replace(sourceText, @"<\!--.*?-->/", "");
sourceText = Regex.Replace(sourceText, @"<(U|I|STRIKE)> <\/\1>", " ");
sourceText = Regex.Replace(sourceText, @"<H\d>\s*<\/H\d>", "", RegexOptions.IgnoreCase);
// Remove "display:none" tags.
sourceText = Regex.Replace(sourceText, @"<(\w+)[^>]*\sstyle=""[^""]*DISPLAY\s?:\s?none(.*?)<\/\1>", "", RegexOptions.IgnoreCase);
// Remove language tags
sourceText = Regex.Replace(sourceText, @"<(\w[^>]*) language=([^ |>]*)([^>]*)", "<$1$3", RegexOptions.IgnoreCase);
// Remove onmouseover and onmouseout events (from MS Word comments effect)
sourceText = Regex.Replace(sourceText, @"<(\w[^>]*) onmouseover=""([^\""]*)""([^>]*)", "<$1$3", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<(\w[^>]*) onmouseout=""([^\""]*)""([^>]*)", "<$1$3", RegexOptions.IgnoreCase);
if (cleanWordKeepsStructure)
{
// The original <Hn> tag send from Word is something like this: <Hn style="margin-top:0px;margin-bottom:0px">
sourceText = Regex.Replace(sourceText, @"<H(\d)([^>]*)>", "<h$1>", RegexOptions.IgnoreCase);
// Word likes to insert extra <font> tags, when using MSIE. (Wierd).
sourceText = Regex.Replace(sourceText, @"<(H\d)><FONT[^>]*>(.*?)<\/FONT><\/\1>", @"<$1>$2<\/$1>", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<(H\d)><EM>(.*?)<\/EM><\/\1>", @"<$1>$2<\/$1>", RegexOptions.IgnoreCase);
}
else
{
sourceText = Regex.Replace(sourceText, @"<H1([^>]*)>", @"<div$1><b><font size=""6"">", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<H2([^>]*)>", @"<div$1><b><font size=""5"">", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<H3([^>]*)>", @"<div$1><b><font size=""4"">", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<H4([^>]*)>", @"<div$1><b><font size=""3"">", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<H5([^>]*)>", @"<div$1><b><font size=""2"">", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<H6([^>]*)>", @"<div$1><b><font size=""1"">", RegexOptions.IgnoreCase);
sourceText = Regex.Replace(sourceText, @"<\/H\d>", @"<\/font><\/b><\/div>", RegexOptions.IgnoreCase);
// Transform <P> to <DIV>
//var re = new Regex(@"(<P)([^>]*>.*?)(<\/P>)", RegexOptions.IgnoreCase); // Different because of a IE 5.0 error
sourceText = Regex.Replace(sourceText, @"(<P)([^>]*>.*?)(<\/P>)", @"<div$2<\/div>", RegexOptions.IgnoreCase);
// Remove empty tags (three times, just to be sure).
// This also removes any empty anchor
sourceText = Regex.Replace(sourceText, @"<([^\s>]+)(\s[^>]*)?>\s*<\/\1>", "");
sourceText = Regex.Replace(sourceText, @"<([^\s>]+)(\s[^>]*)?>\s*<\/\1>", "");
sourceText = Regex.Replace(sourceText, @"<([^\s>]+)(\s[^>]*)?>\s*<\/\1>", "");
}
return sourceText;
}
Javascript 清除word格式的代码:
function CleanWord( oNode, bIgnoreFont, bRemoveStyles )
{
var html = oNode.innerHTML ;
html = html.replace(/<o:p>\s*<\/o:p>/g, '') ;
html = html.replace(/<o:p>.*?<\/o:p>/g, ' ') ;
// Remove mso-xxx styles.
html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, '' ) ;
// Remove margin styles.
html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, '' ) ;
html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;
html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, '' ) ;
html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;
html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;
html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;
html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;
html = html.replace( /\s*tab-stops:[^;"]*;?/gi, '' ) ;
html = html.replace( /\s*tab-stops:[^"]*/gi, '' ) ;
// Remove FONT face attributes.
if ( bIgnoreFont )
{
html = html.replace( /\s*face="[^"]*"/gi, '' ) ;
html = html.replace( /\s*face=[^ >]*/gi, '' ) ;
html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, '' ) ;
}
// Remove Class attributes
html = html.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ;
// Remove styles.
if ( bRemoveStyles )
html = html.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;
// Remove empty styles.
html = html.replace( /\s*style="\s*"/gi, '' ) ;
html = html.replace( /<SPAN\s*[^>]*>\s* \s*<\/SPAN>/gi, ' ' ) ;
html = html.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ;
// Remove Lang attributes
html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
html = html.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ;
html = html.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ;
// Remove XML elements and declarations
html = html.replace(/<\\?\?xml[^>]*>/gi, '' ) ;
// Remove Tags with XML namespace declarations: <o:p><\/o:p>
html = html.replace(/<\/?\w+:[^>]*>/gi, '' ) ;
// Remove comments [SF BUG-1481861].
html = html.replace(/<\!--.*?-->/g, '' ) ;
html = html.replace( /<(U|I|STRIKE)> <\/\1>/g, ' ' ) ;
html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;
// Remove "display:none" tags.
html = html.replace( /<(\w+)[^>]*\sstyle="[^"]*DISPLAY\s?:\s?none(.*?)<\/\1>/ig, '' ) ;
// Remove language tags
html = html.replace( /<(\w[^>]*) language=([^ |>]*)([^>]*)/gi, "<$1$3") ;
// Remove onmouseover and onmouseout events (from MS Word comments effect)
html = html.replace( /<(\w[^>]*) onmouseover="([^\"]*)"([^>]*)/gi, "<$1$3") ;
html = html.replace( /<(\w[^>]*) onmouseout="([^\"]*)"([^>]*)/gi, "<$1$3") ;
if ( FCKConfig.CleanWordKeepsStructure )
{
// The original <Hn> tag send from Word is something like this: <Hn style="margin-top:0px;margin-bottom:0px">
html = html.replace( /<H(\d)([^>]*)>/gi, '<h$1>' ) ;
// Word likes to insert extra <font> tags, when using MSIE. (Wierd).
html = html.replace( /<(H\d)><FONT[^>]*>(.*?)<\/FONT><\/\1>/gi, '<$1>$2<\/$1>' );
html = html.replace( /<(H\d)><EM>(.*?)<\/EM><\/\1>/gi, '<$1>$2<\/$1>' );
}
else
{
html = html.replace( /<H1([^>]*)>/gi, '<div$1><b><font size="6">' ) ;
html = html.replace( /<H2([^>]*)>/gi, '<div$1><b><font size="5">' ) ;
html = html.replace( /<H3([^>]*)>/gi, '<div$1><b><font size="4">' ) ;
html = html.replace( /<H4([^>]*)>/gi, '<div$1><b><font size="3">' ) ;
html = html.replace( /<H5([^>]*)>/gi, '<div$1><b><font size="2">' ) ;
html = html.replace( /<H6([^>]*)>/gi, '<div$1><b><font size="1">' ) ;
html = html.replace( /<\/H\d>/gi, '<\/font><\/b><\/div>' ) ;
// Transform <P> to <DIV>
var re = new RegExp( '(<P)([^>]*>.*?)(<\/P>)', 'gi' ) ; // Different because of a IE 5.0 error
html = html.replace( re, '<div$2<\/div>' ) ;
// Remove empty tags (three times, just to be sure).
// This also removes any empty anchor
html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ;
html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ;
html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, '' ) ;
}
return html ;
}