#region 从HTML中提取文本
/// <summary>
/// 从HTML中提取文本
/// </summary>
/// <param name="html">html源码</param>
/// <returns>纯文本</returns>
public static string ConvertHTML2Text(string html)
{
return ConvertHTML2TextRegex(html);
#region 以下是一组测试代码,从中可以看出,使用CovertHTML2TextRegex的效率最高
/* 其实使用ConvertHTML2TextMil的效率有时会比CovertHTML2TextRegex的效
* 率高,但是它的出错率比较的高!而CovertHTML2TextRegex2这个函数花费
* 时间太长!把Html转换成文本主要目的还是为了方便Lucene.Net分析,所以
* 综合比较下来使用CovertHTML2TextRegex还是比较合算:-)
*/
//string text = "";
//HighResolutionTimer timer1 = new HighResolutionTimer();
//timer1.Start();
//text = ConvertHTML2TextMil(html);
//timer1.Stop();
//Console.WriteLine("Mil timer1:" + timer1.ElapsedTime);
//HighResolutionTimer timer2 = new HighResolutionTimer();
//timer2.Start();
//text = ConvertHTML2TextMs(html);
//timer2.Stop();
//Console.WriteLine("Ms timer2:" + timer2.ElapsedTime);
//HighResolutionTimer timer3 = new HighResolutionTimer();
//timer3.Start();
//text = ConvertHTML2TextRegex(html);
//timer3.Stop();
//Console.WriteLine("Regex timer3:" + timer3.ElapsedTime);
//HighResolutionTimer timer4 = new HighResolutionTimer();
//timer4.Start();
//text = ConvertHTML2TextRegex2(html);
//timer4.Stop();
//Console.WriteLine("Regex2 timer4:" + timer4.ElapsedTime);
//return text;
#endregion
}
/// <summary>
/// 从HTML中提取文本,使用开源的MIL HTML Parser,来自http://www.planetsourcecode.com/
/// </summary>
/// <param name="html">html源码</param>
/// <returns>纯文本</returns>
/// 以下是MIL.HTML的作者信息:
/// HTML PARSER FOR .NET v2.0
/// Date: 18th July 2004
/// Author: Andy Powney (andy@powney.demon.co.uk)
/// 如果您要使用Mil.html类库来解析HTML请保留以上作者信息,谢谢
public static string ConvertHTML2TextMil(string html)
{
//return ConvertHTML2TextRegex(html);
string strText1 = "";
MIL.Html.HtmlDocument documnet;
HtmlParser parser = new HtmlDomainTreeParser();
try
{
documnet = parser.Parse(html);
StringBuilder text = new StringBuilder();
foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
{
HtmlText textNode;
textNode = (HtmlText)node;
if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
continue;
else text.Append(textNode.Text);
}
strText1 = text.ToString();
}
catch
{
Log.Warn("ConvertHtml2Text Error");
return ConvertHTML2TextMs(html);
}
return strText1;
}
/// <summary>
/// 将Html转换成文本,使用微软的mshtml.dll
/// </summary>
/// <param name="html">html源码</param>
/// <returns>纯文本</returns>
public static string ConvertHTML2TextMs(string html)
{
HTMLDocumentClass hc = new HTMLDocumentClass();
IHTMLDocument2 doc = hc;
doc.write(html);
doc.close();
if (doc.body.innerText != null)
return doc.body.innerText;
else
return "";
}
/// <summary>
/// 正则表达式式转换Html到Text来自www.codeprocject.com
/// 有时候会有问题
/// </summary>
/// <param name="html">html源码</param>
/// <returns>纯文本</returns>
public static string ConvertHTML2TextRegex2(string html)
{
try
{
string result;
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
result = html.Replace("\r", " ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace("\n", " ");
// Remove step-formatting
result = result.Replace("\t", string.Empty);
// Remove repeating speces becuase browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result,
@"( )+", " ");
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>", "<head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", "</head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<head>).*(</head>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result,
// @"(<script>)([^(<script>\.</script>)])*(</script>)",
// string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "<style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*style( )*>)", "</style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<style>).*(</style>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>", "\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*br( )*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*li( )*>", "\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place
// if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*div([^>])*>", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*tr([^>])*>", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*p([^>])*>", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images,
// comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"•", " * ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™", "(tm)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"⁄", "/",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<", "<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">", ">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©", "(c)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®", "(r)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testng
//System.Text.RegularExpressions.Regex.Replace(result,
// this.txtRegex.Text,string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("\n", "\r");
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween
// the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\r)", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\t)", "\t\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\r)", "\t\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\t)", "\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+(\r)", "\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove multible tabs followind a linebreak with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+", "\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Initial replacement target string for linebreaks
string breaks = "\r\r\r";
// Initial replacement target string for tabs
string tabs = "\t\t\t\t\t";
for (int index = 0; index < result.Length; index++)
{
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
}
// Thats it.
return result;
}
catch
{
Log.Warn("ConvertHtml2TextRegex2 Error");
return html;
}
}
/// <summary>
/// 将Html转换成文本,使用正则表达式
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
/// 来自jadepark的Blog,感谢jadepark!
/// http://www.cnblogs.com/jadepark/archive/2007/08/04/838907.html
/// Written: [CHINA] Zhang Liu
/// Date: 1,Jun,2006
/// Version: 1.0
/// Support: MYBASK http://www.mybask.net
/// Looking for latest version or similar implementation of this function, please visit: http://www.mybask.net
public static string ConvertHTML2TextRegex(string html)
{
//HighResolutionTimer timer2 = new HighResolutionTimer();
//timer2.Start();
//All the regular expression for matching html, javascript, style elements and others.
string[] aryRegex ={@"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+",
@"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);",
@"(\d+);", @"-->", @"<!--.*\n"};
//Corresponding replacment to the regular expressions.
//string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
string strStripped = html;
//Loop to replacing.
try
{
for (int i = 0; i < aryRegex.Length; i++)
{
Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
strStripped = regex.Replace(strStripped, aryReplacment[i]);
}
}
catch(Exception ee)
{
Log.Warn("ConvertHtml2TextRegex Error:" + ee.ToString());
return html;
}
//Replace "\r\n" to an empty character.
strStripped.Replace("\r\n", "");
strStripped.Replace("\t", "");
//Return stripped string.
//timer2.Stop();
//Console.WriteLine("timer3:" + timer2.ElapsedTime);
return strStripped;
}
#endregion