从HTML中提取文本的经典函数群.-优快云博客

本文介绍了几种从HTML中提取纯文本的方法，包括使用正则表达式、MIL HTML解析器及微软mshtml.dll等技术，并对比了它们的性能。

        #region 从HTML中提取文本
        /// <summary>
        /// 从HTML中提取文本
        /// </summary>
        /// <param name="html">html源码</param>
        /// <returns>纯文本</returns>
        public static string ConvertHTML2Text(string html)
        {
            return ConvertHTML2TextRegex(html);

            #region 以下是一组测试代码，从中可以看出，使用CovertHTML2TextRegex的效率最高
            /* 其实使用ConvertHTML2TextMil的效率有时会比CovertHTML2TextRegex的效
             * 率高，但是它的出错率比较的高！而CovertHTML2TextRegex2这个函数花费
             * 时间太长！把Html转换成文本主要目的还是为了方便Lucene.Net分析，所以
             * 综合比较下来使用CovertHTML2TextRegex还是比较合算:-)
              */
            //string text = "";
            //HighResolutionTimer timer1 = new HighResolutionTimer();
            //timer1.Start();
            //text = ConvertHTML2TextMil(html);
            //timer1.Stop();
            //Console.WriteLine("Mil timer1:" + timer1.ElapsedTime);

            //HighResolutionTimer timer2 = new HighResolutionTimer();
            //timer2.Start();
            //text = ConvertHTML2TextMs(html);

//timer2.Stop();
//Console.WriteLine("Ms timer2:" + timer2.ElapsedTime);

            //HighResolutionTimer timer3 = new HighResolutionTimer();
            //timer3.Start();
            //text = ConvertHTML2TextRegex(html);
            //timer3.Stop();
            //Console.WriteLine("Regex timer3:" + timer3.ElapsedTime);

            //HighResolutionTimer timer4 = new HighResolutionTimer();
            //timer4.Start();
            //text = ConvertHTML2TextRegex2(html);
            //timer4.Stop();
            //Console.WriteLine("Regex2 timer4:" + timer4.ElapsedTime);

            //return text;
            #endregion
        }

        /// <summary>
        /// 从HTML中提取文本,使用开源的MIL HTML Parser，来自http://www.planetsourcecode.com/
        /// </summary>
        /// <param name="html">html源码</param>
        /// <returns>纯文本</returns>
        /// 以下是MIL.HTML的作者信息：
        /// HTML PARSER FOR .NET v2.0
        /// Date: 18th July 2004
        /// Author: Andy Powney (andy@powney.demon.co.uk)
        /// 如果您要使用Mil.html类库来解析HTML请保留以上作者信息，谢谢
        public static string ConvertHTML2TextMil(string html)
        {
            //return ConvertHTML2TextRegex(html);
            string strText1 = "";
            MIL.Html.HtmlDocument documnet;
            HtmlParser parser = new HtmlDomainTreeParser();
            try
            {
                documnet = parser.Parse(html);
                StringBuilder text = new StringBuilder();
                foreach (HtmlNode node in documnet.Nodes.FindAllText(true))
                {
                    HtmlText textNode;
                    textNode = (HtmlText)node;
                    if (textNode.Text.Contains("\r") || textNode.Text.Contains("\n"))
                        continue;
                    else text.Append(textNode.Text);

                }
                strText1 = text.ToString();
            }
            catch
            {
                Log.Warn("ConvertHtml2Text Error");
                return ConvertHTML2TextMs(html);

}

return strText1;

}

        /// <summary>
        /// 将Html转换成文本，使用微软的mshtml.dll
        /// </summary>
        /// <param name="html">html源码</param>
        /// <returns>纯文本</returns>
        public static string ConvertHTML2TextMs(string html)
        {
            HTMLDocumentClass hc = new HTMLDocumentClass();
            IHTMLDocument2 doc = hc;
            doc.write(html);
            doc.close();
            if (doc.body.innerText != null)
                return doc.body.innerText;
            else
                return "";
        }

        /// <summary>
        /// 正则表达式式转换Html到Text来自www.codeprocject.com
        /// 有时候会有问题
        /// </summary>
        /// <param name="html">html源码</param>
        /// <returns>纯文本</returns>
        public static string ConvertHTML2TextRegex2(string html)
        {
            try
            {
                string result;
                // Remove HTML Development formatting
                // Replace line breaks with space
                // because browsers inserts space
                result = html.Replace("\r", " ");
                // Replace line breaks with space
                // because browsers inserts space
                result = result.Replace("\n", " ");
                // Remove step-formatting
                result = result.Replace("\t", string.Empty);
                // Remove repeating speces becuase browsers ignore them
                result = System.Text.RegularExpressions.Regex.Replace(result,
                    @"( )+", " ");
                // Remove the header (prepare first by clearing attributes)
                result = System.Text.RegularExpressions.Regex.Replace(result,
                    @"<( )*head([^>])*>", "<head>",
                    System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                    @"(<( )*(/)( )*head( )*>)", "</head>",
                    System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                    "(<head>).*(</head>)", string.Empty,
                    System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // remove all scripts (prepare first by clearing attributes)
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*script([^>])*>", "<script>",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"(<( )*(/)( )*script( )*>)", "</script>",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                //result = System.Text.RegularExpressions.Regex.Replace(result,
                //         @"(<script>)([^(<script>\.</script>)])*(</script>)",
                //         string.Empty,
                //         System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"(<script>).*(</script>)", string.Empty,
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // remove all styles (prepare first by clearing attributes)
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*style([^>])*>", "<style>",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"(<( )*(/)( )*style( )*>)", "</style>",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(<style>).*(</style>)", string.Empty,
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // insert tabs in spaces of <td> tags
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*td([^>])*>", "\t",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // insert line breaks in places of <BR> and <LI> tags
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*br( )*>", "\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*li( )*>", "\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // insert line paragraphs (double line breaks) in place
                // if <P>, <DIV> and <TR> tags
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*div([^>])*>", "\r\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*tr([^>])*>", "\r\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<( )*p([^>])*>", "\r\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // Remove remaining tags like <a>, links, images,
                // comments etc - anything thats enclosed inside < >
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<[^>]*>", string.Empty,
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // replace special characters:
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @" ", " ",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"•", " * ",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"&lsaquo;", "<",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"&rsaquo;", ">",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"™", "(tm)",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"&frasl;", "/",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"<", "<",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @">", ">",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"©", "(c)",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"®", "(r)",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                // Remove all others. More can be added, see
                // http://hotwired.lycos.com/webmonkey/reference/special_characters/
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     @"&(.{2,6});", string.Empty,
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // for testng
                //System.Text.RegularExpressions.Regex.Replace(result,
                //       this.txtRegex.Text,string.Empty,
                //       System.Text.RegularExpressions.RegexOptions.IgnoreCase);

// make line breaking consistent
result = result.Replace("\n", "\r");

                // Remove extra line breaks and tabs:
                // replace over 2 breaks with 2 and over 4 tabs with 4.
                // Prepare first to remove any whitespaces inbetween
                // the escaped characters and remove redundant tabs inbetween linebreaks
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(\r)( )+(\r)", "\r\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(\t)( )+(\t)", "\t\t",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(\t)( )+(\r)", "\t\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(\r)( )+(\t)", "\r\t",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                // Remove redundant tabs
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(\r)(\t)+(\r)", "\r\r",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                // Remove multible tabs followind a linebreak with just one tab
                result = System.Text.RegularExpressions.Regex.Replace(result,
                     "(\r)(\t)+", "\r\t",
                     System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                // Initial replacement target string for linebreaks
                string breaks = "\r\r\r";
                // Initial replacement target string for tabs
                string tabs = "\t\t\t\t\t";
                for (int index = 0; index < result.Length; index++)
                {
                    result = result.Replace(breaks, "\r\r");
                    result = result.Replace(tabs, "\t\t\t\t");
                    breaks = breaks + "\r";
                    tabs = tabs + "\t";
                }

// Thats it.
return result;

            }
            catch
            {
                Log.Warn("ConvertHtml2TextRegex2 Error");
                return html;
            }

}

/// <summary>
        /// 将Html转换成文本，使用正则表达式
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        /// 来自jadepark的Blog，感谢jadepark!
        /// http://www.cnblogs.com/jadepark/archive/2007/08/04/838907.html
        /// Written:     [CHINA] Zhang Liu
        /// Date:        1,Jun,2006
        /// Version:     1.0
        /// Support:     MYBASK http://www.mybask.net
        /// Looking for latest version or similar implementation of this function, please visit: http://www.mybask.net
        public static string ConvertHTML2TextRegex(string html)
        {
            //HighResolutionTimer timer2 = new HighResolutionTimer();
            //timer2.Start();
            //All the regular expression for matching html, javascript, style elements and others.
            string[] aryRegex ={@"<%=[\w\W]*?%>",    @"<script[\w\W]*?</script>",     @"<style[\w\W]*?</style>",   @"<[/]?[\w\W]*?>",   @"([\r\n])[\s]+",
                              @"&(nbsp|#160);",    @"&(iexcl|#161);",               @"&(cent|#162);",            @"&(pound|#163);",   @"&(copy|#169);",
                              @"(\d+);",         @"-->",                          @"<!--.*\n"};
            //Corresponding replacment to the regular expressions.
            //string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "" };
            string[] aryReplacment = { "", "", "", "", "", " ", "", "", "", "", "", "", "" };
            string strStripped = html;
            //Loop to replacing.
            try
            {
                for (int i = 0; i < aryRegex.Length; i++)
                {
                    Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
                    strStripped = regex.Replace(strStripped, aryReplacment[i]);
                }
            }
            catch(Exception ee)
            {
                Log.Warn("ConvertHtml2TextRegex Error:" + ee.ToString());
                return html;
            }
            //Replace "\r\n" to an empty character.
            strStripped.Replace("\r\n", "");
            strStripped.Replace("\t", "");
            //Return stripped string.
            //timer2.Stop();
            //Console.WriteLine("timer3:" + timer2.ElapsedTime);
            return strStripped;