java去除html tags_java去除html标签-优快云博客

本文介绍了一种使用Java从字符串中移除HTML标签的方法，并提供了优化数据处理的额外技巧。该方法首先通过正则表达式移除所有的HTML标签，然后处理换行符和特殊字符。此外，还提供了一个更全面的优化函数，可以更细致地去除脚本和样式标签。

     public static String removeHTML(String htmlString)
     {
           // Remove HTML tag from java String
         String noHTMLString = htmlString.replaceAll("//<.*?//>", "");

         // Remove Carriage return from java String
         noHTMLString = noHTMLString.replaceAll("/r", "<br/>");

         // Remove New line from java string and replace html break
         noHTMLString = noHTMLString.replaceAll("/n", " ");
         noHTMLString = noHTMLString.replaceAll("/'", "'");
         noHTMLString = noHTMLString.replaceAll("/"", """);
         return noHTMLString;
     }

     public static void main(String[] args) {

     String strHTML= "<html>"+
                     "<head>"+
                     "<title>Convert HTML to Text String</title>"+
                     "</head>"+

                     "<body>"+
                     "This is HTML String of java's source code /"my program/""+
                     "</body>"+
                     "</html>";

         String stringWithoutHTML=removeHTML(strHTML);

         System.out.println(stringWithoutHTML);
     }

public static String regEx_script = "<script[^>]*?>[//s//S]*?<///script>";
public static String regEx_style = "<style[^>]*?>[//s//S]*?<///style>";
public static String regEx_html = "<[^>]+>";
public static Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
public static Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
public static Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
public static String getOptimizedData(String inputString) {
if (inputString == null) {
return inputString;
}
//stripping script tags whether the tag contains "/n" or "/r" or not.
Matcher m_script = p_script.matcher(inputString);
String htmlStr = m_script.replaceAll("");
//stripping style tags whether the tag contains "/n" or "/r" or not.
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll("");
//stripping html tags but continue to have the "/n" and "/r" in right place.
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll("");
return htmlStr;
}