public static String removeHTML(String htmlString)
{
// Remove HTML tag from java String
String noHTMLString = htmlString.replaceAll("//<.*?//>", "");
// Remove Carriage return from java String
noHTMLString = noHTMLString.replaceAll("/r", "<br/>");
// Remove New line from java string and replace html break
noHTMLString = noHTMLString.replaceAll("/n", " ");
noHTMLString = noHTMLString.replaceAll("/'", "'");
noHTMLString = noHTMLString.replaceAll("/"", """);
return noHTMLString;
}
public static void main(String[] args) {
String strHTML= "<html>"+
"<head>"+
"<title>Convert HTML to Text String</title>"+
"</head>"+
"<body>"+
"This is HTML String of java's source code /"my program/""+
"</body>"+
"</html>";
String stringWithoutHTML=removeHTML(strHTML);
System.out.println(stringWithoutHTML);
}
- public static String regEx_script = "<script[^>]*?>[//s//S]*?<///script>";
- public static String regEx_style = "<style[^>]*?>[//s//S]*?<///style>";
- public static String regEx_html = "<[^>]+>";
- public static Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
- public static Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
- public static Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
- public static String getOptimizedData(String inputString) {
- if (inputString == null) {
- return inputString;
- }
- //stripping script tags whether the tag contains "/n" or "/r" or not.
- Matcher m_script = p_script.matcher(inputString);
- String htmlStr = m_script.replaceAll("");
- //stripping style tags whether the tag contains "/n" or "/r" or not.
- Matcher m_style = p_style.matcher(htmlStr);
- htmlStr = m_style.replaceAll("");
- //stripping html tags but continue to have the "/n" and "/r" in right place.
- Matcher m_html = p_html.matcher(htmlStr);
- htmlStr = m_html.replaceAll("");
- return htmlStr;
- }
本文介绍了一种使用Java从字符串中移除HTML标签的方法,并提供了优化数据处理的额外技巧。该方法首先通过正则表达式移除所有的HTML标签,然后处理换行符和特殊字符。此外,还提供了一个更全面的优化函数,可以更细致地去除脚本和样式标签。

被折叠的 条评论
为什么被折叠?



