场景:现有一批大量的网页数据,已经抓取到网页的body内容,但是其中有很多 <\span>、 <\p>、<\img>、<\br>、<\strong> 等标签,需要将这些标签全部过滤掉,只留下正文信息。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetContent {
private static String REGEX = "<.+?>";
private static String INPUT = "";
private static String REPLACE = "";
public static void main(String[] args) throws IOException {
File file = new File("G:\\test.txt");
BufferedReader reader = new BufferedReader(new FileReader(file));
String tempString = null;
while ((tempString = reader.readLine()) != null) {
INPUT += tempString;
}
reader.close();
Pattern p = Pattern.compile(REGEX);
Matcher m = p.matcher(INPUT);
INPUT = m.replaceAll(REPLACE);
System.out.println(INPUT);
}
}