File f = new File("htmtmp/4186.htm"); Converter c = new Converter(); ArticleTextExtractor extractors = new ArticleTextExtractor(); JResult res = extractors.extractContent(c.streamToString( new FileInputStream(f))); System.out.println(res.getText());
BufferedReader reader = new BufferedReader(new FileReader("htmtmp/1.htm")); String line = null; Set<String> existing = new LinkedHashSet<String>(); while ((line = reader.readLine()) != null) { int index1 = line.indexOf("\""); int index2 = line.indexOf("\"", index1 + 1); String url = line.substring(index1 + 1, index2); String domainStr = SHelper.extractDomain(url, true); String counterStr = ""; // TODO more similarities if (existing.contains(domainStr)) counterStr = "2"; else existing.add(domainStr);
String html = new HtmlFetcher().fetchAsString(url, 20000); String outFile = domainStr + counterStr + ".html"; BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); writer.write(html); writer.close(); } reader.close();