- 使用输入输出流批量处理文档(截取新闻的标题正文)
import java.io.*; public class Xi { public static void main(String[] args)throws IOException { zhixing(); } private static void zhixing() throws IOException{ int a = 1; for(a = 1;a<=50;a++){ //提前批量修改了文件名,然后用循环批量处理全部文档 FileInputStream f1 = new FileInputStream("D:\\desktop\\0703作业\\"+a+".html"); FileOutputStream f2 = new FileOutputStream("D:\\desktop\\0703作业\\txt\\"+a+"copy.txt",true); File xi = new File("D:\\desktop\\0703作业\\"+a+".html"); int txt_length = (int) xi.length(); // System.out.println("文件长度:"+txt_length); byte [] b = new byte[txt_length]; f1.read(b); // System.out.println(b); String str = new String(b,"utf-8"); // System.out.println(str); //*****************截取标题********************* int begin1 = str.indexOf("<title>"); int end1 = str.indexOf("|"); String str_bt = str.substring(begin1,end1); str_bt = str_bt.replace("<title>", ""); //*****************截取标题********************* String str_huanhang = "\n\n\n\n"; //*****************截取正文********************* int begin2 = str.indexOf("<p>"); int end2 = str.indexOf("<!-- 吸顶导航结束定位标记 -->"); String str_zw = str.substring(begin2,end2); //*****************截取正文********************* //*****************删除多余元素********************* str_zw = str_zw.replaceAll("^<.*\n$",""); str_zw = str_zw.replace("<p>", ""); str_zw = str_zw.replace("<p class=\"article-editor\">", ""); str_zw = str_zw.replace("<p> 原标题", ""); str_zw = str_zw.replace("</p>", ""); str_zw = str_zw.replace("<strong>", ""); str_zw = str_zw.replace("</strong>", ""); str_zw = str_zw.replace("</div>", ""); str_zw = str_zw.replace("\n\n\n", ""); str_zw = str_zw.replace("<p align=\"justify\"> ",""); str_zw = str_zw.replace(" ",""); str_zw = str_zw.replace("<p class=\"text\">",""); str_zw = str_zw.replace(">",""); str_zw = str_zw.replace("<br>",""); //*****************删除多余元素********************* //*****************删除图片********************* try{ StringBuffer shanchu = new StringBuffer(str_zw); int begind = str_zw.indexOf("<div"); int endd = str_zw.indexOf(">\n"); if(begind>0){ shanchu.delete(begind,endd+2); str_zw=shanchu.toString(); } } catch(Exception e){ System.out.println(""); } //*****************删除图片********************* System.out.println(str_bt); System.out.println(str_zw); byte [] out1 = str_bt.getBytes(); byte [] out2 = str_zw.getBytes(); byte [] out3 = str_huanhang.getBytes(); f2.write(out1); f2.write(out3); f2.write(out2); f2.write(out3); f2.close(); f1.close(); System.out.println("文件:"+a+"处理完毕\n\n"); } System.out.println("全部处理完毕"); } }