import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.Iterator; import java.util.Map;
import javax.swing.text.BadLocationException; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit;
import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; import org.textmining.text.extraction.WordExtractor;
public class ReadOffice { /** * @param args */ public static void main(String[] args) { // readDoc("e:/1.doc"); // readExcel("e:/1.xls"); // readPDF("e:/1.pdf"); // readHtml("e:/1.html"); readHtmlAll("e:/1.html"); } /** * 创建TXT文件,写入文件内容 * * @param text */ static void createTXTAndWriteDoc(String text, String path) { FileOutputStream fos = null; FileOutputStream out = null; try { // 新建一输出文件流,如果文件存在先删除文件 File f = new File(path); if (f.exists()) { f.delete(); } fos = new FileOutputStream(f); out = new FileOutputStream(f); byte[] b = text.getBytes("GB2312"); out.write(b); out.flush(); System.out.println("文件生成..."); } catch (Exception e) { System.out.println("出现异常: " + e); } finally { try { if (null != fos) { fos.close(); } } catch (IOException e) { e.printStackTrace(); } try { if (null != out) { out.close(); } } catch (IOException e) { e.printStackTrace(); } fos = null; out = null; } } /** * 读取DOC文件 * * @param dir * @throws Exception */ static void readDoc(String dir) { // 创建输入流读取doc文件 FileInputStream in = null; WordExtractor extractor = null; String text = null; try { in = new FileInputStream(new File(dir)); // 创建WordExtractor extractor = new WordExtractor(); // 对doc文件进行提取 text = extractor.extractText(in); System.out.println("text1:" + text); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } finally { try { if (null != in) { in.close(); } } catch (IOException e) { e.printStackTrace(); } in = null; } // 写入文件内容 createTXTAndWriteDoc(text, "e:/doc.txt"); } /** * 读取Excel文件 * * @param dir */ @SuppressWarnings("deprecation") static void readExcel(String dir) { /** * @param filePath * 文件路径 * @return 读出的Excel的内容 */ StringBuffer buff = new StringBuffer(); try { // 创建对Excel工作簿文件的引用 HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(dir)); // 创建对工作表的引用。 for (int numSheets = 0; numSheets < wb.getNumberOfSheets(); numSheets++) { if (null != wb.getSheetAt(numSheets)) { HSSFSheet aSheet = wb.getSheetAt(numSheets);// 获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null != aSheet.getRow(rowNumOfSheet)) { HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行 for (int cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null != aRow.getCell((short) cellNumOfRow)) { HSSFCell aCell = aRow.getCell((short) cellNumOfRow);// 获得列值 switch (aCell.getCellType()) { case HSSFCell.CELL_TYPE_FORMULA: break; case HSSFCell.CELL_TYPE_NUMERIC: buff.append(aCell.getNumericCellValue()).append(' '); break; case HSSFCell.CELL_TYPE_STRING: buff.append(aCell.getStringCellValue()).append(' '); break; } } } buff.append(' '); } } } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // 写入文件内容 createTXTAndWriteDoc(buff.toString(), "e:/excel.txt"); } /** * 读取Powerpoint文件 * * @param dir */ static void readPPT(String dir) { } /** * 读取PDF文件 * * @param dir */ static void readPDF(String dir) { String result = null; FileInputStream is = null; PDDocument document = null; try { is = new FileInputStream(dir); PDFParser parser = new PDFParser(is); parser.parse(); document = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(document); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (null != is) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } if (null != document) { try { document.close(); } catch (IOException e) { e.printStackTrace(); } } } // 写入文件内容 createTXTAndWriteDoc(result, "e:/pdf.txt"); } /** * // 读取pdf文件 * * @param file * @throws Exception */ public void readPdf(String file) throws Exception { // 是否排序 boolean sort = false; // pdf文件名 String pdfFile = file; // 输入文本文件名称 String textFile = null; // 编码方式 String encoding = "GB2312"; // 开始提取页数 int startPage = 1; // 结束提取页数 int endPage = Integer.MAX_VALUE; // 文件输入流,生成文本文件 Writer output = null; // 内存中存储的PDF Document PDDocument document = null; try { try { // 首先当作一个URL来装载文件,如果得到异常再从本地文件系统//去装载文件 URL url = new URL(pdfFile); // 注意参数已不是以前版本中的URL.而是File。 document = PDDocument.load(pdfFile); // 获取PDF的文件名 String fileName = url.getFile(); // 以原来PDF的名称来命名新产生的txt文件 if (fileName.length() > 4) { File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt"); textFile = outputFile.getName(); } } catch (MalformedURLException e) { // 如果作为URL装载得到异常则从文件系统装载 //注意参数已不是以前版本中的URL.而是File。 document = PDDocument.load(pdfFile); if (pdfFile.length() > 4) { textFile = pdfFile.substring(0, pdfFile.length() - 4) + ".txt"; } } // 文件输入流,写入文件倒textFile output = new OutputStreamWriter(new FileOutputStream(textFile), encoding); // PDFTextStripper来提取文本 PDFTextStripper stripper = null; stripper = new PDFTextStripper(); // 设置是否排序 stripper.setSortByPosition(sort); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 System.out.print(stripper.getText(document)); stripper.setEndPage(endPage); // 调用PDFTextStripper的writeText提取并输出文本 stripper.writeText(document, output); } finally { if (output != null) { // 关闭输出流 output.close(); } if (document != null) { // 关闭PDF Document document.close(); } } } /** * 读取Txt文件 * * @param filePath * @return * @throws Exception */ public String getTextFromTxt(String filePath) throws Exception { FileReader fr = new FileReader(filePath); BufferedReader br = new BufferedReader(fr); StringBuffer buff = new StringBuffer(); String temp = null; while ((temp = br.readLine()) != null) { buff.append(temp + " "); } br.close(); return buff.toString(); } /** * 读取RTF文件内容 * * @param filePath * @return */ public String getTextFromRtf(String filePath) { String result = null; File file = new File(filePath); try { DefaultStyledDocument styledDoc = new DefaultStyledDocument(); InputStream is = new FileInputStream(file); new RTFEditorKit().read(is, styledDoc, 0); result = new String(styledDoc.getText(0, styledDoc.getLength()).getBytes("ISO8859_1")); // 提取文本,读取中文需要使用ISO8859_1编码,否则会出现乱码 } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } return result; } /** * @param filePath * 文件路径 * @return 获得html的全部内容 */ public static String readHtml(String filePath) { BufferedReader br = null; StringBuffer sb = new StringBuffer(); try { br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "GB2312")); String temp = null; while ((temp = br.readLine()) != null) { sb.append(temp); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // 写入文件内容 createTXTAndWriteDoc(sb.toString(), "e:/html.txt"); return sb.toString(); } /** * @param filePath * 文件路径 * @return 获得的html文本内容 */ public static void readHtmlAll(String filePath) { // 得到body标签中的内容 String str = readHtml(filePath); StringBuffer buff = new StringBuffer(); int maxindex = str.length() - 1; int begin = 0; int end; // 截取>和<之间的内容 while ((begin = str.indexOf('>', begin)) < maxindex) { end = str.indexOf('<', begin); if (end - begin > 1) { buff.append(str.substring(++begin, end)); } begin = end + 1; } // 写入文件内容 createTXTAndWriteDoc(buff.toString(), "e:/htmlAll.txt"); //return buff.toString(); } /** * 以行为单位读取文件(文本文件) * * @param filePath */ public static void readFileByLine(String filePath) { File file = new File(filePath); BufferedReader bd = null; Map<String, String> str = new HashMap<String, String>(); String s1 = ""; String s2 = ""; try { bd = new BufferedReader(new InputStreamReader(new FileInputStream(file), "gb2312"));// 编码转换(关键的地方) String temp = ""; int line = 1; while ((temp = bd.readLine()) != null) { if (temp.length() > 0) { s1 = temp.substring(0, 3); s1 = s1.trim(); s2 = temp.substring(4); s2 = s2.trim(); str.put(s1, s2); } ++line; } createExcel(str); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (bd != null) bd.close(); } catch (IOException e) { e.printStackTrace(); } } } /** * 输出Excel文件,输出格式为多行两列 * * @param map */ @SuppressWarnings( { "deprecation", "unchecked" }) static void createExcel(Map<String, String> map) { try { // 新建一输出文件流 FileOutputStream fOut = new FileOutputStream("e:/2.xls"); File file = new File("e:/2.xls"); if (file.exists()) { file.delete(); } // 创建新的Excel 工作簿 HSSFWorkbook workbook = new HSSFWorkbook(); // 在Excel工作簿中建一工作表,其名为缺省值 // 如要新建一名为"联系人用户名和电话"的工作表,其语句为: HSSFSheet sheet = workbook.createSheet("联系人用户名和电话"); HSSFRow row = null; // 在索引0的位置创建单元格(左上端) HSSFCell cell1 = null; HSSFCell cell2 = null; Iterator iter = map.entrySet().iterator(); int i = 0; while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); Object key = entry.getKey(); Object val = entry.getValue(); row = sheet.createRow((short) i++); cell1 = row.createCell((short) 0); cell2 = row.createCell((short) 1); // 定义单元格为字符串类型 cell1.setCellType(HSSFCell.CELL_TYPE_STRING); cell2.setCellType(HSSFCell.CELL_TYPE_STRING); // 在单元格中输入一些内容 cell1.setCellValue(key.toString()); cell2.setCellValue(val.toString()); if (i > 255) { break; } } // 把相应的Excel 工作簿存盘 workbook.write(fOut); fOut.flush(); // 操作结束,关闭文件 fOut.close(); System.out.println("文件生成..."); } catch (Exception e) { System.out.println("出现异常: " + e); } } }