很多人使用POI读取word的时候都会这么写:
1 2 3 | XWPFDocument document = new XWPFDocument(inputStream); System.out.println( new XWPFWordExtractor(document).getText()); |
但是这个方法其实有非常多的问题的,文本框里面的内容读取不到,换行也有问题。那么我改进了一下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | /** * 处理2007+的WORD * @param filePath 文件地址 * @return word内容 */ private static String read2007(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); XWPFDocument document = new XWPFDocument(inputStream); // 读取非表格文本框 for (XWPFParagraph xwpfParagraph : document.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } // 读取表格内文本框 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } } } } // 读取表格内容 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { content.append(xwpfParagraph.getText()).append(NEW_LINE); } } } } return content.toString(); } catch (IOException e) { logger.error( "解析word错误,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } /** * 获取XML内容,可以使用递归cursor.getDomNode() * @param xml xml * @return xml内容 */ private static String getXMLContent(String xml) { StringBuffer content = new StringBuffer(); Document document; try { document = DocumentHelper.parseText(xml); List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判断是否有表格包含文本框 boolean hasboxintab = false ; for (Object object : namespaces) { Namespace namespace = (Namespace) object; if (NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) { hasboxintab = true ; break ; } } if (!hasboxintab) return content.toString(); for (Object node : document.selectNodes( "//mc:Fallback//w:p" )) { for (Object nodeb : ((Node) node).selectNodes( ".//w:t" )) { if (StringUtils.isNotEmpty(((Node) nodeb).getText())) content.append(((Node) nodeb).getText()); } content.append(NEW_LINE); } } catch (DocumentException e) { logger.error( "XML转化错误,内容:" + xml, e); } return content.toString(); } |
2003版本简单一些:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /** * 处理2003的WORD * @param filePath 文件地址 * @return word内容 */ private static String read2003(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); HWPFDocument document = new HWPFDocument(inputStream); String text = null ; for ( int i = 0 ; i < document.getMainTextboxRange().numParagraphs(); i++) { // 文本框 text = document.getMainTextboxRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text)) content.append(text).append(NEW_LINE); } for ( int i = 0 ; i < document.getRange().numParagraphs(); i++) { // 非文本框 text = document.getRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意这里的trim()方法否者会出现乱码 content.append(text.trim()).append(NEW_LINE); } return content.toString(); } catch (FileNotFoundException e) { logger.error( "解析word错误,文件地址:" + filePath, e); } catch (IOException e) { logger.error( "解析word错误,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } |
注意:读取出的内容为表格里面的内容,文本框内容和直接写在编辑区里面的文本,其他的一些诸如:批注,引用等一些信息可能读取不到,需要的请自行解决。
比较完整的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; import org.dom4j.Node; /** * WordReaderUtils - WORD 读取 * * @author 500d Team * @version 1.0 */ public class WordReaderUtils { private static final String WORD_2003 = "doc" ; private static final String WORD_2007 = "docx" ; private static final Logger logger = Logger.getLogger(WordReaderUtils. class ); public static final String NEW_LINE = "\r\n" ; public static String read(String filePath) { File wordFile = StringUtils.isNotEmpty(filePath) ? new File(filePath) : null ; if (wordFile == null || !wordFile.exists() || !wordFile.isFile()) return null ; String extension = FilenameUtils.getExtension(filePath); if (StringUtils.isEmpty(extension)) return null ; String content = null ; if (WORD_2003.equals(extension.toLowerCase())) content = read2003(filePath); else if (WORD_2007.equals(extension.toLowerCase())) content = read2007(filePath); return Crossover.handle(content); } /** * 处理2003的WORD * @param filePath 文件地址 * @return word内容 */ private static String read2003(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); HWPFDocument document = new HWPFDocument(inputStream); String text = null ; for ( int i = 0 ; i < document.getMainTextboxRange().numParagraphs(); i++) { text = document.getMainTextboxRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text)) content.append(text).append(NEW_LINE); } for ( int i = 0 ; i < document.getRange().numParagraphs(); i++) { text = document.getRange().getParagraph(i).text(); if (StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim())) // 注意这里的trim()方法否者会出现乱码 content.append(text.trim()).append(NEW_LINE); } return content.toString(); } catch (FileNotFoundException e) { logger.error( "解析word错误,文件地址:" + filePath, e); } catch (IOException e) { logger.error( "解析word错误,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } /** * 处理2007+的WORD * @param filePath 文件地址 * @return word内容 */ private static String read2007(String filePath) { InputStream inputStream = null ; StringBuffer content = new StringBuffer(); try { inputStream = new FileInputStream( new File(filePath)); XWPFDocument document = new XWPFDocument(inputStream); // 读取非表格文本框 for (XWPFParagraph xwpfParagraph : document.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } // 读取表格内文本框 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { for (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); } } } } } // 读取表格内容 for (XWPFTable xwpfTable : document.getTables()) { for (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { for (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { for (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { content.append(xwpfParagraph.getText()).append(NEW_LINE); } } } } return content.toString(); } catch (IOException e) { logger.error( "解析word错误,文件地址:" + filePath, e); } finally { IOUtils.closeQuietly(inputStream); } return null ; } /** * 获取XML内容,可以使用递归cursor.getDomNode() * @param xml xml * @return xml内容 */ private static String getXMLContent(String xml) { StringBuffer content = new StringBuffer(); Document document; try { document = DocumentHelper.parseText(xml); List<?> namespaces = document.getRootElement().declaredNamespaces(); // 判断是否有表格包含文本框 boolean hasboxintab = false ; for (Object object : namespaces) { Namespace namespace = (Namespace) object; if (NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) { hasboxintab = true ; break ; } } if (!hasboxintab) return content.toString(); for (Object node : document.selectNodes( "//mc:Fallback//w:p" )) { for (Object nodeb : ((Node) node).selectNodes( ".//w:t" )) { if (StringUtils.isNotEmpty(((Node) nodeb).getText())) content.append(((Node) nodeb).getText()); } content.append(NEW_LINE); } } catch (DocumentException e) { logger.error( "XML转化错误,内容:" + xml, e); } return content.toString(); } public static void main(String[] args) throws Exception { // System.out.println(read("e://company/test.doc")); // System.out.println(read("e://company/test.docx")); } } |
参考文档:http://www.acgist.com/article/206.html