//遇到解析word文档的需求,文档格式不定,在网上看了好多文章,大多是简单应用api解析内容,要不就是需要windows环境,还有个在线编辑的第三方的,不太符合本人需求,目前实现了doc的,docx的后续再说吧。
//本来是想把word解析成xml形式的字符串,就
参考apache poi中的
wordToHtmlConverter部分源码,因为它做了很多样式的解析,本人用不到这些,解析过多反而累赘,所以参考源码
+dom4j实现此功能,不过后来就顺便实现了下导出html(只有一两个样式)
//注意修改代码中word文档的地址和导出图片的存放地址(getImgUrl)
private static Document document = null;
static{
document = DocumentHelper.createDocument();
}
public static void main(String[] args) {
File file = new File("C:/Users/css/Desktop/1.doc");
//注意文档地址
if(file.exists()){
HWPFDocument doc;
try {
doc = new HWPFDocument(new FileInputStream(file));
Range range = doc.getRange();
wordToHtml(doc);
// printRange(doc);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* word解析并导出html文件
* @param doc
*/
private static void wordToHtml(HWPFDocument doc){
Element htmlElement = DocumentHelper.createElement("html");
document.setRootElement(htmlElement);
Element headElement = DocumentHelper.createElement("head");
Element charSetElement = DocumentHelper.createElement("meta");
charSetElement.addAttribute("http-equiv", "Content-Type");
charSetElement.addAttribute("content", "text/html; charset=UTF-8");
headElement.add(charSetElement);
htmlElement.add(headElement);
Element bodyElement = DocumentHelper.createElement("body");
Element contentElement = formatRange(doc);
//解析word
bodyElement.add(contentElement);
htmlElement.add(bodyElement);
String docString = document.asXML();
writeFile(docString, "C:/Users/css/Desktop/12345.html");
System.out.println(docString);
}
/**
* 导出文件
* @param content
* @param path
*/
private static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();