整理了一天网上查到的把Word转为HTML的方法,包括了word07以上版本的转换
代码如下(整合了前辈们的代码):
参考博客地址:http://blog.youkuaiyun.com/ptzrbin/article/details/43449701
http://blog.youkuaiyun.com/u011687117/article/details/29561027
package data.util;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.IURIResolver;
import org.apache.poi.xwpf.converter.xhtml.DefaultContentHandlerFactory;
import org.apache.poi.xwpf.converter.xhtml.IContentHandlerFactory;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
/**
* Word转换为Html并读取Html内容工具类
*/
public class WechatWord2Html {
//输出html文件
public static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
org.jsoup.nodes.Document doc = Jsoup.parse(content);
content=doc.html();
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"GB2312"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}
/**
* Word 转 Html
* 依赖jar包: ooxml-schemas-1.1.jar ;
* org.apache.poi.xwpf.converter.core-1.0.4.jar ;
* org.apache.poi.xwpf.converter.xhtml-1.0.4.jar ;
* @param fileName
* @param outPutFile
* @param fileNameExtension
* @throws TransformerException
* @throws IOException
* @throws ParserConfigurationException
*/
public static void convert2Html(String filePath, String outPutFile ,String fileNameExtension)
throws TransformerException, IOException, ParserConfigurationException { //filePath :Word文件路径 //outPutFile : 输出文件存放路径 //fileNameExtension : Word后缀 if(fileNameExtension.equals("doc")){ //老版本
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager( new PicturesManager() {
public String savePicture( byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches ) {
return "test/"+suggestedName;
}
} );
wordToHtmlConverter.processDocument(wordDocument);
//save pictures
List pics=wordDocument.getPicturesTable().getAllPictures();
if(pics!=null){
for(int i=0;i<pics.size();i++){
Picture pic = (Picture)pics.get(i);
System.out.println();
try {
pic.writeImageContent(new FileOutputStream("D:/test/"
+ pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
serializer.transform(domSource, streamResult);
out.close();
writeFile(new String(out.toByteArray()), outPutFile);
}else if(fileNameExtension.equals("docx")){ //新版本
XWPFDocument document = new XWPFDocument(new FileInputStream(new File(filePath)));
XHTMLOptions options = XHTMLOptions.create();// .indent( 4 );
IContentHandlerFactory f = new DefaultContentHandlerFactory();
// Extract image
options.setExtractor(new FileImageExtractor(new File("D:/")));
// URI resolver
options.URIResolver(new IURIResolver() {
@Override
public String resolve(String uri) {
return "D:/" + uri;
}
});
ByteArrayOutputStream out = new ByteArrayOutputStream();
ContentHandler contentHandler = f.create(out, null, options);
XHTMLConverter.getInstance().convert(document, out, options);
out.close();
writeFile(new String(out.toByteArray()), outPutFile);
}
}
/**
* Html内容提取为String
* @param filePath
* @return
*/
public static String readfile(String filePath){
File file = new File(filePath);
InputStream input = null;
try {
input = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
StringBuffer buffer = new StringBuffer();
byte[] bytes = new byte[1024];
try {
for (int n; (n = input.read(bytes)) != -1;) {
buffer.append(new String(bytes, 0, n, "GBK"));
}
} catch (IOException e) {
e.printStackTrace();
}
// System.out.println(buffer);
return buffer.toString();
}
/**
* 读取html的body内容为String
* @param val
* @return
*/
public static String getBody(String val) {
String start = "<body>";
String end = "</body>";
int s = val.indexOf(start) + start.length();
int e = val.indexOf(end);
return val.substring(s, e);
}
}
本文介绍了一种将Word文档转换为HTML的方法,适用于doc和docx两种格式。通过使用Apache POI库,文章提供了详细的Java代码示例,并解释了如何处理图片资源及设置输出编码。
6640

被折叠的 条评论
为什么被折叠?



