java 使用poi 将word转换为html (包含图片的转换以及远程word转换)

原创已于 2024-12-05 12:46:12 修改 · 2.2k 阅读

21 ·

CC 4.0 BY-SA版权

文章标签：

#java word 转 html

于 2019-07-26 15:37:16 首次发布

java poi 专栏收录该内容

1 篇文章

订阅专栏

该博客详细介绍了如何使用Java的POI库将Word文档转换为HTML，包括处理远程Word文件及图片转换。通过设置PicturesManager处理图片，将图片保存在本地并替换为HTML中的URL链接。

话不多说，直接上代码：

MAVNE增加配置

<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi</artifactId>
   <version>3.17</version>
</dependency>
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi-ooxml</artifactId>
   <version>3.14</version>
</dependency>
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi-ooxml-schemas</artifactId>
   <version>3.17</version>
</dependency>
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>poi-scratchpad</artifactId>
   <version>3.17</version>
</dependency>
<dependency>
   <groupId>fr.opensagres.xdocreport</groupId>
   <artifactId>xdocreport</artifactId>
   <version>1.0.6</version>
</dependency>
<dependency>
   <groupId>org.apache.poi</groupId>
   <artifactId>ooxml-schemas</artifactId>
   <version>1.3</version>
</dependency>

下面是代码实现：

/**
* doc文档转html
* @param path doc 路径
* @param url 服务地址
* @return
*/
public static String doc2Html(String path,String url) {
String s = "";
try {
String sourceFileName = path;
String sourceHtmlPath = path.substring(0,path.lastIndexOf("."))+".html";
if (path.indexOf("http") > -1) {
sourceFileName = "/htmlfile"+path.substring(path.lastIndexOf("/"));
File sourceFile = new File(sourceFileName);
if (!sourceFile.exists()) {
sourceFile.createNewFile();
downloadFile(path,sourceFileName);
}
sourceHtmlPath = sourceFileName.substring(0,sourceFileName.lastIndexOf("."))+".html";
File sourceHtml = new File(sourceHtmlPath);
if (sourceHtml.exists()) {
return readfile(sourceHtmlPath);
}
}

//上面这块代码我是把先判断word文档是否是远程文件，是的话判断本地是否存在已经转换过的html，存在则直接返回，不存在则先创建一个文件名相同的html文件，后面将装换后的文件保存下来以便下次可以直接使用

String imageUrl = url+"/staticFile/htmlfile/image";
String imagePath = "/htmlfile/image";

if(sourceFileName.endsWith(".doc")) {
File file = new File(imagePath+"/");
if(!file.exists()) {
file.mkdirs();
}
InputStream is = new FileInputStream(new File(sourceFileName));
HWPFDocument doc = new HWPFDocument(is);
//通过反射构建dom创建者工厂
DocumentBuilderFactory domBuilderFactory= DocumentBuilderFactory.newInstance();
//生成dom创建者
DocumentBuilder domBuilder=domBuilderFactory.newDocumentBuilder();
//生成dom对象
Document dom=domBuilder.newDocument();
//生成针对Dom对象的转化器
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(dom);
//转化器重写内部方法
wordToHtmlConverter.setPicturesManager(new PicturesManager()
{
@Override
public String savePicture(byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches )
{
try (FileOutputStream out = new FileOutputStream(imagePath + "/" + suggestedName)) {
out.write(content);
} catch (Exception e) {
e.printStackTrace();
}
// return imagePath + "/" + suggestedName;
return imageUrl + "/" + suggestedName;
}
} );
//转化器开始转化接收到的dom对象
wordToHtmlConverter.processDocument(doc);
//从加载了输入文件中的转换器中提取DOM节点
Document htmlDocument = wordToHtmlConverter.getDocument();
//从提取的DOM节点中获得内容
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(new File(sourceHtmlPath));
//转化工厂生成序列转化器
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
//设置序列化内容格式
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
return readfile(sourceHtmlPath);
}else if (sourceFileName.endsWith("docx")) {
InputStream in = new FileInputStream(new File(sourceFileName));
XWPFDocument document = new XWPFDocument(in);
XHTMLOptions options = XHTMLOptions.create();
// 存放图片的文件夹
options.setExtractor(new FileImageExtractor(new File(imagePath)));
// html中图片的路径
options.URIResolver(new BasicURIResolver(imageUrl));
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(sourceHtmlPath), "utf-8");
// 也可以使用字符数组流获取解析的内容
XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
xhtmlConverter.convert(document,outputStreamWriter,options);
String content = readfile(sourceHtmlPath);
//如果想直接就转为html，而不在服务本地保存html文件的话，就使用下面屏蔽掉的4行代码，上面的转换可以屏蔽掉
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// XHTMLConverter.getInstance().convert(document, baos, null);
// String content = baos.toString();
// baos.close();
return content;
}else {
System.out.println("传入的word文件不正确:"+path);
}

} catch (Exception e) {
e.printStackTrace();
}
return s;
}

public static String readfile(String filePath) {
File file = new File(filePath);
InputStream input = null;
try {
input = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
StringBuffer buffer = new StringBuffer();
byte[] bytes = new byte[1024];
try {
for (int n; (n = input.read(bytes)) != -1;) {
buffer.append(new String(bytes, 0, n, "utf8"));
}
} catch (IOException e) {
e.printStackTrace();
}
return buffer.toString();
}

public static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"utf8"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null) {
bw.close();
}
if (fos != null) {
fos.close();
}
} catch (IOException ie) {
}
}
}

/**
* 远程下载文件
* @param remoteFilePath
* @param localFilePath
*/
public static void downloadFile(String remoteFilePath, String localFilePath){
URL urlfile = null;
HttpURLConnection httpUrl = null;
BufferedInputStream bis = null;
BufferedOutputStream bos = null;
File f = new File(localFilePath);
try
{
urlfile = new URL(remoteFilePath);
httpUrl = (HttpURLConnection)urlfile.openConnection();
httpUrl.connect();
bis = new BufferedInputStream(httpUrl.getInputStream());
bos = new BufferedOutputStream(new FileOutputStream(f));
int len = 2048;
byte[] b = new byte[len];
while ((len = bis.read(b)) != -1)
{
bos.write(b, 0, len);
}
bos.flush();
bis.close();
httpUrl.disconnect();
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
try
{
bis.close();
bos.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}