Libreoffice 各类文件转换的filtername

本文介绍了一个使用Python实现的LibreOffice文件转换器,能够将多种格式的文档转换为PDF、JPG、HTML等格式。该转换器通过调用LibreOffice的内部过滤器来完成转换任务。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

LIBREOFFICE_DOC_FAMILIES = [
    "TextDocument",
    "WebDocument",
    "Spreadsheet",
    "Presentation",
    "Graphics"
]

LIBREOFFICE_IMPORT_TYPES = {
    "docx": {
        "FilterName": "MS Word 2007 XML"
    },
    "pdf": {
        "FilterName": "PDF - Portable Document Format"
    },
    "jpg": {
        "FilterName": "JPEG - Joint Photographic Experts Group"
    },
    "html": {
        "FilterName": "HTML Document"
    },
    "odp": {
        "FilterName": "OpenDocument Presentation (Flat XML)"
    },
    "pptx": {
        "FilterName": "Microsoft PowerPoint 2007 XML"
    }
}

LIBREOFFICE_EXPORT_TYPES = {
    "pdf": {
        LIBREOFFICE_DOC_FAMILIES[0]: {"FilterName": "writer_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[1]: {"FilterName": "writer_web_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[2]: {"FilterName": "calc_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[4]: {"FilterName": "draw_pdf_Export"}
    },
    "jpg": {
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress_jpg_Export"},
        LIBREOFFICE_DOC_FAMILIES[4]: {"FilterName": "draw_jpg_Export"}    
    },
    "html": {
        LIBREOFFICE_DOC_FAMILIES[0]: {"FilterName": "HTML (StarWriter)"},
        LIBREOFFICE_DOC_FAMILIES[1]: {"FilterName": "HTML"},
        LIBREOFFICE_DOC_FAMILIES[2]: {"FilterName": "HTML (StarCalc)"},
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress_html_Export"},
        LIBREOFFICE_DOC_FAMILIES[4]: {"FilterName": "draw_html_Export"} 
    },
    "docx": {
        LIBREOFFICE_DOC_FAMILIES[0]: {"FilterName": "MS Word 2007 XML"} 
    },
    "odp": {
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress8"}
    },
    "pptx": {
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "Impress MS PowerPoint 2007 XML"}
    }
}

转:

convert_test

#!/usr/bin/env python3
"""
    VIEW COMPLETE CODE AT
    =====================
    * https://github.com/six519/libreoffice_convert
    THANKS
    ======
    * Thanks to: Mirko Nasato for his PyODConverter http://www.artofsolving.com/opensource/pyodconverter
    TESTED USING
    ============
    * Fedora release 20 (Heisenbug)
    * Python 3.3.2
    INSTALL DEPENDENCIES
    ====================
    * yum install libreoffice-sdk
"""

import uno
import subprocess
import time
import os

from com.sun.star.beans import PropertyValue

LIBREOFFICE_DEFAULT_PORT = 6519
LIBREOFFICE_DEFAULT_HOST = "localhost"

LIBREOFFICE_DOC_FAMILIES = [
    "TextDocument",
    "WebDocument",
    "Spreadsheet",
    "Presentation",
    "Graphics"
]

LIBREOFFICE_IMPORT_TYPES = {
    "docx": {
        "FilterName": "MS Word 2007 XML"
    },
    "pdf": {
        "FilterName": "PDF - Portable Document Format"
    },
    "jpg": {
        "FilterName": "JPEG - Joint Photographic Experts Group"
    },
    "html": {
        "FilterName": "HTML Document"
    },
    "odp": {
        "FilterName": "OpenDocument Presentation (Flat XML)"
    },
    "pptx": {
        "FilterName": "Microsoft PowerPoint 2007 XML"
    }
}

LIBREOFFICE_EXPORT_TYPES = {
    "pdf": {
        LIBREOFFICE_DOC_FAMILIES[0]: {"FilterName": "writer_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[1]: {"FilterName": "writer_web_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[2]: {"FilterName": "calc_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress_pdf_Export"},
        LIBREOFFICE_DOC_FAMILIES[4]: {"FilterName": "draw_pdf_Export"}
    },
    "jpg": {
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress_jpg_Export"},
        LIBREOFFICE_DOC_FAMILIES[4]: {"FilterName": "draw_jpg_Export"}    
    },
    "html": {
        LIBREOFFICE_DOC_FAMILIES[0]: {"FilterName": "HTML (StarWriter)"},
        LIBREOFFICE_DOC_FAMILIES[1]: {"FilterName": "HTML"},
        LIBREOFFICE_DOC_FAMILIES[2]: {"FilterName": "HTML (StarCalc)"},
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress_html_Export"},
        LIBREOFFICE_DOC_FAMILIES[4]: {"FilterName": "draw_html_Export"} 
    },
    "docx": {
        LIBREOFFICE_DOC_FAMILIES[0]: {"FilterName": "MS Word 2007 XML"} 
    },
    "odp": {
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "impress8"}
    },
    "pptx": {
        LIBREOFFICE_DOC_FAMILIES[3]: {"FilterName": "Impress MS PowerPoint 2007 XML"}
    }
}

class PythonLibreOffice(object):

    def __init__(self, host=LIBREOFFICE_DEFAULT_HOST, port=LIBREOFFICE_DEFAULT_PORT):
        self.host = host
        self.port = port
        self.local_context = uno.getComponentContext()
        self.resolver = self.local_context.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", self.local_context)
        self.connectionString = "socket,host=%s,port=%s;urp;StarOffice.ComponentContext" % (LIBREOFFICE_DEFAULT_HOST, LIBREOFFICE_DEFAULT_PORT)
        self.context = None
        self.desktop = None
        self.runUnoProcess()
        self.__lastErrorMessage = ""

        try:
            self.context = self.resolver.resolve("uno:%s" % self.connectionString)
            self.desktop = self.context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", self.context)
        except Exception as e:
            self.__lastErrorMessage = str(e)

    @property 
    def lastError(self):

        return self.__lastErrorMessage

    def terminateProcess(self):

        try:
            if self.desktop:
                self.desktop.terminate()
        except Exception as e:
            self.__lastErrorMessage = str(e)
            return False

        return True

    def convertFile(self, outputFormat, inputFilename):

        if self.desktop:
        
            tOldFileName = os.path.splitext(inputFilename)
            outputFilename = "%s.%s" % (tOldFileName[0], outputFormat)
            inputFormat = tOldFileName[1].replace(".","")
            inputUrl = uno.systemPathToFileUrl(os.path.abspath(inputFilename))
            outputUrl = uno.systemPathToFileUrl(os.path.abspath(outputFilename))

            if inputFormat in LIBREOFFICE_IMPORT_TYPES:
                inputProperties = {
                    "Hidden": True
                }

                inputProperties.update(LIBREOFFICE_IMPORT_TYPES[inputFormat])

                doc = self.desktop.loadComponentFromURL(inputUrl, "_blank", 0, self.propertyTuple(inputProperties))
                
                try:
                    doc.refresh()
                except:
                    pass

                docFamily = self.getDocumentFamily(doc)
                if docFamily:
                    try:
                        outputProperties = LIBREOFFICE_EXPORT_TYPES[outputFormat][docFamily]
                        doc.storeToURL(outputUrl, self.propertyTuple(outputProperties))
                        doc.close(True)

                        return True
                    except Exception as e:
                        self.__lastErrorMessage = str(e)
        
        self.terminateProcess()

        return False

    def propertyTuple(self, propDict):
        properties = []
        for k,v in propDict.items():
            property = PropertyValue()
            property.Name = k
            property.Value = v
            properties.append(property)

        return tuple(properties)

    def getDocumentFamily(self, doc):
        try:
            if doc.supportsService("com.sun.star.text.GenericTextDocument"):
                return LIBREOFFICE_DOC_FAMILIES[0]
            if doc.supportsService("com.sun.star.text.WebDocument"):
                return LIBREOFFICE_DOC_FAMILIES[1]
            if doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
                return LIBREOFFICE_DOC_FAMILIES[2]
            if doc.supportsService("com.sun.star.presentation.PresentationDocument"):
                return LIBREOFFICE_DOC_FAMILIES[3]
            if doc.supportsService("com.sun.star.drawing.DrawingDocument"):
                return LIBREOFFICE_DOC_FAMILIES[4]
        except:
            pass

        return None

    def runUnoProcess(self):
        subprocess.Popen('soffice --headless --norestore --accept="%s"' % self.connectionString, shell=True, stdin=None, stdout=None, stderr=None)
        time.sleep(3)

if __name__ == "__main__":

    test_libreoffice = PythonLibreOffice()

    #convert MS Word Document file (docx) to PDF
    test_libreoffice.convertFile("pdf", "document.docx")

 

转载于:https://www.cnblogs.com/zl1991/p/10762871.html

### LibreOffice 文件转换方法 对于文件转换的需求,采用 `JODConverter` 结合 `LibreOffice` 是一种高效的方式。通过这种方式能够将多种 Microsoft Office 文档格式转换为 PDF 或其他目标格式,从而利用浏览器的内置PDF查看器来展示这些文档的内容[^1]。 #### 支持的文件格式 - **Microsoft Word**: `.doc`, `.docx` - **Microsoft Excel**: `.xls`, `.xlsx` - **Microsoft PowerPoint**: `.ppt`, `.pptx` - **OpenDocument Format (ODF)**: `.odt`, `.ods`, `.odp` 除了上述提到的标准办公软件所使用的文件格式外,还支持一些较少见但是仍然广泛被使用的格式如 WPS 办公套件产生的文件等[^3]。 #### 使用 JODConverter 进行转换 为了完成从源格式到目标格式(通常是 PDF)之间的转换操作,需要先确保已经正确安装并配置好了 LibreOffice 应用程序[^2]。之后可以通过如下 Java 代码片段调用 JODConverter 来执行具体的转换过程: ```java import org.jodconverter.local.JodConverter; import java.io.File; public class DocumentConversion { public static void main(String[] args) { File inputFile = new File("/path/to/input.doc"); File outputFile = new File("/path/to/output.pdf"); // Convert the document. JodConverter.convert(inputFile).to(outputFile); } } ``` 这段简单的例子展示了如何读取一个 DOC 格式的输入文件,并将其保存为目标路径下的 PDF 文件。需要注意的是,在实际部署环境中可能还需要考虑更多细节问题,比如错误处理机制以及并发请求的支持等问题。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值