获取附件内容

最新推荐文章于 2022-05-12 07:00:04 发布
原创最新推荐文章于 2022-05-12 07:00:04 发布 · 334 阅读
1 ·
CC 4.0 BY-SA版权
文章标签：
#数据库 #java
ADF 专栏收录该内容
53 篇文章
订阅专栏
本文介绍了一个用于解析多种格式附件内容的Java工具类，包括Word、Excel、TXT和PDF等常见文档格式。该工具通过调用Apache POI和PDFBox等库实现对不同格式文档的读取，并提取其文本内容。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
package com.caac.utils;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

/**
 * 获取附件内容公共类
 */
public class FjUtils {
    public FjUtils() {
        super();
    }

    /**
     * 读取附件的内容，返回字符串
     * @param path 附件路径
     * @return
     */
    public static String getFjToStr(String fjPath) {
        String fjStr = "";
        String prefix = "";
        if (!"".equals(fjPath)) {
            prefix = fjPath.substring(fjPath.lastIndexOf(".") + 1); //后缀名
            try {
                if ("doc".equals(prefix) || "docx".equals(prefix)) { //word
                    fjStr = getDocContent(fjPath, fjStr, prefix);
                } else if ("xls".equals(prefix) || "xlsx".equals(prefix)) { //excel
                    fjStr = getExcelContent(fjPath);
                } else if ("txt".equals(prefix)) { //txt
                    fjStr = getTxtContent(fjPath, fjStr);
                } else if ("pdf".equals(prefix)) { //pdf
                    fjStr = getPdfContent(fjPath);
                }
            } catch (FileNotFoundException fnfe) {
                fnfe.printStackTrace();
            } catch (IOException ioe) {
                ioe.printStackTrace();
            }
        }
        return fjStr;
    }

    /**
     * 获取word内容
     * @param fjPath
     * @param fjStr
     * @param lx
     * @return
     * @throws IOException
     * @throws FileNotFoundException
     */
    private static String getDocContent(String fjPath, String fjStr, String lx) throws java.io.IOException,
                                                                                       java.io.FileNotFoundException {
        if ("doc".equals(lx)) { //word 2003
            InputStream is = new FileInputStream(new File(fjPath));
            WordExtractor ex = new WordExtractor(is);
            fjStr = ex.getText();
        } else if ("docx".equals(lx)) { //word 2007
            InputStream is2 = new FileInputStream(new File(fjPath));
            XWPFDocument document = new XWPFDocument(is2);
            POIXMLTextExtractor extractor = new XWPFWordExtractor(document);
            fjStr = extractor.getText();
        }
        return fjStr;
    }

    /**
     * 获取excel内容
     * @param fjPath
     * @return
     */
    private static String getExcelContent(String fjPath) {
        StringBuilder result = new StringBuilder();
        try {
            // Excel获得文件
            InputStream inp = new FileInputStream(new File(fjPath));
            Workbook wb = WorkbookFactory.create(inp);
            // 获得第一个工作表对象
            Sheet sheet = wb.getSheetAt(0); //读取Excel中第一个sheet的数据
            int maxRowNum = sheet.getLastRowNum() + 1; //最大行数
            int maxCellNum = sheet.getRow(0).getLastCellNum(); //最大列数
            // 得到第一列第一行的单元格
            for (int i = 0; i < maxRowNum; i++) {
                for (int j = 0; j < maxCellNum; j++) {
                    if (isBlankRow(sheet.getRow(i), maxCellNum)) { //空行则跳过
                        continue;
                    }
                    result.append(getCellToStr(sheet.getRow(i).getCell(j)) + ",");
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result.toString();
    }

    /**
     * 获取EXCEL单元格的值，一律转为String返回
     * @param cell
     * @return
     */
    private static String getCellToStr(Cell cell) {
        String value = "";
        if (cell != null) {
            switch (cell.getCellType()) {
            case Cell.CELL_TYPE_NUMERIC:
                String v = "" + cell.getNumericCellValue() + "";
                value += v;
                break;
            case Cell.CELL_TYPE_STRING:
                value += cell.getStringCellValue();
                break;
            case Cell.CELL_TYPE_FORMULA:
                break;
            case Cell.CELL_TYPE_BOOLEAN:
                value += cell.getBooleanCellValue() + "";
                break;
            default:
                break;
            }
        }
        return value;
    }

    /**
     * 功能：判断是否是空行
     * */
    private static boolean isBlankRow(org.apache.poi.ss.usermodel.Row columnRow, int excelLastcell) {
        String value = "";
        for (int i = 0; i < excelLastcell; i++) {
            Cell cell = columnRow.getCell(i);
            if (cell != null) {
                switch (cell.getCellType()) {
                case Cell.CELL_TYPE_NUMERIC:
                    value += cell.getNumericCellValue() + "";
                    break;
                case Cell.CELL_TYPE_STRING:
                    value += cell.getStringCellValue();
                    break;
                case Cell.CELL_TYPE_FORMULA:
                    break;
                case Cell.CELL_TYPE_BOOLEAN:
                    value += cell.getBooleanCellValue() + "";
                    break;
                default:
                    break;
                }
            }
        }
        if (value == null || "".equals(value)) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * 获取pdf内容
     * @param fjPath
     * @return
     */
    private static String getPdfContent(String fjPath) {
        String str = "";
        FileInputStream fis;
        try {
            fis = new FileInputStream(new File(fjPath));
            PDFParser p = new PDFParser(fis);
            p.parse();
            PDDocument pdd = p.getPDDocument();
            PDFTextStripper ts = new PDFTextStripper();
            str = ts.getText(pdd);
            pdd.close();
            fis.close();
        } catch (Exception e) {
        }
        return str;
    }

    /**
     * 获取txt内容
     * @param fjPath
     * @param fjStr
     * @return
     * @throws IOException
     * @throws FileNotFoundException
     */
    private static String getTxtContent(String fjPath, String fjStr) throws java.io.IOException,
                                                                            java.io.FileNotFoundException {
        StringBuilder result = new StringBuilder();
        String bm = getCharset(fjPath); //编码
        System.out.println("bm:" + bm);
        BufferedReader br = null;
        if ("UTF-8".equals(bm)) {
            br = new BufferedReader(new InputStreamReader(new FileInputStream(fjPath), "UTF-8"));
        } else {
            br = new BufferedReader(new FileReader(new File(fjPath))); //构造一个BufferedReader类来读取文件
        }
        String s = null;
        while ((s = br.readLine()) != null) { //使用readLine方法，一次读一行
            result.append(System.lineSeparator() + s);
        }
        br.close();
        fjStr = result.toString();
        if (!"UTF-8".equals(bm)) {
            writeFile(fjPath, fjStr);
        }
        return fjStr;
    }

    /**
     * 获取txt编码格式
     * @param fileName
     * @return
     * @throws IOException
     */
    private static String getCharset(String fileName) throws IOException {
        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
        int p = (bin.read() << 8) + bin.read();
        String code = null;
        switch (p) {
        case 0xefbb:
            code = "UTF-8";
            break;
        case 0xfffe:
            code = "Unicode";
            break;
        case 0xfeff:
            code = "UTF-16BE";
            break;
        default:
            code = "GBK";
        }
        return code;
    }

    /**
     * 把utf-8编码的内容写回原文件
     * @param filePathAndName 含路径文件名
     * @param fileContent   写入文件的字符串
     */
    public static void writeFile(String filePathAndName, String fileContent) {
        try {
            File f = new File(filePathAndName);
            if (!f.exists()) {
                f.createNewFile();
            }
            //定义编码
            OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(f), "UTF-8");
            BufferedWriter writer = new BufferedWriter(write);
            writer.write(fileContent);
            writer.close();
        } catch (Exception e) {
            System.out.println("写文件内容操作出错");
            e.printStackTrace();
        }
    }
}
pdfbox和fontbox的版本要一致