001 POI解析excel的两种方式

一 问题引发思考

数据量10万行(excel文件6.8M),使用5个线程各执行2万行,部分线程报错Exception in thread "pool-2-thread-4" java.lang.OutOfMemoryError: Java heap space;

引发思考,增大堆内存吗? 增加多少呢?如何分析? 处理增加堆大小,有其他解决办法吗?

2万行数据, 6.8/5 = 1.36M; POI解析会占用多大内存? 怎么查看呢?

jps -v

9030 RunFaceSet 
-agentlib:jdwp=transport=dt_socket,address=127.0.0.1:33107,suspend=y,server=n -Dvisualvm.id=275019664781530 
-javaagent:/home/chenershuai/soft/idea2018/lib/rt/debugger-agent.jar=file:/tmp/capture31.props -Dfile.encoding=UTF-8

通过jdk自带的 jconsole,运行两种模式,观察下,差别还是很大的.

二 POI解析excel的不同方式和区别

POI提供了2中读取Excel的模式,分别是:

  • 用户模式:也就是poi下的usermodel有关包,它对用户友好,有统一的接口在ss包下,但是它是把整个文件读取到内存中的,对于大量数据很容易内存溢出,所以只能用来处理相对较小量的数据
  • 事件模式:在poi下的eventusermodel包下,相对来说实现比较复杂,但是它处理速度快占用内存少,可以用来处理海量的Excel数据

具体原理以后分析:

三 示例分析

代码如下:

4.1 用户模式代码

TestPoi.java

package com.ule.tools;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**
 * 利用POI的用户驱动模型解析excel
 */
public class TestPoi {

    public static void main(String[] args) {
        List list = getExcelList("/home/chenershuai/桌面/blin_test.xlsx", 0);

        while (true) {
            int i = 1;
        }
    }

    public static List getExcelList(String filePath, int sheetNum) {
        Workbook wb = null;
        Sheet sheet = null;
        Row row = null;
        List<Map<String, String>> list = null;
        String cellData = null;
        String columns[] = {"id", "group_id", "user_id", "image_url"};
        wb = readExcel(filePath);
        if (wb != null) {
            //用来存放表中数据
            list = new ArrayList<Map<String, String>>();
            //获取第一个sheet
            sheet = wb.getSheetAt(sheetNum);
            //获取最大行数
            int rownum = sheet.getPhysicalNumberOfRows();
            //获取第一行
            row = sheet.getRow(0);
            //获取最大列数
            int colnum = row.getPhysicalNumberOfCells();
            for (int i = 1; i < rownum; i++) {
                Map<String, String> map = new LinkedHashMap<String, String>();
                row = sheet.getRow(i);
                if (row != null) {
                    for (int j = 0; j < colnum; j++) {
                        cellData = (String) getCellFormatValue(row.getCell(j));
                        map.put(columns[j], cellData);
                    }
                } else {
                    break;
                }
                list.add(map);
            }
        }
        return list;
    }

    //读取excel
    public static Workbook readExcel(String filePath) {
        Workbook wb = null;
        if (filePath == null) {
            return null;
        }
        String extString = filePath.substring(filePath.lastIndexOf("."));
        InputStream is = null;
        try {
            is = new FileInputStream(filePath);
            if (".xls".equals(extString)) {
                return wb = new HSSFWorkbook(is);
            } else if (".xlsx".equals(extString)) {
                return wb = new XSSFWorkbook(is);
            } else {
                return wb = null;
            }

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (is != null)
                    is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return wb;
    }

    public static Object getCellFormatValue(Cell cell) {
        Object cellValue = null;
        if (cell != null) {
            //判断cell类型
            switch (cell.getCellType()) {
                case Cell.CELL_TYPE_NUMERIC: {
                    DecimalFormat df = new DecimalFormat("0");
                    cellValue = df.format(cell.getNumericCellValue());
                    break;
                }
                case Cell.CELL_TYPE_FORMULA: {
                    //判断cell是否为日期格式
                    if (DateUtil.isCellDateFormatted(cell)) {
                        //转换为日期格式YYYY-mm-dd
                        cellValue = cell.getDateCellValue();
                    } else {
                        //数字
                        cellValue = String.valueOf(cell.getNumericCellValue() + "");
                    }
                    break;
                }
                case Cell.CELL_TYPE_STRING: {
                    cellValue = cell.getRichStringCellValue().getString();
                    break;
                }
                default:
                    cellValue = "";
            }
        } else {
            cellValue = "";
        }
        return cellValue;
    }

}

4.2 事件模式代码

ProcessExcelUtil.java

package com.ule.tools;

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.*;

/**
 * 利用POI的事件驱动模型解析excel内容,强大。
 */
public class ProcessExcelUtil {
    public static void main(String[] args) throws Exception {
       List list = getExcelList("/home/chenershuai/桌面/blin_test.xlsx", 1);
       
       while (true) {
           int i = 1;
       } 
    }

    public static List getExcelList(String filePath, int sheetCount) throws Exception {
        Map<Integer, List<Map<String, String>>> dataMap = new HashMap<>();
        OPCPackage pkg;
        InputStream in = new FileInputStream(filePath);
        pkg = OPCPackage.open(in);
        XSSFReader xssfReader = new XSSFReader(pkg);
        StylesTable styles = xssfReader.getStylesTable();
        ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
        Iterator<InputStream> iterators = xssfReader.getSheetsData();

        List<Map<String, String>> dataListMap = new ArrayList<>();
        SaxSheetContentsHandler sheetHandler = new SaxSheetContentsHandler(dataListMap, 1, 0, -1);
        processSheet(styles, strings, iterators.next(), sheetHandler);
        dataMap.put(sheetCount, dataListMap);

        return dataListMap;
    }
    
    /**
         * 解析每个sheet页数据
         *
         * @param styles
         * @param strings
         * @param sheetInputStream
         * @param saxSheetContentsHandler
         * @throws Exception
         */
        private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings, InputStream sheetInputStream, SaxSheetContentsHandler saxSheetContentsHandler)
                throws Exception {
            try {
                XMLReader sheetParser = SAXHelper.newXMLReader();
                sheetParser.setContentHandler(new XSSFSheetXMLHandler(styles, strings, saxSheetContentsHandler, false));
                sheetParser.parse(new InputSource(sheetInputStream));
            } catch (Exception e) {
                throw e;
            } finally {
                if (Objects.nonNull(sheetInputStream)) {
                    sheetInputStream.close();
                }
            }
        }
}

SaxSheetContentsHandler.java

package com.ule.tools;

import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.usermodel.XSSFComment;
import shade.org.apache.http.client.utils.CloneUtils;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SaxSheetContentsHandler implements XSSFSheetXMLHandler.SheetContentsHandler {
    //数据单元
    private Map<String, String> cellMap = new HashMap<>();
    //数据记录,按行记录
    private List<Map<String, String>> cellDataListMap;
    //解析开始行
    private int startRow = 0;
    //解析开始列
    private int startCol = 0;
    //当前列
    private int currentCol = 0;
    //读取数据行(默认为-1不做限制)
    private int limit = -1;

    /**
     * 构造器
     *
     * @param cellDataListMap
     * @param startRow
     * @param startCol
     */
    public SaxSheetContentsHandler(List<Map<String, String>> cellDataListMap, int startRow, int startCol, int limit) {
        this.cellDataListMap = cellDataListMap;
        this.startCol = startCol;
        this.startRow = startRow;
        this.limit = limit;
    }

    @Override
    public void startRow(int rowNum) {
        if (limit == -1) {
            if (rowNum < startRow) {
                return;
            }
        } else {
            if (rowNum < startRow || rowNum > limit) {
                return;
            }
        }
        cellMap.clear();
    }

    @Override
    public void endRow(int rowNum) {
        if (limit == -1) {
            if (rowNum < startRow) {

                return;
            }
        } else if (rowNum < startRow || rowNum > limit) {  //用来判断从第几行开始读取,如果当前行小于开始行则需终止程序。
            currentCol = 0;
            return;
        }
        if (!cellMap.isEmpty()) {
            Map<String, String> rowMap = null;
            try {
                rowMap = (Map<String, String>) CloneUtils.clone(cellMap);
                cellDataListMap.add(rowMap);
            } catch (CloneNotSupportedException e) {
                e.printStackTrace();
            }
        }
        currentCol = 0;
    }

    /**
     * @param cellReference  列名   例如A,B,C等
     * @param formattedValue 单元格值 对应每个单元格的值, [^A-Za-z]:匹配所有非字母的都用""
     *                       代替,因为多行Excel的列明是:A1,B1,C1,....D10等,匹配替换后每行的列名就是A,B,C等。
     *                       此方法用解析每一个cell的值
     * @param comment
     */
    @Override
    public void cell(String cellReference, String formattedValue, XSSFComment comment) {
        currentCol = currentCol + 1;
        if (currentCol >= startCol) {
            cellMap.put(cellReference.replaceAll("[^A-Za-z]", ""), formattedValue);
        }
        return;
    }

    @Override
    public void headerFooter(String text, boolean isHeader, String tagName) {
        System.out.println("text: {" + text + "},isHeader: {" + isHeader + "},tagName : {" + tagName + "}");
    }
}

四 彩蛋

(easyexcel解决POI解析Excel出现OOM)[https://blog.youkuaiyun.com/weixin_36174683/article/details/81161712]

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值