一 问题引发思考
数据量10万行(excel文件6.8M),使用5个线程各执行2万行,部分线程报错Exception in thread "pool-2-thread-4" java.lang.OutOfMemoryError: Java heap space;
引发思考,增大堆内存吗? 增加多少呢?如何分析? 处理增加堆大小,有其他解决办法吗?
2万行数据, 6.8/5 = 1.36M; POI解析会占用多大内存? 怎么查看呢?
jps -v
9030 RunFaceSet
-agentlib:jdwp=transport=dt_socket,address=127.0.0.1:33107,suspend=y,server=n -Dvisualvm.id=275019664781530
-javaagent:/home/chenershuai/soft/idea2018/lib/rt/debugger-agent.jar=file:/tmp/capture31.props -Dfile.encoding=UTF-8
通过jdk自带的 jconsole,运行两种模式,观察下,差别还是很大的.
二 POI解析excel的不同方式和区别
POI提供了2中读取Excel的模式,分别是:
- 用户模式:也就是poi下的usermodel有关包,它对用户友好,有统一的接口在ss包下,但是它是把整个文件读取到
内存中的,对于大量数据很容易内存溢出,所以只能用来处理相对较小量的数据; - 事件模式:在poi下的eventusermodel包下,相对来说
实现比较复杂,但是它处理速度快,占用内存少,可以用来处理海量的Excel数据。
具体原理以后分析:
三 示例分析
代码如下:
4.1 用户模式代码
TestPoi.java
package com.ule.tools;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* 利用POI的用户驱动模型解析excel
*/
public class TestPoi {
public static void main(String[] args) {
List list = getExcelList("/home/chenershuai/桌面/blin_test.xlsx", 0);
while (true) {
int i = 1;
}
}
public static List getExcelList(String filePath, int sheetNum) {
Workbook wb = null;
Sheet sheet = null;
Row row = null;
List<Map<String, String>> list = null;
String cellData = null;
String columns[] = {"id", "group_id", "user_id", "image_url"};
wb = readExcel(filePath);
if (wb != null) {
//用来存放表中数据
list = new ArrayList<Map<String, String>>();
//获取第一个sheet
sheet = wb.getSheetAt(sheetNum);
//获取最大行数
int rownum = sheet.getPhysicalNumberOfRows();
//获取第一行
row = sheet.getRow(0);
//获取最大列数
int colnum = row.getPhysicalNumberOfCells();
for (int i = 1; i < rownum; i++) {
Map<String, String> map = new LinkedHashMap<String, String>();
row = sheet.getRow(i);
if (row != null) {
for (int j = 0; j < colnum; j++) {
cellData = (String) getCellFormatValue(row.getCell(j));
map.put(columns[j], cellData);
}
} else {
break;
}
list.add(map);
}
}
return list;
}
//读取excel
public static Workbook readExcel(String filePath) {
Workbook wb = null;
if (filePath == null) {
return null;
}
String extString = filePath.substring(filePath.lastIndexOf("."));
InputStream is = null;
try {
is = new FileInputStream(filePath);
if (".xls".equals(extString)) {
return wb = new HSSFWorkbook(is);
} else if (".xlsx".equals(extString)) {
return wb = new XSSFWorkbook(is);
} else {
return wb = null;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (is != null)
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return wb;
}
public static Object getCellFormatValue(Cell cell) {
Object cellValue = null;
if (cell != null) {
//判断cell类型
switch (cell.getCellType()) {
case Cell.CELL_TYPE_NUMERIC: {
DecimalFormat df = new DecimalFormat("0");
cellValue = df.format(cell.getNumericCellValue());
break;
}
case Cell.CELL_TYPE_FORMULA: {
//判断cell是否为日期格式
if (DateUtil.isCellDateFormatted(cell)) {
//转换为日期格式YYYY-mm-dd
cellValue = cell.getDateCellValue();
} else {
//数字
cellValue = String.valueOf(cell.getNumericCellValue() + "");
}
break;
}
case Cell.CELL_TYPE_STRING: {
cellValue = cell.getRichStringCellValue().getString();
break;
}
default:
cellValue = "";
}
} else {
cellValue = "";
}
return cellValue;
}
}
4.2 事件模式代码
ProcessExcelUtil.java
package com.ule.tools;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.*;
/**
* 利用POI的事件驱动模型解析excel内容,强大。
*/
public class ProcessExcelUtil {
public static void main(String[] args) throws Exception {
List list = getExcelList("/home/chenershuai/桌面/blin_test.xlsx", 1);
while (true) {
int i = 1;
}
}
public static List getExcelList(String filePath, int sheetCount) throws Exception {
Map<Integer, List<Map<String, String>>> dataMap = new HashMap<>();
OPCPackage pkg;
InputStream in = new FileInputStream(filePath);
pkg = OPCPackage.open(in);
XSSFReader xssfReader = new XSSFReader(pkg);
StylesTable styles = xssfReader.getStylesTable();
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg);
Iterator<InputStream> iterators = xssfReader.getSheetsData();
List<Map<String, String>> dataListMap = new ArrayList<>();
SaxSheetContentsHandler sheetHandler = new SaxSheetContentsHandler(dataListMap, 1, 0, -1);
processSheet(styles, strings, iterators.next(), sheetHandler);
dataMap.put(sheetCount, dataListMap);
return dataListMap;
}
/**
* 解析每个sheet页数据
*
* @param styles
* @param strings
* @param sheetInputStream
* @param saxSheetContentsHandler
* @throws Exception
*/
private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings, InputStream sheetInputStream, SaxSheetContentsHandler saxSheetContentsHandler)
throws Exception {
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
sheetParser.setContentHandler(new XSSFSheetXMLHandler(styles, strings, saxSheetContentsHandler, false));
sheetParser.parse(new InputSource(sheetInputStream));
} catch (Exception e) {
throw e;
} finally {
if (Objects.nonNull(sheetInputStream)) {
sheetInputStream.close();
}
}
}
}
SaxSheetContentsHandler.java
package com.ule.tools;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.usermodel.XSSFComment;
import shade.org.apache.http.client.utils.CloneUtils;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SaxSheetContentsHandler implements XSSFSheetXMLHandler.SheetContentsHandler {
//数据单元
private Map<String, String> cellMap = new HashMap<>();
//数据记录,按行记录
private List<Map<String, String>> cellDataListMap;
//解析开始行
private int startRow = 0;
//解析开始列
private int startCol = 0;
//当前列
private int currentCol = 0;
//读取数据行(默认为-1不做限制)
private int limit = -1;
/**
* 构造器
*
* @param cellDataListMap
* @param startRow
* @param startCol
*/
public SaxSheetContentsHandler(List<Map<String, String>> cellDataListMap, int startRow, int startCol, int limit) {
this.cellDataListMap = cellDataListMap;
this.startCol = startCol;
this.startRow = startRow;
this.limit = limit;
}
@Override
public void startRow(int rowNum) {
if (limit == -1) {
if (rowNum < startRow) {
return;
}
} else {
if (rowNum < startRow || rowNum > limit) {
return;
}
}
cellMap.clear();
}
@Override
public void endRow(int rowNum) {
if (limit == -1) {
if (rowNum < startRow) {
return;
}
} else if (rowNum < startRow || rowNum > limit) { //用来判断从第几行开始读取,如果当前行小于开始行则需终止程序。
currentCol = 0;
return;
}
if (!cellMap.isEmpty()) {
Map<String, String> rowMap = null;
try {
rowMap = (Map<String, String>) CloneUtils.clone(cellMap);
cellDataListMap.add(rowMap);
} catch (CloneNotSupportedException e) {
e.printStackTrace();
}
}
currentCol = 0;
}
/**
* @param cellReference 列名 例如A,B,C等
* @param formattedValue 单元格值 对应每个单元格的值, [^A-Za-z]:匹配所有非字母的都用""
* 代替,因为多行Excel的列明是:A1,B1,C1,....D10等,匹配替换后每行的列名就是A,B,C等。
* 此方法用解析每一个cell的值
* @param comment
*/
@Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
currentCol = currentCol + 1;
if (currentCol >= startCol) {
cellMap.put(cellReference.replaceAll("[^A-Za-z]", ""), formattedValue);
}
return;
}
@Override
public void headerFooter(String text, boolean isHeader, String tagName) {
System.out.println("text: {" + text + "},isHeader: {" + isHeader + "},tagName : {" + tagName + "}");
}
}
四 彩蛋
(easyexcel解决POI解析Excel出现OOM)[https://blog.youkuaiyun.com/weixin_36174683/article/details/81161712]
POI解析Excel优化
5090





