package com.towere.common.utils.poi;
import com.towere.common.utils.StringUtils;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class DiskBasedExcelReader {
private static final Logger log = LoggerFactory.getLogger(DiskBasedExcelReader.class);
private OPCPackage pkg;
private XSSFReader reader;
private SharedStringsTable sst;
private List<String> headers = new ArrayList<>(); // Initialize headers
private int totalProcessedRows = 0;
private final int chunkSize;
public DiskBasedExcelReader(String filePath, int chunkSize) throws IOException, OpenXML4JException {
if (filePath == null || filePath.isEmpty()) {
throw new IllegalArgumentException("文件路径不能为空");
}
if (chunkSize <= 0) {
throw new IllegalArgumentException("分块大小必须大于0");
}
this.chunkSize = chunkSize;
this.pkg = OPCPackage.open(filePath, PackageAccess.READ);
this.reader = new XSSFReader(pkg);
this.sst = reader.getSharedStringsTable();
}
public List<Map<String, String>> readNextChunk() throws IOException, SAXException, ParserConfigurationException {
List<Map<String, String>> chunkData = new ArrayList<>();
SheetDataHandler sheetDataHandler = new SheetDataHandler();
try (InputStream sheetInputStream = getSheetInputStream()) {
XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
parser.setContentHandler(sheetDataHandler);
InputSource sheetSource = new InputSource(sheetInputStream);
parser.parse(sheetSource);
} catch (Exception e) {
log.error("解析Excel文件时出错", e);
throw e;
}
while (chunkData.size() < chunkSize && sheetDataHandler.hasNext()) {
RowData rowData = sheetDataHandler.next();
if (rowData != null) {
Map<String, String> dataRow = parseRowData(rowData);
if (!dataRow.isEmpty()) {
chunkData.add(dataRow);
}
}
}
return chunkData;
}
private InputStream getSheetInputStream() throws IOException {
PackagePart firstSheetPart = getFirstSheetPart();
return firstSheetPart.getInputStream();
}
private PackagePart getFirstSheetPart() throws IOException {
List<PackageRelationship> relationships = pkg.getRelationshipsByType(XSSFReader.SHEET_RELATIONSHIP_TYPE);
if (relationships == null || relationships.isEmpty()) {
throw new IOException("Excel文件中没有工作表");
}
PackageRelationship rel = relationships.get(0);
String sheetPartName = rel.getTargetURI().toString();
return pkg.getPart(PackagePart.create(sheetPartName));
}
private Map<String, String> parseRowData(RowData rowData) {
Map<String, String> rowDataMap = new LinkedHashMap<>();
boolean hasRequiredData = false;
for (int i = 0; i < headers.size(); i++) {
String header = headers.get(i);
if (header != null && !header.isEmpty()) { // Check if header is valid
String cellValue = rowData.getCell(i);
rowDataMap.put(header, cellValue);
if (StringUtils.isNotBlank(cellValue)) {
hasRequiredData = true;
}
}
}
return hasRequiredData ? rowDataMap : new LinkedHashMap<>();
}
private class RowData {
private List<String> cells;
public RowData(List<String> cells) {
this.cells = cells;
}
public String getCell(int index) {
return index < cells.size() ? cells.get(index) : "";
}
}
private class SheetDataHandler extends DefaultHandler {
private List<RowData> rows = new ArrayList<>();
private List<String> currentRow = new ArrayList<>();
private boolean isCell = false;
private boolean isRow = false;
private boolean isSharedString = false;
private StringBuilder cellValue = new StringBuilder();
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
if ("row".equals(qName)) {
isRow = true;
currentRow = new ArrayList<>();
} else if ("c".equals(qName)) {
isCell = true;
cellValue.setLength(0);
} else if ("is".equals(qName)) {
isSharedString = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if ("row".equals(qName)) {
isRow = false;
rows.add(new RowData(currentRow));
} else if ("c".equals(qName)) {
isCell = false;
if (isSharedString) {
try {
int idx = Integer.parseInt(cellValue.toString());
String sharedString = String.valueOf(sst.getEntryAt(idx));
currentRow.add(sharedString);
} catch (NumberFormatException e) {
log.warn("无法解析共享字符串索引: {}", cellValue.toString());
currentRow.add(cellValue.toString());
}
} else {
currentRow.add(cellValue.toString());
}
} else if ("is".equals(qName)) {
isSharedString = false;
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (isCell) {
cellValue.append(new String(ch, start, length).trim());
}
}
public boolean hasNext() {
return !rows.isEmpty();
}
public RowData next() {
if (!hasNext()) {
return null;
}
return rows.remove(0);
}
}
public void close() {
if (pkg != null) {
try {
pkg.close();
} catch (IOException e) {
log.error("关闭OPCPackage时出错", e);
}
}
}
public static void main(String[] args) {
try {
DiskBasedExcelReader reader = new DiskBasedExcelReader("path/to/your/excel.xlsx", 100);
List<Map<String, String>> chunkData;
while ((chunkData = reader.readNextChunk()).size() > 0) {
for (Map<String, String> row : chunkData) {
System.out.println(row);
}
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
// 假设ExcelConstants类如下:
class ExcelConstants {
static final List<String> REQUIRED_HEADERS = List.of("Header1", "Header2", "Header3"); // 示例
}
修改直接再磁盘读取,零内存加载
最新发布