导出超大数据量，如何避免JVM出现OOM，EasyExcel分批导出解决思路，超过100w分sheet写

MrCao杰罗尔德

已于 2024-01-02 10:53:32 修改

阅读量6.8k

点赞数 3

分类专栏： Java基础文章标签： java 开发语言后端 easyexcel

于 2022-02-28 20:17:18 首次发布

本文链接：https://blog.youkuaiyun.com/m0_37797991/article/details/123186552

版权

Java基础专栏收录该内容

5 篇文章

订阅专栏

导出背景

前边一篇博文提到，查询超大数据量，如何避免JVM出现OOM，分批查询处理解决思路
在企业系统中，很多地方可能都需要用到文件导出功能，对于刚入行的小伙伴，代码里面可能经常会出现这种SQL:

#查询全表
select * from table_name;
#查询很大（很多时候是全表）
select * from tab_name limit 0, 9999999;

每当博主发现线上机器OOM，在定位到代码的时候，很多地方都是因为导出，当我看到导出代码的时候，经常看到一个查询，limit后面加了一个很大的数字，看完后差点一口老血吐出来。

问题分析

对于出现问题的原因，及解决思路基本上和我前面提到的思路差不多：查询超大数据量，如何避免JVM出现OOM，分批查询处理解决思路这里就不再赘述。
对于文件导出我想让刚入手不久的朋友们思考下面几个问题：

导出全部数据，需要查询全部数据到内存吗？这样做会有什么问题？
如果我不查询全部数据到内存，但我需要导出一张表的全部数据到excel中，要怎么做？
导出过程中如果分批操作，主要的性能点在哪里？
如何下载一个文件的等待时间特别长，请求超时怎么办？
对于上面几个问题，其实是大家在平常开发过程中经常会遇到的一些导出问题，但随着大家工作经验的增加，可能会得到一定的答案，下面我给出我对上面问题的看法，大家可以和自己答案对比一下，一起探讨下：
很多时候机器配置高，数据量不大，我们导全表没有问题，但业务发展，和机器的资源的瓶颈，很可能会让机器OOM
可以分批操作，一页一页的写到文件中
分批操作的时候会不断的去建立连接，会有很多的网络IO操作，和大家在循环里面查，道理差不多
文件下载建议把同步改成异步

代码实现

针对上面的分析，下面给出基于查询超大数据量，如何避免JVM出现OOM，分批查询处理解决思路一文的基础上，基于Mytabis-Plus给出导出的相应实现。同时上文出现过的BulkExecutorUtil这里也不再贴出。

/**
 * @author caoyong
 * @version 1.0.0
 * @since 2022-02-28 10:24:08
 * 定义基本参数
 **/
@Data
@Builder
public class SegmentDownParam<T, V> {
    /**
     * default size
     */
    public static final Integer DEFAULT_SIZE = 2000;

    /**
     * batch size
     */
    @Builder.Default
    private Integer batchSize = DEFAULT_SIZE;

    /**
     * query wrapper
     */
    private Wrapper<T> queryWrapper;

    /**
     * query service
     */
    private IService<T> service;

    /**
     * download show name
     */
    @NonNull
    private String showName;

    /**
     * transfer T(model that mybatis-plus generated) to V(VO that service response)
     */
    @NonNull
    private Function<List<T>, List<V>> transFunc;

    /**
     * write function
     */
    @NonNull
    BiConsumer<List<V>, ExcelWriter> writeFunc;

    /**
     * export VO class
     */
    @NonNull
    private Class<?> exportClass;

    /**
     * the class for service query response VO
     */
    @NonNull
    private Class<V> voClass;

    /**
     * count: if present,use it else use service.count()
     */
    @Builder.Default
    private Integer count = 0;

    /**
     * query function
     */
    private Function<Integer, List<T>> queryFunc;
}

核心实现：

/**
 * @author caoyong
 * @version 1.0.0
 * @since 2022-02-28 10:05:23
 **/
@Slf4j
public class SegmentDownUtil {
    /**
     * generate download file
     *
     * @param param param need
     * @param <T>   type for query form mybatis-plus
     * @param <V>   type for transferred
     * @return download file path
     */
    public static <T, V> String generateFile(SegmentDownParam<T, V> param) {
        {
            //download file path
            ExcelWriter excelWriter = null;
            String downloadFilePath = AttachmentHelper.tempFilePatch() + Common.SPLIT + param.getShowName() + ".xlsx";
            try {
                excelWriter = EasyExcel.write(downloadFilePath, param.getExportClass()).build();
                BiConsumer<List<V>, ExcelWriter> writeFunc = param.getWriteFunc();
                //get total count
                IService<T> service = param.getService();
                //get available processors
                int parallelism = Runtime.getRuntime().availableProcessors() * 2 + 1;
                BulkExecutorParam<T> exeParam = BulkExecutorParam.<T>builder()
                        .queryWrapper(param.getQueryWrapper())
                        .batchSize(param.getBatchSize())
                        .count(param.getCount())
                        .queryFunc(param.getQueryFunc())
                        .service(service)
                        .parallelism(parallelism)
                        .build();
                // batch write list json to local temp files
                List<Future<File>> futures = BulkExecutorUtil.submit(exeParam, list -> {
                    File file = new File(AttachmentHelper.tempFilePatch() + IdWorker.getId());
                    List<V> transferredRows = param.getTransFunc().apply(list);
                    byte[] bytes = JSONArray.toJSONBytes(transferredRows, SerializerFeature.WriteDateUseDateFormat,
                            SerializerFeature.WriteBigDecimalAsPlain);
                    try (FileOutputStream outputStream = new FileOutputStream(file)) {
                        IOUtils.write(bytes, outputStream);
                    } catch (IOException e) {
                        log.error("write to temp json temp File error, path：{}", file.getAbsolutePath(), e);
                    }
                    return file;
                });
                if (CollectionUtil.isEmpty(futures)) {
                    writeFunc.accept(new ArrayList<>(), excelWriter);
                    return downloadFilePath;
                }
                //iterate all file and merge
                for (Future<File> future : futures) {
                    try {
                        File file = future.get();
                        try (FileInputStream in = new FileInputStream(file)) {
                            String dataString = IOUtils.toString(in, StandardCharsets.UTF_8);
                            List<V> list = JSONArray.parseArray(dataString, param.getVoClass());
                            writeFunc.accept(list, excelWriter);
                        } catch (Exception e) {
                            log.error("merge temp file error, path：{}", file.getAbsolutePath(), e);
                        }
                        boolean isDeleted = file.delete();
                        if (!isDeleted) {
                            log.warn("delete temp json file error");
                        }
                    } catch (Exception e) {
                        log.error("get future File error:{}", e.getMessage(), e);
                    }
                }

            } finally {
                if (excelWriter != null) {
                    excelWriter.finish();
                }
            }
            return downloadFilePath;
        }
    }
}

使用示例

LambdaQueryWrapper<T> wrapper = Wrappers.lambdaQuery(T.class);
SegmentDownUtil.generateFile(SegmentDownParam
                .<T, V>builder()
                .showName("展示名")
                .service(this)
                .transFunc(this::fillRows)
                .writeFunc(this::writeExcelFile)
                //导出时使用的VO
                .exportClass(ExportVo.class)
                .queryWrapper(wrapper )
                .voClass(V.class)
                .build());

/**
 * filled rows and transfer T -> V
 *
 * @param rows entity rows
 * @return transferred rows and filled
 */
private List<V> fillRows(List<T> rows) {
    List<V> filledRows = rows.stream().map(value -> {
	    //转换操作
        return new V();
    }).collect(Collectors.toList());
    return filledRows;
}

/**
 * write Excel file
 *
 * @param originalData origin data
 * @param writer       excel writer
 */
private void writeExcelFile(List<V> originalData, ExcelWriter writer) {
    WriteSheet writeSheet = EasyExcel.writerSheet("导出示例").build();
    List<ExportVo> exportList = originalData.parallelStream()
            .map(vo -> {
                ExportVo exportVo = new ExportVo();
                BeanUtils.copyProperties(vo, exportVo);
                exportVo.setOpTime(LocalDateTimeUtil.format(vo.getOpTime(), DateUtil.YYYY_MM_DD_HH_MM_SS));
                return exportVo;
            }).collect(Collectors.toList());
    writer.write(exportList, writeSheet);
}

深度分页（深分页）场景下的优化

什么是深分页？如一个表的数据量较大，我们的SQL如下：

select * from table_name limit 100000000,10

这样的SQL对于MySQL来说，性能会很低的，对于导出场景，这样的优化是很有必要的。
下面给出一个笔者在企业中开发过的一个导出订单的例子，部分工具类用到了hutool。
需要用到的一些线程池常量

public interface CommonConstant {

    /**
     * 全局导出文件线程
     */
    ExecutorService downloadExecutor = Executors.newWorkStealingPool();

    /**
     * 全局导出文件查询线程
     */
    ExecutorService downloadQueryExecutor = Executors.newWorkStealingPool(20);

}

下载代码

    public R<Void> download(BaseOrderQueryVO baseOrderQueryVO) {
        CommonConstant.downloadExecutor.execute(() -> {
            File downloadTempFile = generateDownloadFile(baseOrderQueryVO);
            // 记录下载信息，可根据自己系统情况选择记录
            BaseDownload baseDownload = new BaseDownload();
            baseDownload.setBizType(DownloadBizTypeEnum.ORDER.name());
            baseDownload.setState(DownloadStateEnum.DOING.name());
            baseDownload.setFileName(downloadTempFile.getName());
            baseDownload.setCreateUser(HeadersContextHolder.getUserId());
            baseDownload.setUpdateUser(HeadersContextHolder.getUserId());
            baseDownloadService.save(baseDownload);

            try {
                List<Future<File>> fileFutures = writeTempFile(baseOrderQueryVO);
                doDownload(fileFutures, downloadTempFile);
            } catch (Exception e) {
                String errMsg = ExceptionUtil.stacktraceToString(e, 400);
                baseDownload.setErrMsg(errMsg);
                baseDownload.setState(DownloadStateEnum.FAILED.name());
            }
        return R.success();
    }

创建File

    private static File generateDownloadFile(BaseOrderQueryVO baseOrderQueryVO) {
        DateTimeFormatter formatter = DatePattern.createFormatter("yyMMdd");
        String startOrderDate = LocalDateTimeUtil.format(baseOrderQueryVO.getStartOrderTime(), formatter);
        String endOrderDate = LocalDateTimeUtil.format(baseOrderQueryVO.getEndOrderTime(), formatter);

        String time = DateTimeUtil.formatPureTime(LocalDateTime.now());

        File tempFile = new File(FileUtil.getTmpDir(), StrUtil.concat(true, "order",
                // 可根据自己的系统生成一个带用户标识的临时目录，如有必要
                File.separator, "download", HeadersContextHolder.getUserDir(), File.separator,
                startOrderDate, "-", endOrderDate, "积分订单", time, ".xlsx"));
        FileUtil.touch(tempFile);
        return tempFile;
    }

写临时文件核心逻辑,注释中有核心逻辑的说明

    private List<Future<File>> writeTempFile(BaseOrderQueryVO baseOrderQueryVO) {
        baseOrderQueryVO.setSize(downloadSize);
        // 导出不需要使用count
        baseOrderQueryVO.setSearchCount(false);
        List<Future<File>> fileFutures = Lists.newArrayList();
        for (; ; ) {
            // 使用通用的分页接口进行查询，查询的时候需要按id倒序
            IPage<OrderDTO> orderPage = getOrderListPage(baseOrderQueryVO);
            // noinspection all
            List<OrderDTO> records = orderPage.getRecords();
            records = records == null ? Lists.newArrayList() : records;
            if (CollUtil.isEmpty(records)) {
                break;
            }
            // 把查询的DTO转成导出的DTO
            List<OrderDownloadDTO> orderDownloads = records.stream()
                    .map(OrderDownloadDTO::new)
                    .collect(Collectors.toList());
            // 解决深分页核心代码       
            // 设置下页起始的id，由于是倒序，所以最后一个元素是本页最小的id，那我们对于finalId在SQL里面处理的逻辑应该是
            // and id < #{finalId}，这样我们下一页就把之前查过的id排除了，limit 0, 10这样的语句查出来就是下一页的数据了
            OrderDTO lastOrder = CollUtil.getLast(records);
            baseOrderQueryVO.setFinalId(lastOrder == null ? StringPool.ZERO : lastOrder.getId());
            // 提升导出速度核心代码
            // 默认使用全局20个查询线程，这里是提升导出速度的关键，由于查询是IO密集型的任务，使用多个线程查询数据后写临时文件
            // 可以在很大程度上提升导出速度，按顺序把Future<File>插入到List<Future<File>>中，也不用担心数据顺序问题。
            // 写入到磁盘释放内存，可以避免OOM
            Future<File> fileFuture = CommonConstant.downloadQueryExecutor.submit(() -> {
                File file = new File(FileUtil.getTmpDir(), StrUtil.concat(true,
                        "order", File.separator, "download", HeadersContextHolder.getUserDir(),
                        File.separator, String.valueOf(IdWorker.getId())) + ".json");
                FileUtil.touch(file);
                // noinspection all
                byte[] bytes = JSONArray.toJSONBytes(orderDownloads, SerializerFeature.WriteDateUseDateFormat,
                        SerializerFeature.WriteBigDecimalAsPlain);
                try (FileOutputStream outputStream = new FileOutputStream(file)) {
                    IOUtils.write(bytes, outputStream);
                } catch (IOException e) {
                    log.error("write to temp json temp File error, path：{}", file.getAbsolutePath(), e);
                }
                return file;
            });
            fileFutures.add(fileFuture);
        }
        return fileFutures;
    }

导出操作代码

    private void doDownload(List<Future<File>> fileFutures, File downloadTempFile) throws Exception {
        int index = 0;
        // 限制单个sheet最大写数量为100w，实际最大支持数据为：1048576
        int sheetMaxNum = 1000000;
        log.info("Write temp file: {}", downloadTempFile);
        try (ExcelWriter excelWriter = EasyExcel.write(downloadTempFile, OrderDownloadDTO.class).build()) {
            if (CollUtil.isEmpty(fileFutures)) {
                // 写空表头
                WriteSheet writeSheet = EasyExcel.writerSheet("订单列表_0").build();
                excelWriter.write(Lists.newArrayList(), writeSheet);
                return;
            }
            for (Future<File> fileFuture : fileFutures) {
                File file = fileFuture.get();
                try (FileInputStream in = new FileInputStream(file)) {
                    String dataString = IOUtils.toString(in, StandardCharsets.UTF_8);
                    // noinspection all
                    List<OrderDownloadDTO> orderDownloads = JSONArray.parseArray(dataString, OrderDownloadDTO.class);
                    int sheetNum = index / sheetMaxNum;
                    WriteSheet writeSheet = EasyExcel.writerSheet(sheetNum, "订单列表_" + sheetNum).build();

                    excelWriter.write(orderDownloads, writeSheet);
                    index = index + orderDownloads.size();
                } catch (Exception e) {
                    log.error("get future File error:{}", e.getMessage(), e);
                }

                boolean isDeleted = FileUtil.del(file);
                if (!isDeleted) {
                    log.warn("delete temp json file error");
                }
            }

        } catch (Exception e) {
            log.error("Download order list error: {}", e.getMessage(), e);
            throw e;
        }
    }