Mineru Docker镜像
打包了几个镜像并封装了api服务供调用。
镜像打包完成后可在离线环境下使用。
英伟达GPU版本
Dockerfile
FROM python:3.10-bookworm
LABEL authors="chunf"
RUN rm -f /etc/apt/sources.list.d/debian.sources
# sources.list为清华大学debian12镜像源
COPY ./sources.list /etc/apt/sources.list
RUN apt-get update
# 安装libreoffice和相关字体
RUN apt-get install libreoffice -y
RUN apt-get install libreoffice-l10n-zh-cn fonts-wqy-zenhei fonts-noto-cjk -y
# Fonts.zip为本人windows电脑所有字体文件
COPY ./Fonts.zip /tmp/Fonts.zip
RUN unzip -n -d /usr/share/fonts /tmp/Fonts.zip && fc-cache -fv && rm -f /tmp/Fonts.zip
# opencv等库需要
RUN apt-get install libgl1-mesa-glx -y
# 安装python相关包
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple
RUN pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
RUN pip install -U "magic-pdf[full]" -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --extra-index-url https://wheels.myhloli.com && \
pip install fastapi uvicorn python-multipart xlrd modelscope
RUN pip uninstall paddlepaddle -y && \
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
# 识别过程中的配置文件和模型文件,请自行下载
COPY ./magic-pdf-cuda.json /root/magic-pdf.json
COPY ./modelscope /root/.cache/modelscope
COPY ./.paddleocr /root/.paddleocr
- 模型和相关配置文件也可以使用挂载的方式,不必打包到镜像。
docker-compose
version: "3"
services:
mineru:
image: hcf/mineru-gpu
container_name: mineru-gpu
command:
- python
- /app/main.py
volumes:
- ./app:/app
- ./output:/output
- ./magic-pdf-cuda.json:/root/magic-pdf.json
environment:
- MAGIC_FILE_OUTPUT_DIR=/output
ports:
- "8000:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [ gpu ]
- app目录为FastAPI服务源代码。
CPU版本
Dockerfile
FROM python:3.10-bookworm
LABEL authors="chunf"
RUN rm -f /etc/apt/sources.list.d/debian.sources
COPY ./sources.list /etc/apt/sources.list
RUN apt-get update
RUN apt-get install libreoffice -y
RUN apt-get install libreoffice-l10n-zh-cn fonts-wqy-zenhei fonts-noto-cjk -y
COPY ./Fonts.zip /tmp/Fonts.zip
RUN unzip -n -d /usr/share/fonts /tmp/Fonts.zip && fc-cache -fv && rm -f /tmp/Fonts.zip
RUN apt-get install libgl1-mesa-glx -y
RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple
RUN pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
RUN pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
RUN pip install fastapi uvicorn python-multipart xlrd modelscope
COPY ./magic-pdf-cpu.json /root/magic-pdf.json
COPY ./modelscope /root/.cache/modelscope
COPY ./.paddleocr /root/.paddleocr
docker-compose
version: "3"
services:
mineru:
image: hcf/mineru
container_name: mineru-cpu
command:
- python
- /app/main.py
volumes:
- ./app:/app
- ./output:/output
- ./magic-pdf-cpu.json:/root/magic-pdf.json
environment:
- MAGIC_FILE_OUTPUT_DIR=/output
ports:
- "8000:8000"
DCU版本
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.3-py3.10
LABEL authors="HuangChunfeng"
RUN apt-get update
RUN apt-get install libreoffice -y
RUN apt-get install libreoffice-l10n-zh-cn fonts-wqy-zenhei fonts-noto-cjk -y
RUN apt-get install libgl1-mesa-glx -y
# 新增其他字体
COPY ./Fonts.zip /tmp/Fonts.zip
RUN unzip -n -d /usr/share/fonts /tmp/Fonts.zip && fc-cache -f -v && rm -rf /tmp/Fonts.zip
RUN pip install -U "magic-pdf[full]" --index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --extra-index-url https://wheels.myhloli.com
RUN pip install fastapi uvicorn python-multipart modelscope xlrd
RUN pip uninstall paddlepaddle -y
COPY ./paddlepaddle-2.6.1+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl /tmp/paddlepaddle-2.6.1+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
RUN pip install /tmp/paddlepaddle-2.6.1+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl && rm -f /tmp/paddlepaddle-2.6.1+das.opt1.dtk24043-cp310-cp310-manylinux_2_28_x86_64.whl
COPY ./magic-pdf-cuda.json /root/magic-pdf.json
COPY ./modelscope /root/.cache/modelscope
COPY ./.paddleocr /root/.paddleocr
GPU版本离线测试
windows docker-desktop测试正常。