基于vllm大模型引擎的毕业论文语法检查程序

原创

已于 2025-04-08 22:11:08 修改 · 369 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #开发语言

于 2025-04-06 19:42:19 首次发布

直接识别word

公式等无法识别

import re
from docx import Document
from typing import List, Tuple
from docx.enum.text import WD_ALIGN_PARAGRAPH
from transformers import AutoTokenizer
import torch, json

# import torch._inductor.lowering
# torch._inductor.lowering.should_fallback_max_pool2d_with_indices

from vllm import LLM, SamplingParams
import time, tiktoken

class WordContentExtractor:
    """
    Word文档内容提取器，精确提取指定章节内容
    
    功能：
    1. 根据完整标题匹配提取指定章节
    2. 完全保留标题原始格式（包括空格）
    3. 返回两个合并后的列表：
       - final_result_paragraph: 按段落合并后的内容列表
       - final_result_sentence: 按句子合并后的内容列表
    
    使用示例：
    >>> extractor = WordContentExtractor("example.docx")
    >>> paragraphs, sentences = extractor.extract_contents()
    """
    
    def __init__(self, file_path: str):
        """
        初始化提取器
        
        参数：
        file_path -- Word文档路径
        """
        self.file_path = file_path
        try:
            self.doc = Document(file_path)
        except Exception as e:
            raise ValueError(f"无法加载Word文档: {
     
     e}")
            
        # 需要提取的完整章节标题（保留原始空格）
        self.target_titles = [
            '摘    要', 
            'Abstract', 
            '1  绪论', 
            '2  相关理论技术与评估指标', 
            '3  面向药物相互作用中复杂关系推理的逻辑查询模型', 
            '4  面向人类基因合成致死预测的可解释多跳推理模型', 
            '5  面向药物相互作用预测的模糊逻辑查询模型', 
            '结    论', 
            '致    谢'
        ]
        self.exclude = [
            '参 考 文 献',
            '作者简历及攻读硕士学位期间的科研成果',
            '大连海事大学学位论文授权使用声明'
        ]

    def is_target_title(self, text: str) -> bool:
        """
        精确判断段落是否为目标章节标题
        
        参数：
        text -- 段落文本
        
        返回：
        如果是目标章节标题返回True，否则False
        """
        return text in self.target_titles
    
    def extract_contents(self) -> Tuple[List[str], List[str]]:
        """
        精确提取指定章节内容
        
        返回：
        元组，包含两个列表：
        - final_result_paragraph: 按段落合并后的内容列表
        - final_result_sentence: 按句子合并后的内容列表
        """
        final_result_paragraph = []
        final_result_sentence = []
        
        current_section = None
        current_section_paragraphs = []

        exclude = 0
        for i, paragraph in enumerate(self.doc.paragraphs):
            
            text = paragraph.text.strip()
            if text in self.exclude:
                exclude = 1
                continue
            
            # 检查是否是目标章节标题
            if self.is_target_title(text):
                exclude = 0
                # 处理之前章节的内容
                if current_section is not None and current_section_paragraphs:
                    # 添加到段落结果
                    final_result_paragraph.extend(current_section_paragraphs)
                    
                    # 处理为句子
                    section_text = ' '.join(current_section_paragraphs)
                    sentences = re.split(r'(?<=[。！？!?])', section_text)
                    sentences = [s.strip() for s in sentences if s.strip()]
                    final_result_sentence.extend(sentences)
                
                # 开始新章节
                current_section = text
                current_section_paragraphs = []
                continue
                
    
            # 如果是当前章节的正文内容
            if current_section is not