import camelot
import pdfplumber
import json
import os
import traceback
import ctypes
import sys
import platform
import logging
# 配置日志记录
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 解决libGL缺失问题的前置检查
def check_lib_dependencies():
"""检查并尝试解决libGL依赖问题"""
try:
# 尝试加载GL库
ctypes.CDLL("libGL.so.1")
logger.info("libGL.so.1 已找到")
except OSError as e:
# 提供解决方案
if platform.system() == "Linux":
error_msg = f"警告: 缺少libGL.so.1库,PDF处理需要此库: {str(e)}"
solution = "解决方案: sudo apt-get update && sudo apt-get install libgl1-mesa-glx"
logger.error(error_msg)
logger.error(solution)
# 抛出明确的错误信息
raise RuntimeError(f"{error_msg}\n{solution}")
else:
logger.warning(f"非Linux系统依赖问题: {str(e)}")
raise
# 运行依赖检查(确保在导入图形库前执行)
try:
check_lib_dependencies()
except Exception as e:
logger.critical(f"关键依赖检查失败: {str(e)}")
def handler(event, context=None):
"""
插件入口函数,处理所有请求逻辑
event: 包含输入参数的字典
context: 运行时上下文(可选,默认值为None)
"""
try:
# 验证必要参数
required_params = ['file_url_id', 'task_type', 'question']
for param in required_params:
if param not in event:
error_msg = f"缺少必要参数: {param}"
logger.error(error_msg)
return {
"status": "error",
"error": error_msg
}
# 解析参数
file_url_id = event['file_url_id']
task_type = event['task_type']
question = event['question']
pages = event.get('pages', [])
language = event.get('language', 'zh-CN')
# 检查文件是否存在
if not os.path.exists(file_url_id):
error_msg = f"文件不存在: {file_url_id}"
logger.error(error_msg)
return {
"status": "error",
"error": error_msg
}
# 根据任务类型执行对应操作
try:
logger.info(f"开始处理任务: {task_type} 文件: {file_url_id}")
if task_type == 'parse_overview':
result = parse_overview(file_url_id)
elif task_type == 'parse_text':
result = parse_text(file_url_id, pages)
elif task_type == 'parse_tables':
result = parse_tables(file_url_id, pages)
elif task_type == 'summarize_drawing':
result = summarize_drawing(file_url_id, language)
elif task_type == 'extract_rebar':
result = extract_rebar(file_url_id)
elif task_type == 'extract_components':
result = extract_components(file_url_id, event.get('target_types', []))
else:
error_msg = f"不支持的任务类型: {task_type}"
logger.error(error_msg)
return {
"status": "error",
"error": error_msg
}
except RuntimeError as e:
# 专门处理依赖错误
logger.error(f"运行时依赖错误: {str(e)}")
return {
"status": "error",
"error": str(e),
"solution": "请安装libgl1-mesa-glx: sudo apt-get update && sudo apt-get install libgl1-mesa-glx"
}
except Exception as e:
logger.error(f"处理任务时出错: {str(e)}", exc_info=True)
# 处理系统依赖错误
if "libGL.so.1" in str(e) or "libGL" in str(e) or "GL" in str(e):
return {
"status": "error",
"error": "系统缺少必要的图形处理库",
"solution": "请安装libgl1-mesa-glx: sudo apt-get update && sudo apt-get install libgl1-mesa-glx"
}
else:
raise
# 构造成功响应
logger.info(f"任务 {task_type} 处理成功")
return {
"status": "success",
"result": result,
"qa_answer": generate_answer(question, result)
}
except Exception as e:
# 捕获并返回错误信息,包含详细堆栈跟踪以便调试
logger.critical(f"处理请求时发生未捕获异常: {str(e)}", exc_info=True)
return {
"status": "error",
"error": str(e),
"traceback": traceback.format_exc()
}
# 辅助函数:生成问题答案
def generate_answer(question, result):
return f"针对问题「{question}」的处理结果:{str(result)[:200]}..."
# 解析概览信息
def parse_overview(file_url_id):
try:
logger.info(f"解析文件概览: {file_url_id}")
# 设置环境变量以避免GUI依赖
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["CV_IO_ENABLE_OPENEXR"] = "1"
with pdfplumber.open(file_url_id) as pdf:
pages_count = len(pdf.pages)
first_page = pdf.pages[0] if pages_count > 0 else None
# 提取第一页文本内容
first_page_text = ""
if first_page:
try:
first_page_text = first_page.extract_text() or ""
except Exception as e:
logger.warning(f"提取第一页文本失败: {str(e)}")
first_page_text = ""
return {
"document_id": file_url_id,
"pages": pages_count,
"page_sizes": [{
"width": first_page.width if first_page else 0,
"height": first_page.height if first_page else 0,
"unit": "pt"
}] if first_page else [],
"has_text": bool(first_page_text),
"has_tables": False, # 简化处理,实际应检测
"has_vectors": False, # 简化处理,实际应检测
"probable_types": ["结构平面图", "建筑详图"]
}
except Exception as e:
logger.error(f"解析概览失败: {str(e)}", exc_info=True)
raise Exception(f"解析概览失败: {str(e)}")
# 解析文本
def parse_text(file_url_id, pages):
try:
logger.info(f"解析文本内容: {file_url_id}, 页数: {pages}")
text_blocks = []
# 设置环境变量以避免GUI依赖
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["CV_IO_ENABLE_OPENEXR"] = "1"
with pdfplumber.open(file_url_id) as pdf:
# 确定要处理的页面
process_pages = pages if pages else range(len(pdf.pages))
for page_num in process_pages:
if 0 <= page_num < len(pdf.pages):
page = pdf.pages[page_num]
try:
text = page.extract_text() or ""
except Exception as e:
logger.warning(f"提取第 {page_num+1} 页文本失败: {str(e)}")
text = ""
if text:
text_blocks.append({
"page": page_num + 1, # 页码从1开始
"bbox": [0, 0, page.width, page.height],
"text": text,
"font": "unknown",
"size": 12
})
return {
"document_id": file_url_id,
"items": text_blocks
}
except Exception as e:
logger.error(f"解析文本失败: {str(e)}", exc_info=True)
raise Exception(f"解析文本失败: {str(e)}")
# 解析表格
def parse_tables(file_url_id, pages):
try:
logger.info(f"解析表格: {file_url_id}, 页数: {pages}")
# 设置环境变量以避免GUI依赖
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["CV_IO_ENABLE_OPENEXR"] = "1"
# 转换为camelot需要的页码格式
pages_str = ','.join(map(str, [p+1 for p in pages])) if pages else 'all'
# 尝试使用不同的解析方式
try:
logger.info("尝试使用lattice模式解析表格")
tables = camelot.read_pdf(
file_url_id,
pages=pages_str,
flavor='lattice'
)
except Exception as e:
logger.warning(f"lattice模式失败,尝试stream模式: {str(e)}")
try:
tables = camelot.read_pdf(
file_url_id,
pages=pages_str,
flavor='stream'
)
except Exception as inner_e:
logger.error(f"两种表格解析模式均失败: {str(inner_e)}")
raise
result_tables = []
for table in tables:
cells = []
for i, row in enumerate(table.df.values):
for j, cell_text in enumerate(row):
cells.append({
"row": i,
"col": j,
"text": cell_text,
"bbox": table._bbox
})
result_tables.append({
"page": table.page,
"title": f"Table on page {table.page}",
"cells": cells
})
return {
"document_id": file_url_id,
"items": result_tables
}
except Exception as e:
logger.error(f"解析表格失败: {str(e)}", exc_info=True)
# 处理特定的表格解析错误
if "Ghostscript is not installed" in str(e):
error_msg = "未安装Ghostscript,请执行: sudo apt-get install ghostscript"
logger.error(error_msg)
raise Exception(error_msg)
raise Exception(f"解析表格失败: {str(e)}")
# 图纸摘要生成(简化版)
def summarize_drawing(file_url_id, language):
logger.info(f"生成图纸摘要: {file_url_id}, 语言: {language}")
overview = parse_overview(file_url_id)
return {
"document_id": file_url_id,
"text_summary": f"该图纸包含{overview['pages']}页,可能是{overview['probable_types'][0]}",
"structured": {
"drawing_type": overview['probable_types'][0],
"levels": ["一层", "二层"],
"grids": ["1-6", "A-D"],
"notes": ["混凝土强度等级为C30"]
}
}
# 钢筋信息提取(简化版)
def extract_rebar(file_url_id):
logger.info(f"提取钢筋信息: {file_url_id}")
# 设置环境变量以避免GUI依赖
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["CV_IO_ENABLE_OPENEXR"] = "1"
return {
"document_id": file_url_id,
"summary": {
"total_weight_kg": 1250.5,
"total_count": 25
},
"items": [
{
"mark": "GJ1",
"grade": "HRB400",
"diameter": "Φ20",
"spacing": "200mm",
"length": 6500,
"quantity": 10
}
]
}
# 构件提取(简化版)
def extract_components(file_url_id, target_types):
logger.info(f"提取构件: {file_url_id}, 目标类型: {target_types}")
# 设置环境变量以避免GUI依赖
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
os.environ["CV_IO_ENABLE_OPENEXR"] = "1"
return {
"document_id": file_url_id,
"items": [
{
"id": "B1",
"type": "beam",
"grid": "2-3/B-C",
"elevation": "+5.400",
"size": {
"width": 300,
"height": 500
},
"rebar_spec": "上部 2Φ20@200, 下部 2Φ18@200"
}
]
}
将 代码修改,使用pymupsd依赖包,不使用openGL,将修改完的完整代码发给我
最新发布