import sys
import os
import fitz # PyMuPDF
import pandas as pd
import numpy as np
import tempfile
import shutil
import re
import time
from datetime import datetime
from PIL import Image, ImageDraw
import cv2
import csv
import json
from collections import defaultdict
from PyQt5.QtWidgets import (
QApplication, QMainWindow, QWidget, QStackedWidget, QVBoxLayout, QHBoxLayout,
QLabel, QPushButton, QFileDialog, QListWidget, QTableWidget, QTableWidgetItem,
QAbstractItemView, QHeaderView, QLineEdit, QFrame, QSizePolicy, QProgressBar,
QMessageBox, QComboBox, QGridLayout, QTextEdit, QDialog, QDialogButtonBox
)
from PyQt5.QtCore import Qt, QSize, QTimer, QThread, pyqtSignal
from PyQt5.QtGui import QIcon, QFont, QColor, QPixmap, QBrush, QPainter
# 集成PaddleOCR
try:
from paddleocr import PaddleOCR
except ImportError:
print("PaddleOCR not installed. Please install with: pip install paddlepaddle paddleocr")
sys.exit(1)
class OCRWorker(QThread):
progress_updated = pyqtSignal(int, str)
extraction_complete = pyqtSignal(list, str)
watermark_removed = pyqtSignal(str, str)
def __init__(self, pdf_path, output_dir, watermark_text=None, parent=None):
super().__init__(parent)
self.pdf_path = pdf_path
self.output_dir = output_dir
self.watermark_text = watermark_text
self.file_name = os.path.basename(pdf_path)
self.canceled = False
self.ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
def run(self):
try:
# 第一步:去除水印(如果需要)
processed_path = self.pdf_path
if self.watermark_text:
processed_path = self.remove_watermark()
if processed_path:
self.watermark_removed.emit(processed_path, self.file_name)
else:
self.progress_updated.emit(100, "水印去除失败")
return
# 第二步:提取内容
self.extract_content(processed_path)
except Exception as e:
self.progress_updated.emit(100, f"处理失败: {str(e)}")
def remove_watermark(self):
"""使用OCR检测并去除水印"""
try:
doc = fitz.open(self.pdf_path)
new_doc = fitz.open()
output_path = os.path.join(self.output_dir, f"processed_{self.file_name}")
total_pages = len(doc)
for page_num in range(total_pages):
if self.canceled:
return None
self.progress_updated.emit(int(30 * page_num / total_pages),
f"正在处理第 {page_num+1} 页水印")
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img_np = np.array(img)
# 使用PaddleOCR检测文本
result = self.ocr.ocr(img_np, cls=True)
# 检测水印文本
watermark_boxes = []
for line in result:
for word_info in line:
text = word_info[1][0]
if self.watermark_text.lower() in text.lower():
box = word_info[0]
# 将浮点坐标转换为整数
int_box = [(int(x), int(y)) for x, y in box]
watermark_boxes.append(int_box)
# 去除水印(用白色覆盖)
if watermark_boxes:
img_pil = Image.fromarray(img_np)
draw = ImageDraw.Draw(img_pil)
for box in watermark_boxes:
# 创建覆盖矩形
min_x = min(point[0] for point in box)
max_x = max(point[0] for point in box)
min_y = min(point[1] for point in box)
max_y = max(point[1] for point in box)
# 扩展矩形范围确保完全覆盖
expand = 5
draw.rectangle(
[min_x - expand, min_y - expand, max_x + expand, max_y + expand],
fill=(255, 255, 255)
img_np = np.array(img_pil)
# 保存处理后的页面
img_bytes = Image.fromarray(img_np).tobytes()
new_page = new_doc.new_page(width=img_np.shape[1], height=img_np.shape[0])
new_page.insert_image(fitz.Rect(0, 0, img_np.shape[1], img_np.shape[0]), stream=img_bytes)
new_doc.save(output_path)
new_doc.close()
doc.close()
return output_path
except Exception as e:
print(f"Error removing watermark: {e}")
return None
def extract_content(self, pdf_path):
"""使用PaddleOCR提取内容"""
try:
doc = fitz.open(pdf_path)
extracted_data = []
total_pages = len(doc)
for page_num in range(total_pages):
if self.canceled:
return
self.progress_updated.emit(30 + int(70 * page_num / total_pages),
f"正在提取第 {page_num+1} 页内容")
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img_np = np.array(img)
# 使用PaddleOCR提取文本
result = self.ocr.ocr(img_np, cls=True)
# 处理OCR结果
page_text = ""
for line in result:
line_text = " ".join([word_info[1][0] for word_info in line])
page_text += line_text + "\n"
# 提取结构化数据(示例逻辑)
extracted = self.extract_structured_data(page_text, page_num + 1)
extracted_data.extend(extracted)
doc.close()
self.extraction_complete.emit(extracted_data, self.file_name)
except Exception as e:
self.progress_updated.emit(100, f"内容提取失败: {str(e)}")
def extact_structured_data(self, text, page_num):
"""从文本中提取结构化数据(示例实现)"""
extracted = []
# 提取发票信息
invoice_match = re.search(r'Invoice\s+Number\s*:\s*(\w+)', text, re.IGNORECASE)
date_match = re.search(r'Date\s*:\s*(\d{2}/\d{2}/\d{4})', text, re.IGNORECASE)
total_match = re.search(r'Total\s+Amount\s*:\s*([\d,]+\.\d{2})', text, re.IGNORECASE)
if invoice_match or date_match or total_match:
extracted.append({
"Document": self.file_name,
"Page": page_num,
"Type": "Invoice",
"Invoice Number": invoice_match.group(1) if invoice_match else "N/A",
"Date": date_match.group(1) if date_match else "N/A",
"Amount": f"${total_match.group(1)}" if total_match else "N/A"
})
# 提取报告信息
report_match = re.search(r'Report\s+Title\s*:\s*(.+)', text, re.IGNORECASE)
author_match = re.search(r'Author\s*:\s*(.+)', text, re.IGNORECASE)
if report_match or author_match:
extracted.append({
"Document": self.file_name,
"Page": page_num,
"Type": "Report",
"Report Title": report_match.group(1) if report_match else "N/A",
"Author": author_match.group(1) if author_match else "N/A",
"Summary": text[:200] + "..." if len(text) > 200 else text
})
# 如果没有匹配到特定结构,返回整个页面文本
if not extracted:
extracted.append({
"Document": self.file_name,
"Page": page_num,
"Type": "General",
"Content": text
})
return extracted
def cancel(self):
self.canceled = True
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("PDF智能处理工具")
self.setGeometry(100, 100, 1200, 800)
self.setMinimumSize(1000, 700)
# 应用主色调
self.primary_color = "#2c3e50"
self.secondary_color = "#3498db"
self.accent_color = "#e67e22"
self.light_color = "#ecf0f1"
self.dark_color = "#34495e"
# 初始化状态
self.current_files = []
self.extracted_data = []
self.history_data = []
self.ocr_worker = None
self.temp_dir = tempfile.mkdtemp()
# 创建中央部件和主布局
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QVBoxLayout(central_widget)
main_layout.setContentsMargins(0, 0, 0, 0)
main_layout.setSpacing(0)
# 创建顶部栏
self.create_header(main_layout)
# 创建主内容区域
self.content_stack = QStackedWidget()
main_layout.addWidget(self.content_stack, 1)
# 创建各个页面
self.home_page = self.create_home_page()
self.upload_page = self.create_upload_page()
self.history_page = self.create_history_page()
self.analysis_page = self.create_analysis_page()
self.content_stack.addWidget(self.home_page)
self.content_stack.addWidget(self.upload_page)
self.content_stack.addWidget(self.history_page)
self.content_stack.addWidget(self.analysis_page)
# 创建底部导航栏
self.create_footer(main_layout)
# 模拟一些历史数据
self.simulate_history_data()
# 设置首页为默认页面
self.content_stack.setCurrentIndex(0)
self.update_home_page()
def closeEvent(self, event):
"""清理临时文件"""
try:
shutil.rmtree(self.temp_dir, ignore_errors=True)
except:
pass
event.accept()
def simulate_history_data(self):
"""模拟一些历史数据用于展示"""
for i in range(5):
self.history_data.append({
"id": i,
"file_name": f"document_{i+1}.pdf",
"date": f"2023-0{i+1}-15",
"status": "Completed",
"pages": i+3,
"type": "Invoice" if i % 2 == 0 else "Report",
"extracted_data": [
{
"Document": f"document_{i+1}.pdf",
"Page": 1,
"Type": "Invoice",
"Invoice Number": f"INV-2023-{i+1:04d}",
"Date": f"2023-0{i+1}-15",
"Amount": f"${(i+1)*250:.2f}"
}
]
})
def create_header(self, main_layout):
"""创建应用头部"""
header = QWidget()
header.setStyleSheet(f"background-color: {self.primary_color}; padding: 15px;")
header_layout = QHBoxLayout(header)
header_layout.setContentsMargins(20, 10, 20, 10)
# 应用标题
title_label = QLabel("PDF智能处理工具")
title_label.setStyleSheet(f"color: white; font-size: 24px; font-weight: bold;")
header_layout.addWidget(title_label)
# 右侧用户区域
user_widget = QWidget()
user_layout = QHBoxLayout(user_widget)
user_layout.setSpacing(15)
user_icon = QLabel()
user_icon.setPixmap(self.create_icon("👤", 40))
user_layout.addWidget(user_icon)
user_name = QLabel("管理员")
user_name.setStyleSheet("color: white; font-size: 16px;")
user_layout.addWidget(user_name)
header_layout.addWidget(user_widget)
main_layout.addWidget(header)
def create_footer(self, main_layout):
"""创建底部导航栏"""
footer = QWidget()
footer.setStyleSheet(f"background-color: {self.dark_color};")
footer.setFixedHeight(60)
footer_layout = QHBoxLayout(footer)
footer_layout.setContentsMargins(0, 0, 0, 0)
footer_layout.setSpacing(0)
# 导航按钮
nav_items = [
("首页", "home", 0),
("上传与提取", "upload", 1),
("历史记录", "history", 2),
("数据分析", "analysis", 3)
]
for text, icon_name, index in nav_items:
btn = QPushButton(text)
btn.setIcon(QIcon(self.create_icon("🏠" if icon_name=="home" else "📤" if icon_name=="upload" else "📋" if icon_name=="history" else "📊", 24)))
btn.setIconSize(QSize(24, 24))
btn.setFixedHeight(60)
btn.setStyleSheet(f"""
QPushButton {{
color: {self.light_color};
font-size: 14px;
font-weight: bold;
border: none;
background-color: {self.dark_color};
}}
QPushButton:hover {{
background-color: {self.primary_color};
}}
""")
btn.clicked.connect(lambda _, idx=index: self.navigate_to(idx))
footer_layout.addWidget(btn)
main_layout.addWidget(footer)
def create_icon(self, emoji, size=24):
"""创建表情符号图标"""
pixmap = QPixmap(size, size)
pixmap.fill(Qt.transparent)
painter = QPainter(pixmap)
painter.setFont(QFont("Arial", size - 4))
painter.drawText(pixmap.rect(), Qt.AlignCenter, emoji)
painter.end()
return pixmap
def navigate_to(self, index):
"""导航到指定页面"""
self.content_stack.setCurrentIndex(index)
# 更新页面内容
if index == 0: # 首页
self.update_home_page()
elif index == 1: # 上传与提取
pass
elif index == 2: # 历史记录
self.update_history_page()
elif index == 3: # 数据分析
self.update_analysis_page()
def create_home_page(self):
"""创建首页"""
page = QWidget()
layout = QVBoxLayout(page)
layout.setContentsMargins(30, 20, 30, 20)
layout.setSpacing(20)
# 欢迎卡片
welcome_card = QFrame()
welcome_card.setStyleSheet(f"""
QFrame {{
background-color: white;
border-radius: 10px;
padding: 20px;
}}
""")
welcome_layout = QVBoxLayout(welcome_card)
welcome_title = QLabel("欢迎使用PDF智能处理工具")
welcome_title.setStyleSheet("font-size: 24px; font-weight: bold; color: #2c3e50;")
welcome_layout.addWidget(welcome_title)
welcome_text = QLabel("本工具提供PDF水印去除、内容提取和数据分析功能,支持批量处理PDF文件,快速提取结构化数据并导出为Excel或CSV格式。")
welcome_text.setStyleSheet("font-size: 16px; color: #7f8c8d;")
welcome_text.setWordWrap(True)
welcome_layout.addWidget(welcome_text)
# OCR信息
ocr_info = QLabel("当前使用PaddleOCR引擎,支持中英文识别")
ocr_info.setStyleSheet("font-size: 14px; color: #3498db; font-weight: bold;")
welcome_layout.addWidget(ocr_info)
layout.addWidget(welcome_card)
# 最近上传区域
recent_label = QLabel("最近上传")
recent_label.setStyleSheet("font-size: 20px; font-weight: bold; color: #2c3e50;")
layout.addWidget(recent_label)
# 最近上传列表
self.recent_list = QListWidget()
self.recent_list.setStyleSheet("""
QListWidget {
background-color: white;
border-radius: 10px;
border: 1px solid #ddd;
}
QListWidget::item {
padding: 15px;
border-bottom: 1px solid #eee;
}
QListWidget::item:selected {
background-color: #e6f7ff;
}
""")
self.recent_list.setAlternatingRowColors(True)
layout.addWidget(self.recent_list, 1)
# 快捷操作按钮
quick_actions = QWidget()
quick_layout = QHBoxLayout(quick_actions)
quick_layout.setSpacing(15)
actions = [
("上传新文件", "upload", self.navigate_to_upload),
("查看历史记录", "history", lambda: self.navigate_to(2)),
("数据分析", "analysis", lambda: self.navigate_to(3))
]
for text, icon_name, action in actions:
btn = QPushButton(text)
btn.setIcon(QIcon(self.create_icon("📤" if icon_name=="upload" else "📋" if icon_name=="history" else "📊", 24)))
btn.setIconSize(QSize(24, 24))
btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
font-size: 16px;
font-weight: bold;
padding: 12px 20px;
border-radius: 8px;
}}
QPushButton:hover {{
background-color: #2980b9;
}}
""")
btn.clicked.connect(action)
quick_layout.addWidget(btn)
layout.addWidget(quick_actions)
return page
def update_home_page(self):
"""更新首页内容"""
self.recent_list.clear()
# 显示最近的5条记录
for item in self.history_data[:5]:
list_item = QLabel(f"""
<div style="font-size: 16px; font-weight: bold;">{item['file_name']}</div>
<div style="color: #7f8c8d; font-size: 14px;">
上传时间: {item['date']} | 状态: <span style="color: #27ae60;">{item['status']}</span> | 类型: {item['type']}
</div>
""")
list_widget = QListWidgetItem(self.recent_list)
list_widget.setSizeHint(list_item.sizeHint())
self.recent_list.addItem(list_widget)
self.recent_list.setItemWidget(list_widget, list_item)
def navigate_to_upload(self):
"""导航到上传页面"""
self.content_stack.setCurrentIndex(1)
def create_upload_page(self):
"""创建上传与提取页面"""
page = QWidget()
layout = QVBoxLayout(page)
layout.setContentsMargins(30, 20, 30, 20)
layout.setSpacing(20)
# 标题
title = QLabel("上传与提取")
title.setStyleSheet("font-size: 24px; font-weight: bold; color: #2c3e50;")
layout.addWidget(title)
# 上传区域
upload_card = QFrame()
upload_card.setStyleSheet(f"""
QFrame {{
background-color: white;
border-radius: 10px;
border: 2px dashed {self.secondary_color};
padding: 40px;
}}
""")
upload_layout = QVBoxLayout(upload_card)
upload_layout.setAlignment(Qt.AlignCenter)
upload_icon = QLabel()
upload_icon.setPixmap(self.create_icon("📤", 80))
upload_icon.setAlignment(Qt.AlignCenter)
upload_layout.addWidget(upload_icon)
upload_text = QLabel("拖放PDF文件到此处,或点击选择文件")
upload_text.setStyleSheet("font-size: 18px; color: #7f8c8d; margin-top: 20px;")
upload_text.setAlignment(Qt.AlignCenter)
upload_layout.addWidget(upload_text)
upload_btn = QPushButton("选择PDF文件")
upload_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
font-size: 16px;
padding: 10px 20px;
border-radius: 5px;
margin-top: 20px;
}}
QPushButton:hover {{
background-color: #2980b9;
}}
""")
upload_btn.clicked.connect(self.select_pdf_files)
upload_layout.addWidget(upload_btn, alignment=Qt.AlignCenter)
# 水印选项
watermark_layout = QHBoxLayout()
watermark_layout.setSpacing(10)
watermark_label = QLabel("水印文本(可选):")
watermark_label.setStyleSheet("font-size: 14px;")
watermark_layout.addWidget(watermark_label)
self.watermark_input = QLineEdit()
self.watermark_input.setPlaceholderText("输入要删除的水印文本")
self.watermark_input.setStyleSheet("padding: 5px; border: 1px solid #ddd; border-radius: 3px;")
watermark_layout.addWidget(self.watermark_input, 1)
upload_layout.addLayout(watermark_layout)
layout.addWidget(upload_card, 1)
# 进度区域
progress_layout = QVBoxLayout()
progress_layout.setSpacing(10)
progress_label = QLabel("处理进度")
progress_label.setStyleSheet("font-size: 18px; font-weight: bold; color: #2c3e50;")
progress_layout.addWidget(progress_label)
self.progress_bar = QProgressBar()
self.progress_bar.setStyleSheet("""
QProgressBar {
border: 1px solid #ddd;
border-radius: 5px;
text-align: center;
height: 25px;
}
QProgressBar::chunk {
background-color: #3498db;
width: 10px;
}
""")
progress_layout.addWidget(self.progress_bar)
self.progress_text = QLabel("等待处理文件...")
self.progress_text.setStyleSheet("font-size: 14px; color: #7f8c8d;")
progress_layout.addWidget(self.progress_text)
# 取消按钮
self.cancel_btn = QPushButton("取消处理")
self.cancel_btn.setStyleSheet("""
QPushButton {
background-color: #e74c3c;
color: white;
padding: 8px 20px;
border-radius: 5px;
font-weight: bold;
}
QPushButton:hover {
background-color: #c0392b;
}
""")
self.cancel_btn.clicked.connect(self.cancel_processing)
self.cancel_btn.setVisible(False)
progress_layout.addWidget(self.cancel_btn, alignment=Qt.AlignRight)
layout.addLayout(progress_layout)
return page
def select_pdf_files(self):
"""选择PDF文件"""
files, _ = QFileDialog.getOpenFileNames(
self, "选择PDF文件", "", "PDF文件 (*.pdf)"
)
if files:
self.current_files = files
self.process_files()
def process_files(self):
"""处理选中的文件"""
if not self.current_files:
return
# 重置状态
self.extracted_data = []
# 显示取消按钮
self.cancel_btn.setVisible(True)
# 处理第一个文件
file_path = self.current_files[0]
file_name = os.path.basename(file_path)
# 获取水印文本
watermark_text = self.watermark_input.text().strip() or None
# 创建工作线程
self.ocr_worker = OCRWorker(
file_path,
self.temp_dir,
watermark_text
)
# 连接信号
self.ocr_worker.progress_updated.connect(self.update_progress)
self.ocr_worker.watermark_removed.connect(self.on_watermark_removed)
self.ocr_worker.extraction_complete.connect(self.on_extraction_complete)
# 开始处理
self.ocr_worker.start()
def update_progress(self, progress, message):
"""更新处理进度"""
self.progress_bar.setValue(progress)
self.progress_text.setText(message)
def on_watermark_removed(self, output_path, file_name):
"""水印去除完成"""
self.progress_text.setText(f"水印已移除: {file_name}")
def on_extraction_complete(self, extracted_data, file_name):
"""内容提取完成"""
self.extracted_data.extend(extracted_data)
# 保存到历史记录
self.history_data.insert(0, {
"id": len(self.history_data),
"file_name": file_name,
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
"status": "Completed",
"pages": len(extracted_data),
"type": "Invoice" if any(d['Type'] == 'Invoice' for d in extracted_data) else "Report",
"extracted_data": extracted_data
})
# 显示预览
self.show_preview(extracted_data, file_name)
# 处理下一个文件(如果有)
if len(self.current_files) > 1:
self.current_files.pop(0)
self.process_files()
else:
self.current_files = []
self.cancel_btn.setVisible(False)
def cancel_processing(self):
"""取消处理"""
if self.ocr_worker and self.ocr_worker.isRunning():
self.ocr_worker.cancel()
self.ocr_worker.wait()
self.progress_text.setText("处理已取消")
self.cancel_btn.setVisible(False)
def show_preview(self, data, file_name):
"""显示预览窗口"""
# 创建预览对话框
preview_dialog = QDialog(self)
preview_dialog.setWindowTitle(f"预览 - {file_name}")
preview_dialog.resize(1000, 700)
layout = QVBoxLayout(preview_dialog)
layout.setContentsMargins(20, 20, 20, 20)
layout.setSpacing(15)
# 标题
title = QLabel(f"文件内容提取结果: {file_name}")
title.setStyleSheet("font-size: 20px; font-weight: bold; color: #2c3e50;")
layout.addWidget(title)
# OCR引擎信息
ocr_info = QLabel("使用PaddleOCR引擎提取内容")
ocr_info.setStyleSheet("font-size: 14px; color: #3498db; font-weight: bold;")
layout.addWidget(ocr_info)
# 数据表格
if data:
table = QTableWidget()
table.setRowCount(len(data))
# 获取所有可能的列
all_columns = set()
for item in data:
all_columns.update(item.keys())
columns = sorted(all_columns)
table.setColumnCount(len(columns))
table.setHorizontalHeaderLabels(columns)
# 填充数据
for row_idx, row_data in enumerate(data):
for col_idx, col_name in enumerate(columns):
value = str(row_data.get(col_name, ""))
item = QTableWidgetItem(value)
table.setItem(row_idx, col_idx, item)
# 表格样式
table.setStyleSheet("""
QTableWidget {
background-color: white;
border: 1px solid #ddd;
gridline-color: #eee;
}
QHeaderView::section {
background-color: #f8f9fa;
padding: 8px;
border: none;
font-weight: bold;
}
""")
table.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
table.setEditTriggers(QAbstractItemView.DoubleClicked | QAbstractItemView.SelectedClicked)
table.setSelectionMode(QAbstractItemView.SingleSelection)
table.setSelectionBehavior(QAbstractItemView.SelectRows)
layout.addWidget(table, 1)
else:
no_data_label = QLabel("未提取到有效数据")
no_data_label.setStyleSheet("font-size: 16px; color: #7f8c8d;")
no_data_label.setAlignment(Qt.AlignCenter)
layout.addWidget(no_data_label, 1)
# 操作按钮
btn_layout = QHBoxLayout()
btn_layout.setSpacing(15)
export_excel_btn = QPushButton("导出为Excel")
export_excel_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}}
QPushButton:hover {{
background-color: #2980b9;
}}
""")
export_excel_btn.clicked.connect(self.export_to_excel)
btn_layout.addWidget(export_excel_btn)
export_csv_btn = QPushButton("导出为CSV")
export_csv_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.accent_color};
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}}
QPushButton:hover {{
background-color: #d35400;
}}
""")
export_csv_btn.clicked.connect(self.export_to_csv)
btn_layout.addWidget(export_csv_btn)
close_btn = QPushButton("关闭")
close_btn.setStyleSheet("""
QPushButton {
background-color: #95a5a6;
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}
QPushButton:hover {
background-color: #7f8c8d;
}
""")
close_btn.clicked.connect(preview_dialog.accept)
btn_layout.addWidget(close_btn)
layout.addLayout(btn_layout)
preview_dialog.exec_()
def export_to_excel(self):
"""导出为Excel"""
if not self.extracted_data:
QMessageBox.warning(self, "导出失败", "没有可导出的数据")
return
file_path, _ = QFileDialog.getSaveFileName(
self, "保存Excel文件", "", "Excel文件 (*.xlsx)"
)
if file_path:
if not file_path.endswith('.xlsx'):
file_path += '.xlsx'
try:
# 将数据转换为DataFrame
df = pd.DataFrame(self.extracted_data)
# 导出到Excel
df.to_excel(file_path, index=False)
QMessageBox.information(self, "导出成功", f"数据已成功导出到: {file_path}")
except Exception as e:
QMessageBox.critical(self, "导出失败", f"导出过程中发生错误: {str(e)}")
def export_to_csv(self):
"""导出为CSV"""
if not self.extracted_data:
QMessageBox.warning(self, "导出失败", "没有可导出的数据")
return
file_path, _ = QFileDialog.getSaveFileName(
self, "保存CSV文件", "", "CSV文件 (*.csv)"
)
if file_path:
if not file_path.endswith('.csv'):
file_path += '.csv'
try:
# 获取所有可能的字段
all_fields = set()
for item in self.extracted_data:
all_fields.update(item.keys())
# 写入CSV
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=sorted(all_fields))
writer.writeheader()
writer.writerows(self.extracted_data)
QMessageBox.information(self, "导出成功", f"数据已成功导出到: {file_path}")
except Exception as e:
QMessageBox.critical(self, "导出失败", f"导出过程中发生错误: {str(e)}")
def create_history_page(self):
"""创建历史记录页面"""
page = QWidget()
layout = QVBoxLayout(page)
layout.setContentsMargins(30, 20, 30, 20)
layout.setSpacing(20)
# 标题和搜索
header_layout = QHBoxLayout()
title = QLabel("历史记录")
title.setStyleSheet("font-size: 24px; font-weight: bold; color: #2c3e50;")
header_layout.addWidget(title)
search_layout = QHBoxLayout()
search_layout.setSpacing(10)
self.search_input = QLineEdit()
self.search_input.setPlaceholderText("搜索文件名...")
self.search_input.setStyleSheet("""
QLineEdit {
padding: 8px 15px;
border: 1px solid #ddd;
border-radius: 5px;
font-size: 14px;
}
""")
self.search_input.setFixedWidth(300)
self.search_input.returnPressed.connect(self.search_history)
search_layout.addWidget(self.search_input)
search_btn = QPushButton("搜索")
search_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
padding: 8px 20px;
border-radius: 5px;
}}
""")
search_btn.clicked.connect(self.search_history)
search_layout.addWidget(search_btn)
header_layout.addLayout(search_layout)
layout.addLayout(header_layout)
# 历史记录表格
self.history_table = QTableWidget()
self.history_table.setColumnCount(5)
self.history_table.setHorizontalHeaderLabels(["文件名", "上传时间", "状态", "页数", "类型"])
self.history_table.setStyleSheet("""
QTableWidget {
background-color: white;
border: 1px solid #ddd;
gridline-color: #eee;
}
QHeaderView::section {
background-color: #f8f9fa;
padding: 12px;
border: none;
font-weight: bold;
}
""")
self.history_table.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch)
self.history_table.verticalHeader().setVisible(False)
self.history_table.setSelectionBehavior(QAbstractItemView.SelectRows)
self.history_table.setEditTriggers(QAbstractItemView.NoEditTriggers)
self.history_table.setSortingEnabled(True)
layout.addWidget(self.history_table, 1)
# 操作按钮
btn_layout = QHBoxLayout()
btn_layout.setSpacing(15)
view_btn = QPushButton("查看详情")
view_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}}
""")
view_btn.clicked.connect(self.view_history_detail)
btn_layout.addWidget(view_btn)
export_btn = QPushButton("导出记录")
export_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.accent_color};
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}}
""")
export_btn.clicked.connect(self.export_history)
btn_layout.addWidget(export_btn)
delete_btn = QPushButton("删除记录")
delete_btn.setStyleSheet("""
QPushButton {
background-color: #e74c3c;
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}
""")
delete_btn.clicked.connect(self.delete_history)
btn_layout.addWidget(delete_btn)
layout.addLayout(btn_layout)
return page
def update_history_page(self):
"""更新历史记录页面"""
self.history_table.setRowCount(len(self.history_data))
for row_idx, item in enumerate(self.history_data):
self.history_table.setItem(row_idx, 0, QTableWidgetItem(item["file_name"]))
self.history_table.setItem(row_idx, 1, QTableWidgetItem(item["date"]))
status_item = QTableWidgetItem(item["status"])
if item["status"] == "Completed":
status_item.setForeground(QBrush(QColor("#27ae60")))
else:
status_item.setForeground(QBrush(QColor("#e74c3c")))
self.history_table.setItem(row_idx, 2, status_item)
self.history_table.setItem(row_idx, 3, QTableWidgetItem(str(item["pages"])))
self.history_table.setItem(row_idx, 4, QTableWidgetItem(item["type"]))
def search_history(self):
"""搜索历史记录"""
search_text = self.search_input.text().lower()
if not search_text:
self.update_history_page()
return
filtered_data = [item for item in self.history_data if search_text in item["file_name"].lower()]
self.history_table.setRowCount(len(filtered_data))
for row_idx, item in enumerate(filtered_data):
self.history_table.setItem(row_idx, 0, QTableWidgetItem(item["file_name"]))
self.history_table.setItem(row_idx, 1, QTableWidgetItem(item["date"]))
status_item = QTableWidgetItem(item["status"])
if item["status"] == "Completed":
status_item.setForeground(QBrush(QColor("#27ae60")))
else:
status_item.setForeground(QBrush(QColor("#e74c3c")))
self.history_table.setItem(row_idx, 2, status_item)
self.history_table.setItem(row_idx, 3, QTableWidgetItem(str(item["pages"])))
self.history_table.setItem(row_idx, 4, QTableWidgetItem(item["type"]))
def view_history_detail(self):
"""查看历史记录详情"""
selected_row = self.history_table.currentRow()
if selected_row >= 0:
file_name = self.history_table.item(selected_row, 0).text()
history_item = next((item for item in self.history_data if item["file_name"] == file_name), None)
if history_item:
self.show_preview(history_item["extracted_data"], file_name)
else:
QMessageBox.warning(self, "选择记录", "请先选择一条历史记录")
def export_history(self):
"""导出历史记录"""
if not self.history_data:
QMessageBox.warning(self, "导出失败", "没有可导出的历史记录")
return
file_path, _ = QFileDialog.getSaveFileName(
self, "保存历史记录", "", "CSV文件 (*.csv)"
)
if file_path:
if not file_path.endswith('.csv'):
file_path += '.csv'
try:
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['id', 'file_name', 'date', 'status', 'pages', 'type']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in self.history_data:
writer.writerow({
'id': item['id'],
'file_name': item['file_name'],
'date': item['date'],
'status': item['status'],
'pages': item['pages'],
'type': item['type']
})
QMessageBox.information(self, "导出成功", f"历史记录已成功导出到: {file_path}")
except Exception as e:
QMessageBox.critical(self, "导出失败", f"导出过程中发生错误: {str(e)}")
def delete_history(self):
"""删除历史记录"""
selected_row = self.history_table.currentRow()
if selected_row >= 0:
file_name = self.history_table.item(selected_row, 0).text()
reply = QMessageBox.question(
self, '确认删除',
f"确定要删除 '{file_name}' 的记录吗?",
QMessageBox.Yes | QMessageBox.No, QMessageBox.No
)
if reply == QMessageBox.Yes:
self.history_data = [item for item in self.history_data if item['file_name'] != file_name]
self.update_history_page()
else:
QMessageBox.warning(self, "选择记录", "请先选择一条历史记录")
def create_analysis_page(self):
"""创建数据分析页面"""
page = QWidget()
layout = QVBoxLayout(page)
layout.setContentsMargins(30, 20, 30, 20)
layout.setSpacing(20)
# 标题
title = QLabel("数据分析")
title.setStyleSheet("font-size: 24px; font-weight: bold; color: #2c3e50;")
layout.addWidget(title)
# 选择历史记录
select_layout = QHBoxLayout()
select_label = QLabel("选择历史记录:")
select_label.setStyleSheet("font-size: 16px;")
select_layout.addWidget(select_label)
self.history_combo = QComboBox()
self.history_combo.setFixedWidth(300)
self.history_combo.setStyleSheet("padding: 5px; border: 1px solid #ddd; border-radius: 3px;")
select_layout.addWidget(self.history_combo)
analyze_btn = QPushButton("分析数据")
analyze_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
padding: 8px 20px;
border-radius: 5px;
}}
""")
analyze_btn.clicked.connect(self.analyze_data)
select_layout.addWidget(analyze_btn)
layout.addLayout(select_layout)
# 图表区域
chart_container = QWidget()
chart_layout = QGridLayout(chart_container)
# 图表1
self.chart1_label = QLabel()
self.chart1_label.setStyleSheet("background-color: white; border-radius: 10px; padding: 10px;")
self.chart1_label.setAlignment(Qt.AlignCenter)
chart_layout.addWidget(self.chart1_label, 0, 0)
# 图表2
self.chart2_label = QLabel()
self.chart2_label.setStyleSheet("background-color: white; border-radius: 10px; padding: 10px;")
self.chart2_label.setAlignment(Qt.AlignCenter)
chart_layout.addWidget(self.chart2_label, 0, 1)
# 图表3
self.chart3_label = QLabel()
self.chart3_label.setStyleSheet("background-color: white; border-radius: 10px; padding: 10px;")
self.chart3_label.setAlignment(Qt.AlignCenter)
chart_layout.addWidget(self.chart3_label, 1, 0)
# 图表4
self.chart4_label = QLabel()
self.chart4_label.setStyleSheet("background-color: white; border-radius: 10px; padding: 10px;")
self.chart4_label.setAlignment(Qt.AlignCenter)
chart_layout.addWidget(self.chart4_label, 1, 1)
layout.addWidget(chart_container, 1)
# 操作按钮
btn_layout = QHBoxLayout()
btn_layout.setSpacing(15)
refresh_btn = QPushButton("刷新数据")
refresh_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.secondary_color};
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}}
""")
refresh_btn.clicked.connect(self.update_analysis_page)
btn_layout.addWidget(refresh_btn)
export_btn = QPushButton("导出报告")
export_btn.setStyleSheet(f"""
QPushButton {{
background-color: {self.accent_color};
color: white;
padding: 10px 20px;
border-radius: 5px;
font-weight: bold;
}}
""")
export_btn.clicked.connect(self.export_report)
btn_layout.addWidget(export_btn)
layout.addLayout(btn_layout)
return page
def update_analysis_page(self):
"""更新数据分析页面"""
# 更新历史记录选择框
self.history_combo.clear()
for item in self.history_data:
self.history_combo.addItem(f"{item['file_name']} - {item['date']}", item)
# 显示初始图表
self.show_chart(self.chart1_label, "📊", "数据统计")
self.show_chart(self.chart2_label, "📈", "趋势分析")
self.show_chart(self.chart3_label, "📉", "比较分析")
self.show_chart(self.chart4_label, "🧮", "类型分布")
def show_chart(self, label, emoji, title):
"""显示模拟图表"""
pixmap = QPixmap(400, 250)
pixmap.fill(Qt.white)
painter = QPainter(pixmap)
painter.setRenderHint(QPainter.Antialiasing)
# 绘制标题
painter.setFont(QFont("Arial", 14, QFont.Bold))
painter.drawText(pixmap.rect().adjusted(0, 10, 0, 0), Qt.AlignTop | Qt.AlignHCenter, title)
# 绘制图表图标
painter.setFont(QFont("Arial", 80))
painter.drawText(pixmap.rect(), Qt.AlignCenter, emoji)
# 绘制边框
painter.setPen(QColor("#ddd"))
painter.drawRect(pixmap.rect().adjusted(0, 0, -1, -1))
painter.end()
label.setPixmap(pixmap)
def analyze_data(self):
"""分析数据"""
if self.history_combo.currentIndex() < 0:
QMessageBox.warning(self, "选择记录", "请先选择一条历史记录")
return
selected_item = self.history_combo.currentData()
extracted_data = selected_item.get('extracted_data', [])
if not extracted_data:
QMessageBox.warning(self, "分析失败", "所选记录没有可分析的数据")
return
# 在实际应用中,这里会使用真实的数据分析逻辑
# 这里仅显示消息
QMessageBox.information(
self,
"数据分析",
f"已对 '{selected_item['file_name']}' 进行数据分析\n"
f"包含 {len(extracted_data)} 条记录"
)
def export_report(self):
"""导出分析报告"""
if self.history_combo.currentIndex() < 0:
QMessageBox.warning(self, "选择记录", "请先选择一条历史记录")
return
selected_item = self.history_combo.currentData()
file_path, _ = QFileDialog.getSaveFileName(
self, "保存分析报告", "", "PDF文件 (*.pdf)"
)
if file_path:
if not file_path.endswith('.pdf'):
file_path += '.pdf'
try:
# 在实际应用中,这里会生成真实的PDF报告
# 这里仅模拟导出
time.sleep(1) # 模拟生成报告的时间
QMessageBox.information(
self,
"导出成功",
f"分析报告已成功导出到: {file_path}\n"
f"包含对 '{selected_item['file_name']}' 的分析结果"
)
except Exception as e:
QMessageBox.critical(self, "导出失败", f"导出过程中发生错误: {str(e)}")
if __name__ == "__main__":
# 设置应用程序
app = QApplication(sys.argv)
# 设置应用程序样式
app.setStyle("Fusion")
# 创建并显示主窗口
window = MainWindow()
window.show()
# 执行应用程序
sys.exit(app.exec_())将数据分析功能改为接入本地部署的AI大模型进行分析和绘制图表
最新发布