import extract_msg
import os
import re
import logging
from pathlib import Path
from datetime import datetime
import html2text
# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def extract_email_content(msg_path, sharepoint_base_url=""):
"""提取邮件内容并格式化"""
try:
msg = extract_msg.Message(msg_path)
# 提取邮件基本信息
subject = getattr(msg, 'subject', '无主题') or '无主题'
sender = getattr(msg, 'sender', '未知发件人') or '未知发件人'
to_recipients = getattr(msg, 'to', '未知收件人') or '未知收件人'
cc_recipients = getattr(msg, 'cc', '无抄送') or '无抄送'
# 提取发送时间
date_sent = getattr(msg, 'sent', '未知时间')
if date_sent and hasattr(date_sent, 'strftime'):
date_str = date_sent.strftime('%Y-%m-%d %H:%M:%S')
else:
date_str = str(date_sent)
# 提取附件信息
attachments = []
if hasattr(msg, 'attachments'):
for attachment in msg.attachments:
attachments.append(attachment.longFilename)
attachments_str = ', '.join(attachments) if attachments else '无附件'
# 提取邮件正文并转换为纯文本
body = getattr(msg, 'body', '') or ''
# 处理HTML内容
if '<html' in body.lower() or '<body' in body.lower():
try:
h = html2text.HTML2Text()
h.ignore_links = False
h.ignore_images = True
body = h.handle(body)
except Exception as e:
logger.warning(f"HTML转换失败: {e}")
# 处理历史邮件内容
processed_body = process_email_body(body)
# 构建SharePoint地址
file_name = os.path.basename(msg_path)
sharepoint_url = f"{sharepoint_base_url}/{file_name}" if sharepoint_base_url else f"本地路径: {msg_path}"
# 格式化邮件内容
email_content = f"""
主题: {subject}
发件人: {sender}
收件人: {to_recipients}
抄送人: {cc_recipients}
发送时间: {date_str}
附件名: {attachments_str}
邮件正文:
{processed_body}
邮件保存地址: {sharepoint_url}
"""
msg.close()
return email_content
except Exception as e:
logger.error(f"提取邮件内容时出错 {msg_path}: {e}")
return None
def process_email_body(body):
"""处理邮件正文,识别并分隔历史邮件"""
if not body:
return "无正文内容"
# 定义历史邮件的分隔符模式
patterns = [
r'^From:\s*',
r'^发件人:\s*',
r'^差出人:\s*',
r'^寄件者:\s*',
r'^Date:\s*',
r'^日期:\s*',
r'^Sent:\s*',
r'^发送时间:\s*',
r'^-----原始邮件-----',
r'^--- Original Message ---'
]
# 组合所有模式
pattern = '|'.join(patterns)
# 使用正则表达式分割邮件正文
parts = re.split(pattern, body, flags=re.MULTILINE | re.IGNORECASE)
# 如果没有找到分隔符,直接返回原始内容
if len(parts) <= 1:
return body
# 第一个部分是当前邮件
processed_parts = [parts[0]]
# 处理每个历史邮件部分
for i in range(1, len(parts)):
# 获取分隔符类型
separator_match = re.search(pattern, body, flags=re.MULTILINE | re.IGNORECASE)
separator = separator_match.group(0) if separator_match else "== Sub Email =="
# 添加分隔符和历史邮件内容
processed_parts.append(f"\n\n== Sub Email ==\n\n{separator}{parts[i]}")
return ''.join(processed_parts)
def process_single_email_folder(folder_path, sharepoint_base_url=""):
"""处理单独邮件文件夹:每个msg文件生成一个单独的txt文件,保存在同一目录"""
logger.info(f"开始处理单独邮件文件夹: {folder_path}")
# 查找所有msg文件
msg_files = []
for file in os.listdir(folder_path):
if file.lower().endswith('.msg'):
full_path = os.path.join(folder_path, file)
msg_files.append(full_path)
logger.info(f"找到邮件文件: {file}")
if not msg_files:
logger.warning(f"在文件夹 {folder_path} 中没有找到邮件文件")
return
logger.info(f"在单独邮件文件夹中找到 {len(msg_files)} 个邮件文件")
# 处理每个邮件文件
for msg_file in msg_files:
# 提取邮件内容
email_content = extract_email_content(msg_file, sharepoint_base_url)
if not email_content:
logger.warning(f"无法提取邮件内容: {msg_file}")
continue
# 生成输出文件名(与msg文件在同一目录)
base_name = os.path.splitext(os.path.basename(msg_file))[0]
output_file = os.path.join(folder_path, f"{base_name}.txt")
# 写入文件
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(email_content)
logger.info(f"已提取: {os.path.basename(msg_file)} -> {os.path.basename(output_file)}")
except Exception as e:
logger.error(f"写入文件失败 {output_file}: {e}")
def process_thread_folder(folder_path, sharepoint_base_url="", max_chars=50000):
"""处理线程文件夹:所有msg文件合并到一个txt文件,保存在同一目录"""
logger.info(f"开始处理线程文件夹: {folder_path}")
# 查找所有msg文件
msg_files = []
for file in os.listdir(folder_path):
if file.lower().endswith('.msg'):
full_path = os.path.join(folder_path, file)
msg_files.append(full_path)
logger.info(f"找到邮件文件: {file}")
if not msg_files:
logger.warning(f"在线程文件夹 {folder_path} 中没有找到邮件文件")
return
logger.info(f"在线程文件夹中找到 {len(msg_files)} 个邮件文件")
# 尝试按发送时间排序邮件
try:
msg_files_with_time = []
for msg_file in msg_files:
try:
msg = extract_msg.Message(msg_file)
sent_time = getattr(msg, 'sent', None)
msg.close()
msg_files_with_time.append((msg_file, sent_time))
except Exception as e:
logger.warning(f"获取邮件时间失败 {msg_file}: {e}")
msg_files_with_time.append((msg_file, None))
# 按时间排序,无法获取时间的放在前面
msg_files_with_time.sort(key=lambda x: x[1] if x[1] and hasattr(x[1], 'timestamp') else 0)
msg_files = [item[0] for item in msg_files_with_time]
logger.info("已按发送时间排序邮件")
except Exception as e:
logger.warning(f"按时间排序失败,使用文件修改时间: {e}")
# 如果按时间排序失败,使用文件修改时间作为备选
msg_files.sort(key=lambda x: os.path.getmtime(x))
logger.info("已按文件修改时间排序邮件")
# 处理邮件文件
current_file_index = 1
current_chars = 0
output_file = None
processed_count = 0
for i, msg_file in enumerate(msg_files):
logger.info(f"处理文件 {i + 1}/{len(msg_files)}: {os.path.basename(msg_file)}")
# 提取邮件内容
email_content = extract_email_content(msg_file, sharepoint_base_url)
if not email_content:
continue
processed_count += 1
# 添加分隔符(除了第一个邮件)
if current_chars > 0:
separator = "\n\n===another Email====\n\n"
email_content = separator + email_content
else:
# 第一个文件,添加文件头
folder_name = os.path.basename(folder_path)
email_content = f"邮件线程提取报告 - {folder_name}\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + email_content
# 检查是否需要创建新文件
if current_chars + len(email_content) > max_chars:
if output_file:
output_file.close()
logger.info(f"达到字符限制,创建新文件部分")
current_file_index += 1
current_chars = 0
output_file = None
# 打开或创建输出文件
if output_file is None:
folder_name = os.path.basename(folder_path)
output_filename = os.path.join(folder_path, f"{folder_name}_合并_{current_file_index}.txt")
try:
output_file = open(output_filename, 'w', encoding='utf-8')
logger.info(f"创建输出文件: {output_filename}")
except Exception as e:
logger.error(f"创建输出文件失败 {output_filename}: {e}")
break
# 写入内容
try:
output_file.write(email_content)
current_chars += len(email_content)
except Exception as e:
logger.error(f"写入文件失败: {e}")
break
# 关闭最后一个文件
if output_file:
output_file.close()
logger.info(f"线程文件夹处理完成! 共处理 {processed_count} 个邮件,生成 {current_file_index} 个输出文件")
def process_directory(directory, sharepoint_base_url=""):
"""处理目录中的所有邮件文件,根据文件夹类型采用不同的处理方式"""
# 确保目录存在
if not os.path.exists(directory):
logger.error(f"目录不存在: {directory}")
return
logger.info(f"开始处理目录: {directory}")
# 获取目录中的所有项目
items = os.listdir(directory)
logger.info(f"目录中包含的项目: {items}")
# 检查特定文件夹是否存在
single_email_folder = None
thread_folder = None
for item in items:
item_path = os.path.join(directory, item)
if os.path.isdir(item_path):
if "单独邮件" in item:
single_email_folder = item_path
logger.info(f"找到单独邮件文件夹: {item}")
elif "线程" in item:
thread_folder = item_path
logger.info(f"找到线程文件夹: {item}")
# 处理单独邮件文件夹
if single_email_folder:
logger.info(f"处理单独邮件文件夹: {single_email_folder}")
process_single_email_folder(single_email_folder, sharepoint_base_url)
else:
logger.warning("未找到单独邮件文件夹")
# 处理线程文件夹
if thread_folder:
logger.info(f"处理线程文件夹: {thread_folder}")
process_thread_folder(thread_folder, sharepoint_base_url)
else:
logger.warning("未找到线程文件夹")
# 处理其他文件夹
for item in items:
item_path = os.path.join(directory, item)
# 跳过已经处理过的文件夹
if item_path == single_email_folder or item_path == thread_folder:
continue
# 处理其他文件夹
if os.path.isdir(item_path):
# 判断文件夹类型
if item.startswith("线程_") or "thread" in item.lower() or "conversation" in item.lower():
# 线程文件夹:所有邮件合并到一个文件
logger.info(f"识别为线程文件夹: {item}")
process_thread_folder(item_path, sharepoint_base_url)
else:
# 其他文件夹:每个邮件单独一个文件
logger.info(f"识别为普通文件夹: {item}")
process_single_email_folder(item_path, sharepoint_base_url)
elif item.lower().endswith('.msg'):
# 直接处理根目录下的邮件文件
logger.info(f"处理根目录下的邮件文件: {item}")
process_single_email_folder(directory, sharepoint_base_url)
logger.info(f"所有处理完成! 提取的txt文件保存在各自原目录中")
if __name__ == "__main__":
# 设置目录路径
directory = r'C:\code\PCGKB_test01\msg\lixia'
# 设置SharePoint基础URL(如果有)
sharepoint_base_url = "https://your-sharepoint-site.com/documents"
# 处理目录
process_directory(directory, sharepoint_base_url) 按要求对路劲下所有目录和子目录,都实行提取,特别是(单独邮件)文件夹和(线程)文件夹,必须要按照要求分类提取,现在一直提取不到,查看代码问题在哪里