import extract_msg
import os
import re
import shutil
import base64
import uuid
from collections import defaultdict
from datetime import datetime
import logging
from email.utils import parsedate_to_datetime
from pathlib import Path
# 设置日志记录
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(os.path.join(r'C:\code\PCGKB_test01\msg\lixia', 'email_classification.log'),
encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def find_all_msg_files(directory):
"""递归查找目录中的所有msg文件"""
msg_files = []
for root, dirs, files in os.walk(directory):
# 跳过已经创建的线程分类目录
if "邮件线程分类" in root:
continue
for file in files:
if file.lower().endswith('.msg'):
full_path = os.path.join(root, file)
msg_files.append(full_path)
return msg_files
def extract_guid_from_index(conversation_index):
"""从ConversationIndex中提取GUID"""
try:
# ConversationIndex通常是base64编码的二进制数据
if isinstance(conversation_index, str):
# 尝试解码base64
binary_data = base64.b64decode(conversation_index)
# 根据Microsoft文档,GUID通常位于偏移量6处,长度为16字节
if len(binary_data) >= 22: # 6 + 16 = 22
guid_bytes = binary_data[6:22]
return str(uuid.UUID(bytes_le=guid_bytes))
except Exception as e:
logger.warning(f"提取GUID时出错: {e}")
return None
def normalize_subject(subject):
"""归一化邮件主题"""
if not subject:
return "无主题"
# 去除常见的回复和转发前缀
prefixes = [
r'^(Re|Fwd|Fw|Aw|WG|VS|SV|TR|转发|回复|答复|回覆)[:\s\[\]]*',
r'^\$\$.*?\$\$[\s]*'
]
normalized = subject.strip()
for pattern in prefixes:
normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE)
# 去除多余空格并转换为小写
normalized = re.sub(r'\s+', ' ', normalized).strip().lower()
return normalized if normalized else "无主题"
def parse_msg_files_in_directory(directory):
"""解析指定目录中的所有msg文件"""
msg_files = find_all_msg_files(directory)
messages = []
for file_path in msg_files:
try:
msg = extract_msg.Message(file_path)
# 提取邮件基本信息
subject = getattr(msg, 'subject', '无主题') or '无主题'
sender = getattr(msg, 'sender', '未知发件人') or '未知发件人'
# 提取邮件日期
date_str = getattr(msg, 'sent', None)
if not date_str and hasattr(msg, 'header') and msg.header:
date_str = msg.header.get('Date')
date_obj = None
if date_str:
try:
date_obj = parsedate_to_datetime(str(date_str))
except:
try:
date_obj = datetime.strptime(str(date_str).split(' (')[0], '%a, %d %b %Y %H:%M:%S %z')
except:
date_obj = None
# 提取ConversationIndex
conversation_index = None
if hasattr(msg, 'header') and msg.header:
conversation_index = msg.header.get('Thread-Index')
if not conversation_index:
conversation_index = msg.header.get('Conversation-Index')
# 提取In-Reply-To和References
in_reply_to = None
references = None
if hasattr(msg, 'header') and msg.header:
in_reply_to = msg.header.get('In-Reply-To')
references = msg.header.get('References')
messages.append({
'file_path': file_path,
'filename': os.path.basename(file_path),
'relative_path': os.path.relpath(file_path, directory),
'subject': subject,
'sender': sender,
'date': date_obj,
'conversation_index': conversation_index,
'in_reply_to': in_reply_to,
'references': references,
'normalized_subject': normalize_subject(subject)
})
msg.close()
except Exception as e:
logger.error(f"解析邮件文件 {file_path} 时出错: {e}")
return messages
def classify_emails_in_directory(directory):
"""对指定目录中的邮件进行线程分类"""
# 检查是否已经处理过
threads_dir = os.path.join(directory, "邮件线程分类")
if os.path.exists(threads_dir):
logger.info(f"目录 {directory} 已经处理过,跳过")
return 0, 0, 0
# 解析所有邮件
messages = parse_msg_files_in_directory(directory)
if not messages:
logger.info(f"目录 {directory} 中没有找到邮件文件")
return 0, 0, 0
logger.info(f"在目录 {directory} 中成功解析 {len(messages)} 封邮件")
# 创建线程分类主文件夹
os.makedirs(threads_dir, exist_ok=True)
# 用于存储线程
threads = defaultdict(list)
# 第一遍:基于ConversationIndex分组
for msg in messages:
thread_key = None
# 优先使用会话索引(ConversationIndex)
if msg['conversation_index']:
# 从ConversationIndex中提取GUID
conv_id = extract_guid_from_index(msg['conversation_index'])
if conv_id:
thread_key = f"conv_{conv_id}"
else:
# 如果无法提取GUID,使用整个ConversationIndex的哈希值
thread_key = f"conv_{hash(msg['conversation_index']) & 0xFFFFFFFF}"
else:
# 否则使用归一化后的主题
thread_key = f"subj_{hash(msg['normalized_subject']) & 0xFFFFFFFF}"
# 将邮件添加到线程
threads[thread_key].append(msg)
# 第二遍:基于In-Reply-To和References进一步分组
# 创建一个映射,用于快速查找邮件
message_map = {msg['file_path']: msg for msg in messages}
for msg in messages:
# 如果邮件有In-Reply-To,尝试找到被回复的邮件
if msg['in_reply_to']:
# 查找被回复的邮件
for other_msg in messages:
if (other_msg['file_path'] != msg['file_path'] and
other_msg['normalized_subject'] == msg['normalized_subject'] and
other_msg['date'] and msg['date'] and other_msg['date'] < msg['date']):
# 找到可能的父邮件
for thread_key, thread_msgs in threads.items():
if other_msg in thread_msgs:
# 将当前邮件移动到父邮件的线程
for key in list(threads.keys()):
if msg in threads[key]:
if key != thread_key:
threads[key].remove(msg)
threads[thread_key].append(msg)
break
break
break
# 统计信息
multi_email_threads = {tid: msgs for tid, msgs in threads.items() if len(msgs) > 1}
multi_email_thread_count = len(multi_email_threads)
single_email_threads = len(threads) - multi_email_thread_count
# 处理包含多个邮件的线程
for thread_id, messages in multi_email_threads.items():
# 创建线程文件夹
thread_folder = os.path.join(threads_dir, f"线程_{thread_id}")
os.makedirs(thread_folder, exist_ok=True)
# 按日期排序
sorted_messages = sorted(messages, key=lambda x: x['date'] if x['date'] else datetime.min)
# 复制邮件到线程文件夹
for msg in sorted_messages:
src_path = msg['file_path']
dst_path = os.path.join(thread_folder, msg['filename'])
# 检查目标文件是否已存在
if not os.path.exists(dst_path):
shutil.copy2(src_path, dst_path)
else:
logger.warning(f"文件已存在,跳过复制: {dst_path}")
# 创建单独邮件文件夹(单邮件线程)
single_emails_dir = os.path.join(threads_dir, "单独邮件")
os.makedirs(single_emails_dir, exist_ok=True)
# 处理单邮件线程
for thread_id, messages in threads.items():
if len(messages) == 1:
msg = messages[0]
src_path = msg['file_path']
dst_path = os.path.join(single_emails_dir, msg['filename'])
# 检查目标文件是否已存在
if not os.path.exists(dst_path):
shutil.copy2(src_path, dst_path)
else:
logger.warning(f"文件已存在,跳过复制: {dst_path}")
# 生成报告
report_file = os.path.join(threads_dir, "邮件线程分类报告.txt")
with open(report_file, 'w', encoding='utf-8') as f:
f.write(f"邮件线程分类报告\n")
f.write("=" * 60 + "\n\n")
f.write(f"源目录: {directory}\n")
f.write(f"总邮件数: {len(messages)}\n")
f.write(f"总线程数: {len(threads)}\n")
f.write(f"多邮件线程数: {multi_email_thread_count}\n")
f.write(f"单邮件线程数: {single_email_threads}\n\n")
f.write("多邮件线程详情:\n")
f.write("-" * 30 + "\n")
for thread_id, messages in multi_email_threads.items():
f.write(f"线程 {thread_id} (共 {len(messages)} 封邮件):\n")
# 按日期排序
sorted_messages = sorted(messages, key=lambda x: x['date'] if x['date'] else datetime.min)
for i, msg in enumerate(sorted_messages):
marker = " (最后发送)" if i == len(sorted_messages) - 1 else ""
date_str = msg['date'].strftime("%Y-%m-%d %H:%M:%S") if msg['date'] else "无日期信息"
f.write(f" - {msg['filename']} [{date_str}]{marker}\n")
f.write("\n")
logger.info(f"目录 {directory} 邮件线程分类完成!")
logger.info(f"分类结果保存在: {threads_dir}")
logger.info(f"报告文件: {report_file}")
logger.info(f"总邮件数: {len(messages)}")
logger.info(f"多邮件线程数: {multi_email_thread_count}")
logger.info(f"单邮件线程数: {single_email_threads}")
return len(messages), multi_email_thread_count, single_email_threads
def process_all_directories(base_directory):
"""处理基础目录下的所有子目录"""
base_path = Path(base_directory)
if not base_path.exists() or not base_path.is_dir():
logger.error(f"基础目录不存在或不是目录: {base_directory}")
return
total_emails = 0
total_multi_threads = 0
total_single_threads = 0
processed_dirs = 0
# 只处理直接子目录,不递归处理
for item in os.listdir(base_directory):
item_path = os.path.join(base_directory, item)
if os.path.isdir(item_path):
logger.info(f"处理子目录: {item_path}")
# 检查目录中是否有msg文件
msg_files = find_all_msg_files(item_path)
if msg_files:
emails, multi_threads, single_threads = classify_emails_in_directory(item_path)
total_emails += emails
total_multi_threads += multi_threads
total_single_threads += single_threads
processed_dirs += 1
else:
logger.info(f"目录 {item_path} 中没有msg文件,跳过")
# 生成总体报告
summary_file = os.path.join(base_directory, "总体邮件线程分类报告.txt")
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"总体邮件线程分类报告\n")
f.write("=" * 60 + "\n\n")
f.write(f"基础目录: {base_directory}\n")
f.write(f"处理的目录数量: {processed_dirs}\n")
f.write(f"总邮件数: {total_emails}\n")
f.write(f"总多邮件线程数: {total_multi_threads}\n")
f.write(f"总单邮件线程数: {total_single_threads}\n\n")
f.write("处理完成的目录:\n")
f.write("-" * 30 + "\n")
for item in os.listdir(base_directory):
item_path = os.path.join(base_directory, item)
if os.path.isdir(item_path) and os.path.exists(os.path.join(item_path, "邮件线程分类")):
f.write(f"- {item_path}\n")
logger.info(f"所有目录处理完成!")
logger.info(f"处理的目录数量: {processed_dirs}")
logger.info(f"总邮件数: {total_emails}")
logger.info(f"总多邮件线程数: {total_multi_threads}")
logger.info(f"总单邮件线程数: {total_single_threads}")
logger.info(f"总体报告保存在: {summary_file}")
if __name__ == '__main__':
base_directory = r'C:\code\PCGKB_test01\msg\lixia'
process_all_directories(base_directory) 调整msg必须移动到分类而不是复制,确保所有文件夹干净
最新发布