import extract_msg
import os
import re
import shutil
import hashlib
import difflib
from collections import defaultdict
from datetime import datetime
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def preprocess_text(text):
"""预处理文本,去除无关内容"""
if not text:
return ""
# 移除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 移除电子邮件签名常见模式
signature_patterns = [
r'--\s*$.*', # 标准签名分隔符
r'________________+.*', # 下划线分隔符
r'^.*发自我的.*$', # 中文签名
r'^.*Sent from my.*$', # 英文签名
r'^.*Best regards,.*$', # 英文结束语
r'^.*此致敬礼.*$', # 中文结束语
r'^.*联系方式:.*$', # 联系方式
r'^.*Contact information:.*$', # 联系方式英文
]
for pattern in signature_patterns:
text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
# 移除多余的空格和换行
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_email_content(msg_path):
"""提取邮件内容并预处理"""
try:
msg = extract_msg.Message(msg_path)
# 提取邮件基本信息
sender = getattr(msg, 'sender', "未知发件人") or "未知发件人"
to_recipients = getattr(msg, 'to', "未知收件人") or "未知收件人"
subject = getattr(msg, 'subject', "无主题") or "无主题"
body = getattr(msg, 'body', "") or ""
# 预处理邮件内容
processed_body = preprocess_text(body)
# 创建内容的哈希值用于快速比较
content_for_hash = f"{sender}|{to_recipients}|{subject}|{processed_body}"
content_hash = hashlib.md5(content_for_hash.encode('utf-8', errors='ignore')).hexdigest()
msg.close()
return {
'sender': sender,
'to': to_recipients,
'subject': subject,
'body': processed_body,
'body_hash': content_hash,
'file_path': msg_path,
'filename': os.path.basename(msg_path),
'relative_path': os.path.relpath(msg_path, directory)
}
except Exception as e:
logger.error(f"提取邮件内容时出错 {msg_path}: {str(e)}")
return None
def find_all_msg_files(directory):
"""递归查找目录中的所有msg文件"""
msg_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.lower().endswith('.msg'):
full_path = os.path.join(root, file)
msg_files.append(full_path)
return msg_files
def group_emails_by_sender_subject(emails):
"""根据发件人和主题对邮件进行初步分组"""
groups = defaultdict(list)
for email in emails:
if not email:
continue
# 创建基于发件人和主题的键
key = f"{email['sender']}|{email['subject']}"
groups[key].append(email)
return groups
def calculate_similarity(text1, text2):
"""计算两段文本的相似度"""
if not text1 or not text2:
return 0
# 使用difflib计算相似度
matcher = difflib.SequenceMatcher(None, text1, text2)
return matcher.ratio()
def find_similar_emails(emails, similarity_threshold=0.85):
"""在邮件组中查找相似的邮件"""
similar_groups = []
processed_hashes = set()
for i, email1 in enumerate(emails):
if email1['body_hash'] in processed_hashes:
continue
similar_emails = [email1]
processed_hashes.add(email1['body_hash'])
for j, email2 in enumerate(emails[i + 1:], i + 1):
if email2['body_hash'] in processed_hashes:
continue
# 计算内容相似度
similarity = calculate_similarity(email1['body'], email2['body'])
if similarity >= similarity_threshold:
similar_emails.append(email2)
processed_hashes.add(email2['body_hash'])
if len(similar_emails) > 1:
similar_groups.append(similar_emails)
return similar_groups
def find_duplicate_emails(emails):
"""查找完全相同的邮件(基于内容哈希)"""
hash_to_emails = defaultdict(list)
for email in emails:
hash_to_emails[email['body_hash']].append(email)
# 只返回有重复的组
return [emails for emails in hash_to_emails.values() if len(emails) > 1]
def classify_emails_by_content_similarity(directory):
"""基于内容相似度对邮件进行分类和去重"""
# 递归查找所有msg文件
msg_files = find_all_msg_files(directory)
total_emails = len(msg_files)
logger.info(f"找到 {total_emails} 个邮件文件")
# 提取所有邮件内容
emails = []
for file_path in msg_files:
email_content = extract_email_content(file_path)
if email_content:
emails.append(email_content)
# 根据发件人和主题进行初步分组
sender_subject_groups = group_emails_by_sender_subject(emails)
logger.info(f"基于发件人和主题创建了 {len(sender_subject_groups)} 个初步分组")
# 创建分类主文件夹
classification_dir = os.path.join(directory, "邮件内容分类")
os.makedirs(classification_dir, exist_ok=True)
# 创建重复邮件文件夹
duplicates_dir = os.path.join(classification_dir, "重复邮件")
os.makedirs(duplicates_dir, exist_ok=True)
# 创建相似邮件文件夹
similar_dir = os.path.join(classification_dir, "相似邮件")
os.makedirs(similar_dir, exist_ok=True)
# 创建唯一邮件文件夹
unique_dir = os.path.join(classification_dir, "唯一邮件")
os.makedirs(unique_dir, exist_ok=True)
total_duplicates = 0
total_similar = 0
total_unique = 0
# 处理每个发件人-主题组
for group_key, group_emails in sender_subject_groups.items():
if len(group_emails) == 1:
# 单邮件组,直接归类为唯一邮件
email = group_emails[0]
dst_path = os.path.join(unique_dir, email['relative_path'])
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy2(email['file_path'], dst_path)
total_unique += 1
continue
# 查找完全相同的邮件
duplicate_groups = find_duplicate_emails(group_emails)
# 查找相似的邮件
similar_groups = find_similar_emails(group_emails)
# 处理重复邮件
processed_hashes = set()
for dup_group in duplicate_groups:
# 按文件名排序,保留一个副本
sorted_duplicates = sorted(dup_group, key=lambda x: x['filename'])
# 保留第一个邮件到唯一文件夹
kept_email = sorted_duplicates[0]
dst_path = os.path.join(unique_dir, kept_email['relative_path'])
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy2(kept_email['file_path'], dst_path)
total_unique += 1
processed_hashes.add(kept_email['body_hash'])
# 将其余重复邮件移动到重复文件夹
for duplicate in sorted_duplicates[1:]:
dst_path = os.path.join(duplicates_dir, duplicate['relative_path'])
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy2(duplicate['file_path'], dst_path)
total_duplicates += 1
processed_hashes.add(duplicate['body_hash'])
# 处理相似邮件
for sim_group in similar_groups:
# 过滤掉已经处理过的邮件
sim_group = [email for email in sim_group if email['body_hash'] not in processed_hashes]
if len(sim_group) <= 1:
continue
# 按内容长度排序,保留最长的邮件
sorted_similar = sorted(sim_group, key=lambda x: len(x['body']), reverse=True)
# 保留第一个邮件到唯一文件夹
kept_email = sorted_similar[0]
dst_path = os.path.join(unique_dir, kept_email['relative_path'])
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy2(kept_email['file_path'], dst_path)
total_unique += 1
processed_hashes.add(kept_email['body_hash'])
# 将其余相似邮件移动到相似文件夹
for similar in sorted_similar[1:]:
dst_path = os.path.join(similar_dir, similar['relative_path'])
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy2(similar['file_path'], dst_path)
total_similar += 1
processed_hashes.add(similar['body_hash'])
# 处理组中剩余的邮件(既不是重复也不是相似的)
for email in group_emails:
if email['body_hash'] not in processed_hashes:
dst_path = os.path.join(unique_dir, email['relative_path'])
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
shutil.copy2(email['file_path'], dst_path)
total_unique += 1
# 生成报告
report_file = os.path.join(classification_dir, "邮件内容分类报告.txt")
with open(report_file, 'w', encoding='utf-8') as f:
f.write(f"邮件内容分类报告\n")
f.write("=" * 60 + "\n\n")
f.write(f"源目录: {directory}\n")
f.write(f"总邮件数: {total_emails}\n")
f.write(f"处理邮件数: {len(emails)}\n")
f.write(f"初步分组数: {len(sender_subject_groups)}\n")
f.write(f"唯一邮件数: {total_unique}\n")
f.write(f"重复邮件数: {total_duplicates}\n")
f.write(f"相似邮件数: {total_similar}\n\n")
f.write("分类说明:\n")
f.write("- 唯一邮件: 内容完全独特的邮件\n")
f.write("- 重复邮件: 内容完全相同的邮件\n")
f.write("- 相似邮件: 内容高度相似但不完全相同的邮件\n")
print(f"邮件内容分类完成!")
print(f"分类结果保存在: {classification_dir}")
print(f"唯一邮件保存在: {unique_dir}")
print(f"重复邮件保存在: {duplicates_dir}")
print(f"相似邮件保存在: {similar_dir}")
print(f"报告文件: {report_file}")
print(f"总邮件数: {total_emails}")
print(f"处理邮件数: {len(emails)}")
print(f"唯一邮件数: {total_unique}")
print(f"重复邮件数: {total_duplicates}")
print(f"相似邮件数: {total_similar}")
# 使用TF-IDF和余弦相似度的替代方案(需要scikit-learn)
def advanced_similarity_analysis(emails, similarity_threshold=0.8):
"""使用TF-IDF和余弦相似度进行高级相似度分析"""
if not emails:
return []
# 准备文本数据
texts = [email['body'] for email in emails]
# 创建TF-IDF向量器
vectorizer = TfidfVectorizer(
max_features=10000,
min_df=2,
max_df=0.8,
stop_words=None, # 可以添加停用词列表
ngram_range=(1, 2) # 使用一元和二元语法
)
# 计算TF-IDF矩阵
try:
tfidf_matrix = vectorizer.fit_transform(texts)
except:
# 如果TF-IDF失败,回退到简单方法
return find_similar_emails(emails, similarity_threshold)
# 计算余弦相似度矩阵
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# 查找相似邮件组
similar_groups = []
processed_indices = set()
for i in range(len(emails)):
if i in processed_indices:
continue
similar_indices = [i]
processed_indices.add(i)
for j in range(i + 1, len(emails)):
if j in processed_indices:
continue
if cosine_sim[i][j] >= similarity_threshold:
similar_indices.append(j)
processed_indices.add(j)
if len(similar_indices) > 1:
similar_emails = [emails[idx] for idx in similar_indices]
similar_groups.append(similar_emails)
return similar_groups
if __name__ == '__main__':
directory = r'C:\code\PCGKB_test01\msg'
# 设置更详细的日志级别
logger.setLevel(logging.INFO)
# 使用基于内容相似度的分类方法
classify_emails_by_content_similarity(directory) 帮我调整代码以及搜索全网,找出能有效查重的办法