182. Duplicate Emails

LeetCode重复邮箱问题
本文介绍了LeetCode上关于查找重复邮箱的问题,提供了两种SQL解决方案,一种是通过子查询结合group by和having子句,另一种是使用临时表进行计数。

原题链接:https://leetcode.com/problems/duplicate-emails/description/
这么简单的题目,我竟然都不会。。。sql 玩的不够溜啊。然后就去看了官方答案吧:
两种方法:

# Write your MySQL query statement below
select Email from 
(select Email, count(Email) as num from Person group by Email) temp where num > 1;

select Email from Person group by Email having count(Email) > 1;

转载于:https://www.cnblogs.com/optor/p/8622784.html

import extract_msg import os import re import shutil import hashlib import difflib from collections import defaultdict from datetime import datetime import logging from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # 设置日志记录 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def preprocess_text(text): """预处理文本,去除无关内容""" if not text: return "" # 移除HTML标签 text = re.sub(r'<[^>]+>', '', text) # 移除电子邮件签名常见模式 signature_patterns = [ r'--\s*$.*', # 标准签名分隔符 r'________________+.*', # 下划线分隔符 r'^.*发自我的.*$', # 中文签名 r'^.*Sent from my.*$', # 英文签名 r'^.*Best regards,.*$', # 英文结束语 r'^.*此致敬礼.*$', # 中文结束语 r'^.*联系方式:.*$', # 联系方式 r'^.*Contact information:.*$', # 联系方式英文 ] for pattern in signature_patterns: text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE) # 移除多余的空格和换行 text = re.sub(r'\s+', ' ', text).strip() return text def extract_email_content(msg_path): """提取邮件内容并预处理""" try: msg = extract_msg.Message(msg_path) # 提取邮件基本信息 sender = getattr(msg, 'sender', "未知发件人") or "未知发件人" to_recipients = getattr(msg, 'to', "未知收件人") or "未知收件人" subject = getattr(msg, 'subject', "无主题") or "无主题" body = getattr(msg, 'body', "") or "" # 预处理邮件内容 processed_body = preprocess_text(body) # 创建内容的哈希值用于快速比较 content_for_hash = f"{sender}|{to_recipients}|{subject}|{processed_body}" content_hash = hashlib.md5(content_for_hash.encode('utf-8', errors='ignore')).hexdigest() msg.close() return { 'sender': sender, 'to': to_recipients, 'subject': subject, 'body': processed_body, 'body_hash': content_hash, 'file_path': msg_path, 'filename': os.path.basename(msg_path), 'relative_path': os.path.relpath(msg_path, directory) } except Exception as e: logger.error(f"提取邮件内容时出错 {msg_path}: {str(e)}") return None def find_all_msg_files(directory): """递归查找目录中的所有msg文件""" msg_files = [] for root, dirs, files in os.walk(directory): for file in files: if file.lower().endswith('.msg'): full_path = os.path.join(root, file) msg_files.append(full_path) return msg_files def group_emails_by_sender_subject(emails): """根据发件人和主题对邮件进行初步分组""" groups = defaultdict(list) for email in emails: if not email: continue # 创建基于发件人和主题的键 key = f"{email['sender']}|{email['subject']}" groups[key].append(email) return groups def calculate_similarity(text1, text2): """计算两段文本的相似度""" if not text1 or not text2: return 0 # 使用difflib计算相似度 matcher = difflib.SequenceMatcher(None, text1, text2) return matcher.ratio() def find_similar_emails(emails, similarity_threshold=0.85): """在邮件组中查找相似的邮件""" similar_groups = [] processed_hashes = set() for i, email1 in enumerate(emails): if email1['body_hash'] in processed_hashes: continue similar_emails = [email1] processed_hashes.add(email1['body_hash']) for j, email2 in enumerate(emails[i + 1:], i + 1): if email2['body_hash'] in processed_hashes: continue # 计算内容相似度 similarity = calculate_similarity(email1['body'], email2['body']) if similarity >= similarity_threshold: similar_emails.append(email2) processed_hashes.add(email2['body_hash']) if len(similar_emails) > 1: similar_groups.append(similar_emails) return similar_groups def find_duplicate_emails(emails): """查找完全相同的邮件(基于内容哈希)""" hash_to_emails = defaultdict(list) for email in emails: hash_to_emails[email['body_hash']].append(email) # 只返回有重复的组 return [emails for emails in hash_to_emails.values() if len(emails) > 1] def classify_emails_by_content_similarity(directory): """基于内容相似度对邮件进行分类和去重""" # 递归查找所有msg文件 msg_files = find_all_msg_files(directory) total_emails = len(msg_files) logger.info(f"找到 {total_emails} 个邮件文件") # 提取所有邮件内容 emails = [] for file_path in msg_files: email_content = extract_email_content(file_path) if email_content: emails.append(email_content) # 根据发件人和主题进行初步分组 sender_subject_groups = group_emails_by_sender_subject(emails) logger.info(f"基于发件人和主题创建了 {len(sender_subject_groups)} 个初步分组") # 创建分类主文件夹 classification_dir = os.path.join(directory, "邮件内容分类") os.makedirs(classification_dir, exist_ok=True) # 创建重复邮件文件夹 duplicates_dir = os.path.join(classification_dir, "重复邮件") os.makedirs(duplicates_dir, exist_ok=True) # 创建相似邮件文件夹 similar_dir = os.path.join(classification_dir, "相似邮件") os.makedirs(similar_dir, exist_ok=True) # 创建唯一邮件文件夹 unique_dir = os.path.join(classification_dir, "唯一邮件") os.makedirs(unique_dir, exist_ok=True) total_duplicates = 0 total_similar = 0 total_unique = 0 # 处理每个发件人-主题组 for group_key, group_emails in sender_subject_groups.items(): if len(group_emails) == 1: # 单邮件组,直接归类为唯一邮件 email = group_emails[0] dst_path = os.path.join(unique_dir, email['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(email['file_path'], dst_path) total_unique += 1 continue # 查找完全相同的邮件 duplicate_groups = find_duplicate_emails(group_emails) # 查找相似的邮件 similar_groups = find_similar_emails(group_emails) # 处理重复邮件 processed_hashes = set() for dup_group in duplicate_groups: # 按文件名排序,保留一个副本 sorted_duplicates = sorted(dup_group, key=lambda x: x['filename']) # 保留第一个邮件到唯一文件夹 kept_email = sorted_duplicates[0] dst_path = os.path.join(unique_dir, kept_email['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(kept_email['file_path'], dst_path) total_unique += 1 processed_hashes.add(kept_email['body_hash']) # 将其余重复邮件移动到重复文件夹 for duplicate in sorted_duplicates[1:]: dst_path = os.path.join(duplicates_dir, duplicate['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(duplicate['file_path'], dst_path) total_duplicates += 1 processed_hashes.add(duplicate['body_hash']) # 处理相似邮件 for sim_group in similar_groups: # 过滤掉已经处理过的邮件 sim_group = [email for email in sim_group if email['body_hash'] not in processed_hashes] if len(sim_group) <= 1: continue # 按内容长度排序,保留最长的邮件 sorted_similar = sorted(sim_group, key=lambda x: len(x['body']), reverse=True) # 保留第一个邮件到唯一文件夹 kept_email = sorted_similar[0] dst_path = os.path.join(unique_dir, kept_email['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(kept_email['file_path'], dst_path) total_unique += 1 processed_hashes.add(kept_email['body_hash']) # 将其余相似邮件移动到相似文件夹 for similar in sorted_similar[1:]: dst_path = os.path.join(similar_dir, similar['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(similar['file_path'], dst_path) total_similar += 1 processed_hashes.add(similar['body_hash']) # 处理组中剩余的邮件(既不是重复也不是相似的) for email in group_emails: if email['body_hash'] not in processed_hashes: dst_path = os.path.join(unique_dir, email['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(email['file_path'], dst_path) total_unique += 1 # 生成报告 report_file = os.path.join(classification_dir, "邮件内容分类报告.txt") with open(report_file, 'w', encoding='utf-8') as f: f.write(f"邮件内容分类报告\n") f.write("=" * 60 + "\n\n") f.write(f"源目录: {directory}\n") f.write(f"总邮件数: {total_emails}\n") f.write(f"处理邮件数: {len(emails)}\n") f.write(f"初步分组数: {len(sender_subject_groups)}\n") f.write(f"唯一邮件数: {total_unique}\n") f.write(f"重复邮件数: {total_duplicates}\n") f.write(f"相似邮件数: {total_similar}\n\n") f.write("分类说明:\n") f.write("- 唯一邮件: 内容完全独特的邮件\n") f.write("- 重复邮件: 内容完全相同的邮件\n") f.write("- 相似邮件: 内容高度相似但不完全相同的邮件\n") print(f"邮件内容分类完成!") print(f"分类结果保存在: {classification_dir}") print(f"唯一邮件保存在: {unique_dir}") print(f"重复邮件保存在: {duplicates_dir}") print(f"相似邮件保存在: {similar_dir}") print(f"报告文件: {report_file}") print(f"总邮件数: {total_emails}") print(f"处理邮件数: {len(emails)}") print(f"唯一邮件数: {total_unique}") print(f"重复邮件数: {total_duplicates}") print(f"相似邮件数: {total_similar}") # 使用TF-IDF和余弦相似度的替代方案(需要scikit-learn) def advanced_similarity_analysis(emails, similarity_threshold=0.8): """使用TF-IDF和余弦相似度进行高级相似度分析""" if not emails: return [] # 准备文本数据 texts = [email['body'] for email in emails] # 创建TF-IDF向量器 vectorizer = TfidfVectorizer( max_features=10000, min_df=2, max_df=0.8, stop_words=None, # 可以添加停用词列表 ngram_range=(1, 2) # 使用一元和二元语法 ) # 计算TF-IDF矩阵 try: tfidf_matrix = vectorizer.fit_transform(texts) except: # 如果TF-IDF失败,回退到简单方法 return find_similar_emails(emails, similarity_threshold) # 计算余弦相似度矩阵 cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # 查找相似邮件组 similar_groups = [] processed_indices = set() for i in range(len(emails)): if i in processed_indices: continue similar_indices = [i] processed_indices.add(i) for j in range(i + 1, len(emails)): if j in processed_indices: continue if cosine_sim[i][j] >= similarity_threshold: similar_indices.append(j) processed_indices.add(j) if len(similar_indices) > 1: similar_emails = [emails[idx] for idx in similar_indices] similar_groups.append(similar_emails) return similar_groups if __name__ == '__main__': directory = r'C:\code\PCGKB_test01\msg' # 设置更详细的日志级别 logger.setLevel(logging.INFO) # 使用基于内容相似度的分类方法 classify_emails_by_content_similarity(directory) 帮我调整代码以及搜索全网,找出能有效查重的办法
09-04
import extract_msg import os import re import shutil import hashlib from collections import defaultdict from email.utils import parsedate_to_datetime from datetime import datetime def clean_message_id(msg_id): """去除Message-ID中的尖括号(如果存在)""" if not msg_id: return msg_id cleaned = re.sub(r'^<|>$', '', msg_id.strip()) return cleaned def extract_email_content(msg_path): """提取邮件内容用于去重比较""" try: msg = extract_msg.Message(msg_path) # 提取邮件基本信息 sender = msg.sender or "未知发件人" to_recipients = msg.to or "未知收件人" subject = msg.subject or "无主题" body = msg.body or "" # 简化内容用于比较(去除多余空格和换行) simplified_body = re.sub(r'\s+', ' ', body).strip() # 创建内容的哈希值用于快速比较 content_hash = hashlib.md5(f"{sender}{to_recipients}{subject}{simplified_body}".encode()).hexdigest() msg.close() return { 'sender': sender, 'to': to_recipients, 'subject': subject, 'body_hash': content_hash, 'simplified_body': simplified_body[:200] # 只保留前200个字符用于显示 } except Exception as e: print(f"提取邮件内容时出错 {msg_path}: {str(e)}") return None def find_all_msg_files(directory): """递归查找目录中的所有msg文件""" msg_files = [] for root, dirs, files in os.walk(directory): for file in files: if file.lower().endswith('.msg'): full_path = os.path.join(root, file) msg_files.append(full_path) return msg_files def analyze_email_threads(directory): """分析邮件线程并返回线程信息""" message_id_to_thread = {} thread_to_messages = defaultdict(list) current_thread_id = 1 # 递归查找所有msg文件 msg_files = find_all_msg_files(directory) total_emails = len(msg_files) for file_path in msg_files: try: msg = extract_msg.Message(file_path) # 提取并清洁Message-ID raw_message_id = msg.header.get('Message-ID', '').strip() message_id = clean_message_id(raw_message_id) if not message_id: # 使用相对路径作为标识 rel_path = os.path.relpath(file_path, directory) message_id = f"NO_MESSAGE_ID_{rel_path.replace(os.sep, '_')}" # 提取并清洁In-Reply-To raw_in_reply_to = msg.header.get('In-Reply-To', '').strip() in_reply_to = clean_message_id(raw_in_reply_to) # 提取并清洁References raw_references = msg.header.get('References', '').strip() ref_ids = [] if raw_references: raw_ref_list = raw_references.split() ref_ids = [clean_message_id(ref) for ref in raw_ref_list] # 提取邮件日期 date_str = msg.header.get('Date', '') try: date_obj = parsedate_to_datetime(date_str) if date_str else None except: date_obj = None # 确定邮件属于哪个线程 thread_id = None if in_reply_to: thread_id = message_id_to_thread.get(in_reply_to) elif ref_ids: for ref_id in ref_ids: if ref_id in message_id_to_thread: thread_id = message_id_to_thread[ref_id] break # 如果没有找到现有线程,创建新线程 if thread_id is None: thread_id = current_thread_id current_thread_id += 1 # 记录当前邮件的Message-ID和对应的线程 message_id_to_thread[message_id] = thread_id # 提取邮件内容用于去重 content_info = extract_email_content(file_path) # 将当前邮件信息添加到线程中 thread_to_messages[thread_id].append({ 'filename': os.path.basename(file_path), 'original_path': file_path, 'relative_path': os.path.relpath(file_path, directory), 'date': date_obj, 'content_info': content_info }) except Exception as e: print(f"处理文件 {file_path} 时出错: {str(e)}") return thread_to_messages, total_emails def find_duplicates_within_thread(thread_messages): """在线程内查找重复邮件""" # 使用内容哈希作为键,存储具有相同内容的邮件 content_to_emails = defaultdict(list) for msg in thread_messages: if msg['content_info']: content_hash = msg['content_info']['body_hash'] content_to_emails[content_hash].append(msg) # 只返回有重复的组 return {hash_val: emails for hash_val, emails in content_to_emails.items() if len(emails) > 1} def classify_emails_by_thread_and_deduplicate(directory): """根据线程分析结果将邮件分类到文件夹,并进行内容去重""" # 分析邮件线程 thread_info, total_emails = analyze_email_threads(directory) # 创建线程分类主文件夹 threads_dir = os.path.join(directory, "邮件分类结果") os.makedirs(threads_dir, exist_ok=True) # 创建重复邮件文件夹 duplicates_dir = os.path.join(threads_dir, "重复邮件") os.makedirs(duplicates_dir, exist_ok=True) # 统计信息 multi_email_threads = {tid: msgs for tid, msgs in thread_info.items() if len(msgs) > 1} multi_email_thread_count = len(multi_email_threads) single_email_threads = len(thread_info) - multi_email_thread_count total_duplicates = 0 # 处理包含多个邮件的线程 for thread_id, messages in multi_email_threads.items(): # 创建线程文件夹 thread_folder = os.path.join(threads_dir, f"线程_{thread_id}") os.makedirs(thread_folder, exist_ok=True) # 在线程内查找重复邮件 duplicates = find_duplicates_within_thread(messages) # 为每个线程创建重复邮件子文件夹 thread_duplicates_dir = os.path.join(duplicates_dir, f"线程_{thread_id}_重复邮件") if duplicates: os.makedirs(thread_duplicates_dir, exist_ok=True) # 处理重复邮件 processed_hashes = set() for msg in messages: if not msg['content_info']: # 如果无法提取内容信息,直接复制到线程文件夹 src_path = msg['original_path'] # 保持原始目录结构 dst_path = os.path.join(thread_folder, msg['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(src_path, dst_path) continue content_hash = msg['content_info']['body_hash'] # 如果是重复邮件且尚未处理 if content_hash in duplicates and content_hash not in processed_hashes: duplicate_group = duplicates[content_hash] # 按日期排序,保留最新的邮件 sorted_duplicates = sorted(duplicate_group, key=lambda x: x['date'] if x['date'] else datetime.min, reverse=True) # 保留最新的一封邮件到线程文件夹 latest_email = sorted_duplicates[0] src_path = latest_email['original_path'] dst_path = os.path.join(thread_folder, latest_email['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(src_path, dst_path) # 将其余重复邮件移动到重复文件夹 for duplicate in sorted_duplicates[1:]: src_path = duplicate['original_path'] dst_path = os.path.join(thread_duplicates_dir, duplicate['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(src_path, dst_path) total_duplicates += 1 processed_hashes.add(content_hash) elif content_hash not in processed_hashes: # 非重复邮件,直接复制到线程文件夹 src_path = msg['original_path'] dst_path = os.path.join(thread_folder, msg['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(src_path, dst_path) # 创建单独邮件文件夹(单邮件线程) single_emails_dir = os.path.join(threads_dir, "单独邮件") os.makedirs(single_emails_dir, exist_ok=True) # 处理单邮件线程 single_threads = {tid: msgs for tid, msgs in thread_info.items() if len(msgs) == 1} for thread_id, messages in single_threads.items(): for msg in messages: src_path = msg['original_path'] dst_path = os.path.join(single_emails_dir, msg['relative_path']) os.makedirs(os.path.dirname(dst_path), exist_ok=True) shutil.copy2(src_path, dst_path) # 生成报告 report_file = os.path.join(threads_dir, "邮件分类和去重报告.txt") with open(report_file, 'w', encoding='utf-8') as f: f.write(f"邮件分类和去重报告\n") f.write("=" * 60 + "\n\n") f.write(f"源目录: {directory}\n") f.write(f"总邮件数: {total_emails}\n") f.write(f"总线程数: {len(thread_info)}\n") f.write(f"多邮件线程数: {multi_email_thread_count}\n") f.write(f"单邮件线程数: {single_email_threads}\n") f.write(f"发现的重复邮件数: {total_duplicates}\n\n") f.write("重复邮件详情:\n") f.write("-" * 30 + "\n") # 遍历所有线程,报告重复邮件情况 for thread_id, messages in multi_email_threads.items(): duplicates = find_duplicates_within_thread(messages) if duplicates: f.write(f"线程 {thread_id} 中的重复邮件:\n") for content_hash, duplicate_emails in duplicates.items(): f.write(f" 内容哈希: {content_hash[:10]}...\n") f.write(f" 发件人: {duplicate_emails[0]['content_info']['sender']}\n") f.write(f" 收件人: {duplicate_emails[0]['content_info']['to']}\n") f.write(f" 主题: {duplicate_emails[0]['content_info']['subject']}\n") f.write(f" 内容预览: {duplicate_emails[0]['content_info']['simplified_body']}\n") f.write(f" 重复数量: {len(duplicate_emails)}\n") f.write(f" 文件列表:\n") for email in duplicate_emails: date_str = email['date'].strftime("%Y-%m-%d %H:%M:%S") if email['date'] else "无日期信息" f.write(f" - {email['relative_path']} [{date_str}]\n") f.write("\n") print(f"邮件分类和去重完成!") print(f"分类结果保存在: {threads_dir}") print(f"重复邮件保存在: {duplicates_dir}") print(f"报告文件: {report_file}") print(f"总邮件数: {total_emails}") print(f"多邮件线程数: {multi_email_thread_count}") print(f"单邮件线程数: {single_email_threads}") print(f"发现的重复邮件数: {total_duplicates}") if __name__ == '__main__': directory = r'C:\code\PCGKB_test01\msg' classify_emails_by_thread_and_deduplicate(directory) 线程没查到重复邮件这合理吗?
09-03
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值