Threads and Collections

本文探讨了在Java中使用不同类型的集合类时所遇到的线程安全性问题。详细介绍了线程安全的集合类如Vector、Stack、Hashtable等,并提到了通过同步机制使非线程安全的集合类如HashSet、HashMap等变得线程安全的方法。

 

Ever since JDK 1.2 collections are being widely used to store data in our programs. These Collection classes are used to store data which is shared by diferrent threads. Hence come the problem of which collection are thread-safe. i.e. Operations on these collection classes are synchronized.
If a collection class is not thread-safe and if we intend to use it in our multithreaded program, then we need to explicitely synchronize them.(or apply a wrapper class)
Some thread safe collection classes are:

  • Vector
  • Stack
  • Hashtable
  • ConcurrentHashMap
  • ConcurrentLinkedQueue
  • CopyOnWriteArrayList
  • CopyOnWriteArraySet


Thread Unsafe Collection Classes: HashSet, TreeSet, HashMap, TreeMap, WeakHashMap, IdentityHashMap, LinkedHashMap etc.
How to achieve synchronization when working with Collections in multithreaded programs Can be done by:

  • Usage of a thread safe collection (:-) Seems Simple Huh...)
  • Manage synchronization explicitely by providely synchronized access to certain blocks of code.
  • Usage of a synchronized version of a Thread-Unsafe collection class
    • Map m = Collections.synchronizedMap(new HashMap());
    • Look into the synchronized_ _ mthods of the Collections class for more details.
import extract_msg import os import re import shutil import base64 import uuid from collections import defaultdict from datetime import datetime import logging from email.utils import parsedate_to_datetime from pathlib import Path # 设置日志记录 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(os.path.join(r'C:\code\PCGKB_test01\msg\lixia', 'email_classification.log'), encoding='utf-8'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def find_all_msg_files(directory): """递归查找目录中的所有msg文件""" msg_files = [] for root, dirs, files in os.walk(directory): # 跳过已经创建的线程分类目录 if "邮件线程分类" in root: continue for file in files: if file.lower().endswith('.msg'): full_path = os.path.join(root, file) msg_files.append(full_path) return msg_files def extract_guid_from_index(conversation_index): """从ConversationIndex中提取GUID""" try: # ConversationIndex通常是base64编码的二进制数据 if isinstance(conversation_index, str): # 尝试解码base64 binary_data = base64.b64decode(conversation_index) # 根据Microsoft文档,GUID通常位于偏移量6处,长度为16字节 if len(binary_data) >= 22: # 6 + 16 = 22 guid_bytes = binary_data[6:22] return str(uuid.UUID(bytes_le=guid_bytes)) except Exception as e: logger.warning(f"提取GUID时出错: {e}") return None def normalize_subject(subject): """归一化邮件主题""" if not subject: return "无主题" # 去除常见的回复和转发前缀 prefixes = [ r'^(Re|Fwd|Fw|Aw|WG|VS|SV|TR|转发|回复|答复|回覆)[:\s\[\]]*', r'^\$\$.*?\$\$[\s]*' ] normalized = subject.strip() for pattern in prefixes: normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE) # 去除多余空格并转换为小写 normalized = re.sub(r'\s+', ' ', normalized).strip().lower() return normalized if normalized else "无主题" def parse_msg_files_in_directory(directory): """解析指定目录中的所有msg文件""" msg_files = find_all_msg_files(directory) messages = [] for file_path in msg_files: try: msg = extract_msg.Message(file_path) # 提取邮件基本信息 subject = getattr(msg, 'subject', '无主题') or '无主题' sender = getattr(msg, 'sender', '未知发件人') or '未知发件人' # 提取邮件日期 date_str = getattr(msg, 'sent', None) if not date_str and hasattr(msg, 'header') and msg.header: date_str = msg.header.get('Date') date_obj = None if date_str: try: date_obj = parsedate_to_datetime(str(date_str)) except: try: date_obj = datetime.strptime(str(date_str).split(' (')[0], '%a, %d %b %Y %H:%M:%S %z') except: date_obj = None # 提取ConversationIndex conversation_index = None if hasattr(msg, 'header') and msg.header: conversation_index = msg.header.get('Thread-Index') if not conversation_index: conversation_index = msg.header.get('Conversation-Index') # 提取In-Reply-To和References in_reply_to = None references = None if hasattr(msg, 'header') and msg.header: in_reply_to = msg.header.get('In-Reply-To') references = msg.header.get('References') messages.append({ 'file_path': file_path, 'filename': os.path.basename(file_path), 'relative_path': os.path.relpath(file_path, directory), 'subject': subject, 'sender': sender, 'date': date_obj, 'conversation_index': conversation_index, 'in_reply_to': in_reply_to, 'references': references, 'normalized_subject': normalize_subject(subject) }) msg.close() except Exception as e: logger.error(f"解析邮件文件 {file_path} 时出错: {e}") return messages def classify_emails_in_directory(directory): """对指定目录中的邮件进行线程分类""" # 检查是否已经处理过 threads_dir = os.path.join(directory, "邮件线程分类") if os.path.exists(threads_dir): logger.info(f"目录 {directory} 已经处理过,跳过") return 0, 0, 0 # 解析所有邮件 messages = parse_msg_files_in_directory(directory) if not messages: logger.info(f"目录 {directory} 中没有找到邮件文件") return 0, 0, 0 logger.info(f"在目录 {directory} 中成功解析 {len(messages)} 封邮件") # 创建线程分类主文件夹 os.makedirs(threads_dir, exist_ok=True) # 用于存储线程 threads = defaultdict(list) # 第一遍:基于ConversationIndex分组 for msg in messages: thread_key = None # 优先使用会话索引(ConversationIndex) if msg['conversation_index']: # 从ConversationIndex中提取GUID conv_id = extract_guid_from_index(msg['conversation_index']) if conv_id: thread_key = f"conv_{conv_id}" else: # 如果无法提取GUID,使用整个ConversationIndex的哈希值 thread_key = f"conv_{hash(msg['conversation_index']) & 0xFFFFFFFF}" else: # 否则使用归一化后的主题 thread_key = f"subj_{hash(msg['normalized_subject']) & 0xFFFFFFFF}" # 将邮件添加到线程 threads[thread_key].append(msg) # 第二遍:基于In-Reply-To和References进一步分组 # 创建一个映射,用于快速查找邮件 message_map = {msg['file_path']: msg for msg in messages} for msg in messages: # 如果邮件有In-Reply-To,尝试找到被回复的邮件 if msg['in_reply_to']: # 查找被回复的邮件 for other_msg in messages: if (other_msg['file_path'] != msg['file_path'] and other_msg['normalized_subject'] == msg['normalized_subject'] and other_msg['date'] and msg['date'] and other_msg['date'] < msg['date']): # 找到可能的父邮件 for thread_key, thread_msgs in threads.items(): if other_msg in thread_msgs: # 将当前邮件移动到父邮件的线程 for key in list(threads.keys()): if msg in threads[key]: if key != thread_key: threads[key].remove(msg) threads[thread_key].append(msg) break break break # 统计信息 multi_email_threads = {tid: msgs for tid, msgs in threads.items() if len(msgs) > 1} multi_email_thread_count = len(multi_email_threads) single_email_threads = len(threads) - multi_email_thread_count # 处理包含多个邮件的线程 for thread_id, messages in multi_email_threads.items(): # 创建线程文件夹 thread_folder = os.path.join(threads_dir, f"线程_{thread_id}") os.makedirs(thread_folder, exist_ok=True) # 按日期排序 sorted_messages = sorted(messages, key=lambda x: x['date'] if x['date'] else datetime.min) # 复制邮件到线程文件夹 for msg in sorted_messages: src_path = msg['file_path'] dst_path = os.path.join(thread_folder, msg['filename']) # 检查目标文件是否已存在 if not os.path.exists(dst_path): shutil.copy2(src_path, dst_path) else: logger.warning(f"文件已存在,跳过复制: {dst_path}") # 创建单独邮件文件夹(单邮件线程) single_emails_dir = os.path.join(threads_dir, "单独邮件") os.makedirs(single_emails_dir, exist_ok=True) # 处理单邮件线程 for thread_id, messages in threads.items(): if len(messages) == 1: msg = messages[0] src_path = msg['file_path'] dst_path = os.path.join(single_emails_dir, msg['filename']) # 检查目标文件是否已存在 if not os.path.exists(dst_path): shutil.copy2(src_path, dst_path) else: logger.warning(f"文件已存在,跳过复制: {dst_path}") # 生成报告 report_file = os.path.join(threads_dir, "邮件线程分类报告.txt") with open(report_file, 'w', encoding='utf-8') as f: f.write(f"邮件线程分类报告\n") f.write("=" * 60 + "\n\n") f.write(f"源目录: {directory}\n") f.write(f"总邮件数: {len(messages)}\n") f.write(f"总线程数: {len(threads)}\n") f.write(f"多邮件线程数: {multi_email_thread_count}\n") f.write(f"单邮件线程数: {single_email_threads}\n\n") f.write("多邮件线程详情:\n") f.write("-" * 30 + "\n") for thread_id, messages in multi_email_threads.items(): f.write(f"线程 {thread_id} (共 {len(messages)} 封邮件):\n") # 按日期排序 sorted_messages = sorted(messages, key=lambda x: x['date'] if x['date'] else datetime.min) for i, msg in enumerate(sorted_messages): marker = " (最后发送)" if i == len(sorted_messages) - 1 else "" date_str = msg['date'].strftime("%Y-%m-%d %H:%M:%S") if msg['date'] else "无日期信息" f.write(f" - {msg['filename']} [{date_str}]{marker}\n") f.write("\n") logger.info(f"目录 {directory} 邮件线程分类完成!") logger.info(f"分类结果保存在: {threads_dir}") logger.info(f"报告文件: {report_file}") logger.info(f"总邮件数: {len(messages)}") logger.info(f"多邮件线程数: {multi_email_thread_count}") logger.info(f"单邮件线程数: {single_email_threads}") return len(messages), multi_email_thread_count, single_email_threads def process_all_directories(base_directory): """处理基础目录下的所有子目录""" base_path = Path(base_directory) if not base_path.exists() or not base_path.is_dir(): logger.error(f"基础目录不存在或不是目录: {base_directory}") return total_emails = 0 total_multi_threads = 0 total_single_threads = 0 processed_dirs = 0 # 只处理直接子目录,不递归处理 for item in os.listdir(base_directory): item_path = os.path.join(base_directory, item) if os.path.isdir(item_path): logger.info(f"处理子目录: {item_path}") # 检查目录中是否有msg文件 msg_files = find_all_msg_files(item_path) if msg_files: emails, multi_threads, single_threads = classify_emails_in_directory(item_path) total_emails += emails total_multi_threads += multi_threads total_single_threads += single_threads processed_dirs += 1 else: logger.info(f"目录 {item_path} 中没有msg文件,跳过") # 生成总体报告 summary_file = os.path.join(base_directory, "总体邮件线程分类报告.txt") with open(summary_file, 'w', encoding='utf-8') as f: f.write(f"总体邮件线程分类报告\n") f.write("=" * 60 + "\n\n") f.write(f"基础目录: {base_directory}\n") f.write(f"处理的目录数量: {processed_dirs}\n") f.write(f"总邮件数: {total_emails}\n") f.write(f"总多邮件线程数: {total_multi_threads}\n") f.write(f"总单邮件线程数: {total_single_threads}\n\n") f.write("处理完成的目录:\n") f.write("-" * 30 + "\n") for item in os.listdir(base_directory): item_path = os.path.join(base_directory, item) if os.path.isdir(item_path) and os.path.exists(os.path.join(item_path, "邮件线程分类")): f.write(f"- {item_path}\n") logger.info(f"所有目录处理完成!") logger.info(f"处理的目录数量: {processed_dirs}") logger.info(f"总邮件数: {total_emails}") logger.info(f"总多邮件线程数: {total_multi_threads}") logger.info(f"总单邮件线程数: {total_single_threads}") logger.info(f"总体报告保存在: {summary_file}") if __name__ == '__main__': base_directory = r'C:\code\PCGKB_test01\msg\lixia' process_all_directories(base_directory) 调整msg必须移动到分类而不是复制,确保所有文件夹干净
09-06
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值