thread msg 修正

本文详细阐述了线程池任务管理和消息循环机制的实现方式,通过实例展示了如何利用线程池进行任务分配和消息处理,提高了程序的并发处理能力。

DWORD WINAPI Thread_List_In(LPVOID lpParam)
{
 MSG msg;
 PeekMessage(&msg, NULL, WM_USER, WM_USER, PM_NOREMOVE);
 while(1)
 {
  if(GetMessage(&msg,0,0,0)) //get msg from message queue
  {
   switch(msg.message)
   {
   case MY_MSG:
    char * pInfo = (char *)msg.wParam;
    printf("recv %s\n",pInfo);
    WaitForSingleObject(hMutex,INFINITE);
    task_list_out = task_list;
    task_list.clear();
    ReleaseMutex(hMutex);
    Sleep(100);
    delete[] pInfo;
    break;
   }
  }
  for(int i=0;i<30000;i++)
  {
   task_info ti;
   ti.set_task_id("1");
   
   task_list.push_back(ti);
  }
  //WaitForSingleObject(hMutex,INFINITE);
  /*task_pool tp = get_free_pool();
  for(int i=0;i<300;i++)
  {
   task_info *ti = new task_info();
   ti->set_task_id("1");

   tp.task_list->push_back(*ti);
  }
  tp.list_state = 1;*/
  //ReleaseMutex(hMutex);
 }

 //WaitForSingleObject(hMutex,INFINITE);

 ////while(true)
 //{
 // for(int i=0;i<30000;i++)
 // {
 //  task_info *ti = new task_info();
 //  ti->set_task_id("1");

 //  task_list->push_back(*ti);
 // }
 // if(task_list->size()>8000)
 // {
 //  Sleep(1000);
 // }
 //}

 //ReleaseMutex(hMutex);

 return 0;
}

DWORD WINAPI Thread_List_Out(LPVOID lpParam)
{
 int i=0;
 while(1)
 {
  //WaitForSingleObject(hMutex,INFINITE);
  //task_pool tp  = get_buzy_pool();
  
  cout<<"size--------- :"<<task_list_out.size()<<endl;
  char* pInfo = new char[10]; //create dynamic msg
  sprintf(pInfo,"msg_%d",i++);
        if(!PostThreadMessage(ThreadID,MY_MSG,(WPARAM)pInfo,0))//post thread msg
        {
            printf("post message failed,errno:%d\n",::GetLastError());
            delete[] pInfo;
        }

  //task_pool_list.remove(tp);
  //tp.list_state = 0;
  //tp.task_list->clear();
  //ReleaseMutex(hMutex);

  Sleep(1000);
 }
 /*WaitForSingleObject(hMutex,INFINITE);

 while(true)
 {
  cout<<"size--------- :"<<task_list->size()<<endl;
  task_list->clear();
  Sleep(1000);
 }
 
 ReleaseMutex(hMutex);*/

 return 0;
}

import extract_msg import os import re import shutil import base64 import uuid from collections import defaultdict from datetime import datetime import logging from email.utils import parsedate_to_datetime # 设置日志记录 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def find_all_msg_files(directory): """递归查找目录中的所有msg文件""" msg_files = [] for root, dirs, files in os.walk(directory): for file in files: if file.lower().endswith('.msg'): full_path = os.path.join(root, file) msg_files.append(full_path) return msg_files def extract_guid_from_index(conversation_index): """从ConversationIndex中提取GUID""" try: # ConversationIndex通常是base64编码的二进制数据 if isinstance(conversation_index, str): # 尝试解码base64 binary_data = base64.b64decode(conversation_index) # 根据Microsoft文档,GUID通常位于偏移量6处,长度为16字节 if len(binary_data) >= 22: # 6 + 16 = 22 guid_bytes = binary_data[6:22] return str(uuid.UUID(bytes_le=guid_bytes)) except Exception as e: logger.warning(f"提取GUID时出错: {e}") return None def normalize_subject(subject): """归一化邮件主题""" if not subject: return "无主题" # 去除常见的回复和转发前缀 prefixes = [ r'^(Re|Fwd|Fw|Aw|WG|VS|SV|TR|转发|回复|答复|回覆)[:\s\[\]]*', r'^\[.*?\][\s]*' ] normalized = subject.strip() for pattern in prefixes: normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE) # 去除多余空格并转换为小写 normalized = re.sub(r'\s+', ' ', normalized).strip().lower() return normalized if normalized else "无主题" def parse_msg_files_in_directory(directory): """解析指定目录中的所有msg文件""" msg_files = find_all_msg_files(directory) messages = [] for file_path in msg_files: try: msg = extract_msg.Message(file_path) # 提取邮件基本信息 subject = getattr(msg, 'subject', '无主题') or '无主题' sender = getattr(msg, 'sender', '未知发件人') or '未知发件人' # 提取邮件日期 date_str = getattr(msg, 'sent', None) if not date_str and hasattr(msg, 'header') and msg.header: date_str = msg.header.get('Date') date_obj = None if date_str: try: date_obj = parsedate_to_datetime(str(date_str)) except: try: date_obj = datetime.strptime(str(date_str).split(' (')[0], '%a, %d %b %Y %H:%M:%S %z') except: date_obj = None # 提取ConversationIndex conversation_index = None if hasattr(msg, 'header') and msg.header: conversation_index = msg.header.get('Thread-Index') if not conversation_index: conversation_index = msg.header.get('Conversation-Index') # 提取In-Reply-To和References in_reply_to = None references = None if hasattr(msg, 'header') and msg.header: in_reply_to = msg.header.get('In-Reply-To') references = msg.header.get('References') messages.append({ 'file_path': file_path, 'filename': os.path.basename(file_path), 'relative_path': os.path.relpath(file_path, directory), 'subject': subject, 'sender': sender, 'date': date_obj, 'conversation_index': conversation_index, 'in_reply_to': in_reply_to, 'references': references }) msg.close() except Exception as e: logger.error(f"解析邮件文件 {file_path} 时出错: {e}") return messages def classify_emails_in_directory(directory): """对指定目录中的邮件进行线程分类""" # 解析所有邮件 messages = parse_msg_files_in_directory(directory) if not messages: logger.info(f"目录 {directory} 中没有找到邮件文件") return logger.info(f"在目录 {directory} 中成功解析 {len(messages)} 封邮件") # 创建线程分类主文件夹 threads_dir = os.path.join(directory, "邮件线程分类") os.makedirs(threads_dir, exist_ok=True) # 用于存储线程 threads = {} # 第一遍:基于ConversationIndex分组 for msg in messages: thread_key = None # 优先使用会话索引(ConversationIndex) if msg['conversation_index']: # 从ConversationIndex中提取GUID conv_id = extract_guid_from_index(msg['conversation_index']) if conv_id: thread_key = f"conv_{conv_id}" else: # 如果无法提取GUID,使用整个ConversationIndex thread_key = f"conv_{hash(msg['conversation_index'])}" else: # 否则使用归一化后的主题 normalized_subject = normalize_subject(msg['subject']) thread_key = f"subj_{hash(normalized_subject)}" # 将邮件添加到线程 if thread_key not in threads: threads[thread_key] = [] threads[thread_key].append(msg) # 第二遍:基于In-Reply-To和References进一步分组(简化版JWZ算法) for msg in messages: # 如果邮件有In-Reply-To,尝试找到被回复的邮件 if msg['in_reply_to']: # 简化处理:假设In-Reply-To指向的邮件在同一目录中 for other_msg in messages: if other_msg['file_path'] != msg['file_path']: # 检查是否是被回复的邮件 if (other_msg['subject'] and msg['subject'] and normalize_subject(other_msg['subject']) == normalize_subject(msg['subject']) and other_msg['date'] and msg['date'] and other_msg['date'] < msg['date']): # 找到可能的父邮件 thread_key = None for key, thread_msgs in threads.items(): if other_msg in thread_msgs: thread_key = key break if thread_key: # 将当前邮件移动到父邮件的线程 for key in list(threads.keys()): if msg in threads[key]: threads[key].remove(msg) if not threads[key]: del threads[key] break threads[thread_key].append(msg) break # 统计信息 multi_email_threads = {tid: msgs for tid, msgs in threads.items() if len(msgs) > 1} multi_email_thread_count = len(multi_email_threads) single_email_threads = len(threads) - multi_email_thread_count # 处理包含多个邮件的线程 for thread_id, messages in multi_email_threads.items(): # 创建线程文件夹 thread_folder = os.path.join(threads_dir, f"线程_{thread_id}") os.makedirs(thread_folder, exist_ok=True) # 找出最后发送的邮件 last_msg = None for msg in messages: if msg['date']: if last_msg is None or msg['date'] > last_msg['date']: last_msg = msg # 复制邮件到线程文件夹 for msg in messages: src_path = msg['file_path'] dst_path = os.path.join(thread_folder, msg['filename']) shutil.copy2(src_path, dst_path) # 创建单独邮件文件夹(单邮件线程) single_emails_dir = os.path.join(threads_dir, "单独邮件") os.makedirs(single_emails_dir, exist_ok=True) # 处理单邮件线程 for thread_id, messages in threads.items(): if len(messages) == 1: msg = messages[0] src_path = msg['file_path'] dst_path = os.path.join(single_emails_dir, msg['filename']) shutil.copy2(src_path, dst_path) # 生成报告 report_file = os.path.join(threads_dir, "邮件线程分类报告.txt") with open(report_file, 'w', encoding='utf-8') as f: f.write(f"邮件线程分类报告\n") f.write("=" * 60 + "\n\n") f.write(f"源目录: {directory}\n") f.write(f"总邮件数: {len(messages)}\n") f.write(f"总线程数: {len(threads)}\n") f.write(f"多邮件线程数: {multi_email_thread_count}\n") f.write(f"单邮件线程数: {single_email_threads}\n\n") f.write("多邮件线程详情:\n") f.write("-" * 30 + "\n") for thread_id, messages in multi_email_threads.items(): f.write(f"线程 {thread_id} (共 {len(messages)} 封邮件):\n") # 找出最后发送的邮件 last_msg = None for msg in messages: if msg['date']: if last_msg is None or msg['date'] > last_msg['date']: last_msg = msg # 按日期排序 sorted_messages = sorted(messages, key=lambda x: x['date'] if x['date'] else datetime.min) for msg in sorted_messages: marker = " (最后发送)" if msg == last_msg else "" date_str = msg['date'].strftime("%Y-%m-%d %H:%M:%S") if msg['date'] else "无日期信息" f.write(f" - {msg['filename']} [{date_str}]{marker}\n") f.write("\n") logger.info(f"目录 {directory} 邮件线程分类完成!") logger.info(f"分类结果保存在: {threads_dir}") logger.info(f"报告文件: {report_file}") logger.info(f"总邮件数: {len(messages)}") logger.info(f"多邮件线程数: {multi_email_thread_count}") logger.info(f"单邮件线程数: {single_email_threads}") return len(messages), multi_email_thread_count, single_email_thread_count def process_all_directories(base_directory): """处理基础目录下的所有子目录""" base_path = Path(base_directory) if not base_path.exists() or not base_path.is_dir(): logger.error(f"基础目录不存在或不是目录: {base_directory}") return total_emails = 0 total_multi_threads = 0 total_single_threads = 0 processed_dirs = 0 # 首先处理基础目录本身 logger.info(f"处理基础目录: {base_directory}") emails, multi_threads, single_threads = classify_emails_in_directory(base_directory) total_emails += emails total_multi_threads += multi_threads total_single_threads += single_threads processed_dirs += 1 # 然后处理所有子目录 for root, dirs, files in os.walk(base_directory): for dir_name in dirs: dir_path = os.path.join(root, dir_name) logger.info(f"处理子目录: {dir_path}") # 检查目录中是否有msg文件 msg_files = find_all_msg_files(dir_path) if msg_files: emails, multi_threads, single_threads = classify_emails_in_directory(dir_path) total_emails += emails total_multi_threads += multi_threads total_single_threads += single_threads processed_dirs += 1 else: logger.info(f"目录 {dir_path} 中没有msg文件,跳过") # 生成总体报告 summary_file = os.path.join(base_directory, "总体邮件线程分类报告.txt") with open(summary_file, 'w', encoding='utf-8') as f: f.write(f"总体邮件线程分类报告\n") f.write("=" * 60 + "\n\n") f.write(f"基础目录: {base_directory}\n") f.write(f"处理的目录数量: {processed_dirs}\n") f.write(f"总邮件数: {total_emails}\n") f.write(f"总多邮件线程数: {total_multi_threads}\n") f.write(f) 总单邮件线程数: {total_single_threads}\n\n ") f.write("处理完成的目录:\n") f.write("-" * 30 + "\n") for root, dirs, files in os.walk(base_directory): for dir_name in dirs: dir_path = os.path.join(root, dir_name) if os.path.exists(os.path.join(dir_path, "邮件线程分类")): f.write(f"- {dir_path}\n") logger.info(f"所有目录处理完成!") logger.info(f"处理的目录数量: {processed_dirs}") logger.info(f"总邮件数: {total_emails}") logger.info(f"总多邮件线程数: {total_multi_threads}") logger.info(f"总单邮件线程数: {total_single_threads}") logger.info(f"总体报告保存在: {summary_file}") if __name__ == '__main__': base_directory = r'C:\code\PCGKB_test01\msg' process_all_directories(base_directory) 代码出错,帮我修改
09-05
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值