Filemapping 与 Semaphore

本文探讨了在C#中使用Semaphore对象进行线程同步的方法,强调了使用后关闭Semaphore以避免内存泄漏的重要性。同时介绍了如何利用Global\前缀的共享内存对象实现服务进程间的交互。

 m_Read = Semaphore.OpenExisting(spname);

用过之后要记得关闭 否则内存泄漏

 m_Read.Close();

 

要想 服务进程 之间交互 使用 Global\\Erist.Remote.ShareMemory 全局对象

转载于:https://www.cnblogs.com/erist/p/10142565.html

import os import time import pandas as pd import csv import chardet from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler import logging from datetime import datetime import re import threading import queue from concurrent.futures import ThreadPoolExecutor, as_completed # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("flight_data_processor.log", encoding='utf-8'), logging.StreamHandler() ] ) # 默认飞机呼号型号映射字典 DEFAULT_AIRCRAFT_MAPPING = { "ABC": "F-35", "DEF": "Boeing 737", "GHI": "Airbus A320", "JKL": "Cessna 172", "MNO": "F-16", } # 全局飞机映射字典 AIRCRAFT_MAPPING = {} # ==================== 关键参数调整区域 ==================== # 线程池大小 - 增加到50个线程 MAX_WORKERS = 50 # 队列大小 - 增加到200,避免队列满 MAX_QUEUE_SIZE = 200 # 批量处理大小 - 一次性处理100个文件 BATCH_PROCESS_SIZE = 100 # 创建线程安全的文件处理队列 file_queue = queue.Queue(maxsize=MAX_QUEUE_SIZE) # 创建线程池执行器 - 使用调整后的线程数 executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) # 创建处理中的文件集合 processing_files = set() # 创建线程锁 processing_lock = threading.Lock() # 信号量控制并发文件读取数量,避免内存溢出 file_read_semaphore = threading.Semaphore(20) # 同时最多20个文件读取操作 # ======================================================== def load_mapping_from_excel(mapping_file_path): """从Excel文件加载呼号-型号映射""" global AIRCRAFT_MAPPING if not os.path.exists(mapping_file_path): logging.warning(f"映射文件不存在: {mapping_file_path},使用默认映射") AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy() return try: if mapping_file_path.endswith('.xls'): df_mapping = pd.read_excel(mapping_file_path, engine='xlrd') else: df_mapping = pd.read_excel(mapping_file_path, engine='openpyxl') callsign_col = None model_col = None for col in df_mapping.columns: col_lower = str(col).lower() if '呼号' in col_lower or 'callsign' in col_lower: callsign_col = col elif '型号' in col_lower or 'model' in col_lower: model_col = col if callsign_col is None or model_col is None: logging.error("在映射文件中找不到呼号列或型号列") AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy() return new_mapping = {} for _, row in df_mapping.iterrows(): callsign = str(row[callsign_col]).strip() if pd.notna(row[callsign_col]) else "" model = str(row[model_col]).strip() if pd.notna(row[model_col]) else "" if callsign and model: new_mapping[callsign] = model if new_mapping: AIRCRAFT_MAPPING = new_mapping logging.info(f"成功加载 {len(AIRCRAFT_MAPPING)} 条映射记录") else: AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy() except Exception as e: logging.error(f"加载映射文件失败: {str(e)}") AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy() def reload_mapping_if_updated(mapping_file_path, last_modified_time): """检查映射文件是否更新并重新加载""" try: if os.path.exists(mapping_file_path): current_modified_time = os.path.getmtime(mapping_file_path) if current_modified_time > last_modified_time: logging.info("检测到映射文件更新,重新加载...") load_mapping_from_excel(mapping_file_path) return current_modified_time except Exception as e: logging.error(f"检查映射文件更新失败: {str(e)}") return last_modified_time class FileHandler(FileSystemEventHandler): def __init__(self, directory_to_watch, mapping_file_path): self.directory_to_watch = directory_to_watch self.mapping_file_path = mapping_file_path self.mapping_last_modified = os.path.getmtime(mapping_file_path) if os.path.exists(mapping_file_path) else 0 self.processed_files = set() # 初始化时批量处理现有文件 self.process_existing_files_batch() def process_existing_files_batch(self): """批量处理目录中已存在的文件""" all_files = [] for filename in os.listdir(self.directory_to_watch): file_path = os.path.join(self.directory_to_watch, filename) if (os.path.isfile(file_path) and self.is_supported_file(filename) and not filename.startswith('~$') and file_path != self.mapping_file_path): all_files.append(file_path) # 分批处理文件,每批100个 for i in range(0, len(all_files), BATCH_PROCESS_SIZE): batch_files = all_files[i:i + BATCH_PROCESS_SIZE] logging.info(f"开始处理第 {i//BATCH_PROCESS_SIZE + 1} 批文件,共 {len(batch_files)} 个文件") self.process_files_concurrently(batch_files) def on_created(self, event): """当有新文件创建时调用""" if not event.is_directory: file_path = event.src_path if (self.is_supported_file(file_path) and not os.path.basename(file_path).startswith('~$') and file_path != self.mapping_file_path): time.sleep(0.5) # 减少等待时间 self.add_file_to_queue(file_path) def on_modified(self, event): """当文件被修改时调用""" if not event.is_directory: file_path = event.src_path if file_path == self.mapping_file_path: # 映射文件更新处理 try: self.mapping_last_modified = reload_mapping_if_updated(file_path, self.mapping_last_modified) except Exception as e: logging.error(f"处理映射文件更新失败: {str(e)}") elif (self.is_supported_file(file_path) and not os.path.basename(file_path).startswith('~$')): time.sleep(0.5) self.add_file_to_queue(file_path) def add_file_to_queue(self, file_path): """将文件添加到处理队列""" with processing_lock: if (file_path not in self.processed_files and file_path not in processing_files and file_queue.qsize() < MAX_QUEUE_SIZE - 10): # 留出一些缓冲空间 try: file_queue.put_nowait(file_path) logging.debug(f"已添加文件到队列: {os.path.basename(file_path)}") except queue.Full: logging.warning("处理队列已满,等待空间释放") def process_files_concurrently(self, file_paths): """并发处理多个文件""" futures = [] for file_path in file_paths: if file_path not in self.processed_files: futures.append(executor.submit(self.process_file_wrapper, file_path)) # 使用回调处理完成的任务 for future in as_completed(futures): try: future.result() except Exception as e: logging.error(f"文件处理失败: {str(e)}") def process_file_wrapper(self, file_path): """包装文件处理函数,用于线程池""" with processing_lock: if file_path in processing_files: return processing_files.add(file_path) try: with file_read_semaphore: # 控制并发读取数量 self.process_file(file_path) with processing_lock: self.processed_files.add(file_path) processing_files.remove(file_path) except Exception as e: with processing_lock: if file_path in processing_files: processing_files.remove(file_path) logging.error(f"处理文件 {os.path.basename(file_path)} 时出错: {str(e)}") def is_supported_file(self, file_path): """检查文件是否为支持的格式""" return any(file_path.lower().endswith(ext) for ext in ['.csv', '.xls', '.xlsx']) def detect_encoding(self, file_path): """检测文件编码""" try: with open(file_path, 'rb') as f: raw_data = f.read(10000) result = chardet.detect(raw_data) return result['encoding'] if result['confidence'] > 0.7 else 'utf-8' except: return 'utf-8' def find_callsign_column(self, df): """智能查找呼号列""" possible_names = ['呼号', 'callsign', 'call sign', 'flight', 'flightnumber', 'flight number'] for col in df.columns: col_lower = str(col).lower().strip() for name in possible_names: if name.lower() == col_lower: return col for col in df.columns: col_lower = str(col).lower().strip() for name in possible_names: if name.lower() in col_lower: return col for col in df.columns: if df[col].dtype == 'object': sample_values = df[col].dropna().head(10) if len(sample_values) > 0: call_sign_pattern = re.compile(r'^[A-Za-z]{2,}\d{2,}') matches = sum(1 for val in sample_values if call_sign_pattern.match(str(val))) if matches / len(sample_values) > 0.5: return col return None def find_model_column(self, df): """智能查找机型列""" possible_names = ['机型', '型号', 'model', 'aircraft', 'type', 'aircraft type'] for col in df.columns: col_lower = str(col).lower().strip() for name in possible_names: if name.lower() == col_lower: return col for col in df.columns: col_lower = str(col).lower().strip() for name in possible_names: if name.lower() in col_lower: return col return None def is_unknown_value(self, value): """检查值是否为'未知'或空值""" if pd.isna(value) or value is None: return True value_str = str(value).strip().lower() unknown_patterns = ['未知', 'unknown', 'none', 'null', 'nan', ''] return any(pattern == value_str for pattern in unknown_patterns) def process_file(self, file_path): """处理单个文件""" file_ext = os.path.splitext(file_path)[1].lower() try: if file_ext == '.csv': encodings_to_try = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'utf-8-sig', 'cp936'] df = None for encoding in encodings_to_try: try: df = pd.read_csv(file_path, encoding=encoding) break except: continue if df is None: raise Exception("无法用任何编码读取CSV文件") else: try: if file_ext == '.xls': df = pd.read_excel(file_path, engine='xlrd') else: df = pd.read_excel(file_path, engine='openpyxl') except Exception as e: try: if file_ext == '.xls': df = pd.read_excel(file_path, engine='openpyxl') else: df = pd.read_excel(file_path, engine='xlrd') except Exception as e2: raise except Exception as e: logging.error(f"读取文件 {os.path.basename(file_path)} 失败: {str(e)}") return callsign_col = self.find_callsign_column(df) if callsign_col is None: logging.warning(f"文件 {os.path.basename(file_path)} 中没有找到呼号列") return model_col = self.find_model_column(df) if model_col is None: model_col_patterns = ['型号', 'model', 'aircraft', 'type'] for col in df.columns: col_lower = str(col).lower().strip() for pattern in model_col_patterns: if pattern.lower() in col_lower: model_col = col break if model_col: break if model_col is None: model_col = '机型' df[model_col] = '未知' updated_count = 0 unknown_count = 0 skipped_count = 0 for index, row in df.iterrows(): callsign = str(row[callsign_col]) if pd.notna(row[callsign_col]) else "" if not callsign.strip(): skipped_count += 1 continue current_model = row[model_col] if pd.notna(row[model_col]) else "" if not self.is_unknown_value(current_model): skipped_count += 1 continue # 使用全局映射字典查找匹配 aircraft_model = None for prefix, model in AIRCRAFT_MAPPING.items(): if callsign.startswith(prefix): aircraft_model = model break if aircraft_model: df.at[index, model_col] = aircraft_model updated_count += 1 else: df.at[index, model_col] = "未知" unknown_count += 1 try: if file_ext == '.csv': df.to_csv(file_path, index=False, encoding='utf-8-sig') else: if file_ext == '.xls': df.to_excel(file_path, index=False, engine='xlrd') else: df.to_excel(file_path, index=False, engine='openpyxl') logging.info(f"文件 {os.path.basename(file_path)} 处理完成: 更新{updated_count}, 未知{unknown_count}, 跳过{skipped_count}") except Exception as e: logging.error(f"保存文件 {os.path.basename(file_path)} 失败: {str(e)}") def process_queue(): """处理文件队列的线程函数 - 增强版""" while True: try: # 批量获取文件,提高效率 files_to_process = [] for _ in range(min(10, file_queue.qsize())): # 一次最多取10个文件 try: file_path = file_queue.get_nowait() files_to_process.append(file_path) except queue.Empty: break if files_to_process: # 批量提交任务 futures = [] for file_path in files_to_process: futures.append(executor.submit(event_handler.process_file_wrapper, file_path)) # 等待这批任务完成 for future in as_completed(futures): try: future.result() except Exception as e: logging.error(f"队列文件处理失败: {str(e)}") finally: file_queue.task_done() else: time.sleep(0.1) # 减少空队列时的CPU占用 except Exception as e: logging.error(f"处理队列时出错: {str(e)}") time.sleep(1) def main(): directory_to_watch = r"C:\Users\user\Desktop\flight-analysis-system" mapping_file_path = r"C:\Users\user\Desktop\flight-analysis-system\aircraft_mapping.xlsx" # 创建默认映射文件如果不存在 if not os.path.exists(mapping_file_path): try: default_mapping_df = pd.DataFrame({ '飞机呼号': list(DEFAULT_AIRCRAFT_MAPPING.keys()), '飞机型号': list(DEFAULT_AIRCRAFT_MAPPING.values()) }) default_mapping_df.to_excel(mapping_file_path, index=False, engine='openpyxl') logging.info(f"已创建默认映射文件: {mapping_file_path}") except Exception as e: logging.error(f"创建默认映射文件失败: {str(e)}") # 加载映射文件 load_mapping_from_excel(mapping_file_path) if not os.path.exists(directory_to_watch): logging.error(f"指定的目录不存在: {directory_to_watch}") return logging.info(f"开始监控目录: {directory_to_watch}") logging.info(f"映射文件: {mapping_file_path}") logging.info(f"线程池大小: {MAX_WORKERS}") logging.info(f"批量处理大小: {BATCH_PROCESS_SIZE}") global event_handler event_handler = FileHandler(directory_to_watch, mapping_file_path) observer = Observer() observer.schedule(event_handler, directory_to_watch, recursive=True) observer.start() # 启动多个队列处理线程 queue_threads = [] for i in range(3): # 启动3个队列处理线程 thread = threading.Thread(target=process_queue, daemon=True, name=f"QueueProcessor-{i}") thread.start() queue_threads.append(thread) try: while True: # 定期检查映射文件更新 event_handler.mapping_last_modified = reload_mapping_if_updated( mapping_file_path, event_handler.mapping_last_modified ) # 定期报告状态 if file_queue.qsize() > 0 or len(processing_files) > 0: logging.info(f"队列状态: 待处理={file_queue.qsize()}, 处理中={len(processing_files)}, 已完成={len(event_handler.processed_files)}") time.sleep(5) except KeyboardInterrupt: observer.stop() logging.info("正在停止监控...") observer.join() # 等待队列清空 while file_queue.qsize() > 0 or len(processing_files) > 0: logging.info(f"等待任务完成: 队列={file_queue.qsize()}, 处理中={len(processing_files)}") time.sleep(1) executor.shutdown(wait=True) logging.info("所有任务已完成,程序退出") if __name__ == "__main__": main()需要对代码进行改进,首先对于监控文件夹中文件如果一次性导入几百个excel、csv运行会非常慢,并且如果表格内容数据特别多运行也非常慢,其次,我希望监视该文件夹中,将文件夹原有的数据保持不变,而是在桌面创建一个“data is here”的文件夹,将监视文件夹中处理的数据copy到这个文件夹中,其中处理的数据包括经过比对补充完的数据,和比对后没有变化的数据都要保存至上述文件夹,要求对数据处理和另存另一个文件,在文件数量为300个平均大小为150kb的情况下同步在三秒内完成
最新发布
09-04
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值