import os
import time
import pandas as pd
import csv
import chardet
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import logging
from datetime import datetime
import re
import threading
import queue
from concurrent.futures import ThreadPoolExecutor, as_completed
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("flight_data_processor.log", encoding='utf-8'),
logging.StreamHandler()
]
)
# 默认飞机呼号与型号映射字典
DEFAULT_AIRCRAFT_MAPPING = {
"ABC": "F-35",
"DEF": "Boeing 737",
"GHI": "Airbus A320",
"JKL": "Cessna 172",
"MNO": "F-16",
}
# 全局飞机映射字典
AIRCRAFT_MAPPING = {}
# ==================== 关键参数调整区域 ====================
# 线程池大小 - 增加到50个线程
MAX_WORKERS = 50
# 队列大小 - 增加到200,避免队列满
MAX_QUEUE_SIZE = 200
# 批量处理大小 - 一次性处理100个文件
BATCH_PROCESS_SIZE = 100
# 创建线程安全的文件处理队列
file_queue = queue.Queue(maxsize=MAX_QUEUE_SIZE)
# 创建线程池执行器 - 使用调整后的线程数
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
# 创建处理中的文件集合
processing_files = set()
# 创建线程锁
processing_lock = threading.Lock()
# 信号量控制并发文件读取数量,避免内存溢出
file_read_semaphore = threading.Semaphore(20) # 同时最多20个文件读取操作
# ========================================================
def load_mapping_from_excel(mapping_file_path):
"""从Excel文件加载呼号-型号映射"""
global AIRCRAFT_MAPPING
if not os.path.exists(mapping_file_path):
logging.warning(f"映射文件不存在: {mapping_file_path},使用默认映射")
AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy()
return
try:
if mapping_file_path.endswith('.xls'):
df_mapping = pd.read_excel(mapping_file_path, engine='xlrd')
else:
df_mapping = pd.read_excel(mapping_file_path, engine='openpyxl')
callsign_col = None
model_col = None
for col in df_mapping.columns:
col_lower = str(col).lower()
if '呼号' in col_lower or 'callsign' in col_lower:
callsign_col = col
elif '型号' in col_lower or 'model' in col_lower:
model_col = col
if callsign_col is None or model_col is None:
logging.error("在映射文件中找不到呼号列或型号列")
AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy()
return
new_mapping = {}
for _, row in df_mapping.iterrows():
callsign = str(row[callsign_col]).strip() if pd.notna(row[callsign_col]) else ""
model = str(row[model_col]).strip() if pd.notna(row[model_col]) else ""
if callsign and model:
new_mapping[callsign] = model
if new_mapping:
AIRCRAFT_MAPPING = new_mapping
logging.info(f"成功加载 {len(AIRCRAFT_MAPPING)} 条映射记录")
else:
AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy()
except Exception as e:
logging.error(f"加载映射文件失败: {str(e)}")
AIRCRAFT_MAPPING = DEFAULT_AIRCRAFT_MAPPING.copy()
def reload_mapping_if_updated(mapping_file_path, last_modified_time):
"""检查映射文件是否更新并重新加载"""
try:
if os.path.exists(mapping_file_path):
current_modified_time = os.path.getmtime(mapping_file_path)
if current_modified_time > last_modified_time:
logging.info("检测到映射文件更新,重新加载...")
load_mapping_from_excel(mapping_file_path)
return current_modified_time
except Exception as e:
logging.error(f"检查映射文件更新失败: {str(e)}")
return last_modified_time
class FileHandler(FileSystemEventHandler):
def __init__(self, directory_to_watch, mapping_file_path):
self.directory_to_watch = directory_to_watch
self.mapping_file_path = mapping_file_path
self.mapping_last_modified = os.path.getmtime(mapping_file_path) if os.path.exists(mapping_file_path) else 0
self.processed_files = set()
# 初始化时批量处理现有文件
self.process_existing_files_batch()
def process_existing_files_batch(self):
"""批量处理目录中已存在的文件"""
all_files = []
for filename in os.listdir(self.directory_to_watch):
file_path = os.path.join(self.directory_to_watch, filename)
if (os.path.isfile(file_path) and
self.is_supported_file(filename) and
not filename.startswith('~$') and
file_path != self.mapping_file_path):
all_files.append(file_path)
# 分批处理文件,每批100个
for i in range(0, len(all_files), BATCH_PROCESS_SIZE):
batch_files = all_files[i:i + BATCH_PROCESS_SIZE]
logging.info(f"开始处理第 {i//BATCH_PROCESS_SIZE + 1} 批文件,共 {len(batch_files)} 个文件")
self.process_files_concurrently(batch_files)
def on_created(self, event):
"""当有新文件创建时调用"""
if not event.is_directory:
file_path = event.src_path
if (self.is_supported_file(file_path) and
not os.path.basename(file_path).startswith('~$') and
file_path != self.mapping_file_path):
time.sleep(0.5) # 减少等待时间
self.add_file_to_queue(file_path)
def on_modified(self, event):
"""当文件被修改时调用"""
if not event.is_directory:
file_path = event.src_path
if file_path == self.mapping_file_path:
# 映射文件更新处理
try:
self.mapping_last_modified = reload_mapping_if_updated(file_path, self.mapping_last_modified)
except Exception as e:
logging.error(f"处理映射文件更新失败: {str(e)}")
elif (self.is_supported_file(file_path) and
not os.path.basename(file_path).startswith('~$')):
time.sleep(0.5)
self.add_file_to_queue(file_path)
def add_file_to_queue(self, file_path):
"""将文件添加到处理队列"""
with processing_lock:
if (file_path not in self.processed_files and
file_path not in processing_files and
file_queue.qsize() < MAX_QUEUE_SIZE - 10): # 留出一些缓冲空间
try:
file_queue.put_nowait(file_path)
logging.debug(f"已添加文件到队列: {os.path.basename(file_path)}")
except queue.Full:
logging.warning("处理队列已满,等待空间释放")
def process_files_concurrently(self, file_paths):
"""并发处理多个文件"""
futures = []
for file_path in file_paths:
if file_path not in self.processed_files:
futures.append(executor.submit(self.process_file_wrapper, file_path))
# 使用回调处理完成的任务
for future in as_completed(futures):
try:
future.result()
except Exception as e:
logging.error(f"文件处理失败: {str(e)}")
def process_file_wrapper(self, file_path):
"""包装文件处理函数,用于线程池"""
with processing_lock:
if file_path in processing_files:
return
processing_files.add(file_path)
try:
with file_read_semaphore: # 控制并发读取数量
self.process_file(file_path)
with processing_lock:
self.processed_files.add(file_path)
processing_files.remove(file_path)
except Exception as e:
with processing_lock:
if file_path in processing_files:
processing_files.remove(file_path)
logging.error(f"处理文件 {os.path.basename(file_path)} 时出错: {str(e)}")
def is_supported_file(self, file_path):
"""检查文件是否为支持的格式"""
return any(file_path.lower().endswith(ext) for ext in ['.csv', '.xls', '.xlsx'])
def detect_encoding(self, file_path):
"""检测文件编码"""
try:
with open(file_path, 'rb') as f:
raw_data = f.read(10000)
result = chardet.detect(raw_data)
return result['encoding'] if result['confidence'] > 0.7 else 'utf-8'
except:
return 'utf-8'
def find_callsign_column(self, df):
"""智能查找呼号列"""
possible_names = ['呼号', 'callsign', 'call sign', 'flight', 'flightnumber', 'flight number']
for col in df.columns:
col_lower = str(col).lower().strip()
for name in possible_names:
if name.lower() == col_lower:
return col
for col in df.columns:
col_lower = str(col).lower().strip()
for name in possible_names:
if name.lower() in col_lower:
return col
for col in df.columns:
if df[col].dtype == 'object':
sample_values = df[col].dropna().head(10)
if len(sample_values) > 0:
call_sign_pattern = re.compile(r'^[A-Za-z]{2,}\d{2,}')
matches = sum(1 for val in sample_values if call_sign_pattern.match(str(val)))
if matches / len(sample_values) > 0.5:
return col
return None
def find_model_column(self, df):
"""智能查找机型列"""
possible_names = ['机型', '型号', 'model', 'aircraft', 'type', 'aircraft type']
for col in df.columns:
col_lower = str(col).lower().strip()
for name in possible_names:
if name.lower() == col_lower:
return col
for col in df.columns:
col_lower = str(col).lower().strip()
for name in possible_names:
if name.lower() in col_lower:
return col
return None
def is_unknown_value(self, value):
"""检查值是否为'未知'或空值"""
if pd.isna(value) or value is None:
return True
value_str = str(value).strip().lower()
unknown_patterns = ['未知', 'unknown', 'none', 'null', 'nan', '']
return any(pattern == value_str for pattern in unknown_patterns)
def process_file(self, file_path):
"""处理单个文件"""
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext == '.csv':
encodings_to_try = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'utf-8-sig', 'cp936']
df = None
for encoding in encodings_to_try:
try:
df = pd.read_csv(file_path, encoding=encoding)
break
except:
continue
if df is None:
raise Exception("无法用任何编码读取CSV文件")
else:
try:
if file_ext == '.xls':
df = pd.read_excel(file_path, engine='xlrd')
else:
df = pd.read_excel(file_path, engine='openpyxl')
except Exception as e:
try:
if file_ext == '.xls':
df = pd.read_excel(file_path, engine='openpyxl')
else:
df = pd.read_excel(file_path, engine='xlrd')
except Exception as e2:
raise
except Exception as e:
logging.error(f"读取文件 {os.path.basename(file_path)} 失败: {str(e)}")
return
callsign_col = self.find_callsign_column(df)
if callsign_col is None:
logging.warning(f"文件 {os.path.basename(file_path)} 中没有找到呼号列")
return
model_col = self.find_model_column(df)
if model_col is None:
model_col_patterns = ['型号', 'model', 'aircraft', 'type']
for col in df.columns:
col_lower = str(col).lower().strip()
for pattern in model_col_patterns:
if pattern.lower() in col_lower:
model_col = col
break
if model_col:
break
if model_col is None:
model_col = '机型'
df[model_col] = '未知'
updated_count = 0
unknown_count = 0
skipped_count = 0
for index, row in df.iterrows():
callsign = str(row[callsign_col]) if pd.notna(row[callsign_col]) else ""
if not callsign.strip():
skipped_count += 1
continue
current_model = row[model_col] if pd.notna(row[model_col]) else ""
if not self.is_unknown_value(current_model):
skipped_count += 1
continue
# 使用全局映射字典查找匹配
aircraft_model = None
for prefix, model in AIRCRAFT_MAPPING.items():
if callsign.startswith(prefix):
aircraft_model = model
break
if aircraft_model:
df.at[index, model_col] = aircraft_model
updated_count += 1
else:
df.at[index, model_col] = "未知"
unknown_count += 1
try:
if file_ext == '.csv':
df.to_csv(file_path, index=False, encoding='utf-8-sig')
else:
if file_ext == '.xls':
df.to_excel(file_path, index=False, engine='xlrd')
else:
df.to_excel(file_path, index=False, engine='openpyxl')
logging.info(f"文件 {os.path.basename(file_path)} 处理完成: 更新{updated_count}, 未知{unknown_count}, 跳过{skipped_count}")
except Exception as e:
logging.error(f"保存文件 {os.path.basename(file_path)} 失败: {str(e)}")
def process_queue():
"""处理文件队列的线程函数 - 增强版"""
while True:
try:
# 批量获取文件,提高效率
files_to_process = []
for _ in range(min(10, file_queue.qsize())): # 一次最多取10个文件
try:
file_path = file_queue.get_nowait()
files_to_process.append(file_path)
except queue.Empty:
break
if files_to_process:
# 批量提交任务
futures = []
for file_path in files_to_process:
futures.append(executor.submit(event_handler.process_file_wrapper, file_path))
# 等待这批任务完成
for future in as_completed(futures):
try:
future.result()
except Exception as e:
logging.error(f"队列文件处理失败: {str(e)}")
finally:
file_queue.task_done()
else:
time.sleep(0.1) # 减少空队列时的CPU占用
except Exception as e:
logging.error(f"处理队列时出错: {str(e)}")
time.sleep(1)
def main():
directory_to_watch = r"C:\Users\user\Desktop\flight-analysis-system"
mapping_file_path = r"C:\Users\user\Desktop\flight-analysis-system\aircraft_mapping.xlsx"
# 创建默认映射文件如果不存在
if not os.path.exists(mapping_file_path):
try:
default_mapping_df = pd.DataFrame({
'飞机呼号': list(DEFAULT_AIRCRAFT_MAPPING.keys()),
'飞机型号': list(DEFAULT_AIRCRAFT_MAPPING.values())
})
default_mapping_df.to_excel(mapping_file_path, index=False, engine='openpyxl')
logging.info(f"已创建默认映射文件: {mapping_file_path}")
except Exception as e:
logging.error(f"创建默认映射文件失败: {str(e)}")
# 加载映射文件
load_mapping_from_excel(mapping_file_path)
if not os.path.exists(directory_to_watch):
logging.error(f"指定的目录不存在: {directory_to_watch}")
return
logging.info(f"开始监控目录: {directory_to_watch}")
logging.info(f"映射文件: {mapping_file_path}")
logging.info(f"线程池大小: {MAX_WORKERS}")
logging.info(f"批量处理大小: {BATCH_PROCESS_SIZE}")
global event_handler
event_handler = FileHandler(directory_to_watch, mapping_file_path)
observer = Observer()
observer.schedule(event_handler, directory_to_watch, recursive=True)
observer.start()
# 启动多个队列处理线程
queue_threads = []
for i in range(3): # 启动3个队列处理线程
thread = threading.Thread(target=process_queue, daemon=True, name=f"QueueProcessor-{i}")
thread.start()
queue_threads.append(thread)
try:
while True:
# 定期检查映射文件更新
event_handler.mapping_last_modified = reload_mapping_if_updated(
mapping_file_path, event_handler.mapping_last_modified
)
# 定期报告状态
if file_queue.qsize() > 0 or len(processing_files) > 0:
logging.info(f"队列状态: 待处理={file_queue.qsize()}, 处理中={len(processing_files)}, 已完成={len(event_handler.processed_files)}")
time.sleep(5)
except KeyboardInterrupt:
observer.stop()
logging.info("正在停止监控...")
observer.join()
# 等待队列清空
while file_queue.qsize() > 0 or len(processing_files) > 0:
logging.info(f"等待任务完成: 队列={file_queue.qsize()}, 处理中={len(processing_files)}")
time.sleep(1)
executor.shutdown(wait=True)
logging.info("所有任务已完成,程序退出")
if __name__ == "__main__":
main()需要对代码进行改进,首先对于监控文件夹中文件如果一次性导入几百个excel、csv运行会非常慢,并且如果表格内容数据特别多运行也非常慢,其次,我希望监视该文件夹中,将文件夹原有的数据保持不变,而是在桌面创建一个“data is here”的文件夹,将监视文件夹中处理的数据copy到这个文件夹中,其中处理的数据包括经过比对补充完的数据,和比对后没有变化的数据都要保存至上述文件夹,要求对数据处理和另存另一个文件,在文件数量为300个平均大小为150kb的情况下同步在三秒内完成
最新发布