请帮忙分析代码,并给出可行性的优化方案
import os
import pandas as pd
import numpy as np
import re
import chardet
import csv
import time
import gc
import psutil
from sqlalchemy import create_engine, text
from datetime import datetime
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from tenacity import retry, stop_after_attempt, wait_fixed
base_path = r'D:\OneDrive\ERIC\维护\综资数据\20250601'
database_url = 'mysql+pymysql://root:root@127.0.0.1:3306/test'
def custom_field_handler(field):
"""自定义字段处理函数,使用正则表达式"""
if isinstance(field, str):
field = field.strip()
if field in ('', '\\N'):
return None
return re.sub(r'["\n\r]', '', field)
return field
def detect_encoding_and_separator(filepath):
"""自动检测CSV文件的编码和分隔符"""
encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'iso-8859-1']
best_sep = None
max_columns = 0
detected_encoding = None
for encoding in encodings:
try:
with open(filepath, 'rb') as f:
raw_data = f.read(10000)
detected = chardet.detect(raw_data)
detected_encoding = detected['encoding']
with open(filepath, 'r', encoding=detected_encoding, errors='ignore') as f:
first_line = f.readline()
separators = [',', '\t', ';', '|', '^']
for sep in separators:
columns = len(first_line.split(sep))
if columns > max_columns:
max_columns = columns
best_sep = sep
if best_sep is not None:
break
except Exception as ex:
print(f"尝试使用编码 {encoding} 时遇到错误: {ex}")
if best_sep is None:
raise ValueError(f"无法检测到分隔符,文件可能为空或格式不正确: {filepath}")
return detected_encoding, best_sep
def read_csv_with_encodings(filepath):
"""根据自动检测的编码读取CSV文件"""
encoding, sep = detect_encoding_and_separator(filepath)
try:
df = pd.read_csv(filepath, encoding=encoding, sep=sep,
quoting=3, low_memory=False, on_bad_lines='warn')
print(f"成功读取文件: {filepath}")
return df
except Exception as ex:
print(f"尝试读取文件时遇到错误: {ex}")
raise ValueError(f"无法读取文件: {filepath},请检查文件格式或编码。")
def process_csv_file(filepath, column_map, custom_handler, folder_type):
"""处理单个CSV文件,并返回处理后的DataFrame"""
try:
df_temp = read_csv_with_encodings(filepath)
df_temp.columns = [custom_handler(col) for col in df_temp.columns]
selected_columns = list(column_map.keys())
temp_df = df_temp[selected_columns].copy()
temp_df.rename(columns=column_map, inplace=True)
type_mapping = {
"机房": "机房",
"站点": "基站",
"铁塔": "铁塔",
"直放站": "室分_直放站",
"分布系统": "室分",
"BTS": "BBU_2G",
"BBU": "BBU_4G",
"DU": "BBU_5G",
"CELL": "CELL_2G",
"E-UTRANCELL": "CELL_4G",
"NR-CELL": "CELL_5G",
"E-NODEB": "SITE_4G",
"GNODEB": "SITE_5G"
}
temp_df['类型'] = type_mapping.get(folder_type, "未知类型")
print(f"{os.path.basename(filepath)}已成功加载,本次包含行数:{len(temp_df)}")
return temp_df
except (pd.errors.ParserError, KeyError) as ex:
print(f"在处理文件 {os.path.basename(filepath)} 时遇到错误: {ex}")
return None
def read_csv_with_encodings_station(filepath):
"""根据自动检测的编码读取CSV文件"""
encoding, sep = detect_encoding_and_separator(filepath)
try:
with open(filepath, 'r', newline='', encoding=encoding, errors='replace') as csvfile:
reader = csv.DictReader(csvfile, delimiter=sep, quotechar='@')
data = list(reader)
df = pd.DataFrame(data)
print(f"成功读取文件: {filepath}")
return df
except Exception as ex:
print(f"尝试读取文件时遇到错误: {ex}")
raise ValueError(f"无法读取文件: {filepath},请检查文件格式或编码。")
def process_csv_files_station(filepath, column_map, custom_handler, folder_type):
"""处理单个CSV文件,并返回处理后的DataFrame"""
try:
df_temp = read_csv_with_encodings_station(filepath)
df_temp.columns = [custom_handler(col) for col in df_temp.columns]
selected_columns = list(column_map.keys())
temp_df = df_temp[selected_columns].copy()
temp_df.rename(columns=column_map, inplace=True)
type_mapping = {
"机房": "机房",
"站点": "基站",
"铁塔": "铁塔",
"直放站": "室分_直放站",
"分布系统": "室分",
"BTS": "BBU_2G",
"BBU": "BBU_4G",
"DU": "BBU_5G",
"CELL": "CELL_2G",
"E-UTRANCELL": "CELL_4G",
"NR-CELL": "CELL_5G",
"E-NODEB": "SITE_4G",
"GNODEB": "SITE_5G"
}
temp_df['类型'] = type_mapping.get(folder_type, "未知类型")
print(f"{os.path.basename(filepath)}已成功加载,本次包含行数:{len(temp_df)}")
return temp_df
except (pd.errors.ParserError, KeyError) as ex:
print(f"在处理文件 {os.path.basename(filepath)} 时遇到错误: {ex}")
return None
def validate_dataframe(df, required_columns):
"""验证DataFrame是否包含必要的列"""
for col in required_columns:
if col not in df.columns:
print(f"缺少必要的列: {col}")
return False
return True
def read_and_process_folder(folder_path):
"""读取并处理文件夹中的所有CSV文件"""
combined_dataframes_local = {}
df_list_station = []
df_list_room = []
combined_other_dfs = []
key_columns_map = {
"NR-CELL": ['NAME$03008', 'ID$03008', 'ROOM_NAME$03008', 'ROOM_ID$03008'],
"GNODEB": ['NAME$03007', 'ID$03007', 'ROOM_NAME$03007', 'ROOM_ID$03007'],
"E-UTRANCELL": ['NAME$03006', 'ID$03006', 'ROOM_NAME$03006', 'ROOM_ID$03006'],
"E-NODEB": ['NAME$03005', 'ID$03005', 'ROOM_NAME$03005', 'ROOM_ID$03005'],
"CELL": ['NAME$03002', 'ID$03002', 'ROOM_NAME$03002', 'ROOM_ID$03002'],
"BTS": ['NAME$03001', 'ID$03001', 'ROOM_NAME$03001', 'ROOM_ID$03001'],
"BBU": ['NAME$03010', 'ID$03010', 'ROOM_NAME$03010', 'ROOM_ID$03010'],
"DU": ['NAME$03011', 'ID$03011', 'ROOM_NAME$03011', 'ROOM_ID$03011'],
"铁塔": ['NAME$03020', 'ID$03020', 'ROOM_NAME$03020', 'ROOM_ID$03020'],
"直放站": ['NAME$03019', 'ID$03019', 'ROOM_NAME$03019', 'ROOM_ID$03019'],
"分布系统": ['NAME$03018', 'ID$03018', 'ROOM_NAME$03018', 'ROOM_ID$03018']
}
new_column_names = {k: ['NAME', 'ID', 'ROOM_NAME', 'ROOM_ID'] for k in key_columns_map.keys()}
for root, dirs, files in os.walk(folder_path):
current_folder_name = os.path.basename(root)
if current_folder_name == "站点":
column_map = {'MNT_LEVEL$01002': 'STATION_LEVEL', 'NAME$01002': 'STATION_NAME', 'ID$01002': 'STATION_ID',
'LATITUDE$01002': 'LATITUDE', 'LONGITUDE$01002': 'LONGITUDE', "MNT_TYPE$01002": "维护类型",
"LIFE_STATE$01002": "生命周期状态", "EXT_CODE$01002": "铁塔编码",
"IS_MATCHING$01002": "是否含配套", "ORGANIZATION_NAME$01002": "维护小组",
"BELONG_SPECIALITY$01002": "关联专业", "MNT_DIFFICULTY$01002": "维护难度",
"CITY_NAME$01002": "地市"}
for file in files:
if file.lower().endswith('.csv'):
df_station = process_csv_files_station(os.path.join(root, file), column_map,
custom_field_handler, current_folder_name)
if df_station is not None and validate_dataframe(df_station, ['STATION_NAME', 'STATION_ID']):
df_list_station.append(df_station)
if df_list_station:
combined_dataframes_local['站点'] = pd.concat(df_list_station, ignore_index=True)
elif current_folder_name == "机房":
column_map = {'NAME$01003': 'ROOM_NAME', 'ID$01003': 'ROOM_ID', 'SITE_NAME$01003': 'STATION_NAME',
'SITE_ID$01003': 'STATION_ID'}
for file in files:
if file.lower().endswith('.csv'):
df_room = process_csv_files_station(os.path.join(root, file), column_map, custom_field_handler,
current_folder_name)
if df_room is not None and validate_dataframe(df_room, ['ROOM_NAME', 'ROOM_ID']):
df_list_room.append(df_room)
if df_list_room:
combined_dataframes_local['机房'] = pd.concat(df_list_room, ignore_index=True)
elif current_folder_name in key_columns_map:
current_columns_key = key_columns_map[current_folder_name]
current_column_names = new_column_names[current_folder_name]
column_map = dict(zip(current_columns_key, current_column_names))
for file in files:
if file.lower().endswith('.csv'):
data = process_csv_file(os.path.join(root, file), column_map, custom_field_handler,
current_folder_name)
if data is not None:
combined_other_dfs.append(data)
if combined_other_dfs:
combined_dataframes_local['合并表'] = pd.concat(combined_other_dfs, ignore_index=True)
# 合并机房与站点
if '机房' in combined_dataframes_local and '站点' in combined_dataframes_local:
station_columns = ['STATION_ID', 'STATION_LEVEL', 'LONGITUDE', 'LATITUDE', '维护类型']
df_station_subset = combined_dataframes_local['站点'][station_columns]
combined_dataframes_local['机房'] = combined_dataframes_local['机房'].merge(
df_station_subset, on='STATION_ID', how='left'
)
# 合并合并表与机房
if '合并表' in combined_dataframes_local and '机房' in combined_dataframes_local:
room_columns = ['ROOM_ID', 'STATION_NAME', 'STATION_ID', 'STATION_LEVEL', 'LONGITUDE', 'LATITUDE', '维护类型']
df_room_subset = combined_dataframes_local['机房'][room_columns]
combined_dataframes_local['合并表'] = combined_dataframes_local['合并表'].merge(
df_room_subset, on='ROOM_ID', how='left'
)
return combined_dataframes_local
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def safe_insert_chunk(chunk, table_name, engine_url, chunk_id, total_chunks):
"""安全插入单个数据块,包含重试机制"""
chunk_start = time.time()
start_mem = psutil.virtual_memory().used
thread_engine = create_engine(engine_url, pool_pre_ping=True, pool_recycle=3600)
inner_connection = thread_engine.connect()
transaction = inner_connection.begin() # 显式开始事务
try:
# 使用显式连接执行插入
chunk.to_sql(
name=table_name,
con=inner_connection,
if_exists='append',
index=False,
method='multi',
chunksize=1000 # 内部进一步分块
)
transaction.commit() # 显式提交事务
chunk_elapsed = time.time() - chunk_start
print(f"块 {chunk_id}/{total_chunks} 导入完成,行数: {len(chunk)}, 耗时: {chunk_elapsed:.2f}秒")
return len(chunk), chunk_elapsed
except Exception as insert_error:
transaction.rollback() # 出错时显式回滚
print(
f"块 {chunk_id}/{total_chunks} 导入失败 (尝试 {safe_insert_chunk.retry.statistics.get('attempt_number', 1)}/3): {str(insert_error)}")
raise
finally:
connection.close()
thread_engine.dispose()
del chunk
gc.collect()
end_mem = psutil.virtual_memory().used
print(f"内存使用: {(end_mem - start_mem) / 1024 / 1024:.2f}MB")
def parallel_insert_to_sql(df, table_name, engine_url, chunk_size=10000, max_workers=10):
"""并行将DataFrame分块导入数据库,修复计数问题"""
if df.empty:
print("没有数据需要导入")
return
total_chunks = (len(df) + chunk_size - 1) // chunk_size
print(f"数据将分为 {total_chunks} 块进行并行导入,每块 {chunk_size} 行,使用 {max_workers} 个工作线程")
# 性能监控
successful_chunks = 0
failed_chunks = 0
total_rows_inserted = 0
insert_start_time = time.time()
# 创建进度条
with tqdm(total=total_chunks, desc="并行导入") as pbar:
# 使用线程池处理
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {}
# 提交所有任务
for i in range(0, len(df), chunk_size):
chunk = df.iloc[i:i + chunk_size].copy()
chunk_id = i // chunk_size + 1
# 提交任务到线程池
future = executor.submit(
safe_insert_chunk,
chunk=chunk,
table_name=table_name,
engine_url=engine_url,
chunk_id=chunk_id,
total_chunks=total_chunks
)
futures[future] = chunk_id
# 等待所有任务完成并收集结果
for future in as_completed(futures):
chunk_id = futures[future]
try:
rows_inserted, elapsed = future.result()
successful_chunks += 1
total_rows_inserted += rows_inserted
pbar.update(1)
# 定期打印统计信息
if successful_chunks % 10 == 0:
current_elapsed = time.time() - insert_start_time
rows_per_second = total_rows_inserted / current_elapsed if current_elapsed > 0 else 0
pbar.set_postfix({
"已插入": f"{total_rows_inserted}行",
"速度": f"{rows_per_second:.0f}行/秒"
})
except Exception as ex:
failed_chunks += 1
print(f"块 {chunk_id}/{total_chunks} 处理失败: {str(ex)}")
pbar.update(1)
# 打印总体统计
total_time = time.time() - insert_start_time
if successful_chunks > 0:
avg_rows_per_second = total_rows_inserted / total_time
print(f"导入完成! 总块数: {total_chunks}, 成功块数: {successful_chunks}, 失败块数: {failed_chunks}")
print(f"总行数: {total_rows_inserted}, 总耗时: {total_time:.2f}秒, 平均速度: {avg_rows_per_second:.0f}行/秒")
else:
print("所有块导入均失败")
return total_rows_inserted
# 主程序开始
start_time = time.time()
print(f"程序开始执行,时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# 读取并处理数据
combined_dataframes = read_and_process_folder(base_path)
# 合并结果
final_combined = pd.concat([
combined_dataframes.get('站点'),
combined_dataframes.get('机房'),
combined_dataframes.get('合并表')
], ignore_index=True)
# 数据处理和清洗
final_combined = final_combined[
['地市', 'NAME', 'ID', 'ROOM_NAME', 'ROOM_ID', 'STATION_NAME', 'STATION_ID', '生命周期状态', '铁塔编码',
'是否含配套', 'STATION_LEVEL', 'LATITUDE', 'LONGITUDE', '维护小组', '关联专业', '维护难度', '类型', '维护类型']
]
final_combined['NAME'] = np.where(
final_combined['ROOM_NAME'].isna(),
final_combined['NAME'].fillna(final_combined['STATION_NAME']),
final_combined['NAME'].fillna(final_combined['ROOM_NAME'])
)
final_combined['ID'] = np.where(
final_combined['ROOM_ID'].isna(),
final_combined['ID'].fillna(final_combined['STATION_ID']),
final_combined['ID'].fillna(final_combined['ROOM_ID'])
)
# 提取并格式化日期
folder_name = os.path.basename(base_path) # 先在此处获取文件夹名称
try:
date_obj = datetime.strptime(folder_name, "%Y%m%d")
final_combined['DATE'] = date_obj.strftime("%Y-%m-%d")
except ValueError:
final_combined['DATE'] = None
print(f"无效的日期格式: {folder_name}")
# 清理数据
final_combined = final_combined.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
final_combined = final_combined.rename(columns={'ID': 'SOURCE_ID'})
# 调整列顺序
final_combined = final_combined[
['地市', 'NAME', 'SOURCE_ID', 'ROOM_NAME', 'ROOM_ID', 'STATION_NAME', 'STATION_ID', '生命周期状态', '铁塔编码',
'是否含配套', 'STATION_LEVEL', 'LATITUDE', 'LONGITUDE', '维护小组', '关联专业', '维护难度', '类型', '维护类型',
'DATE']
]
print(f"数据处理完成,总行数: {len(final_combined)}")
# 数据库操作
print("开始数据库操作...")
# 获取当前处理的日期
folder_name = os.path.basename(base_path)
try:
date_obj = datetime.strptime(folder_name, "%Y%m%d")
current_date = date_obj.strftime("%Y-%m-%d")
except ValueError:
current_date = None
print(f"无效的日期格式: {folder_name}")
# 删除当天已有数据(如果存在)
if current_date:
try:
with create_engine(database_url).connect() as connection:
# 关闭索引和约束以提高删除性能
connection.execute(text("SET foreign_key_checks = 0"))
connection.execute(text("SET unique_checks = 0"))
delete_query = text("DELETE FROM zzwy WHERE DATE = :date")
result = connection.execute(delete_query, {'date': current_date})
connection.commit()
print(f"已删除日期为 {current_date} 的 {result.rowcount} 条记录")
# 重新启用索引和约束
connection.execute(text("SET foreign_key_checks = 1"))
connection.execute(text("SET unique_checks = 1"))
except Exception as e:
print(f"删除当天数据时出错: {str(e)}")
# 继续执行,尝试插入新数据
else:
print("警告: 当前日期为空,跳过删除步骤")
# 关闭索引和约束以提高性能
with create_engine(database_url).connect() as connection:
connection.execute(text("SET foreign_key_checks = 0"))
connection.execute(text("SET unique_checks = 0"))
try:
# 自动计算最佳chunksize
available_memory = psutil.virtual_memory().available
estimated_row_size = 1024 # 假设每行1KB内存
chunksize = min(int(available_memory * 0.3 / estimated_row_size), 500000)
print(f"自动计算的chunksize: {chunksize}")
# 创建表(如果不存在)
with create_engine(database_url).connect() as connection:
connection.execute(text("""
CREATE TABLE IF NOT EXISTS zzwy (
地市 VARCHAR(255),
NAME VARCHAR(255),
SOURCE_ID VARCHAR(255),
ROOM_NAME VARCHAR(255),
ROOM_ID VARCHAR(255),
STATION_NAME VARCHAR(255),
STATION_ID VARCHAR(255),
生命周期状态 VARCHAR(255),
铁塔编码 VARCHAR(255),
是否含配套 VARCHAR(255),
STATION_LEVEL VARCHAR(255),
LATITUDE VARCHAR(255),
LONGITUDE VARCHAR(255),
维护小组 VARCHAR(255),
关联专业 VARCHAR(255),
维护难度 VARCHAR(255),
类型 VARCHAR(255),
维护类型 VARCHAR(255),
DATE VARCHAR(255)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
"""))
# 并行导入数据
actual_rows_inserted = parallel_insert_to_sql(
df=final_combined,
table_name='zzwy',
engine_url=database_url,
chunk_size=20000,
max_workers=10
)
# 验证数据完整性
if current_date:
try:
with create_engine(database_url).connect() as connection:
result = connection.execute(
text("SELECT COUNT(*) FROM zzwy WHERE DATE = :date"),
{'date': current_date}
)
db_count = result.scalar()
print(
f"源数据行数: {len(final_combined)}, 实际插入行数: {actual_rows_inserted}, 数据库记录数(当天): {db_count}")
if db_count == len(final_combined):
print("数据导入完整! 数据库记录数与源数据一致")
else:
print(f"警告: 数据可能不完整! 差异: {len(final_combined) - db_count}")
except Exception as e:
print(f"验证数据完整性时出错: {str(e)}")
else:
print("无法验证数据完整性: 当前日期为空")
except Exception as main_error:
print(f"导入数据到数据库时发生错误: {str(main_error)}")
raise
finally:
# 重新启用索引和约束
with create_engine(database_url).connect() as connection:
connection.execute(text("SET foreign_key_checks = 1"))
connection.execute(text("SET unique_checks = 1"))
# 结束计时并输出耗时
end_time = time.time()
elapsed_time = end_time - start_time
print(f"程序执行完成,时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"总耗时: {elapsed_time:.2f}秒 ({elapsed_time / 3600:.2f}小时)")
最新发布