import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
# -------------------------- 1. 路径定义与基础工具函数 --------------------------
# 固定数据路径
data_dir = r"C:\Users\Desktop\研赛D题"
file_paths = {
"a_raw": os.path.join(data_dir, "a站点_风廓线原始数据.csv"),
"a_mwr": os.path.join(data_dir, "a站点_微波辐射计数据.csv"),
"b_raw": os.path.join(data_dir, "b站点_风廓线原始数据.csv"),
"b_mwr": os.path.join(data_dir, "b站点_微波辐射计数据.csv")
}
# 检查文件存在性
for name, path in file_paths.items():
if not os.path.exists(path):
raise FileNotFoundError(f"关键数据文件缺失: {path}")
# 自动检测文件分隔符
def detect_delimiter(file_path, sample_lines=5):
import csv
with open(file_path, "r", encoding="utf-8-sig") as f:
sample = "".join([f.readline() for _ in range(sample_lines) if f.readline()])
delimiters = [",", ";", "\t", "|"]
counts = {d: sample.count(d) for d in delimiters}
return max(counts, key=counts.get)
# 缺测值处理函数
def handle_missing(data, fill_method="linear"):
# 风廓线缺测:空值/9999.0;微波辐射计缺测:无(题目数据无异常码)
data = data.replace(9999.0, np.nan)
if fill_method == "linear":
return data.interpolate(method="linear", axis=0).fillna(method="bfill").fillna(method="ffill")
return data.dropna()
# -------------------------- 2. 数据预处理模块 --------------------------
class DataPreprocessor:
def __init__(self):
# 风廓线5波束参数
self.beam_params = {
1: {"azimuth": 90.0, "elevation": 15.0}, # 东波束
2: {"azimuth": 180.0, "elevation": 15.0}, # 南波束
3: {"azimuth": 270.0, "elevation": 15.0}, # 西波束
4: {"azimuth": 0.0, "elevation": 15.0}, # 北波束
5: {"azimuth": 0.0, "elevation": 90.0} # 天顶波束
}
def process_mwr(self, mwr_path):
"""处理微波辐射计数据,提取温度廓线用于位温计算"""
delimiter = detect_delimiter(mwr_path)
mwr_df = pd.read_csv(mwr_path, delimiter=delimiter)
# 筛选温度廓线,保留时间和高度列
temp_profile = mwr_df.copy()
if "廓线类型" in mwr_df.columns:
temp_profile = mwr_df[mwr_df["廓线类型"].str.contains("温度", na=False)].copy()
# 提取高度列(列名含"km"且是数字开头)并转换为米
height_cols = []
for col in temp_profile.columns:
# 只保留以数字开头且包含km的列(排除CloudBase等非高度列)
if "km" in col and col[0].isdigit():
height_cols.append(col)
# 处理高度列
for col in height_cols:
try:
# 提取数字部分并转换为米
height_str = col.replace("(km)", "").strip()
height_m = float(height_str) * 1000 # km→m
temp_profile.rename(columns={col: f"Temp_{int(height_m)}m"}, inplace=True)
except ValueError:
print(f"跳过无法转换的列: {col}")
continue
# 时间格式标准化(保留到分钟,与风廓线同步)
if "DateTime" in temp_profile.columns:
temp_profile["DateTime"] = pd.to_datetime(temp_profile["DateTime"], errors='coerce').dt.floor("min")
elif "时间" in temp_profile.columns:
temp_profile["DateTime"] = pd.to_datetime(temp_profile["时间"], errors='coerce').dt.floor("min")
else:
# 如果没有时间列,创建一个默认时间列
temp_profile["DateTime"] = pd.date_range(start="2025-09-21 00:00:00",
periods=len(temp_profile),
freq="6min")
# 移除时间为空的行
temp_profile = temp_profile.dropna(subset=["DateTime"])
# 选择需要保留的列
keep_cols = ["DateTime"]
if "SurPre(hPa)" in temp_profile.columns:
keep_cols.append("SurPre(hPa)")
elif "地面气压(hPa)" in temp_profile.columns:
keep_cols.append("地面气压(hPa)")
keep_cols.extend([col for col in temp_profile.columns if "Temp_" in col])
return temp_profile[keep_cols]
def process_wind_profile(self, wind_path):
"""处理风廓线原始数据,反演三维风场"""
delimiter = detect_delimiter(wind_path)
wind_df = pd.read_csv(wind_path, delimiter=delimiter)
# 筛选低模式数据(低空湍流核心关注),剔除无效信噪比
if "模式" in wind_df.columns:
wind_df = wind_df[wind_df["模式"] == "低模式"].copy()
wind_df = handle_missing(wind_df, fill_method="linear")
# 数据类型转换
height_col = "采样高度(m)" if "采样高度(m)" in wind_df.columns else "高度(m)"
wind_df[height_col] = pd.to_numeric(wind_df[height_col], errors="coerce")
speed_col = "径向速度(m/s)" if "径向速度(m/s)" in wind_df.columns else "速度(m/s)"
wind_df[speed_col] = pd.to_numeric(wind_df[speed_col], errors="coerce")
width_col = "速度谱宽(m/s)" if "速度谱宽(m/s)" in wind_df.columns else "谱宽(m/s)"
wind_df[width_col] = pd.to_numeric(wind_df[width_col], errors="coerce")
# 反演三维风场(u:东西向,v:南北向,w:垂直向)
wind_df = self._invert_3d_wind(wind_df, height_col, speed_col, width_col)
# 处理时间列
if "DateTime" in wind_df.columns:
wind_df["DateTime"] = pd.to_datetime(wind_df["DateTime"], errors='coerce').dt.floor("min")
elif "时间" in wind_df.columns:
wind_df["DateTime"] = pd.to_datetime(wind_df["时间"], errors='coerce').dt.floor("min")
else:
# 如果没有时间列,创建一个默认时间列
wind_df["DateTime"] = pd.date_range(start="2025-09-21 00:00:00",
periods=len(wind_df),
freq="6min")
# 移除时间为空的行
wind_df = wind_df.dropna(subset=["DateTime"])
# 重命名列名以保持一致性
wind_df = wind_df.rename(columns={
height_col: "采样高度(m)",
speed_col: "径向速度(m/s)",
width_col: "速度谱宽(m/s)"
})
snr_col = "信噪比(dB)" if "信噪比(dB)" in wind_df.columns else "SNR(dB)"
if snr_col not in wind_df.columns:
# 如果没有信噪比列,创建一个默认值
wind_df[snr_col] = 30.0 # 假设默认信噪比为30dB
return wind_df[["DateTime", "采样高度(m)", "u", "v", "w", "速度谱宽(m/s)", snr_col]]
def _invert_3d_wind(self, wind_df, height_col, speed_col, width_col):
"""基于多波束径向速度反演三维风场"""
# 按高度分组,每个高度下5波束数据联立方程
wind_list = []
for height, group in wind_df.groupby(height_col):
if len(group) < 5:
continue # 波束数据不全,跳过
# 构建系数矩阵A和观测向量b
A = []
b = []
for _, row in group.iterrows():
try:
# 提取波束编号
if "波束" in row:
beam_str = str(row["波束"])
beam_id = int(''.join(filter(str.isdigit, beam_str)))
if beam_id < 1 or beam_id > 5:
continue
else:
# 如果没有波束列,假设按顺序是1-5波束
beam_id = len(A) + 1
az = np.radians(self.beam_params[beam_id]["azimuth"])
el = np.radians(self.beam_params[beam_id]["elevation"])
A.append([np.sin(el)*np.cos(az), np.sin(el)*np.sin(az), np.cos(el)])
b.append(row[speed_col])
except (ValueError, KeyError):
continue
# 最小二乘解u, v, w
if len(A) >= 3: # 至少需要3个波束才能求解
A = np.array(A)
b = np.array(b)
try:
x, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
row = group.iloc[0].copy()
row["u"] = x[0]
row["v"] = x[1]
row["w"] = x[2]
wind_list.append(row)
except np.linalg.LinAlgError:
continue
return pd.DataFrame(wind_list)
def sync_data(self, wind_df, mwr_df):
"""时间-高度双维度数据匹配"""
# 1. 时间同步:微波辐射计重采样到风廓线时间点
mwr_sync = mwr_df.set_index("DateTime").resample("6min").mean().reset_index()
# 确保时间列有效
if mwr_sync["DateTime"].isna().any():
mwr_sync = mwr_sync.dropna(subset=["DateTime"])
if mwr_sync.empty:
raise ValueError("微波辐射计数据处理后为空,请检查数据格式")
# 2. 高度匹配:提取风廓线高度对应的温度
merged_data = []
# 转换微波时间为numpy数组(兼容旧版pandas)
mwr_times = mwr_sync["DateTime"].values
for _, wind_row in wind_df.iterrows():
# 获取风廓线数据时间
wind_time = wind_row["DateTime"]
if pd.isna(wind_time):
continue
# 匹配同时间的微波数据
time_mask = mwr_sync["DateTime"] == wind_time
mwr_matched = mwr_sync[time_mask]
if mwr_matched.empty:
# 如果没有完全匹配的时间,找最近的
# 计算时间差(转换为数值进行比较)
# 将时间转换为Unix时间戳(秒)
mwr_ts = mwr_sync["DateTime"].apply(lambda x: x.timestamp()).values
wind_ts = wind_time.timestamp()
time_diff = np.abs(mwr_ts - wind_ts)
closest_idx = np.argmin(time_diff)
mwr_matched = mwr_sync.iloc[[closest_idx]]
# 匹配最接近的高度温度(风廓线高度→微波辐射计高度)
wind_h = wind_row["采样高度(m)"]
mwr_temp_cols = [col for col in mwr_matched.columns if "Temp_" in col]
if not mwr_temp_cols: # 如果没有温度列,跳过
continue
mwr_heights = [int(col.replace("Temp_", "").replace("m", "")) for col in mwr_temp_cols]
closest_h_idx = np.argmin(np.abs(np.array(mwr_heights) - wind_h))
closest_temp_col = mwr_temp_cols[closest_h_idx]
# 获取地面气压
sur_pre = None
if "SurPre(hPa)" in mwr_matched.columns:
sur_pre = mwr_matched["SurPre(hPa)"].values[0]
elif "地面气压(hPa)" in mwr_matched.columns:
sur_pre = mwr_matched["地面气压(hPa)"].values[0]
if sur_pre is None or np.isnan(sur_pre):
continue
# 获取温度值
temp_value = mwr_matched[closest_temp_col].values[0]
if np.isnan(temp_value):
continue
# 合并数据
merged_row = {
"DateTime": wind_row["DateTime"],
"Height(m)": wind_h,
"u": wind_row["u"],
"v": wind_row["v"],
"w": wind_row["w"],
"Sigma": wind_row["速度谱宽(m/s)"],
"SNR": wind_row[[col for col in wind_row.index if "信噪比" in col or "SNR" in col][0]],
"Temp(℃)": temp_value,
"SurPre(hPa)": sur_pre
}
merged_data.append(merged_row)
result_df = pd.DataFrame(merged_data)
if result_df.empty:
print("警告:数据同步后为空,可能是因为时间或高度匹配失败")
return result_df
# -------------------------- 3. 特征工程与模型构建 --------------------------
class TurbulenceModel:
def __init__(self):
self.g = 9.8 # 重力加速度
self.model_a = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
self.model_b = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
self.scaler_a = StandardScaler() # 模型a专用标准化器
self.scaler_b = StandardScaler() # 模型b专用标准化器
def calculate_merged_features(self, data):
"""计算融合数据的特征(包含温度相关特征)"""
if data.empty:
raise ValueError("输入数据为空,无法计算特征")
data = data.copy()
# 1. 位温θ(K):θ = T*(1000/P)^0.286(T转换为开尔文)
data["Temp(K)"] = data["Temp(℃)"] + 273.15
data["Theta"] = data["Temp(K)"] * (1000 / data["SurPre(hPa)"]) ** 0.286
# 2. 垂直梯度(相邻高度差)
data = data.sort_values(["DateTime", "Height(m)"])
for col in ["u", "v", "Theta"]:
data[f"d{col}_dz"] = data.groupby("DateTime")[col].diff() / data.groupby("DateTime")["Height(m)"].diff()
# 3. 理查逊数Ri
data["Ri"] = (self.g / data["Theta"] * data["dTheta_dz"]) / (data["du_dz"]**2 + data["dv_dz"]**2)
data["Ri"] = data["Ri"].replace([np.inf, -np.inf], np.nan).fillna(0)
# 4. 湍流动能TKE(目标变量:σ→TKE=3σ²/2)
data["TKE"] = 1.5 * (data["Sigma"] ** 2)
return handle_missing(data)
def calculate_wind_only_features(self, data):
"""计算仅风廓线数据的特征(不包含温度相关特征)"""
if data.empty:
raise ValueError("输入数据为空,无法计算特征")
data = data.copy()
# 1. 垂直梯度(仅基于风场)
data = data.sort_values(["DateTime", "Height(m)"])
for col in ["u", "v", "w"]:
data[f"d{col}_dz"] = data.groupby("DateTime")[col].diff() / data.groupby("DateTime")["Height(m)"].diff()
# 2. 湍流动能TKE(目标变量:σ→TKE=3σ²/2)
data["TKE"] = 1.5 * (data["Sigma"] ** 2)
return handle_missing(data)
def train_model_a(self, merged_data):
"""模型a:融合风廓线+微波辐射计数据"""
# 计算融合特征
feature_data = self.calculate_merged_features(merged_data)
# 特征与目标变量
features_a = ["u", "v", "w", "Sigma", "SNR", "Ri", "dTheta_dz", "du_dz", "dv_dz"]
# 确保所有特征都存在
missing_features = [f for f in features_a if f not in feature_data.columns]
if missing_features:
print(f"警告:模型a缺少特征 {missing_features},将使用可用特征")
features_a = [f for f in features_a if f in feature_data.columns]
X = feature_data[features_a]
y = feature_data["TKE"]
# 标准化与训练
X_scaled = self.scaler_a.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
self.model_a.fit(X_train, y_train)
# 预测与评估
y_pred = self.model_a.predict(X_test)
metrics_a = {
"RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
"MAE": mean_absolute_error(y_test, y_pred),
"R2": r2_score(y_test, y_pred)
}
print("模型a(融合数据)评估指标:", metrics_a)
# 返回带预测结果的完整数据
feature_data["TKE_a"] = self.model_a.predict(self.scaler_a.transform(X))
return feature_data, metrics_a
def train_model_b(self, wind_data, model_a_result):
"""模型b:仅风廓线数据,以模型a结果为标准"""
# 计算仅风数据的特征
wind_feature_data = self.calculate_wind_only_features(wind_data)
# 特征(无微波相关特征)
features_b = ["u", "v", "w", "Sigma", "SNR", "du_dz", "dv_dz", "dw_dz"]
# 确保所有特征都存在
missing_features = [f for f in features_b if f not in wind_feature_data.columns]
if missing_features:
print(f"警告:模型b缺少特征 {missing_features},将使用可用特征")
features_b = [f for f in features_b if f in wind_feature_data.columns]
X_b = wind_feature_data[features_b]
# 目标变量:模型a的TKE结果(按高度和时间匹配)
# 创建匹配键:时间+高度
model_a_result["match_key"] = model_a_result["DateTime"].astype(str) + "_" + model_a_result["Height(m)"].astype(str)
wind_feature_data["match_key"] = wind_feature_data["DateTime"].astype(str) + "_" + wind_feature_data["Height(m)"].astype(str)
# 合并获取目标值
merged_for_target = pd.merge(
wind_feature_data[["match_key"]],
model_a_result[["match_key", "TKE_a"]],
on="match_key",
how="left"
)
y_b = merged_for_target["TKE_a"].values
# 处理可能的NaN值
valid_mask = ~np.isnan(y_b)
if np.sum(valid_mask) < 10: # 确保有足够的有效数据
raise ValueError("模型b训练数据不足,请检查数据匹配情况")
X_b = X_b[valid_mask]
y_b = y_b[valid_mask]
# 标准化与训练
X_b_scaled = self.scaler_b.fit_transform(X_b)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b_scaled, y_b, test_size=0.2, random_state=42)
self.model_b.fit(X_train_b, y_train_b)
# 预测与优化
y_pred_b = self.model_b.predict(X_test_b)
metrics_b = {
"RMSE": np.sqrt(mean_squared_error(y_test_b, y_pred_b)),
"MAE": mean_absolute_error(y_test_b, y_pred_b),
"R2": r2_score(y_test_b, y_pred_b)
}
print("模型b(仅风廓线)初始评估指标:", metrics_b)
# 结果合并
wind_feature_data["TKE_b"] = np.nan
wind_feature_data.loc[valid_mask, "TKE_b"] = self.model_b.predict(self.scaler_b.transform(X_b))
return wind_feature_data, metrics_b
# -------------------------- 4. 主执行流程 --------------------------
if __name__ == "__main__":
try:
# 1. 数据预处理
preprocessor = DataPreprocessor()
print("正在处理微波辐射计数据...")
mwr_data = preprocessor.process_mwr(file_paths["a_mwr"])
print("正在处理风廓线数据...")
wind_data = preprocessor.process_wind_profile(file_paths["a_raw"])
# 重命名风数据列以保持一致性
wind_data_renamed = wind_data.rename(columns={
"采样高度(m)": "Height(m)",
"速度谱宽(m/s)": "Sigma",
[col for col in wind_data.columns if "信噪比" in col or "SNR" in col][0]: "SNR"
})
print("正在同步数据...")
merged_data = preprocessor.sync_data(wind_data, mwr_data)
if merged_data.empty:
raise ValueError("数据同步后为空,请检查输入数据格式")
# 2. 特征计算与模型训练
turbulence_model = TurbulenceModel()
# 模型a训练(使用融合数据)
print("正在训练模型a...")
model_a_result, metrics_a = turbulence_model.train_model_a(merged_data)
# 模型b训练与优化(仅使用风数据)
print("正在训练模型b...")
model_b_result, metrics_b = turbulence_model.train_model_b(
wind_data=wind_data_renamed,
model_a_result=model_a_result
)
# 3. 结果保存
# 3.1 模型评估指标
metrics_df = pd.DataFrame({
"模型": ["模型a(融合数据)", "模型b(仅风廓线)"],
"RMSE": [metrics_a["RMSE"], metrics_b["RMSE"]],
"MAE": [metrics_a["MAE"], metrics_b["MAE"]],
"R²": [metrics_a["R2"], metrics_b["R2"]]
})
metrics_df.to_csv(os.path.join(data_dir, "模型评估指标.csv"), index=False, encoding="utf-8-sig")
# 3.2 湍流强度廓线结果(按高度排序)
# 确保两个结果有相同的匹配键
model_a_result = model_a_result[["match_key", "Height(m)", "TKE_a"]].drop_duplicates()
model_b_result = model_b_result[["match_key", "Height(m)", "TKE_b"]].drop_duplicates()
profile_result = pd.merge(
model_a_result[["Height(m)", "TKE_a"]].rename(columns={"TKE_a": "湍流强度_模型a"}),
model_b_result[["Height(m)", "TKE_b"]].rename(columns={"TKE_b": "湍流强度_模型b"}),
on="Height(m)",
how="inner"
).sort_values("Height(m)")
profile_result.to_csv(os.path.join(data_dir, "湍流强度廓线结果.csv"), index=False, encoding="utf-8-sig")
# 3.3 模型参数保存
joblib.dump(turbulence_model.model_b, os.path.join(data_dir, "优化后模型b.pkl"))
print("="*50)
print("问题一计算完成!输出文件:")
print(f"1. 模型评估指标.csv -> {data_dir}")
print(f"2. 湍流强度廓线结果.csv -> {data_dir}")
print(f"3. 优化后模型b.pkl -> {data_dir}")
print("="*50)
except Exception as e:
print(f"执行过程中出错: {str(e)}")
# 打印详细错误信息
import traceback
traceback.print_exc()
最新发布