import xarray as xr
import pandas as pd
import numpy as np
import os
from glob import glob
import re
# ================== 用户参数设置 ==================
data_dir = r"D:\CHINAdate" # 根目录(含子目录)
output_dir = r"D:\CHINAdate\Excel_By_Year" # 输出路径
os.makedirs(output_dir, exist_ok=True)
# 目标站点
target_x = 107.67 # 经度
target_y = 35.2333 # 纬度
# 支持的变量列表及其单位(用于列名和元数据)
variables = {
'hurs': '%', # 相对湿度
'tas': '°C', # 平均气温 (需转K→℃)
'tasmax': '°C', # 最高气温
'tasmin': '°C', # 最低气温
'pr': 'mm/day', # 降水(假设是日降水量 kg/m²/s → mm/day)
'ps': 'hPa', # 地面气压
'rsds': 'W/m²', # 向下短波辐射
'rlds': 'W/m²', # 向下长波辐射
'sfcWind': 'm/s', # 风速(注意大小写可能不同)
'pet': 'mm/day' # 蒸散发(如 FAO-PET 或其他模型输出)
}
print(f"📍 目标站点: 长武县, 陕西")
print(f" x={target_x:.4f}, y={target_y:.4f}")
print(f"📊 变量列表: {', '.join(variables.keys())}\n")
# ================== 递归查找所有 .nc 文件 ==================
nc_files = sorted(glob(os.path.join(data_dir, "**", "*.nc"), recursive=True))
if not nc_files:
raise FileNotFoundError(f"❌ 未找到任何 .nc 文件:\n {data_dir}")
print(f"📦 发现 {len(nc_files)} 个 NetCDF 文件(含子目录)")
# ================== 按变量分类文件 ==================
file_dict = {var: [] for var in variables}
for file_path in nc_files:
fname = os.path.basename(file_path).lower()
for var in variables:
if var.lower() in fname or f"_{var}_" in fname or fname.startswith(var):
file_dict[var].append(file_path)
break
# 检查每个变量是否有文件
missing_vars = [var for var, files in file_dict.items() if not files]
if missing_vars:
print(f"⚠️ 以下变量未找到对应文件: {missing_vars}")
else:
print("✅ 所有变量均找到至少一个文件")
# ================== 存储结果:按年分组的字典 {year: {var: [series_list]}} ==================
yearly_data = {} # 结构: {2025: {'hurs': [s1,s2], 'tas': [s1]}, ...}
# ================== 循环处理每个变量的每个文件 ==================
for var_name in variables:
for file_path in file_dict[var_name]:
rel_path = os.path.relpath(file_path, data_dir)
print(f"\n🔄 正在处理 [{var_name}]: {rel_path}")
try:
ds = xr.open_dataset(file_path, engine='netcdf4')
if var_name not in ds:
print(f" ❌ 跳过: 文件中不包含变量 '{var_name}'")
ds.close()
continue
data = ds[var_name]
# 确保坐标是 x/y
if 'x' not in data.coords or 'y' not in data.coords:
print(f" ❌ 跳过: 缺少 'x' 或 'y' 坐标")
ds.close()
continue
x_vals = data.x.values
y_vals = data.y.values
if not (x_vals.min() <= target_x <= x_vals.max()):
print(f" ❌ 跳过: x 超出范围")
ds.close()
continue
if not (y_vals.min() <= target_y <= y_vals.max()):
print(f" ❌ 跳过: y 超出范围")
ds.close()
continue
# 插值到目标点
point_data = data.interp(x=target_x, y=target_y, method='linear')
series = point_data.to_series()
if series.empty:
print(f" ❌ 跳过: 插值为空")
ds.close()
continue
# 时间索引标准化为 datetime
try:
series.index = pd.to_datetime(series.index)
except Exception as e:
print(f" ❌ 时间解析失败: {e}")
ds.close()
continue
# 单位转换(特别重要)
values = series.values
if var_name in ['tas', 'tasmax', 'tasmin']: # K → °C
values = values - 273.15
elif var_name == 'pr': # kg/m²/s → mm/day
values = values * 86400
elif var_name == 'ps': # Pa → hPa
values = values / 100.0
# 构建新 Series
cleaned_series = pd.Series(values, index=series.index, name=var_name)
cleaned_series.dropna(inplace=True)
if cleaned_series.empty:
print(f" ❌ 跳过: 数据全为 NaN")
ds.close()
continue
# 提取年份(用第一个时间点)
year = cleaned_series.index[0].year
# 初始化 yearly_data 结构
if year not in yearly_data:
yearly_data[year] = {v: [] for v in variables}
yearly_data[year][var_name].append(cleaned_series)
print(f" ✅ 成功提取 {len(cleaned_series)} 天数据 → 归属年份: {year}")
ds.close()
except Exception as e:
print(f" ❌ 处理失败: {type(e).__name__}: {e}")
continue
# ================== 按年生成综合 Excel 文件 ==================
for year in sorted(yearly_data.keys()):
print(f"\n📌 开始生成 {year} 年的多变量数据...")
# 构建 DataFrame 列表
dfs_by_var = []
source_info = []
for var_name in variables:
series_list = yearly_data[year][var_name]
if not series_list:
print(f" ⚠️ {var_name} 在 {year} 年无数据")
continue
# 合并同一变量的多个来源(按时间对齐后平均或保留?这里我们拼接)
combined_series = pd.concat(series_list).sort_index()
# 去重:相同时间点取均值
daily = combined_series.groupby(combined_series.index).mean() # 日平均(防重复)
df_var = daily.to_frame(name=var_name)
dfs_by_var.append(df_var)
# 记录来源
sources = [os.path.basename(f).replace('.nc', '') for f in file_dict[var_name]]
source_info.append(f"{var_name}: " + ", ".join(sources[:3]) + ("..." if len(sources) > 3 else ""))
# 所有变量合并成一张表(时间对齐)
if not dfs_by_var:
print(f"❌ {year} 年没有任何变量数据,跳过")
continue
final_df = pd.concat(dfs_by_var, axis=1)
final_df.index.name = 'Date'
final_df.reset_index(inplace=True)
final_df.sort_values('Date', inplace=True)
# 保存路径
output_file = os.path.join(output_dir, f"Weather_Changwu_{year}.xlsx")
try:
with pd.ExcelWriter(output_file) as writer:
# 主表:所有变量
final_df.to_excel(writer, sheet_name="Multi_Variable_Data", index=False, float_format="%.3f")
# 汇总统计表
summary = pd.DataFrame([
{
"Variable": var,
"Unit": variables[var],
"Count": final_df[var].count(),
"Mean": final_df[var].mean(),
"Min": final_df[var].min(),
"Max": final_df[var].max()
}
for var in variables if var in final_df.columns
]).round(3)
summary.to_excel(writer, sheet_name="Summary", index=False)
# 元数据
metadata = pd.DataFrame({
"Parameter": [
"Location", "Latitude (y)", "Longitude (x)",
"Variables", "Source_Files_Summary",
"Created_On"
],
"Value": [
"Changwu County, Shaanxi",
f"{target_y:.4f}",
f"{target_x:.4f}",
", ".join(final_df.columns[1:]),
"\n".join(source_info),
pd.Timestamp.now().strftime("%Y-%m-%d %H:%M")
]
})
metadata.to_excel(writer, sheet_name="Metadata", index=False)
print(f"📁 已生成多变量文件: Weather_Changwu_{year}.xlsx ({final_df.shape[0]} 行)")
except Exception as e:
print(f"❌ 保存 {year} 年文件失败: {e}")
print(f"\n🎉 所有年份的多变量 Excel 文件已生成!")
print(f"💾 路径: {output_dir}")
在这个基础之上,在表格每一行变量名称后面加单位