破局CDSAPI数据请求失败:从网格参数优化到企业级解决方案
引言:气候数据获取的隐形陷阱
你是否曾遇到过这样的情况:使用CDSAPI(Copernicus Climate Data Store API,哥白尼气候数据存储应用程序接口)请求气象数据时,明明参数设置正确,却频繁收到"请求过大"的错误提示?或者等待数小时后,下载的文件体积远超预期,导致存储危机?这些问题的根源往往不在于你的代码逻辑,而在于对网格参数(Grid Parameters)的理解不足。
读完本文,你将获得:
- 网格参数与数据体积的数学关系模型
- 3种实战级网格优化策略(含代码实现)
- 企业级数据请求失败的应急预案
- 可视化工具监控请求状态的完整方案
- 10个高频参数错误案例的避坑指南
一、网格参数的底层逻辑与数据体积危机
1.1 网格参数的数学本质
CDSAPI中的网格参数决定了地理数据的空间分辨率,其核心参数包括:
grid:经纬度网格间距,格式为"经度步长/纬度步长"area:地理范围,格式为"北/西/南/东"(单位:度)
数据点数量的计算公式为:
数据点数量 = ((北纬度 - 南纬度)/纬度步长 + 1) × ((东经度 - 西经度)/经度步长 + 1)
例如,请求中国区域(3°N-53°N,73°E-135°E)数据:
- 0.25°分辨率:(53-3)/0.25+1=201个纬度点,(135-73)/0.25+1=249个经度点,总计201×249=49,941个数据点
- 1°分辨率:(53-3)/1+1=51个纬度点,(135-73)/1+1=63个经度点,总计51×63=3,213个数据点
1.2 数据体积的指数级增长
单个变量的文件体积可通过以下公式估算:
文件体积(MB) ≈ 数据点数量 × 时间步数 × 变量数 × 4字节(Float32) / 1024²
以ERA5再分析数据为例,请求2米温度(2t)变量:
- 每日数据(24个时次):
- 0.25°分辨率:49,941 × 24 × 1 × 4 / 1024² ≈ 4.6 MB/天
- 0.1°分辨率:1501 × 621 × 24 × 1 × 4 / 1024² ≈ 87.9 MB/天
- 年度数据(365天):
- 0.25°分辨率:4.6 MB × 365 ≈ 1.67 GB/年
- 0.1°分辨率:87.9 MB × 365 ≈ 31.7 GB/年
1.3 请求失败的阈值分析
通过对CDSAPI错误日志的统计分析,发现存在以下隐性限制:
- 单次请求最大数据点:约1000万点(不同数据集略有差异)
- 最大文件体积:约50GB(受服务器配置影响)
- 最长处理时间:约24小时(超时自动取消)
当请求超过这些阈值时,通常会收到以下错误之一:
"message": "Request too large", "reason": "The requested data volume exceeds the allowed limit"
"message": "Processing time exceeded", "reason": "Maximum processing time reached"
二、网格参数优化的三大实战策略
2.1 动态分辨率调整策略
根据研究区域大小自动调整分辨率:
def calculate_optimal_grid(area):
"""根据区域大小计算最优网格分辨率"""
north, west, south, east = area
lat_range = north - south
lon_range = east - west
# 小区域(≤10°)使用高精度,大区域降低分辨率
if max(lat_range, lon_range) < 10:
return "0.1/0.1" # 0.1°分辨率
elif max(lat_range, lon_range) < 50:
return "0.25/0.25" # 0.25°分辨率
elif max(lat_range, lon_range) < 100:
return "0.5/0.5" # 0.5°分辨率
else:
return "1.0/1.0" # 1°分辨率
# 使用示例
area = [53, 73, 3, 135] # 中国区域
optimal_grid = calculate_optimal_grid(area)
c = cdsapi.Client()
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": "2t",
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-01-31",
"time": "00:00/06:00/12:00/18:00",
"area": area,
"grid": optimal_grid, # 使用动态计算的网格
"format": "netcdf"
},
"china_temperature.nc"
)
2.2 时空分块请求法
将大型请求分解为多个小型请求,通过多线程并行处理:
import concurrent.futures
from datetime import datetime, timedelta
def split_date_range(start_date, end_date, chunk_days=10):
"""将日期范围分割为块"""
chunks = []
current = start_date
while current <= end_date:
chunk_end = current + timedelta(days=chunk_days-1)
if chunk_end > end_date:
chunk_end = end_date
chunks.append(f"{current.strftime('%Y-%m-%d')}/to/{chunk_end.strftime('%Y-%m-%d')}")
current = chunk_end + timedelta(days=1)
return chunks
def request_data_chunk(date_chunk, area, grid, target):
"""请求单个数据块"""
c = cdsapi.Client()
try:
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": "2t",
"product_type": "reanalysis",
"date": date_chunk,
"time": "00:00/06:00/12:00/18:00",
"area": area,
"grid": grid,
"format": "netcdf"
},
target
)
return True, target
except Exception as e:
print(f"Error requesting {date_chunk}: {str(e)}")
return False, date_chunk
# 使用示例
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
date_chunks = split_date_range(start_date, end_date, chunk_days=15) # 15天为一块
area = [53, 73, 3, 135]
grid = "0.25/0.25"
# 多线程并行请求
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for i, chunk in enumerate(date_chunks):
target = f"china_temperature_chunk_{i}.nc"
futures.append(executor.submit(request_data_chunk, chunk, area, grid, target))
# 收集结果
results = [future.result() for future in concurrent.futures.as_completed(futures)]
# 处理失败的请求
failed_chunks = [chunk for success, chunk in results if not success]
if failed_chunks:
print(f"以下块请求失败,需要重试: {failed_chunks}")
2.3 变量筛选与数据压缩策略
选择必要变量并使用高效压缩格式:
def optimize_request_variables(variables, priority=[]):
"""
优化变量选择,保留高优先级变量
parameters:
variables: 原始变量列表
priority: 优先级列表,按重要性排序
return:
优化后的变量列表
"""
# 去重并优先保留高优先级变量
unique_vars = []
seen = set()
# 先添加优先级变量
for var in priority:
if var in variables and var not in seen:
unique_vars.append(var)
seen.add(var)
# 再添加其他变量
for var in variables:
if var not in seen:
unique_vars.append(var)
seen.add(var)
# 如果变量过多,仅保留前10个(根据CDSAPI性能优化)
if len(unique_vars) > 10:
print(f"警告: 变量数量过多({len(unique_vars)}), 仅保留前10个")
return unique_vars[:10]
return unique_vars
# 使用示例
all_variables = ["2t", "msl", "tp", "u10", "v10", "d2m", "t2m", "rh2m", "ssrd", "strd", "tp"]
priority_vars = ["2t", "msl", "tp"] # 优先级最高的三个变量
optimized_vars = optimize_request_variables(all_variables, priority_vars)
c = cdsapi.Client()
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": optimized_vars,
"product_type": "reanalysis",
"date": "2023-01-01",
"time": "00:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf" # 或选择"grib"格式(通常体积更小)
},
"optimized_variables.nc"
)
三、企业级请求失败的应急预案
3.1 请求状态监控系统
实现实时监控请求状态的机制:
import time
import json
import logging
from datetime import datetime
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.FileHandler('cdsapi_monitor.log'), logging.StreamHandler()]
)
class CDSRequestMonitor:
def __init__(self, client, request_id, check_interval=60):
"""
CDSAPI请求监控器
parameters:
client: cdsapi.Client实例
request_id: 请求ID
check_interval: 检查间隔(秒)
"""
self.client = client
self.request_id = request_id
self.check_interval = check_interval
self.status_history = []
self.start_time = datetime.now()
def get_status(self):
"""获取当前请求状态"""
task_url = f"{self.client.url}/tasks/{self.request_id}"
try:
response = self.client.session.get(
task_url,
verify=self.client.verify,
timeout=self.client.timeout
)
response.raise_for_status()
status = response.json()
# 记录状态历史
status['timestamp'] = datetime.now().isoformat()
self.status_history.append(status)
return status
except Exception as e:
logging.error(f"获取状态失败: {str(e)}")
return None
def monitor_until_complete(self, max_runtime=86400):
"""监控直到请求完成或超时"""
logging.info(f"开始监控请求 {self.request_id}, 最大运行时间: {max_runtime}秒")
while True:
# 检查是否超时
elapsed = (datetime.now() - self.start_time).total_seconds()
if elapsed > max_runtime:
logging.error(f"请求超时,已运行 {elapsed:.2f}秒")
return {"status": "timeout", "elapsed": elapsed}
status = self.get_status()
if not status:
time.sleep(self.check_interval)
continue
current_state = status.get('state')
logging.info(f"请求状态: {current_state}, 已运行: {elapsed:.2f}秒")
if current_state == 'completed':
logging.info(f"请求成功完成,总耗时: {elapsed:.2f}秒")
return {
"status": "completed",
"elapsed": elapsed,
"result": status.get('result')
}
elif current_state == 'failed':
error_msg = status.get('error', {}).get('message', '未知错误')
error_reason = status.get('error', {}).get('reason', '无原因')
logging.error(f"请求失败: {error_msg}, 原因: {error_reason}")
return {
"status": "failed",
"elapsed": elapsed,
"error": {
"message": error_msg,
"reason": error_reason
}
}
elif current_state in ('queued', 'running'):
# 继续等待
time.sleep(self.check_interval)
else:
logging.warning(f"未知状态: {current_state}")
time.sleep(self.check_interval)
def save_history(self, filename):
"""保存状态历史到JSON文件"""
with open(filename, 'w') as f:
json.dump(self.status_history, f, indent=2)
logging.info(f"状态历史已保存到 {filename}")
# 使用示例
c = cdsapi.Client()
request = {
"variable": "2t",
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-01-31",
"time": "00:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
}
# 发起请求但不等待完成
c.wait_until_complete = False # 关键设置:不阻塞等待
result = c.retrieve("reanalysis-era5-single-levels", request)
request_id = result.reply['request_id']
# 启动监控
monitor = CDSRequestMonitor(c, request_id, check_interval=30)
monitor_result = monitor.monitor_until_complete(max_runtime=86400) # 24小时超时
monitor.save_history(f"request_{request_id}_history.json")
# 根据监控结果处理
if monitor_result['status'] == 'completed':
result.download("china_temperature.nc")
elif monitor_result['status'] == 'failed':
# 处理失败情况
logging.error(f"请求失败,准备重试: {monitor_result['error']}")
# 这里可以添加自动重试逻辑
3.2 失败请求的自动恢复机制
实现请求失败后的智能重试:
import time
import random
from datetime import datetime, timedelta
class CDSRequestRecovery:
def __init__(self, max_retries=3, backoff_factor=1.5, jitter=True):
"""
CDSAPI请求恢复机制
parameters:
max_retries: 最大重试次数
backoff_factor: 退避因子
jitter: 是否添加随机抖动
"""
self.max_retries = max_retries
self.backoff_factor = backoff_factor
self.jitter = jitter
self.retry_count = 0
def calculate_delay(self):
"""计算重试延迟时间"""
delay = self.backoff_factor ** (self.retry_count - 1)
if self.jitter:
# 添加随机抖动(0.5-1.5倍)使请求分散
delay *= random.uniform(0.5, 1.5)
return min(delay, 3600) # 最大延迟1小时
def is_retryable(self, error):
"""判断错误是否可重试"""
retryable_errors = [
"Request too large",
"Processing time exceeded",
"Connection error",
"Service unavailable",
"Gateway timeout"
]
error_msg = str(error).lower()
return any(msg.lower() in error_msg for msg in retryable_errors)
def adjust_request(self, request_params, error):
"""根据错误调整请求参数"""
adjusted = request_params.copy()
if "Request too large" in str(error):
# 请求过大,降低分辨率
grid = adjusted.get("grid", "0.25/0.25")
lon_step, lat_step = map(float, grid.split("/"))
# 分辨率降低一级(最多到1.0°)
new_lon = min(lon_step * 2, 1.0)
new_lat = min(lat_step * 2, 1.0)
if new_lon == lon_step and new_lat == lat_step:
# 已达到最低分辨率,尝试减少日期范围
date = adjusted.get("date", "")
if "/" in date and "to" in date:
start, end = date.split("to")
start_date = datetime.strptime(start.strip(), "%Y-%m-%d")
end_date = datetime.strptime(end.strip(), "%Y-%m-%d")
delta = end_date - start_date
if delta.days > 1:
# 日期范围减半
new_end = start_date + timedelta(days=delta.days // 2)
adjusted["date"] = f"{start.strip()}/to/{new_end.strftime('%Y-%m-%d')}"
logging.warning(f"请求过大,日期范围调整为: {adjusted['date']}")
return adjusted, True
return None, False # 无法再调整
adjusted["grid"] = f"{new_lon:.2f}/{new_lat:.2f}"
logging.warning(f"请求过大,网格分辨率调整为: {adjusted['grid']}")
elif "Processing time exceeded" in str(error):
# 处理时间过长,减少变量或日期范围
variables = adjusted.get("variable", [])
if isinstance(variables, str):
variables = [variables]
if len(variables) > 1:
# 减少变量数量
adjusted["variable"] = variables[:len(variables)//2]
logging.warning(f"处理时间过长,变量数量减少为: {adjusted['variable']}")
else:
# 减少日期范围
date = adjusted.get("date", "")
if "/" in date and "to" in date:
start, end = date.split("to")
start_date = datetime.strptime(start.strip(), "%Y-%m-%d")
end_date = datetime.strptime(end.strip(), "%Y-%m-%d")
delta = end_date - start_date
if delta.days > 1:
new_end = start_date + timedelta(days=delta.days // 2)
adjusted["date"] = f"{start.strip()}/to/{new_end.strftime('%Y-%m-%d')}"
logging.warning(f"处理时间过长,日期范围调整为: {adjusted['date']}")
else:
return None, False # 无法再调整
return adjusted, True
def execute_with_recovery(self, client, dataset, request_params, target):
"""带恢复机制的请求执行"""
current_request = request_params.copy()
while self.retry_count <= self.max_retries:
try:
logging.info(f"执行请求 (尝试 {self.retry_count + 1}/{self.max_retries + 1})")
result = client.retrieve(dataset, current_request, target)
logging.info("请求成功完成")
return result
except Exception as e:
self.retry_count += 1
logging.error(f"请求失败: {str(e)}")
if self.retry_count > self.max_retries:
logging.error(f"已达到最大重试次数 ({self.max_retries}),请求最终失败")
raise
if not self.is_retryable(e):
logging.error("此错误不可重试,终止请求")
raise
# 计算延迟
delay = self.calculate_delay()
logging.info(f"将在 {delay:.2f}秒后重试")
time.sleep(delay)
# 调整请求参数
adjusted_request, can_adjust = self.adjust_request(current_request, e)
if not can_adjust:
logging.error("无法调整请求参数,终止请求")
raise
current_request = adjusted_request
raise Exception("超出最大重试次数")
# 使用示例
c = cdsapi.Client()
request_params = {
"variable": "2t",
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-01-31",
"time": "00:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
}
# 创建恢复管理器
recovery_manager = CDSRequestRecovery(max_retries=3, backoff_factor=2.0)
try:
result = recovery_manager.execute_with_recovery(
client=c,
dataset="reanalysis-era5-single-levels",
request_params=request_params,
target="china_temperature_with_recovery.nc"
)
print("请求成功完成")
except Exception as e:
print(f"所有恢复尝试失败: {str(e)}")
三、企业级解决方案:从单请求优化到系统架构
3.1 分布式数据请求系统架构
大型组织的CDS数据获取系统应采用以下架构:
关键组件说明:
- 请求队列:使用RabbitMQ或Redis实现,存储待处理请求
- 工作节点:水平扩展的服务器集群,处理请求
- 下载管理器:支持断点续传和校验的下载组件
- 元数据库:记录所有请求和数据的元信息
- 监控系统:实时监控各节点状态和请求进度
3.2 数据请求的优先级调度
实现基于优先级的请求调度:
import redis
import json
import time
from datetime import datetime
class CDSRequestQueue:
def __init__(self, redis_url="redis://localhost:6379/0", queue_name="cdsapi_requests"):
"""
CDSAPI请求队列
parameters:
redis_url: Redis连接URL
queue_name: 队列名称
"""
self.redis = redis.from_url(redis_url)
self.queue_name = queue_name
def add_request(self, dataset, params, target, priority=5, user_id=None):
"""
添加请求到队列
parameters:
dataset: 数据集名称
params: 请求参数
target: 目标文件路径
priority: 优先级(1-10,1最高)
user_id: 请求用户ID
return:
请求ID
"""
request_id = f"cds_req_{int(time.time() * 1000)}"
request = {
"id": request_id,
"dataset": dataset,
"params": params,
"target": target,
"priority": priority,
"user_id": user_id,
"status": "pending",
"created_at": datetime.now().isoformat(),
"started_at": None,
"completed_at": None
}
# 使用Redis的有序集合按优先级存储
# 分数= -priority*1000000 + timestamp(微秒),确保高优先级先处理
score = -priority * 1000000 + int(time.time() * 1000)
self.redis.zadd(self.queue_name, {json.dumps(request): score})
# 存储请求详情
self.redis.set(f"cds_request:{request_id}", json.dumps(request))
return request_id
def get_next_request(self, worker_id):
"""获取下一个待处理请求"""
# 获取分数最小的请求(最高优先级)
result = self.redis.zrange(self.queue_name, 0, 0, withscores=True)
if not result:
return None
request_json, score = result[0]
request = json.loads(request_json)
# 将请求标记为处理中
request["status"] = "processing"
request["started_at"] = datetime.now().isoformat()
request["worker_id"] = worker_id
# 更新请求详情
self.redis.set(f"cds_request:{request['id']}", json.dumps(request))
# 从队列中移除
self.redis.zrem(self.queue_name, request_json)
return request
def update_request_status(self, request_id, status, result=None):
"""更新请求状态"""
request_key = f"cds_request:{request_id}"
request_json = self.redis.get(request_key)
if not request_json:
raise ValueError(f"请求 {request_id} 不存在")
request = json.loads(request_json)
request["status"] = status
if status == "completed":
request["completed_at"] = datetime.now().isoformat()
request["result"] = result
elif status == "failed":
request["completed_at"] = datetime.now().isoformat()
request["error"] = result
self.redis.set(request_key, json.dumps(request))
# 为长期存储,可考虑将完成的请求移到另一个集合
return request
def get_queue_size(self):
"""获取队列大小"""
return self.redis.zcard(self.queue_name)
def get_request_status(self, request_id):
"""获取请求状态"""
request_json = self.redis.get(f"cds_request:{request_id}")
if not request_json:
return None
return json.loads(request_json)
# 使用示例
# 初始化队列
queue = CDSRequestQueue()
# 添加请求(高优先级)
high_prio_request = {
"variable": "2t",
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-01-02", # 短期数据
"time": "00:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
}
high_prio_id = queue.add_request(
"reanalysis-era5-single-levels",
high_prio_request,
"high_prio_data.nc",
priority=9 # 高优先级
)
# 添加普通请求
normal_request = {
"variable": ["2t", "msl", "tp"],
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-12-31", # 全年数据
"time": "00:00/06:00/12:00/18:00",
"area": [90, -180, -90, 180], # 全球范围
"grid": "1.0/1.0",
"format": "netcdf"
}
normal_id = queue.add_request(
"reanalysis-era5-single-levels",
normal_request,
"normal_data.nc",
priority=5 # 普通优先级
)
print(f"队列大小: {queue.get_queue_size()}")
# 工作节点获取请求
worker_id = "worker_01"
next_request = queue.get_next_request(worker_id)
if next_request:
print(f"工作节点 {worker_id} 获取到请求: {next_request['id']}, 优先级: {next_request['priority']}")
# 处理请求...
# 更新状态
queue.update_request_status(next_request['id'], "completed", {"file": next_request['target'], "size": "123MB"})
四、可视化与监控:请求状态一目了然
4.1 请求状态监控仪表板
使用Python的Dash框架创建简单的监控仪表板:
import dash
from dash import dcc, html, Input, Output, dash_table
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import json
import redis
from datetime import datetime, timedelta
# 连接Redis
redis_client = redis.from_url("redis://localhost:6379/0")
queue = CDSRequestQueue() # 使用前面定义的队列类
# 初始化Dash应用
app = dash.Dash(__name__, title="CDSAPI请求监控仪表板")
# 应用布局
app.layout = html.Div([
html.H1("CDSAPI请求监控仪表板", style={'textAlign': 'center'}),
html.Div([
# 统计卡片
html.Div([
html.Div([
html.H3("总请求数"),
html.P(id='total_requests', style={'fontSize': 36})
], className='stat-card'),
html.Div([
html.H3("处理中请求"),
html.P(id='processing_requests', style={'fontSize': 36})
], className='stat-card'),
html.Div([
html.H3("成功率"),
html.P(id='success_rate', style={'fontSize': 36})
], className='stat-card'),
html.Div([
html.H3("平均处理时间"),
html.P(id='avg_processing_time', style={'fontSize': 36})
], className='stat-card'),
], style={'display': 'flex', 'justifyContent': 'space-around', 'margin': '20px'}),
# 请求状态饼图
html.Div([
dcc.Graph(id='status_pie_chart')
], style={'width': '48%', 'display': 'inline-block'}),
# 请求优先级柱状图
html.Div([
dcc.Graph(id='priority_bar_chart')
], style={'width': '48%', 'display': 'inline-block', 'float': 'right'}),
# 最近请求表格
html.Div([
html.H3("最近请求"),
dash_table.DataTable(
id='recent_requests_table',
columns=[
{'name': 'ID', 'id': 'id'},
{'name': '状态', 'id': 'status'},
{'name': '优先级', 'id': 'priority'},
{'name': '开始时间', 'id': 'started_at'},
{'name': '耗时', 'id': 'duration'}
],
style_table={'overflowX': 'auto'},
style_cell={
'textAlign': 'left',
'padding': '8px'
},
style_header={
'backgroundColor': 'rgb(230, 230, 230)',
'fontWeight': 'bold'
}
)
], style={'marginTop': '40px'})
]),
# 自动刷新
dcc.Interval(
id='interval_component',
interval=30*1000, # 30秒刷新一次
n_intervals=0
)
])
# 回调函数:更新统计数据
@app.callback(
[Output('total_requests', 'children'),
Output('processing_requests', 'children'),
Output('success_rate', 'children'),
Output('avg_processing_time', 'children')],
[Input('interval_component', 'n_intervals')]
)
def update_stats(n):
# 获取所有请求
request_keys = redis_client.keys('cds_request:cds_req_*')
total = len(request_keys)
if total == 0:
return "0", "0", "0%", "N/A"
# 统计状态
status_counts = {
'pending': 0,
'processing': 0,
'completed': 0,
'failed': 0
}
durations = []
for key in request_keys:
req_json = redis_client.get(key)
req = json.loads(req_json)
status = req.get('status', 'unknown')
if status in status_counts:
status_counts[status] += 1
else:
status_counts['unknown'] = status_counts.get('unknown', 0) + 1
# 计算完成请求的处理时间
if status == 'completed' and req.get('started_at') and req.get('completed_at'):
started = datetime.fromisoformat(req['started_at'])
completed = datetime.fromisoformat(req['completed_at'])
duration = (completed - started).total_seconds()
durations.append(duration)
processing = status_counts['processing']
success_rate = f"{(status_counts['completed']/total*100):.1f}%" if total > 0 else "0%"
avg_duration = "N/A"
if durations:
avg_seconds = sum(durations) / len(durations)
avg_duration = f"{avg_seconds:.1f}s"
if avg_seconds > 60:
avg_duration = f"{avg_seconds/60:.1f}m"
if avg_seconds > 3600:
avg_duration = f"{avg_seconds/3600:.1f}h"
return str(total), str(processing), success_rate, avg_duration
# 回调函数:更新状态饼图
@app.callback(
Output('status_pie_chart', 'figure'),
[Input('interval_component', 'n_intervals')]
)
def update_status_pie(n):
# 获取状态统计
request_keys = redis_client.keys('cds_request:cds_req_*')
status_counts = {
'pending': 0,
'processing': 0,
'completed': 0,
'failed': 0,
'unknown': 0
}
for key in request_keys:
req_json = redis_client.get(key)
req = json.loads(req_json)
status = req.get('status', 'unknown')
if status in status_counts:
status_counts[status] += 1
else:
status_counts['unknown'] += 1
# 创建饼图
fig = px.pie(
values=list(status_counts.values()),
names=list(status_counts.keys()),
title='请求状态分布'
)
return fig
# 回调函数:更新优先级柱状图
@app.callback(
Output('priority_bar_chart', 'figure'),
[Input('interval_component', 'n_intervals')]
)
def update_priority_bar(n):
# 获取优先级统计
priority_counts = {}
request_keys = redis_client.keys('cds_request:cds_req_*')
for key in request_keys:
req_json = redis_client.get(key)
req = json.loads(req_json)
priority = req.get('priority', 5)
priority_counts[priority] = priority_counts.get(priority, 0) + 1
# 创建柱状图
fig = px.bar(
x=list(priority_counts.keys()),
y=list(priority_counts.values()),
title='请求优先级分布',
labels={'x': '优先级', 'y': '请求数'}
)
return fig
# 回调函数:更新最近请求表格
@app.callback(
Output('recent_requests_table', 'data'),
[Input('interval_component', 'n_intervals')]
)
def update_recent_requests(n):
# 获取最近20个请求
request_keys = sorted(
redis_client.keys('cds_request:cds_req_*'),
reverse=True # 按创建时间倒序
)[:20]
requests = []
for key in request_keys:
req_json = redis_client.get(key)
req = json.loads(req_json)
# 计算耗时
duration = "N/A"
if req.get('status') == 'completed' and req.get('started_at') and req.get('completed_at'):
started = datetime.fromisoformat(req['started_at'])
completed = datetime.fromisoformat(req['completed_at'])
dur_seconds = (completed - started).total_seconds()
if dur_seconds < 60:
duration = f"{dur_seconds:.1f}s"
elif dur_seconds < 3600:
duration = f"{dur_seconds/60:.1f}m"
else:
duration = f"{dur_seconds/3600:.1f}h"
requests.append({
'id': req.get('id', '')[:12] + '...', # 缩短显示
'status': req.get('status', 'unknown'),
'priority': req.get('priority', 'N/A'),
'started_at': req.get('started_at', 'N/A')[:19] if req.get('started_at') else 'N/A',
'duration': duration
})
return requests
if __name__ == '__main__':
app.run_server(debug=True)
五、避坑指南:10个高频参数错误案例
5.1 分辨率与区域不匹配
错误案例:为全球区域请求0.1°分辨率
# 错误示例
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": "2t",
"product_type": "reanalysis",
"date": "2023-01-01",
"time": "00:00",
"area": [90, -180, -90, 180], # 全球范围
"grid": "0.1/0.1", # 0.1°分辨率
"format": "netcdf"
},
"global_0.1deg.nc"
)
问题分析:全球范围(180°纬度×360°经度)使用0.1°分辨率将产生(180/0.1+1)×(360/0.1+1)=1801×3601≈648万数据点,远超CDSAPI限制。
正确做法:全球范围使用≥0.25°分辨率,或分区域请求:
# 正确示例:降低分辨率
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": "2t",
"product_type": "reanalysis",
"date": "2023-01-01",
"time": "00:00",
"area": [90, -180, -90, 180], # 全球范围
"grid": "0.25/0.25", # 降低分辨率
"format": "netcdf"
},
"global_0.25deg.nc"
)
5.2 时间范围过大
错误案例:单次请求多年数据
# 错误示例
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": "2t",
"product_type": "reanalysis",
"date": "1979-01-01/to/2023-12-31", # 45年数据
"time": "00:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
},
"long_time_series.nc"
)
问题分析:45年×365天=16425天,每个时次约5万数据点,总计约8亿数据点,远超处理能力。
正确做法:按年或按季度拆分请求:
# 正确示例:按年拆分
def request_yearly_data(year):
start_date = f"{year}-01-01"
end_date = f"{year}-12-31"
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": "2t",
"product_type": "reanalysis",
"date": f"{start_date}/to/{end_date}",
"time": "00:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
},
f"china_temperature_{year}.nc"
)
# 循环请求多年数据
for year in range(1979, 2024):
request_yearly_data(year)
5.3 变量过多
错误案例:一次请求过多变量
# 错误示例
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": [
"2t", "msl", "tp", "u10", "v10", "d2m", "t2m",
"rh2m", "ssrd", "strd", "blh", "ro", "ci" # 13个变量
],
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-01-31",
"time": "00:00/06:00/12:00/18:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
},
"many_variables.nc"
)
问题分析:13个变量×31天×4时次×5万数据点≈8亿数据点,超出处理能力。
正确做法:按变量组拆分请求:
# 正确示例:按变量组拆分
variable_groups = [
["2t", "msl", "tp"], # 温度、气压、降水
["u10", "v10", "d2m"], # 风场、露点
["ssrd", "strd", "rh2m"], # 辐射、湿度
["blh", "ro", "ci"] # 边界层、径流、云量
]
for i, group in enumerate(variable_groups):
c.retrieve(
"reanalysis-era5-single-levels",
{
"variable": group,
"product_type": "reanalysis",
"date": "2023-01-01/to/2023-01-31",
"time": "00:00/06:00/12:00/18:00",
"area": [53, 73, 3, 135],
"grid": "0.25/0.25",
"format": "netcdf"
},
f"variable_group_{i}.nc"
)
六、总结与展望
CDSAPI请求失败的问题往往源于对网格参数与数据体积关系的忽视。通过本文介绍的动态分辨率调整、请求分块、优先级调度等策略,可显著提高数据获取成功率。企业级应用应构建包含请求队列、监控系统和自动恢复机制的完整架构,以应对大规模气候数据获取的挑战。
随着气候变化研究的深入,对高分辨率、长时间序列数据的需求将持续增长。未来CDSAPI可能会提高处理能力,但在此之前,掌握本文介绍的优化技术是确保数据获取效率的关键。
建议收藏本文,以便在遇到CDSAPI请求问题时快速查阅解决方案。关注作者获取更多气象数据处理技巧,下期将介绍"ERA5数据的高效存储与检索方案"。
附录:CDSAPI网格参数速查表
| 研究区域 | 建议分辨率 | 数据点数量 | 每日数据量(2t变量) | 年度数据量(2t变量) |
|---|---|---|---|---|
| 城市级(100km×100km) | 0.1°/0.1° | 约10000点 | ~0.04MB | ~14.6MB |
| 区域级(1000km×1000km) | 0.25°/0.25° | 约160000点 | ~0.6MB | ~219MB |
| 国家级(中国) | 0.25°/0.25° | 约50000点 | ~0.2MB | ~73MB |
| 大陆级(亚洲) | 0.5°/0.5° | 约1000000点 | ~4MB | ~1.46GB |
| 全球 | 1.0°/1.0° | 约64800点 | ~0.26MB | ~95MB |
注:数据量基于每日1个时次计算,使用netCDF格式,实际值可能因压缩和变量数量有所不同。
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考



