DEFAULT_REWARD_HYPERPARAMS = {
# 1. 奖励组件权重(含事故场景自适应调整)
"weight": {
"peak": { # 高峰时段:优先降低延误与等待
"delay_wait": 0.45,
"queue": 0.15,
"approach": 0.15,
"travel": 0.15,
"coordination": 0.10 # 区域协同权重(高峰保持原设计)
},
"non_peak": { # 非高峰时段:提升协同与通行权重
"delay_wait": 0.35,
"queue": 0.15,
"approach": 0.15,
"travel": 0.20,
"coordination": 0.20 # 【优化】非高峰协同权重从0.15→0.20
},
"accident_adjust": { # 【优化】事故时多指标放大(原单一系数→字典)
"queue": 1.5, # 排队权重放大
"delay_wait": 1.2 # 延误-等待权重也放大
}
},
# 2. 天气影响系数(保持原设计)
"weather_impact": {
0: {"delay": 1.0, "waiting": 1.0, "queue": 1.0, "travel": 1.0, "coord": 1.0},
1: {"delay": 1.15, "waiting": 1.1, "queue": 1.15, "travel": 0.85, "coord": 1.15},
2: {"delay": 1.35, "waiting": 1.25, "queue": 1.35, "travel": 0.75, "coord": 1.35},
3: {"delay": 1.5, "waiting": 1.4, "queue": 1.5, "travel": 0.65, "coord": 1.5}
},
# 3. 平滑与归一化参数(保持原设计)
"smoothing": {
"alpha": 0.3,
"ema_decay": 0.8,
"sigmoid_sensitivity": 0.8
},
# 4. 通行奖励参数(保持原设计)
"travel_reward": {
"distance_scale": 100.0,
"base_entry_reward": 5.0,
"max_completed_reward": 5,
"count_bonus_max": 0.3,
"potential_value_coeff": 0.5
},
# 5. 拥堵判断与奖励(新增区域拥堵参数)
"congestion": {
"global_congest_threshold": 5.0,
"approach_congest_threshold": 0.5,
"queue_drop_steps": 2,
"congest_relief_bonus": 0.05,
"relief_rate_coeff": 0.03,
"region_congest_threshold": 0.5, # 【新增】区域拥堵阈值
"region_congest_penalty_scale": 1.5 # 【新增】区域拥堵时惩罚放大系数
},
# 6. 区域协调参数(提升协同奖励权重)
"coordination": {
"base_penalty_weight": 0.1,
"synergy_bonus_weight": 0.1 # 【优化】协同奖励权重从0.05→0.1
},
# 7. 动态基线参数(保持原设计)
"base_reward": {
"initial": 0.2,
"decay_steps": 1000,
"decay_max_ratio": 0.15
},
# 8. 日志配置(保持原设计)
"log": {
"print_interval": 100
},
# 9. 车型权重(保持原设计)
"vehicle_type_weight": {
1: 1.0,
2: 1.5,
3: 1.2,
4: 0.8,
5: 0.5,
"default": 1.0
},
# 10. 【新增】事故延误惩罚参数
"accident_delay": {
"threshold": 15.0, # 事故场景下延误超阈值(秒)
"penalty": 0.05 # 额外扣除的奖励
}
}
def calculate_travel_reward(junction_id, fp, vehicles, invalid_lanes, config):
cfg = config["travel_reward"]
total_distance = 0.0
total_potential_value = 0.0
valid_count = 0
completed_count = 0
rewarded_completed = set()
vehicle_dict = {v["v_id"]: v for v in vehicles}
current_v_ids = set(vehicle_dict.keys())
vehicle_type_weight = config["vehicle_type_weight"]
junction_metrics = fp.junction_metrics.get(junction_id, {})
completed_vehicles = junction_metrics.get("completed_vehicles", set())
for v_id, distance in fp.vehicle_distance_store.items():
if v_id not in current_v_ids or v_id not in vehicle_dict:
continue
vehicle = vehicle_dict[v_id]
if (vehicle.get("target_junction") == junction_id
and (on_enter_lane(vehicle, invalid_lanes) or in_junction(vehicle))
and v_id not in completed_vehicles):
if vehicle.get("lane") in invalid_lanes or fp.vehicle_status.get(v_id, 1) != 0:
continue
v_config_id = vehicle.get("v_config_id")
if not v_config_id:
weight = vehicle_type_weight["default"]
else:
vehicle_config = fp.vehicle_configs.get(v_config_id, {})
v_type = vehicle_config.get("v_type", "default")
weight = vehicle_type_weight.get(v_type, vehicle_type_weight["default"])
total_distance += (distance / cfg["distance_scale"]) * weight
valid_count += 1
position = vehicle.get("position_in_lane", {})
distance_to_stop = position.get("y", 50.0)
potential_value = max(0.0, (50.0 - distance_to_stop) / 50.0) * cfg["potential_value_coeff"]
total_potential_value += potential_value * weight
if v_id in completed_vehicles and v_id not in rewarded_completed:
if completed_count < cfg["max_completed_reward"]:
total_distance += cfg["base_entry_reward"] * 2
completed_count += 1
rewarded_completed.add(v_id)
total_reward_value = total_distance + total_potential_value
total_count = valid_count + completed_count
if total_count <= 0:
return 0.1
else:
avg_value = total_reward_value / total_count
count_bonus = min(cfg["count_bonus_max"], total_count * 0.01)
return np.tanh(avg_value + count_bonus)
def reward_shaping(_obs, _extra_info, act, agent, config=None):
cfg = config or DEFAULT_REWARD_HYPERPARAMS
fp = agent.preprocess
junction_ids = fp.get_sorted_junction_ids()
rewards = {j_id: {'total': 0.0, 'components': {}} for j_id in junction_ids}
frame_state = _obs.get("framestate", {})
vehicles = frame_state.get("vehicles", [])
fp.update_traffic_info(_obs, _extra_info)
all_junction_waiting = fp.get_all_junction_waiting_time(vehicles) if vehicles else {}
invalid_lanes = fp.get_invalid_lanes()
global_avg_queue = fp.get_all_avg_queue()
is_global_congested = global_avg_queue > cfg["congestion"]["global_congest_threshold"]
weather = fp.get_weather()
weather_map = {0: "晴", 1: "雨", 2: "雪", 3: "雾"}
weather_name = weather_map.get(weather, "未知")
weather_factors = cfg["weather_impact"].get(weather, cfg["weather_impact"][0])
is_peak = fp.is_peak_hour()
weight_key = "peak" if is_peak else "non_peak"
if not hasattr(agent, 'prev_metrics'):
agent.prev_metrics = {}
for j_id in junction_ids:
capacity = fp.get_junction_capacity(j_id)
agent.prev_metrics[j_id] = {
"avg_delay_wait": 8.0 + capacity * 0.015,
"avg_queue": 2.0 + capacity * 0.002,
"travel_reward": 0.0,
"prev_queue": 0.0
}
if not hasattr(agent, 'ema_delay_wait'):
agent.ema_delay_wait = {j_id: 0.0 for j_id in junction_ids}
if not hasattr(agent, 'prev_approach_congestion'):
agent.prev_approach_congestion = {j_id: 0.0 for j_id in junction_ids}
if not hasattr(agent, 'prev_metrics_trend'):
agent.prev_metrics_trend = {
j_id: {"queue": []} for j_id in junction_ids
}
def sigmoid_scale(x, sensitivity=None):
sens = sensitivity or cfg["smoothing"]["sigmoid_sensitivity"]
x_clamped = np.clip(x, -1000, 1000)
return 2.0 / (1 + np.exp(-sens * x_clamped)) - 1.0
for j_id in junction_ids:
junction = fp.junction_dict.get(j_id, {})
if not junction:
rewards[j_id]['total'] = 0.0
continue
signal_id = junction.get("signal", "")
capacity = fp.get_junction_capacity(j_id)
region_id = fp.get_region(j_id)
region_id = region_id if region_id is not None else -1
enter_lanes = junction.get("cached_enter_lanes", [])
valid_enter_lanes = [lane for lane in enter_lanes if lane not in invalid_lanes]
is_accident = len(valid_enter_lanes) < len(enter_lanes)
# 1. 延误-等待综合奖励
current_avg_delay = fp.get_junction_avg_delay(j_id)
current_avg_waiting = all_junction_waiting.get(j_id, 0.0)
current_avg_delay_wait = (current_avg_delay + current_avg_waiting) / 2
agent.ema_delay_wait[j_id] = (
cfg["smoothing"]["ema_decay"] * agent.ema_delay_wait[j_id] +
(1 - cfg["smoothing"]["ema_decay"]) * current_avg_delay_wait
)
current_smoothed = agent.ema_delay_wait[j_id]
prev_avg_delay_wait = agent.prev_metrics[j_id]["avg_delay_wait"]
delay_wait_delta = prev_avg_delay_wait - current_smoothed
delay_wait_change = delay_wait_delta / max(1.0, current_avg_delay_wait) * 10
delay_wait_reward = cfg["weight"][weight_key]["delay_wait"] * sigmoid_scale(delay_wait_change)
delay_wait_reward *= weather_factors["delay"]
rewards[j_id]['components']['delay_wait'] = delay_wait_reward
# 2. 排队奖励(事故适配+速率奖励)
current_avg_queue = fp.get_junction_avg_queue(j_id)
prev_avg_queue = agent.prev_metrics[j_id]["avg_queue"]
prev_prev_queue = agent.prev_metrics[j_id]["prev_queue"]
queue_delta = prev_avg_queue - current_avg_queue
normalized_queue = current_avg_queue / max(1.0, capacity)
queue_delta_normalized = queue_delta / max(1.0, capacity)
queue_reward = (
cfg["weight"][weight_key]["queue"] * sigmoid_scale(queue_delta_normalized * 5.0) +
weather_factors["queue"] * sigmoid_scale(normalized_queue, sensitivity=2.0)
)
agent.prev_metrics_trend[j_id]["queue"].append(current_avg_queue)
if len(agent.prev_metrics_trend[j_id]["queue"]) > cfg["congestion"]["queue_drop_steps"]:
agent.prev_metrics_trend[j_id]["queue"].pop(0)
if len(agent.prev_metrics_trend[j_id]["queue"]) == cfg["congestion"]["queue_drop_steps"]:
if all(
agent.prev_metrics_trend[j_id]["queue"][i] > agent.prev_metrics_trend[j_id]["queue"][i+1]
for i in range(cfg["congestion"]["queue_drop_steps"] - 1)
):
queue_reward += cfg["congestion"]["congest_relief_bonus"]
relief_rate = agent.prev_metrics_trend[j_id]["queue"][0] - current_avg_queue
rate_bonus = relief_rate * cfg["congestion"]["relief_rate_coeff"]
queue_reward += rate_bonus
# 事故场景:同时放大排队和延误-等待权重 + 延误超阈值惩罚
if is_accident:
queue_reward *= cfg["weight"]["accident_adjust"].get("queue", 1.0)
delay_wait_reward *= cfg["weight"]["accident_adjust"].get("delay_wait", 1.0)
if current_avg_delay > cfg["accident_delay"]["threshold"]:
delay_wait_reward -= cfg["accident_delay"]["penalty"]
rewards[j_id]['components']['queue'] = queue_reward
rewards[j_id]['components']['delay_wait'] = delay_wait_reward # 同步更新delay_wait
# 3. 进口道奖励
phase_remaining = fp.get_phase_remaining_time(signal_id) if signal_id else 0
approach_queue = sum(len(fp.lane_volume.get(lane, [])) for lane in valid_enter_lanes)
total_possible_queue = sum(fp.get_lane_capacity(lane) for lane in valid_enter_lanes)
demand_ratio = approach_queue / (total_possible_queue + 1e-5) if total_possible_queue > 0 else 0.5
approach_reward = cfg["weight"][weight_key]["approach"] * (demand_ratio + 0.1) * max(0.0, 1 - phase_remaining / 5)
approach_reward *= 1.0 / weather_factors["delay"]
if hasattr(fp, 'lane_congestion'):
approach_congestion = [fp.lane_congestion.get(lane, 0.0) for lane in valid_enter_lanes]
avg_approach_congestion = np.mean(approach_congestion) if approach_congestion else 0.0
if (avg_approach_congestion < cfg["congestion"]["approach_congest_threshold"] and
agent.prev_approach_congestion[j_id] >= cfg["congestion"]["approach_congest_threshold"]):
approach_reward += cfg["congestion"]["congest_relief_bonus"]
agent.prev_approach_congestion[j_id] = avg_approach_congestion
rewards[j_id]['components']['approach'] = approach_reward
# 4. 通行奖励
travel_reward = calculate_travel_reward(
junction_id=j_id, fp=fp, vehicles=vehicles,
invalid_lanes=invalid_lanes, config=cfg
)
travel_reward *= weather_factors["travel"]
rewards[j_id]['components']['travel'] = cfg["weight"][weight_key]["travel"] * travel_reward
# 5. 区域协调奖励(区域拥堵时放大惩罚)
coordination_penalty = 0.0
if region_id != -1 and hasattr(fp, 'region_dict'):
region_dict = getattr(fp, 'region_dict', {})
region_junctions = region_dict.get(region_id, [])
if len(region_junctions) > 1:
region_avg_queue = fp.get_region_avg_queue(region_id)
queue_deviation = abs(current_avg_queue - region_avg_queue)
region_cap = fp.get_region_capacity(region_id)
region_congestion = region_avg_queue / max(1.0, region_cap) if region_cap > 0 else 0.0
# 区域拥堵时放大协同惩罚
region_scale = cfg["congestion"]["region_congest_penalty_scale"] if region_congestion > cfg["congestion"]["region_congest_threshold"] else 1.0
coordination_factor = (1.0 + 2.0 * min(1.0, region_congestion)) * weather_factors["coord"] * region_scale
coordination_penalty = -cfg["coordination"]["base_penalty_weight"] * coordination_factor * sigmoid_scale(queue_deviation, sensitivity=0.5)
queue_diffs = []
for j_near in region_junctions:
if j_near == j_id:
continue
near_queue = fp.get_junction_avg_queue(j_near)
queue_diffs.append(abs(current_avg_queue - near_queue))
if queue_diffs:
avg_queue_diff = np.mean(queue_diffs)
coordination_penalty += cfg["coordination"]["synergy_bonus_weight"] * sigmoid_scale(-avg_queue_diff, sensitivity=0.1)
rewards[j_id]['components']['coordination'] = cfg["weight"][weight_key]["coordination"] * coordination_penalty
# 6. 总奖励计算(事故时多指标权重调整)
weight_set = cfg["weight"][weight_key].copy()
if is_accident:
scaled_keys = ["queue", "delay_wait"]
original_weights = {k: weight_set[k] for k in scaled_keys}
total_scale = sum(original_weights[k] * cfg["weight"]["accident_adjust"].get(k, 1.0) for k in scaled_keys)
original_total = sum(original_weights.values())
scale_ratio = total_scale / original_total
other_weight_sum = sum(w for k, w in weight_set.items() if k not in scaled_keys)
if other_weight_sum > 0:
adjust_ratio = (sum(weight_set.values()) * scale_ratio - sum(original_weights[k] * cfg["weight"]["accident_adjust"].get(k, 1.0) for k in scaled_keys)) / other_weight_sum
for k in weight_set:
if k not in scaled_keys:
weight_set[k] *= adjust_ratio
weight_sum = sum(weight_set.values()) or 1.0
normalized_weights = {k: v / weight_sum for k, v in weight_set.items()}
base_reward = cfg["base_reward"]["initial"]
if hasattr(agent, 'train_step'):
decay_ratio = min(agent.train_step / cfg["base_reward"]["decay_steps"], 1.0)
base_reward -= cfg["base_reward"]["decay_max_ratio"] * cfg["base_reward"]["initial"] * decay_ratio
total_reward = base_reward + sum(
normalized_weights[k] * rewards[j_id]['components'][k]
for k in normalized_weights
)
rewards[j_id]['total'] = np.clip(total_reward, -1.0, 1.0)
# 7. 更新历史缓存
agent.prev_metrics[j_id] = {
"avg_delay_wait": cfg["smoothing"]["alpha"] * current_avg_delay_wait + (1 - cfg["smoothing"]["alpha"]) * prev_avg_delay_wait,
"avg_queue": cfg["smoothing"]["alpha"] * current_avg_queue + (1 - cfg["smoothing"]["alpha"]) * prev_avg_queue,
"travel_reward": cfg["smoothing"]["alpha"] * travel_reward + (1 - cfg["smoothing"]["alpha"]) * agent.prev_metrics[j_id]["travel_reward"],
"prev_queue": prev_avg_queue
}
if (hasattr(agent, 'train_step')
and agent.train_step % cfg["log"]["print_interval"] == 0
and hasattr(agent, 'logger')):
for j_id in junction_ids:
comp = rewards[j_id]['components']
region_id = fp.get_region(j_id)
region_id = region_id if region_id is not None else -1
region_cap = fp.get_region_capacity(region_id)
region_congestion = fp.get_region_avg_queue(region_id) / max(1.0, region_cap) if region_cap > 0 else 0.0
enter_lanes = fp.junction_dict.get(j_id, {}).get("cached_enter_lanes", [])
valid_enter_lanes = [lane for lane in enter_lanes if lane not in invalid_lanes]
is_accident = len(valid_enter_lanes) < len(enter_lanes)
agent.logger.info(
f"Step {agent.train_step} | Junc {j_id} (Region {region_id}) - "
f"DelayWait: {comp['delay_wait']:.2f}, Queue: {comp['queue']:.2f}, "
f"Approach: {comp['approach']:.2f}, Travel: {comp['travel']:.2f}, "
f"Coord: {comp['coordination']:.2f} | "
f"Congestion: {region_congestion:.2f}, Peak: {is_peak}, Weather: {weather_name} "
f"(Factors: D:{weather_factors['delay']}, Q:{weather_factors['queue']}) | "
f"Accident: {is_accident}, Total: {rewards[j_id]['total']:.2f}"
)
return tuple(rewards[j_id]['total'] for j_id in junction_ids)检查此奖励函数的奖励与惩罚平衡性上是否合理?
最新发布