# 加载必要的包
library(mgcv)
library(readxl)
library(rms)
library(devEMF)
library(broom)
library(car)
library(ggplot2)
library(patchwork)
library(viridis)
# 设置输出目录
output_dir <- "F:/eddies_data/gam1"
# 如果目录不存在,则创建
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
cat("已创建输出目录:", output_dir, "\n")
}
# 读取数据
data <- read_excel("F:/eddies_data/fishery_eddies1.xlsx")
# 查看数据结构
str(data)
head(data)
# 检查数据中是否有缺失值
cat("数据缺失值统计:\n")
print(colSums(is.na(data)))
# 创建对数变换后的响应变量
data$log_cpue <- log(data$cpue + 1)
# 创建因子变量:正就是反气旋涡,负就是气旋涡
data$eddy_type_fac <- factor(data$eddy_type,
levels = c(-1, 1),
labels = c("气旋涡", "反气旋涡"))
# 查看数据分布
cat("气旋涡数据点数量:", sum(data$eddy_type == -1 & !is.na(data$eddy_type)), "\n")
cat("反气旋涡数据点数量:", sum(data$eddy_type == 1 & !is.na(data$eddy_type)), "\n")
# 使用交互效应模型
final_model <- gam(log_cpue ~
eddy_type_fac +
te(longitude, latitude) + # 经纬度交互项,不区分涡旋类型
s(relative_distance, by = eddy_type_fac) +
s(amplitude, by = eddy_type_fac) +
s(speed_average, by = eddy_type_fac) +
as.factor(year) + # 年份作为固定效应
as.factor(month), # 月份作为固定效应
# 查看模型摘要
cat("\n=== 交互效应模型摘要 ===\n")
print(summary(final_model))
# 创建分别显示每个涡旋类型偏效应图的函数,优化纵坐标范围
plot_effect_by_type_optimized <- function(model, variable, eddy_types = c("气旋涡", "反气旋涡")) {
# 创建一个新的数据框用于预测
plot_data <- expand.grid(
eddy_type_fac = factor(eddy_types, levels = c("气旋涡", "反气旋涡")),
stringsAsFactors = FALSE
)
# 为每个变量设置合理的范围
if (variable == "longitude") {
plot_data[[variable]] <- seq(min(data$longitude, na.rm = TRUE),
max(data$longitude, na.rm = TRUE), length = 100)
} else if (variable == "latitude") {
plot_data[[variable]] <- seq(min(data$latitude, na.rm = TRUE),
max(data$latitude, na.rm = TRUE), length = 100)
} else if (variable == "relative_distance") {
plot_data[[variable]] <- seq(min(data$relative_distance, na.rm = TRUE),
max(data$relative_distance, na.rm = TRUE), length = 100)
} else if (variable == "amplitude") {
plot_data[[variable]] <- seq(min(data$amplitude, na.rm = TRUE),
max(data$amplitude, na.rm = TRUE), length = 100)
} else if (variable == "speed_average") {
plot_data[[variable]] <- seq(min(data$speed_average, na.rm = TRUE),
max(data$speed_average, na.rm = TRUE), length = 100)
}
# 设置其他变量为均值
other_vars <- c("longitude", "latitude", "relative_distance", "amplitude", "speed_average")
other_vars <- setdiff(other_vars, variable)
for (var in other_vars) {
plot_data[[var]] <- mean(data[[var]], na.rm = TRUE)
}
# 设置年份和月份为最常见的水平
plot_data$year <- as.integer(names(sort(table(data$year), decreasing = TRUE)[1]))
plot_data$month <- as.integer(names(sort(table(data$month), decreasing = TRUE)[1]))
# 预测 - 使用terms获取偏效应
pred <- predict(model, newdata = plot_data, type = "terms", se.fit = TRUE)
# 提取对应变量的偏效应
term_names <- colnames(pred$fit)
var_terms <- grep(variable, term_names, value = TRUE)
# 为每个eddy_type提取对应的项
plot_data$fit <- NA
plot_data$se <- NA
for (i in 1:nrow(plot_data)) {
eddy_type <- as.character(plot_data$eddy_type_fac[i])
# 查找对应的项名称
target_term <- grep(paste0(variable, ".+", eddy_type), term_names, value = TRUE)
if (length(target_term) == 0) {
# 如果找不到,尝试其他匹配方式
target_term <- grep(paste0(variable, ".*", substr(eddy_type, 1, 2)), term_names, value = TRUE)
}
if (length(target_term) > 0) {
plot_data$fit[i] <- pred$fit[i, target_term[1]]
plot_data$se[i] <- pred$se.fit[i, target_term[1]]
}
}
plot_data$upper <- plot_data$fit + 1.96 * plot_data$se
plot_data$lower <- plot_data$fit - 1.96 * plot_data$se
# 计算纵坐标的合理范围,让曲线变化更明显
y_range <- range(plot_data$fit, na.rm = TRUE)
y_span <- diff(y_range)
# 如果变化范围太小,扩大纵坐标范围以显示变化
if (y_span < 0.1) {
y_center <- mean(y_range)
y_range <- c(y_center - 0.1, y_center + 0.1)
} else {
# 添加一些边距
y_range <- c(y_range[1] - 0.05 * y_span, y_range[2] + 0.05 * y_span)
}
# 绘制图表
p <- ggplot(plot_data, aes_string(x = variable, y = "fit",
color = "eddy_type_fac", fill = "eddy_type_fac")) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, color = NA) +
geom_line(size = 1) +
labs(title = paste0(variable, "对对数CPUE的偏效应"),
x = switch(variable,
"longitude" = "经度",
"latitude" = "纬度",
"relative_distance" = "相对距离",
"amplitude" = "振幅",
"speed_average" = "平均速度",
variable),
y = "对对数CPUE的影响",
color = "涡旋类型", fill = "涡旋类型") +
theme_minimal() +
theme(legend.position = "bottom") +
coord_cartesian(ylim = y_range) # 关键:限制纵坐标范围
return(p)
}
# 使用更简单但有效的方法 - 直接使用plot.gam但调整纵坐标
plot_smooth_effects <- function(model, data) {
# 获取所有平滑项
smooth_terms <- summary(model)$s.table
# 为每个变量创建偏效应图
variables <- c("relative_distance", "amplitude", "speed_average") # 移除了经纬度
for (var in variables) {
png(file.path(output_dir, paste0("optimized_", var, "_effect.png")),
width = 800, height = 600)
# 创建预测数据
new_data <- data.frame(eddy_type_fac = factor(rep(c("气旋涡", "反气旋涡"), each = 100)))
# 设置变量范围
var_range <- range(data[[var]], na.rm = TRUE)
new_data[[var]] <- rep(seq(var_range[1], var_range[2], length = 100), 2)
# 设置其他变量为均值
other_vars <- c("longitude", "latitude", "relative_distance", "amplitude", "speed_average")
other_vars <- setdiff(other_vars, var)
for (v in other_vars) {
new_data[[v]] <- mean(data[[v]], na.rm = TRUE)
}
# 设置年份和月份为最常见的水平
new_data$year <- as.integer(names(sort(table(data$year), decreasing = TRUE)[1]))
new_data$month <- as.integer(names(sort(table(data$month), decreasing = TRUE)[1]))
# 预测 - 使用terms获取偏效应
pred <- predict(model, newdata = new_data, type = "terms", se.fit = TRUE)
# 提取对应变量的偏效应
term_names <- colnames(pred$fit)
var_terms <- grep(var, term_names, value = TRUE)
# 为每个eddy_type提取对应的项
new_data$fit <- NA
new_data$se <- NA
for (i in 1:nrow(new_data)) {
eddy_type <- as.character(new_data$eddy_type_fac[i])
# 查找对应的项名称
target_term <- grep(paste0(var, ".+", eddy_type), term_names, value = TRUE)
if (length(target_term) == 0) {
# 如果找不到,尝试其他匹配方式
target_term <- grep(paste0(var, ".*", substr(eddy_type, 1, 2)), term_names, value = TRUE)
}
if (length(target_term) > 0) {
new_data$fit[i] <- pred$fit[i, target_term[1]]
new_data$se[i] <- pred$se.fit[i, target_term[1]]
}
}
new_data$upper <- new_data$fit + 1.96 * new_data$se
new_data$lower <- new_data$fit - 1.96 * new_data$se
# 计算纵坐标范围 - 聚焦于变化部分
y_range <- range(new_data$fit, na.rm = TRUE)
y_span <- diff(y_range)
# 如果变化太小,扩大范围
if (y_span < 0.5) {
y_center <- mean(y_range)
y_range <- c(y_center - 0.25, y_center + 0.25)
}
# 使用ggplot绘制偏效应图
p <- ggplot(new_data, aes_string(x = var, y = "fit",
color = "eddy_type_fac", fill = "eddy_type_fac")) +
geom_ribbon(aes(ymin = lower, ymax = upper),
alpha = 0.2, color = NA) +
geom_line(size = 1) +
labs(title = paste0("偏效应: ",
switch(var,
"longitude" = "经度",
"latitude" = "纬度",
"relative_distance" = "相对距离",
"amplitude" = "振幅",
"speed_average" = "平均速度",
var)),
x = switch(var,
"longitude" = "经度",
"latitude" = "纬度",
"relative_distance" = "相对距离",
"amplitude" = "振幅",
"speed_average" = "平均速度",
var),
y = "对对数CPUE的偏效应",
color = "涡旋类型", fill = "涡旋类型") +
theme_minimal() +
theme(legend.position = "bottom") +
coord_cartesian(ylim = y_range) # 关键:聚焦于变化部分
print(p)
dev.off()
}
}
# 修改后的经纬度交互偏效应图
plot_spatial_partial_effect <- function(model, data) {
# 创建经纬度网格
lon_seq <- seq(min(data$longitude, na.rm = TRUE),
max(data$longitude, na.rm = TRUE), length = 50)
lat_seq <- seq(min(data$latitude, na.rm = TRUE),
max(data$latitude, na.rm = TRUE), length = 50)
# 创建预测数据 - 不再区分涡旋类型
grid_data <- expand.grid(
longitude = lon_seq,
latitude = lat_seq
)
# 设置其他变量为均值
grid_data$relative_distance <- mean(data$relative_distance, na.rm = TRUE)
grid_data$amplitude <- mean(data$amplitude, na.rm = TRUE)
grid_data$speed_average <- mean(data$speed_average, na.rm = TRUE)
grid_data$year <- as.integer(names(sort(table(data$year), decreasing = TRUE)[1]))
grid_data$month <- as.integer(names(sort(table(data$month), decreasing = TRUE)[1]))
# 设置涡旋类型为最常见的水平
grid_data$eddy_type_fac <- factor(
names(sort(table(data$eddy_type_fac), decreasing = TRUE)[1]),
levels = c("气旋涡", "反气旋涡")
)
# 预测 - 使用terms获取偏效应
pred <- predict(model, newdata = grid_data, type = "terms", se.fit = TRUE)
# 提取经纬度交互项的偏效应
term_names <- colnames(pred$fit)
spatial_term <- grep("te\\(longitude,latitude\\)", term_names, value = TRUE)
if (length(spatial_term) > 0) {
grid_data$partial_effect <- pred$fit[, spatial_term[1]]
# 绘制偏效应图
p <- ggplot(grid_data, aes(x = longitude, y = latitude, fill = partial_effect)) +
geom_tile() +
scale_fill_viridis_c(name = "偏效应") +
labs(title = "经纬度交互项对对数CPUE的偏效应",
x = "经度", y = "纬度") +
theme_minimal() +
theme(legend.position = "bottom")
ggsave(file.path(output_dir, "spatial_partial_effect.png"), p, width = 10, height = 8)
return(p)
} else {
cat("警告:未找到经纬度交互项\n")
return(NULL)
}
}
# 使用第二种方法绘制优化后的偏效应图
plot_smooth_effects(final_model, data[!is.na(data$eddy_type), ])
# 绘制经纬度交互偏效应图
plot_spatial_partial_effect(final_model, data[!is.na(data$eddy_type), ])
# 创建组合对比图
create_comparison_plots <- function(model, data) {
combined_data <- data[!is.na(data$eddy_type), ]
# 相对距离对比
p1 <- ggplot(combined_data, aes(x = relative_distance, y = log_cpue, color = eddy_type_fac)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "gam", formula = y ~ s(x)) +
labs(title = "相对距离 vs 对数CPUE", x = "相对距离", y = "对数CPUE") +
theme_minimal() +
theme(legend.position = "none")
# 振幅对比
p2 <- ggplot(combined_data, aes(x = amplitude, y = log_cpue, color = eddy_type_fac)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "gam", formula = y ~ s(x)) +
labs(title = "振幅 vs 对数CPUE", x = "振幅", y = "对数CPUE") +
theme_minimal() +
theme(legend.position = "none")
# 速度对比
p3 <- ggplot(combined_data, aes(x = speed_average, y = log_cpue, color = eddy_type_fac)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "gam", formula = y ~ s(x)) +
labs(title = "平均速度 vs 对数CPUE", x = "平均速度", y = "对数CPUE") +
theme_minimal() +
theme(legend.position = "none")
# 获取图例
legend_plot <- ggplot(combined_data, aes(x = relative_distance, y = log_cpue, color = eddy_type_fac)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "gam", formula = y ~ s(x)) +
labs(color = "涡旋类型") +
theme_minimal() +
theme(legend.position = "bottom")
legend <- cowplot::get_legend(legend_plot)
# 组合图表
top_row <- p1 + p2
bottom_row <- p3 + plot_spacer() # 占位符保持对齐
combined <- top_row / bottom_row / legend +
plot_layout(heights = c(2, 2, 0.5))
ggsave(file.path(output_dir, "comparison_scatter_plots.png"), combined, width = 12, height = 10)
}
# 创建对比散点图
create_comparison_plots(final_model, data)
# 模型诊断
cat("\n=== 模型诊断 ===\n")
png(file.path(output_dir, "model_diagnostics.png"), width = 800, height = 600)
par(mfrow = c(2, 2))
gam.check(final_model)
dev.off()
# 保存模型结果
sink(file.path(output_dir, "model_summary.txt"))
cat("GAM模型分析结果\n")
cat("===============\n\n")
print(summary(final_model))
sink()
cat("\n分析完成!优化后的图表已保存到:", output_dir, "\n") 帮我分析一下这个代码正确吗