# ==============================================================================
# 终极兼容版:上证指数HMM分析(适配HiddenMarkov所有版本,包括最旧版)
# 核心:放弃dthmm,直接使用initHMM+baumWelch+viterbi基础函数
# ==============================================================================
# 1. 安装加载包(强制安装稳定版)
install.packages("HiddenMarkov", version = "1.8", dependencies = TRUE)
install.packages(c("dplyr", "ggplot2", "lubridate", "gridExtra", "moments", "zoo"))
# 加载包
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(moments)
library(zoo)
library(HiddenMarkov)
# 2. 数据读取与预处理
sz_data <- read.csv("C:/Users/DELL/Desktop/概率模型论文 马尔科夫/sj.csv",
stringsAsFactors = FALSE, fileEncoding = "UTF-8")
# 计算对数收益率(确保存在)
if (!"Log_Return" %in% colnames(sz_data)) {
sz_data <- sz_data %>%
arrange(Trddt) %>%
mutate(Log_Return = log(Clsindex / lag(Clsindex)))
}
# 数据清洗
sz_data <- sz_data %>%
mutate(Trddt = as.Date(Trddt, format = "%Y/%m/%d")) %>%
arrange(Trddt) %>%
filter(!is.na(Log_Return))
# 核心变量(离散化处理,适配基础版HMM)
returns_pct <- sz_data$Log_Return * 100
# 收益率离散化(解决连续型分布兼容问题)
returns_discrete <- cut(returns_pct,
breaks = c(-Inf, -0.5, 0.5, Inf),
labels = c("下跌", "波动", "上涨"))
n_obs <- length(returns_discrete)
cat("有效观测数:", n_obs, "\n")
# 3. 描述性统计
desc_stats <- sz_data %>%
summarise(
总观测数 = n(),
日期范围 = paste(min(Trddt), "至", max(Trddt)),
收盘价均值 = round(mean(Clsindex), 2),
日收益率均值_百分比 = round(mean(Log_Return)*100, 4),
日收益率标准差_百分比 = round(sd(Log_Return)*100, 4),
年化收益率_百分比 = round(mean(Log_Return)*252*100, 4),
年化波动率_百分比 = round(sd(Log_Return)*sqrt(252)*100, 4)
)
cat("=== 描述性统计 ===\n")
print(desc_stats)
# 4. HMM模型构建(完全兼容所有版本的基础方法)
set.seed(1234)
n_states <- 3 # 三状态
# 4.1 初始化HMM(基础函数,无参数兼容问题)
# 定义隐藏状态和观测符号
hidden_states <- c("State1_波动", "State2_下跌", "State3_上涨")
observation_symbols <- c("下跌", "波动", "上涨")
# 初始化HMM(核心:使用initHMM,兼容所有版本)
hmm_init <- initHMM(
States = hidden_states,
Symbols = observation_symbols,
# 初始状态概率(均匀分布)
startProbs = rep(1/n_states, n_states),
# 初始转移矩阵
transProbs = matrix(
c(0.9, 0.05, 0.05,
0.1, 0.85, 0.05,
0.1, 0.05, 0.85),
nrow = n_states, byrow = TRUE,
dimnames = list(hidden_states, hidden_states)
),
# 初始发射概率
emissionProbs = matrix(
c(0.1, 0.8, 0.1, # 波动状态→观测下跌/波动/上涨
0.8, 0.15, 0.05, # 下跌状态→观测下跌/波动/上涨
0.05, 0.15, 0.8), # 上涨状态→观测下跌/波动/上涨
nrow = n_states, byrow = TRUE,
dimnames = list(hidden_states, observation_symbols)
)
)
# 4.2 Baum-Welch算法训练模型(核心兼容)
cat("\n=== 模型训练(Baum-Welch)===\n")
bw_result <- baumWelch(
hmm = hmm_init,
observation = returns_discrete,
maxIter = 200, # 迭代次数
delta = 1e-3 # 收敛阈值
)
# 输出训练后的参数
cat("1. 初始状态概率:\n")
print(round(bw_result$hmm$startProbs, 4))
cat("\n2. 状态转移矩阵:\n")
trans_matrix_final <- bw_result$hmm$transProbs
rownames(trans_matrix_final) <- colnames(trans_matrix_final) <- c("波动状态", "下跌状态", "上涨状态")
print(round(trans_matrix_final, 4))
cat("\n3. 发射概率矩阵:\n")
emission_matrix_final <- bw_result$hmm$emissionProbs
rownames(emission_matrix_final) <- c("波动状态", "下跌状态", "上涨状态")
print(round(emission_matrix_final, 4))
# 4.3 Viterbi算法识别隐藏状态(核心兼容)
cat("\n=== 状态识别(Viterbi)===\n")
viterbi_states <- viterbi(
hmm = bw_result$hmm,
observation = returns_discrete
)
# 状态标签映射
state_mapping <- c(
"State1_波动" = "波动状态",
"State2_下跌" = "下跌状态",
"State3_上涨" = "上涨状态"
)
sz_data <- sz_data %>%
mutate(
hmm_state_raw = viterbi_states,
hmm_state = recode(hmm_state_raw, !!!state_mapping),
obs_state = returns_discrete
)
# 5. 状态统计分析
state_summary <- sz_data %>%
group_by(hmm_state) %>%
summarise(
天数 = n(),
占比_百分比 = round(n()/nrow(sz_data)*100, 2),
平均收益率_百分比 = round(mean(Log_Return)*100, 4),
收益率标准差_百分比 = round(sd(Log_Return)*100, 4),
胜率_百分比 = round(sum(Log_Return>0)/n()*100, 2)
)
cat("=== 状态统计特征 ===\n")
print(state_summary)
# 6. 可视化(极简版,确保运行)
# 6.1 收盘价+状态图
p1 <- ggplot(sz_data, aes(x = Trddt, y = Clsindex)) +
geom_line(color = "black", linewidth = 1) +
geom_point(aes(color = hmm_state), alpha = 0.7, size = 1) +
scale_color_manual(values = c("波动状态" = "blue", "下跌状态" = "red", "上涨状态" = "orange")) +
labs(title = "上证指数收盘价与HMM状态", x = "日期", y = "收盘价", color = "市场状态") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# 6.2 状态占比柱状图
p2 <- ggplot(state_summary, aes(x = hmm_state, y = 天数, fill = hmm_state)) +
geom_bar(stat = "identity", alpha = 0.8) +
geom_text(aes(label = paste0(占比_百分比, "%")), vjust = -0.3, size = 4) +
scale_fill_manual(values = c("波动状态" = "blue", "下跌状态" = "red", "上涨状态" = "orange")) +
labs(title = "各市场状态天数占比", x = "市场状态", y = "天数", fill = "市场状态") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# 6.3 收益率分布(按状态)
p3 <- ggplot(sz_data, aes(x = Log_Return*100, fill = hmm_state)) +
geom_histogram(bins = 30, alpha = 0.6, position = "dodge") +
scale_fill_manual(values = c("波动状态" = "blue", "下跌状态" = "red", "上涨状态" = "orange")) +
labs(title = "各状态收益率分布", x = "收益率(%)", y = "频次", fill = "市场状态") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# 6.4 状态转移矩阵热力图
trans_df <- as.data.frame(trans_matrix_final)
trans_df$从状态 <- rownames(trans_df)
trans_long <- tidyr::pivot_longer(trans_df, cols = -从状态, names_to = "到状态", values_to = "概率")
p4 <- ggplot(trans_long, aes(x = 到状态, y = 从状态, fill = 概率)) +
geom_tile(color = "white") +
geom_text(aes(label = round(概率, 4)), size = 4) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "状态转移概率矩阵", x = "转移到状态", y = "从状态转移") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# 组合图表并保存
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)
ggsave("sz_index_hmm_ultimate.png", width = 16, height = 12, dpi = 300, bg = "white")
# 7. 结果保存
result_data <- sz_data %>%
select(Trddt, Clsindex, Log_Return, hmm_state, obs_state) %>%
rename(
日期 = Trddt,
收盘价 = Clsindex,
对数收益率 = Log_Return,
HMM识别状态 = hmm_state,
观测状态 = obs_state
)
write.csv(result_data, "sz_index_hmm_final_results.csv", row.names = FALSE, fileEncoding = "UTF-8")
# 保存模型
save(bw_result, file = "sz_hmm_final_model.RData")
cat("\n=== 分析完成 ===\n")
cat("生成文件:\n")
cat("1. sz_index_hmm_ultimate.png - 可视化结果\n")
cat("2. sz_index_hmm_final_results.csv - 状态数据\n")
cat("3. sz_hmm_final_model.RData - 模型对象\n")
代码报错