# 载入包
if (TRUE){
library(survival)
library(survminer)
library(randomForestSRC)
library(timeROC)
library(data.table)
library(tidyr)
library(dplyr)
library(tibble)
library(caret)
}
# 开始使用临床血液数据吧
# load data
options(stringsAsFactors = F)
rm(list = ls())
pwd = "E:\\process\\wangtao/"
save_pwd = "E:\\process\\wangtao\\picture\\"
setwd(pwd)
library(readr)
library(dplyr)
library(readxl)
# 复发,定义五年复发率状态
data_exp = read.csv("data_excel.csv")
data1 = read_xls("raw_data/脑转移.xls" ,sheet = 1)
data = read_xls("raw_data/脑转移-self.xls" ,sheet = 6)
data$age = data1$`BC确诊年龄(岁)`[match(data$ID号,data1$ID号)]
data = apply(data,2,as.numeric) %>% as.data.frame()
data = na.omit(data)
apply(data[,3:dim(data)[2]],2,table)
# 样本定义
## 定义术后都接受了治疗,剔除绝经为0.5的,剔除
data$therapy = ifelse(data$`既往治疗(辅助)化疗
1-是
2-否` ==1 | data$`既往治疗(辅助)内分泌
1-是
2-否`==1 |data$`既往治疗(辅助)放疗
1-是
2-否`==1,1,0)
data = subset(data,(therapy == 1))
# 血液数据,data_exp time = time - quyang
data_exp = subset(data_exp,id%in%data$ID号)
data_exp$quezhen_quyang = round(data_exp$quezhen_quyang/30,2)
data_exp$fufa_quyang = round(data_exp$fufa_quyang/30,2)
data_exp$zhuanyi_quyang = round(data_exp$zhuanyi_quyang/30,2)
# 定义复发状态
data_exp$quezhen_quyang[data_exp$quezhen_quyang %>% unique() > 0 ]#
data$status = ifelse(data$`DFS(月)`<60,1,0)
colnames(data) = c("ID","DFS","juejing","T","N","M",
"stage","ER","PR","HER2","SUBTYPE","hualiao",
"neifenxi","fangliao","age","therapy","status")
#################################################
###############随机森林生存树####################
#################################################
if(T){
# 分割数据集
set.seed(3)
size <- createDataPartition(data$status,p=0.8,list = F) # 随机选择80%的数据作为Train data
train <- data[size,]
validation <- data[-size,]
## 训练模型
##构建随机生存森林
if (T){
attach(data)
rsf_t <- rfsrc(Surv(DFS,status)~.,data = train[,c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age")],
ntree = 11,nodesize = 30,##该值建议多调整
splitrule = 'logrank',
importance = TRUE,
proximity = TRUE,
forest = TRUE,
seed = 8)
rsf_t
plot(rsf_t)
score_t <- data.frame(train[,c(1,2,17)],Score=rsf_t$predicted)
cut <- surv_cutpoint(score_t,'DFS','status','Score')
cut
plot(cut)
## 生存分析
cat <- surv_categorize(cut)
fit <- survfit(Surv(DFS,status)~Score,cat)
mytheme <- theme_survminer(font.legend = c(14,"plain", "black"),
font.x = c(14,"plain", "black"),
font.y = c(14,"plain", "black")) ## 自定义主题
ggsurvplot(fit,cat,
palette = 'jco',
pval=TRUE,
size=1.3,
legend.labs=c("High","Low"),
legend.title='Score',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme
)
ggsave(filename = paste0(save_pwd,"train.png"))
## 验证集
rsf_v<- predict(rsf_t,newdata = validation[,c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age")])
score_v <- data.frame(validation[,c(1,2,17)],Score=rsf_v$predicted)
score_v$Group <- ifelse(score_v$Score>cut$cutpoint$cutpoint,'High','Low')
fit <- survfit(Surv(DFS,status)~Group,score_v)
ggsurvplot(fit,
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("High","Low"),
legend.title='Score',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
fit$n
ggsave(filename = paste0(save_pwd,"validate.png"))
}
## 比较训练验证集在血液数据上的差异
data$Score = predict(rsf_t,newdata = data[,c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age")])$predicted
data$Group <- ifelse(data$Score>cut$cutpoint$cutpoint,'High','Low')
##对血液数据进行预处理
data_exp$quezhen_quyang[data_exp$quezhen_quyang %>% unique() > 0 ]#没有确诊前的数据
unique_month = data_exp$quezhen_quyang %>% unique()
hist(unique_month)
hist(data_exp$quezhen_quyang)
##这两张图不管是从有没有去重复的来看,应该选择的项目时间点都应该在150天过后
data_exp = subset(data_exp,quezhen_quyang>(-150))
unique_month = data_exp$quezhen_quyang %>% unique()
hist(unique_month)
hist(data_exp$quezhen_quyang)
# 构造分级计算样本数
unique_month = data_exp$quezhen_quyang %>% unique()
##按12个月分组吧
group = abs(data_exp$quezhen_quyang /12) %>% ceiling()
###人数
a = tapply(data_exp$id,group,unique)
sample_hist = sapply(a,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = 1: length(sample_hist)),
count = sample_hist %>% as.numeric() )
library(ggplot2)
ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+5), position=position_dodge(0.9))
###项目数
b = tapply(data_exp$quezhen_quyang,group,unique)
xiangmu_hist = sapply(b,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = 1: length(sample_hist)),
count=xiangmu_hist %>% as.numeric() )
ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+5), position=position_dodge(0.9))
}
#################################################
###############三年五年AUC值####################
#################################################
if(T){
# 全部数据集
rsf_v<- predict(rsf_t,newdata = data[,c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age")])
score_v <- data.frame(data[,c(1,2,17)],Score=rsf_v$predicted)
result <-timeROC(score_v$DFS,score_v$status,score_v$Score,
cause = 1,weighting = 'marginal',
times = c(12,36,60),ROC = TRUE, iid = TRUE)
dat = data.frame(fpr = as.numeric(result$FP),
tpr = as.numeric(result$TP),
time = rep(as.factor(c(365,1095,1825)),each = nrow(result$TP)))
library(ggplot2)
ggplot() +
geom_line(data = dat,aes(x = fpr, y = tpr,color = time),size = 1) +
scale_color_manual(name = NULL,values = c("#92C5DE", "#F4A582", "#66C2A5"),
labels = paste0("AUC of ",c(1,3,5),"-years survival: ",
format(round(result$AUC,2),nsmall = 2)))+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
theme(panel.grid = element_blank(),
legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
legend.position = c(0.765,0.125))+
scale_x_continuous(expand = c(0.005,0.005))+
scale_y_continuous(expand = c(0.005,0.005))+
labs(x = "1 - Specificity",
y = "Sensitivity")+
coord_fixed()
ggsave(filename = paste0(save_pwd,"auc.png"),width = 6,height = 6)# 直接保存画板好像有问题,因为画板太小了,所以需要将照片设置的大一点
}
#################################################
###########治疗方式对复发高低风险的影响##########
#################################################
library(ezcox)
temp = subset(data,(hualiao%in%c(1,2))&(neifenxi%in%c(1,2))&fangliao%in%c(1,2))
res = ezcox(temp, covariates = c("fangliao", "hualiao", "neifenxi"),
time = "DFS", status = "status")
# 其中内分泌与复发相关,那么所谓的分组对于治疗的影响并不很明显
library(forestmodel)
res = ezcox(data, covariates = c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age","Group"),
time = "DFS", status = "status",return_models = TRUE)
forest_model(res$models$model)
mySurv=with(temp,Surv(DFS,status))
# 经过校正后也是如此
cox1 <- coxph(mySurv~juejing + age + T + N + stage + hualiao + fangliao +neifenxi ,
data = temp)
cox1 %>% summary()
data$Score = predict(rsf_t,newdata = data[,c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age")])$predicted
data$Group <- ifelse(data$Score>cut$cutpoint$cutpoint,'High','Low')
if (T){
# temp = subset(data,(neifenxi==2)&(fangliao==2))
# temp = subset(data,(hualiao==2)&(fangliao==2))
# temp = subset(data,(hualiao==2)&(neifenxi==2))
# 前面的条件过于苛刻,所以卡下来后只有一种样本
high_risk = subset(temp,Group=="High")
ggsurvplot(survfit(Surv(DFS,status)~fangliao,high_risk),
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("radiation therapy","non-radiation therapy"),
legend.title='High-risk',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
ggsave(filename = paste0(save_pwd,"high_risk_rt.png"))
ggsurvplot(survfit(Surv(DFS,status)~hualiao,high_risk),
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("chemotherapy","non-chemotherapy"),
legend.title='High-risk',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
ggsave(filename = paste0(save_pwd,"high_risk_ct.png"))
ggsurvplot(survfit(Surv(DFS,status)~neifenxi,high_risk),
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("endocrine therapy","non-endocrine therapy"),
legend.title='High-risk',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
ggsave(filename = paste0(save_pwd,"high_risk_et.png"))
low_risk = subset(temp,Group=="Low")
ggsurvplot(survfit(Surv(DFS,status)~fangliao,low_risk),
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("radiation therapy","non-radiation therapy"),
legend.title='Low-risk',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
ggsave(filename = paste0(save_pwd,"low_risk_rt.png"))
ggsurvplot(survfit(Surv(DFS,status)~hualiao,low_risk),
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("chemotherapy","non-chemotherapy"),
legend.title='Low-risk',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
ggsave(filename = paste0(save_pwd,"low_risk_ct.png"))
ggsurvplot(survfit(Surv(DFS,status)~neifenxi,low_risk),
palette = 'jco',
size=1.3,
pval=TRUE,
legend.labs=c("endocrine therapy","non-endocrine therapy"),
legend.title='Low-risk',
xlab="Time(months)",
ylab='Disease free survival',
ggtheme = mytheme)
ggsave(filename = paste0(save_pwd,"low_risk_nt.png"))
}
#################################################
###############检验差异数据######################
#################################################
# 第一年
years = 1
if (T){
data_jianyan = data_exp[which(group==years),]
data_temp = subset(data,data$ID%in%data_jianyan$id,select = c("ID","Group"))
table(data_temp$Group)
group_years = abs(data_jianyan$quezhen_quyang) %>% ceiling()
a = tapply(data_jianyan$id,group_years,unique)
sample_hist = sapply(a,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = 1: length(sample_hist)),
count=sample_hist %>% as.numeric() )
library(ggplot2)
a=ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(a)
b = tapply(data_jianyan$quezhen_quyang,group_years,unique)
xiangmu_hist = sapply(b,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = 1: length(sample_hist)),
count=xiangmu_hist %>% as.numeric() )
a=ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(a)
#data_jianyan1 = data_jianyan
if (T){
#data_jianyan = data_jianyan1
data_jianyan$Group = data$Group[match(data_jianyan$id,data$ID)]
## 查看哪些项目比较多的人数
xiangmu = tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}) %>% as.numeric()
con_xiangmu = names(tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}))[ which(xiangmu > 35)]
data_jianyan = subset(data_jianyan,xiangmu%in%con_xiangmu)
## 提取一个样本有多个时间点时取最早时间点
data_jianyan = subset(data_jianyan,quezhen_quyang<0)
days1 = tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max)%>% as.numeric()
names1 = names(tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max))
temp1 = cbind(names1,days1) %>% apply(.,1,function(x)
{subset(data_jianyan,(id==x[1])&(quezhen_quyang==x[2]))})
data_jianyan = do.call(rbind,temp1)
#批量循环,每个组别找高低风险对比
for (i in con_xiangmu)
{
#i = con_xiangmu[1]
temp = subset(data_jianyan,xiangmu%in%i)
a = temp[temp$Group=="High",4] %>% as.numeric() %>% na.omit()
b = temp[temp$Group=="Low",4] %>% as.numeric() %>% na.omit()
var_identical = data.frame(value = c(a,b),
group = c(rep(1,length(a)),rep(2,length(b))))
result = ifelse(length(a)>3&length(b)>3&!identical(a,b)&length(unique(a))>1&length(unique(b))>1,
#ifelse(shapiro.test(a)$p.value>0.05&shapiro.test(b)$p.value>0.05,
#t.test(a,b,paired = FALSE)$p.value,
wilcox.test(a,b,paired = FALSE)$p.value,1)
if(result<0.05){
print(i)
d= ggplot(data= data.frame(value = c(a,b),
group = c(rep("High",length(a)),rep("Low",length(b)))),
aes(group,value,fill = group))+
geom_boxplot()+
ggsignif::geom_signif(comparisons = list(c("High","Low"))) +
labs(title="New plot title")+ggtitle(i)
print(d)
}
}
}
}
# 第三年
years = 3
if (T){
data_jianyan = data_exp[which(group==years),]
data_temp = subset(data,data$ID%in%data_jianyan$id,select = c("ID","Group"))
table(data_temp$Group)
group_years = abs(data_jianyan$quezhen_quyang) %>% ceiling()
a = tapply(data_jianyan$id,group_years,unique)
sample_hist = sapply(a,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = names(sample_hist) %>% as.numeric()),
count=sample_hist %>% as.numeric() )
library(ggplot2)
a = ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(a)
b = tapply(data_jianyan$quezhen_quyang,group_years,unique)
xiangmu_hist = sapply(b,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = names(sample_hist) %>% as.numeric()),
count=xiangmu_hist %>% as.numeric() )
b = ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(b)
#data_jianyan1 = data_jianyan
if (T){
#data_jianyan = data_jianyan1
data_jianyan$Group = data$Group[match(data_jianyan$id,data$ID)]
## 查看哪些项目比较多的人数
xiangmu = tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}) %>% as.numeric()
con_xiangmu = names(tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}))[ which(xiangmu > 35)]
data_jianyan = subset(data_jianyan,xiangmu%in%con_xiangmu)
## 提取一个样本有多个时间点时取最早时间点
data_jianyan = subset(data_jianyan,quezhen_quyang<0)
days1 = tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max)%>% as.numeric()
names1 = names(tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max))
temp1 = cbind(names1,days1) %>% apply(.,1,function(x)
{subset(data_jianyan,(id==x[1])&(quezhen_quyang==x[2]))})
data_jianyan = do.call(rbind,temp1)
#批量循环,每个组别找高低风险对比
result_static = matrix(NA,nrow=length(con_xiangmu),ncol=4)
k = 1
for (i in con_xiangmu)
{
#i = con_xiangmu[1]
temp = subset(data_jianyan,xiangmu%in%i)
a = temp[temp$Group=="High",4] %>% as.numeric() %>% na.omit()
b = temp[temp$Group=="Low",4] %>% as.numeric() %>% na.omit()
var_identical = data.frame(value = c(a,b),
group = c(rep(1,length(a)),rep(2,length(b))))
result = ifelse(length(a)>3&length(b)>3&!identical(a,b)&length(unique(a))>1&length(unique(b))>1,
#ifelse(shapiro.test(a)$p.value>0.05&shapiro.test(b)$p.value>0.05,
#t.test(a,b,paired = FALSE)$p.value,
wilcox.test(a,b,paired = FALSE)$p.value,1)
if(result<0.05){
print(i)
d= ggplot(data= data.frame(value = c(a,b),
group = c(rep("High",length(a)),rep("Low",length(b)))),
aes(group,value,fill = group))+
geom_boxplot()+
ggsignif::geom_signif(comparisons = list(c("High","Low"))) +
labs(title="New plot title")+ggtitle(i)
print(d)
result_static[k,1] = i
result_static[k,2] = median(a)
result_static[k,3] = median(b)
result_static[k,4] = result
k = k+1
}
}
}
}
result_static = na.omit(result_static)
which(result_static[,2]>result_static[,3]) %>% result_static[.,1]
which(result_static[,2]<result_static[,3]) %>% result_static[.,1]
# 第五年
years = 5
if (T){
data_jianyan = data_exp[which(group==years),]
data_temp = subset(data,data$ID%in%data_jianyan$id,select = c("ID","Group"))
table(data_temp$Group)
group_years = abs(data_jianyan$quezhen_quyang) %>% ceiling()
a = tapply(data_jianyan$id,group_years,unique)
sample_hist = sapply(a,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = names(sample_hist) %>% as.numeric()),
count=sample_hist %>% as.numeric() )
library(ggplot2)
a = ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(a)
b = tapply(data_jianyan$quezhen_quyang,group_years,unique)
xiangmu_hist = sapply(b,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = names(sample_hist) %>% as.numeric()),
count=xiangmu_hist %>% as.numeric() )
b = ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(b)
#data_jianyan1 = data_jianyan
if (T){
#data_jianyan = data_jianyan1
data_jianyan$Group = data$Group[match(data_jianyan$id,data$ID)]
## 查看哪些项目比较多的人数
xiangmu = tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}) %>% as.numeric()
con_xiangmu = names(tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}))[ which(xiangmu > 35)]
data_jianyan = subset(data_jianyan,xiangmu%in%con_xiangmu)
## 提取一个样本有多个时间点时取最早时间点
data_jianyan = subset(data_jianyan,quezhen_quyang<0)
days1 = tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max)%>% as.numeric()
names1 = names(tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max))
temp1 = cbind(names1,days1) %>% apply(.,1,function(x)
{subset(data_jianyan,(id==x[1])&(quezhen_quyang==x[2]))})
data_jianyan = do.call(rbind,temp1)
#批量循环,每个组别找高低风险对比
result_static = matrix(NA,nrow=length(con_xiangmu),ncol=4)
k = 1
for (i in con_xiangmu)
{
#i = con_xiangmu[1]
temp = subset(data_jianyan,xiangmu%in%i)
a = temp[temp$Group=="High",4] %>% as.numeric() %>% na.omit()
b = temp[temp$Group=="Low",4] %>% as.numeric() %>% na.omit()
var_identical = data.frame(value = c(a,b),
group = c(rep(1,length(a)),rep(2,length(b))))
result = ifelse(length(a)>3&length(b)>3&!identical(a,b)&length(unique(a))>1&length(unique(b))>1,
#ifelse(shapiro.test(a)$p.value>0.05&shapiro.test(b)$p.value>0.05,
#t.test(a,b,paired = FALSE)$p.value,
wilcox.test(a,b,paired = FALSE)$p.value,1)
if(result<0.05){
print(i)
d= ggplot(data= data.frame(value = c(a,b),
group = c(rep("High",length(a)),rep("Low",length(b)))),
aes(group,value,fill = group))+
geom_boxplot()+
ggsignif::geom_signif(comparisons = list(c("High","Low"))) +
labs(title="New plot title")+ggtitle(i)
print(d)
result_static[k,1] = i
result_static[k,2] = median(a)
result_static[k,3] = median(b)
result_static[k,4] = result
k = k+1
}
}
}
}
result_static = na.omit(result_static)
which(result_static[,2]>result_static[,3]) %>% result_static[.,1]
which(result_static[,2]<result_static[,3]) %>% result_static[.,1]
# 第7年
years = 7
if (T){
data_jianyan = data_exp[which(group==years),]
data_temp = subset(data,data$ID%in%data_jianyan$id,select = c("ID","Group"))
table(data_temp$Group)
group_years = abs(data_jianyan$quezhen_quyang) %>% ceiling()
a = tapply(data_jianyan$id,group_years,unique)
sample_hist = sapply(a,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = names(sample_hist) %>% as.numeric()),
count=sample_hist %>% as.numeric() )
library(ggplot2)
a = ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(a)
b = tapply(data_jianyan$quezhen_quyang,group_years,unique)
xiangmu_hist = sapply(b,length)
data_plot = data.frame(month = names(sample_hist) %>% factor(,levels = names(sample_hist) %>% as.numeric()),
count=xiangmu_hist %>% as.numeric() )
b = ggplot(data = data_plot,aes(x=month,y=count))+
geom_bar(stat="identity") +
geom_text(aes(label=count, y=count+1), position=position_dodge(0.9))
print(b)
#data_jianyan1 = data_jianyan
if (T){
#data_jianyan = data_jianyan1
data_jianyan$Group = data$Group[match(data_jianyan$id,data$ID)]
## 查看哪些项目比较多的人数
xiangmu = tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}) %>% as.numeric()
con_xiangmu = names(tapply(data_jianyan$id,data_jianyan$xiangmu,function(x){
length(unique(x))
}))[ which(xiangmu > 35)]
data_jianyan = subset(data_jianyan,xiangmu%in%con_xiangmu)
## 提取一个样本有多个时间点时取最早时间点
data_jianyan = subset(data_jianyan,quezhen_quyang<0)
days1 = tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max)%>% as.numeric()
names1 = names(tapply(data_jianyan$quezhen_quyang,data_jianyan$id,max))
temp1 = cbind(names1,days1) %>% apply(.,1,function(x)
{subset(data_jianyan,(id==x[1])&(quezhen_quyang==x[2]))})
data_jianyan = do.call(rbind,temp1)
#批量循环,每个组别找高低风险对比
result_static = matrix(NA,nrow=length(con_xiangmu),ncol=4)
k = 1
for (i in con_xiangmu)
{
#i = con_xiangmu[1]
temp = subset(data_jianyan,xiangmu%in%i)
a = temp[temp$Group=="High",4] %>% as.numeric() %>% na.omit()
b = temp[temp$Group=="Low",4] %>% as.numeric() %>% na.omit()
var_identical = data.frame(value = c(a,b),
group = c(rep(1,length(a)),rep(2,length(b))))
result = ifelse(length(a)>3&length(b)>3&!identical(a,b)&length(unique(a))>1&length(unique(b))>1,
#ifelse(shapiro.test(a)$p.value>0.05&shapiro.test(b)$p.value>0.05,
#t.test(a,b,paired = FALSE)$p.value,
wilcox.test(a,b,paired = FALSE)$p.value,1)
if(result<0.05){
print(i)
d= ggplot(data= data.frame(value = c(a,b),
group = c(rep("High",length(a)),rep("Low",length(b)))),
aes(group,value,fill = group))+
geom_boxplot()+
ggsignif::geom_signif(comparisons = list(c("High","Low"))) +
labs(title="New plot title")+ggtitle(i)
print(d)
result_static[k,1] = i
result_static[k,2] = median(a)
result_static[k,3] = median(b)
result_static[k,4] = result
k = k+1
}
}
}
}
result_static = na.omit(result_static)
which(result_static[,2]>result_static[,3]) %>% result_static[.,1]
which(result_static[,2]<result_static[,3]) %>% result_static[.,1]
#################################################
###############森林图########################
#################################################
# 单因素
if(T){
library("survival")
library("dplyr")
pretty_breast <- data[,c("juejing","T","N","M",
"stage","SUBTYPE","DFS","status",
"age","Group")]
colnames(pretty_breast ) = c("Menstrual period","T","N","M",
"stage","subtype","DFS","status",
"age","group")
pretty_breast$T = ifelse(pretty_breast$T<3,"T0-T2","T3-T4")
pretty_breast$N = ifelse(pretty_breast$N<1,"N0","N1-N3")
pretty_breast$stage = ifelse(pretty_breast$subtype<3,"I-II","III-IV")
pretty_breast$subtype = ifelse(pretty_breast$subtype==4,"TNBC","non-TNBC")
pretty_breast$age = ifelse(pretty_breast$age<=50,"<=30",">30")
pretty_breast$M = ifelse(pretty_breast$M<=0,"M0","M1")
pretty_breast$`Menstrual period` = ifelse(pretty_breast$`Menstrual period`==1,"pausimenia","non-pausimenia")
print(forest_model(coxph(Surv(DFS, status) ~ ., pretty_breast)))
ggsave(filename = paste0(save_pwd,"forest_cox.png"),height = 6)
}
# 控制多因素
if(T){
forest_model(coxph(Surv(DFS, status) ~ group + stage + N, pretty_breast))
ggsave(filename = paste0(save_pwd,"forest_multi_cox.png"))
}