# ======= 加载必要包 清除环境 ==============
# 清除环境
rm(list = ls(all.names = TRUE))
options(stringsAsFactors =F) #避免自动将字符串转换为R语言因子
# Load GEOquery without displaying any messages
suppressMessages(library(GEOquery))
# Load data.table (messages are not suppressed)
library(data.table)
# 解决编码问题
Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
# 增加缓冲区大小和超时设置
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 100)
options(timeout = 99999)
# 设置readr兼容模式
readr::local_edition(1)
setwd("C:\\Users\\Administrator\\Desktop\\1")
#### 表达矩阵获取 ####
# 方式一:从GEO下载表达矩阵(同时获取平台注释)
library(GEOquery)
gset <- getGEO("GSE74602", destdir = "GSE74602/", AnnotGPL = TRUE, getGPL = TRUE)
exp <- exprs(gset[[1]]) # 获取表达矩阵
#### 平台注释信息获取 ####
# 方法三:从GEOquery对象中提取(另一个方法)
feature_data <- fData(gset[[1]])
# 查看列名,确定基因符号所在的列
colnames(feature_data)
# 对于GPL6104平台,基因符号列通常是"Gene Symbol"
ids_direct <- feature_data[, c("ID", "Gene Symbol")]
colnames(ids_direct) <- c("ID", "Gene_name")
ids_direct <- subset(ids_direct, Gene_name != "" & !is.na(Gene_name))
head(ids_direct)
# 方法二:使用idmap3包(推荐)
# devtools::install_github("jimzeng/idmap3") # 首次使用需安装
library(idmap3)
IDs <- get_pipe_IDs('GPL6104') # 自动获取探针注释
colnames(IDs) <- c("ID", "Gene_name")
head(IDs)
#### 样本信息获取 ####
pdata <- pData(gset[[1]])
# 提取肿瘤组和正常组
Tumor <- rownames(pdata)[grep("Tumor", pdata$`tissue type:ch1`)]
Normal <- rownames(pdata)[grep("Normal", pdata$`tissue type:ch1`)]
group <- c(rep("Tumor", length(Tumor)), rep("Normal", length(Normal)))
#### 处理表达矩阵 ####
library(dplyr)
# 使用直接从GEOquery对象获取的注释信息(方法三)
exp_df <- as.data.frame(exp)
exp_df$ID <- rownames(exp_df)
exp_annotated <- exp_df %>%
inner_join(ids_direct, by = "ID") %>%
select(-ID)
# 处理重复基因名(取平均值)
exp_processed <- exp_annotated %>%
group_by(Gene_name) %>%
summarise(across(everything(), mean)) %>%
as.data.frame()
# 设置行名
rownames(exp_processed) <- exp_processed$Gene_name
exp_processed <- exp_processed[, -1]
# 筛选肿瘤和正常样本
exp_final <- exp_processed[, c(Tumor, Normal)]
#### 备选方法:从补充文件获取表达矩阵 ####
# 方式二:从补充文件中下载表达矩阵
library(data.table)
exp1 <- fread("GSE74602/GSE74602_non_normalized.txt.gz")
rownames(exp1) <- exp1$ID_REF
exp1 <- exp1[, -1] # 移除ID_REF列
# 使用idmap注释(方法二)
exp1_df <- as.data.frame(exp1)
exp1_df$ID <- rownames(exp1_df)
exp1_annotated <- exp1_df %>%
inner_join(IDs, by = "ID") %>%
select(-ID)
# 处理重复基因名
exp1_processed <- exp1_annotated %>%
group_by(Gene_name) %>%
summarise(across(everything(), mean)) %>%
as.data.frame()
rownames(exp1_processed) <- exp1_processed$Gene_name
exp1_processed <- exp1_processed[, -1]
# 确保样本名一致
colnames(exp1_processed) <- colnames(exp_final)
#### 保存数据 ####
save(exp_final, exp1_processed, Tumor, Normal, group, file = "GSE74602/GSE74602数据集.RData")
探针注释信息的完整获取方法
setwd(“C:\Users\Administrator\Desktop\1”)
表达矩阵获取
library(GEOquery)
gset <- getGEO(“GSE74602”, destdir = “GSE74602/”, AnnotGPL = FALSE, getGPL = FALSE)
exp <- exprs(gset[[1]]) # 获取表达矩阵
平台注释信息获取 - 多种方法
方法一:直接从GEO下载GPL文件
gpl <- getGEO(‘GPL6104’, destdir = “GSE74602/”)
gpl_table <- Table(gpl)
ids <- gpl_table[, c(1, 12)]
colnames(ids) <- c(“ID”, “Gene_name”)
ids <- subset(ids, Gene_name != “”) # 过滤空基因名
head(ids)
方法二:使用idmap3包(推荐)
devtools::install_github(“jimzeng/idmap3”) # 首次使用需安装
library(idmap3)
IDs <- get_pipe_IDs(‘GPL6104’) # 自动获取探针注释
colnames(IDs) <- c(“ID”, “Gene_name”)
head(IDs)
方法三:从GEOquery对象中提取(另一个方法)
此方法需要下载GPL信息,但更直接
gset_with_gpl <- getGEO(“GSE74602”, destdir = “GSE74602/”, AnnotGPL = TRUE, getGPL = TRUE)
feature_data <- fData(gset_with_gpl[[1]])
ids_direct <- feature_data[, c(“ID”, “Gene Symbol”)]
colnames(ids_direct) <- c(“ID”, “Gene_name”)
ids_direct <- subset(ids_direct, Gene_name != “”)
head(ids_direct)
方法四:使用Bioconductor注释包(最可靠)
首先确定GPL6104对应的Bioconductor注释包
GPL6104对应Illumina HumanHT-12 V3.0芯片
安装对应注释包:biocLite(“illuminaHumanv3.db”)
library(illuminaHumanv3.db)
获取探针到基因的映射
probe_ids <- rownames(exp)
gene_symbols <- mapIds(illuminaHumanv3.db,
keys = probe_ids,
column = “SYMBOL”,
keytype = “PROBEID”)
ids_bioc <- data.frame(ID = probe_ids, Gene_name = gene_symbols)
ids_bioc <- na.omit(ids_bioc) # 移除无注释的探针
head(ids_bioc)
样本信息获取
pdata <- pData(gset[[1]])
提取肿瘤组和正常组
Tumor <- rownames(pdata)[grep(“Tumor”, pdata‘tissuetype:ch1‘)]Normal<−rownames(pdata)[grep("Normal",pdata`tissue type:ch1`)]
Normal <- rownames(pdata)[grep("Normal", pdata‘tissuetype:ch1‘)]Normal<−rownames(pdata)[grep("Normal",pdatatissue type:ch1)]
group <- c(rep(“Tumor”, length(Tumor)), rep(“Normal”, length(Normal)))
处理表达矩阵
library(dplyr)
使用Bioconductor注释(方法四)处理表达矩阵
exp_df <- as.data.frame(exp)
exp_df$ID <- rownames(exp_df)
exp_annotated <- exp_df %>%
inner_join(ids_bioc, by = “ID”) %>%
select(-ID)
处理重复基因名(取平均值)
exp_processed <- exp_annotated %>%
group_by(Gene_name) %>%
summarise(across(everything(), mean)) %>%
as.data.frame()
设置行名
rownames(exp_processed) <- exp_processed$Gene_name
exp_processed <- exp_processed[, -1]
筛选肿瘤和正常样本
exp_final <- exp_processed[, c(Tumor, Normal)]
保存数据
save(exp_final, Tumor, Normal, group, file = “GSE74602/GSE74602数据集.RData”)
9183

被折叠的 条评论
为什么被折叠?



