CUT and Tag 数据分析与整理

原创已于 2025-03-04 17:55:32 修改 · 1.3k 阅读

10 ·

CC 4.0 BY-SA版权

文章标签：

#linux #运维 #服务器

于 2025-03-04 15:13:01 首次发布

参考文献：

https://zhuanlan.zhihu.com/p/686382493

CUT&Tag数据分析笔记（1） - 简书

CUT&Tag数据分析笔记（2） - 简书

https://zhuanlan.zhihu.com/p/356333135

CUT&Tag Data Processing and Analysis Tutorial

1. 数据清洗与质控

官网建议不需要进行数据去接头

Critical step: There is no need to trim reads from out standard 25x25 PE sequencing, as adapter sequences will not be included in reads of inserts >25 bp. However, for users performing longer sequencing, reads will need to be trimmed by Cutadapt and mapped by --local --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 to ignore any remaining adapter sequence at the 3’ ends of reads during mapping

但是我们测试发现不去接头比对率非常低，所以此处用fastp去接头

#fastqc.sh
##== linux command ==##
### 质控脚本如下
projPath=/home/xintong/CutTag
echo -e " \n \n \n  111# qc 1 !!! \n \n \n "      
mkdir -p ${projPath}/fastqFileQC/qc1
cd  ${projPath}/fastqFileQC/qc1
pwd
ls ${projPath}/raw/*fq.gz | xargs fastqc -t 12 -o  ./
multiqc ./

echo -e  " \n 111#  ALL  Work Done!!! \n "

###fastp.sh
cores=20
projPath=/home/xintong/CutTag
mkdir -p ${projPath}/clean
for histName in 370-1  EV-1 ;do
echo "I am processing ${projPath}/raw/${histName}"
fastp -w ${cores} -i ${projPath}/raw/${histName}_R1.fq.gz -I ${projPath}/raw/${histName}_R2.fq.gz -o ${projPath}/clean/${histName}_clean_R1.fq.gz -O ${projPath}/clean/${histName}_clean_R2.fq.gz
echo "Now I finished processing ${projPath}/raw/${histName}"
done

2. 数据比对至参考基因组上

比对时设置参数--local, 局部比对

###clean_align.sh
##== linux command ==##
## Build the bowtie2 reference genome index if needed:
## bowtie2-build path/to/hg38/fasta/hg38.fa /path/to/bowtie2Index/hg38
cores=20
ref=/home/xintong/ATAC_Project/ATAC_EDDvsEMvsLEAF/reference/ricegenome           # 这个是我构建好的基因组的位置，也可以直接下载
projPath=/home/xintong/CutTag
#mkdir -p ${projPath}/alignment/sam/bowtie2_summary
#mkdir -p ${projPath}/alignment/bam
#mkdir -p ${projPath}/alignment/bed
#mkdir -p ${projPath}/alignment/bedgraph
for histName in 370-1  EV-1 ;do
echo I am processing ${projPath}/clean/${histName}
bowtie2 --local --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p ${cores} -x ${ref} -1 ${projPath}/clean/${histName}_clean_R1.fq.gz -2 ${projPath}/clean/${histName}_clean_R2.fq.gz -S ${projPath}/alignment/sam/${histName}_clean_bowtie2.sam &> ${projPath}/alignment/sam/bowtie2_summary/${histName}_clean_bowtie2.txt;
done

将clean data同时比对至E coli参考基因组上，可以进行后续spike in calibration

#spike_in_clean.sh
##== linux command ==##
projPath=/home/xintong/CutTag
spikeInRef="/home/xintong/CutTag/Ecoli_ref/ecoli/ecoli"
cores=20
#chromSize="/fh/fast/gottardo_r/yezheng_working/SupplementaryData/hg38/chromSize/hg38.chrom.size"

## bowtie2-build path/to/Ecoli/fasta/Ecoli.fa /path/to/bowtie2Index/Ecoli
for histName in 370-1  EV-1;do
bowtie2 --end-to-end --very-sensitive --no-overlap --no-dovetail --no-mixed --no-discordant --phred33 -I 10 -X 700 -p ${cores} -x ${spikeInRef} -1 ${projPath}/clean/${histName}_clean_R1.fq.gz -2 ${projPath}/clean/${histName}_clean_R2.fq.gz -S $projPath/alignment/sam/${histName}_bowtie2_spikeIn_clean.sam &> $projPath/alignment/sam/bowtie2_summary/${histName}_bowtie2_spikeIn_clean.txt

seqDepthDouble=`samtools view -F 0x04 $projPath/alignment/sam/${histName}_bowtie2_spikeIn_clean.sam | wc -l`
seqDepth=$((seqDepthDouble/2))

echo $seqDepth >$projPath/alignment/sam/bowtie2_summary/${histName}_bowtie2_spikeIn_clean.seqDepth
done

3 清洗比对至参考基因组上的sam数据

#filter_sam.sh
##== linux 命令 ==##
#通过回帖质量来筛选回贴上的 reads（可选）
minQualityScore=2
projPath=/home/xintong/CutTag
for histName in 370-1 EV-1 ; do 
echo Now I am processing ${projPath}/alignment/sam/${histName}

# 使用 samtools 的 view 子命令，-h 选项表示在输出中包含头部信息，-q $minQualityScore 选项用于筛选出比对质量得分大于等于 $minQualityScore 的比对记录。
# 从 ${projPath}/alignment/sam/${histName}_clean_bowtie2.sam 文件中筛选出符合条件的记录，并将结果输出到 ${projPath}/alignment/sam/${histName}_clean_bowtie2_mQ.sam 文件中。
samtools view -h -q $minQualityScore ${projPath}/alignment/sam/${histName}_clean_bowtie2.sam >${projPath}/alignment/sam/${histName}_clean_bowtie2_mQ.sam

##== linux 命令 ==##
## Filter and keep the mapped read pairs

#本节是为 Peak calling 和 可视化做准备 所必需的，其中有一些过滤和文件格式转换需要完成。
## 筛选和保留比对上的双端 reads 
# 使用 samtools 的 view 子命令，-b 选项表示将输入的 SAM 格式文件转换为 BAM 格式（二进制的 SAM 文件），-S 选项指明输入文件为 SAM 格式。
# -F 0x04 选项用于过滤掉未比对上的记录（0x04 是 SAM 标志中表示未比对上的标志位）。
# 从 ${projPath}/alignment/sam/${histName}_clean_bowtie2_mQ.sam 这个 SAM 文件中筛选出已比对上的记录，并将其转换为 BAM 格式，
# 最后将结果输出到 ${projPath}/alignment/bam/${histName}_bowtie2.mapped.bam 文件中。
samtools view -bS -F 0x04 ${projPath}/alignment/sam/${histName}_clean_bowtie2_mQ.sam > ${projPath}/alignment/bam/${histName}_bowtie2.mapped.bam
## Convert into bed file format
## 将 BAM 文件转换为 bed 文件格式
bedtools bamtobed -bedpe -i ${projPath}/alignment/bam/${histName}_bowtie2.mapped.bam >  ${projPath}/alignment/bed/${histName}_bowtie2.bed
## Keep the read pairs that are on the same chromosome and fragment length less than 1000bp.
## 保留那些在同一条染色体且片段长度小于 1000bp 的双端 reads
awk '$1==$4 && $6-$2 < 1000 {print $0}' ${projPath}/alignment/bed/${histName}_bowtie2.bed >  ${projPath}/alignment/bed/${histName}_bowtie2.clean.bed
## Only extract the fragment related columns
## 仅提取片段相关的列 
cut -f 1,2,6 ${projPath}/alignment/bed/${histName}_bowtie2.clean.bed | sort -k1,1 -k2,2n -k3,3n  >${projPath}/alignment/bed/${histName}_bowtie2.fragments.bed

done

4. Callpeaks

官网推荐 seacr 这里用的macs2

projPath=/home/xintong/CutTag
#mkdir -p $projPath/peakCalling
histName=370-1
controlName=EV-1
macs2 callpeak -t ${projPath}/alignment/bam/${histName}_bowtie2.mapped.bam \
      -c ${projPath}/alignment/bam/${controlName}_bowtie2.mapped.bam \
      -g hs -f BAMPE -n macs2_peak_q0.1 --outdir $projPath/peakCalling/MACS2 -q 0.1 --keep-dup all 2>${projPath}/peakCalling/MACS2/macs2Peak_summary.txt

5. 生成bw文件用于IGV可视化

projPath=/home/xintong/CutTag
cd ${projPath}/alignment/bam/
for histName in 370-1  EV-1;do
 #samtools view -@ 20 -S ${projPath}/alignment/sam/${histName}_bowtie2.sam -b > ${histName}.bam
 samtools sort -@ 20 -l 5 -o ${histName}.sort.bam ${histName}_bowtie2.mapped.bam
 samtools index -@ 25 ${histName}.sort.bam
 samtools flagstat ${histName}.sort.bam >  ${histName}.stat
 bamCoverage -p 20 -v --binSize 10  --normalizeUsing RPKM  --extendReads -b ${histName}.sort.bam -o ${histName}.bam.sort.bw;
done

6. 对Peak进行基因注释，R中进行

#在monocle环境的R下运行
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") 
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
setwd("/home/ruiqing/peak250304")
library(BiocManager)
#install.packages('ggupset')
library(ggupset)
library(ggplot2)
library(GenomicFeatures)
#BiocManager::install('ChIPseeker')
library(ChIPseeker)
#BiocManager::install('ChIPpeakAnno')
library(ChIPpeakAnno)
library(clusterProfiler)
library(org.Osativa.eg.db)
org.oryza=org.Osativa.eg.db

txdb.oryza <- makeTxDbFromGFF("/home/xintong/ATAC_Project/ATAC_EDDvsEMvsLEAF/reference/all.gff3")
#save(txdb.oryza,file = '/home/xintong/ATAC_Project/ATAC_EDDvsEMvsLEAF/txdb_oryza.RData')
#load('/home/xintong/ATAC_Project/ATAC_EDDvsEMvsLEAF/txdb_oryza.RData')

# 指定目录路径
target_dir <- "/home/ruiqing/peak250304"

# 列出指定目录下以 .bed 结尾的文件
bed_files <- list.files(path = target_dir, pattern = "\\.bed$", full.names = FALSE)

if (length(bed_files) == 0) {
  stop("No .bed files found in the specified directory.")
}
# 打印结果
print(bed_files)

promoter <- getPromoters(TxDb=txdb.oryza, 
                         upstream=3000, downstream=0)
                         
for (bedfile in bed_files){
EDD_anno <- annotatePeak(bedfile, # 分别改成2或者3或者4即可，分别对应四个文件
                          tssRegion=c(-3000, 0),
                        TxDb=txdb.oryza, annoDb="org.Osativa.eg.db")
cuttag_annotation=as.data.frame(EDD_anno)
 output_file <- file.path("/home/ruiqing/peak250304/anno/", paste0(tools::file_path_sans_ext(bedfile), ".csv"))
write.csv(cuttag_annotation,output_file)
}