Scaffolding

基于数据库架构生成网页模板的过程。在 ASP .NET 中, 动态数据使用基架来简化基于 Web 的 UI 的生成过程。用户可以通过这种 UI 来查看和更新数据库。
根据这些参考脚本内容”#!/bin/bash ################################ START OF YAHS SCAFFOLDING TEST ################################ #### download the test data ## test is either "LYZE01" or "CHM13" test="LYZE01" #test="CHM13" wget https://zenodo.org/record/7079219/files/${test}.contigs.fasta.gz wget https://zenodo.org/record/7079219/files/${test}.contigs.fasta.gz.fai wget https://zenodo.org/record/7079219/files/${test}.hic.qn.bam out="test_out" outdir="." contigs="${test}.contigs.fasta.gz" # need to be indexed, i.e., ${test}.contigs.fasta.gz.fai is presented hicaln="${test}.hic.qn.bam" # could be .bed, .bam or .bin file #### run yahs scaffolding ../yahs -o ${outdir}/${out} ${contigs} ${hicaln} || exit 1 ## LYZE01 is a very small genome, ## better result when running YaHS with option "-r 1000,2000,5000,10000,20000,50000,100000,200000,500000" #../yahs -r 1000,2000,5000,10000,20000,50000,100000,200000,500000 -o ${outdir}/${out} ${contigs} ${hicaln} || exit 1 ################################# END OF YAHS SCAFFOLDING TEST ################################# #### change doplot to 1 if you want to generate hic contact maps ## you also need to set the path of juicer_tools, pretext_map, pretext_snapshot, and samtools doplot=0 if [ ${doplot} -eq 0 ]; then exit 0; fi ## need juicer_tools/pretextmap and samtools if want to do hic plot ## juicer_tools: https://github.com/aidenlab/juicer/wiki/Download ## PretextMap: https://github.com/wtsi-hpag/PretextMap ## PretexSnapshot: https://github.com/wtsi-hpag/PretextSnapshot ## samtools: https://github.com/samtools/samtools ## please adjust the path to juicer_tools and samtools ## here we use 12 CPUs and 32Gb memory for juicer_tools pre - adjust it according to your device ## see more information for juicer tools https://github.com/aidenlab/juicer/wiki/Juicer-Tools-Quick-Start ## output file will be ${outdir}/${out}.hic ## the output hic file could be viewed with JuiceBox https://github.com/aidenlab/Juicebox #juicer_tools="java -Xmx32G -jar /bin/juicer_tools_1.22.01.jar pre --threads 12" ## v1.9.9 seems much faster than v1.22.01 juicer_tools="java -Xmx32G -jar /bin/juicer_tools.1.9.9_jcuda.0.8.jar pre" pretext_map="/bin/PretextMap" pretext_snapshot="/bin/PretextSnapshot" samtools="/bin/samtools" #### this is to generate input file for juicer_tools - non-assembly mode or for PretextMap ## here we use 8 CPUs and 32Gb memory for sorting - adjust it according to your device (../juicer pre ${outdir}/${out}.bin ${outdir}/${out}_scaffolds_final.agp ${contigs}.fai 2>${outdir}/tmp_juicer_pre.log | LC_ALL=C sort -k2,2d -k6,6d -T ${outdir} --parallel=8 -S32G | awk 'NF' > ${outdir}/alignments_sorted.txt.part) && (mv ${outdir}/alignments_sorted.txt.part ${outdir}/alignments_sorted.txt) ## prepare chromosome size file from samtools index file # ${samtools} faidx ${outdir}/${out}_scaffolds_final.fa # cut -f1-2 ${outdir}/${out}_scaffolds_final.fa.fai >${outdir}/${out}_scaffolds_final.chrom.sizes ## another way to prepare chromosome size file ## this is an easier way especially when we have >2G scaffolds which need scaling cat ${outdir}/tmp_juicer_pre.log | grep "PRE_C_SIZE" | cut -d' ' -f2- >${outdir}/${out}_scaffolds_final.chrom.sizes ## do juicer hic map (${juicer_tools} ${outdir}/alignments_sorted.txt ${outdir}/${out}.hic.part ${outdir}/${out}_scaffolds_final.chrom.sizes) && (mv ${outdir}/${out}.hic.part ${outdir}/${out}.hic) ## do Pretext hic map (awk 'BEGIN{print "## pairs format v1.0"} {print "#chromsize:\t"$1"\t"$2} END {print "#columns:\treadID\tchr1\tpos1\tchr2\tpos2\tstrand1\tstrand2"}' ${outdir}/${out}_scaffolds_final.chrom.sizes; awk '{print ".\t"$2"\t"$3"\t"$6"\t"$7"\t.\t."}' alignments_sorted.txt) | ${pretext_map} -o ${outdir}/${out}.pretext # and a pretext snapshot ${pretext_snapshot} -m ${outdir}/${out}.pretext --sequences "=full" -o ${outdir} #### this is to generate input file for juicer_tools - assembly (JBAT) mode (-a) ../juicer pre -a -o ${outdir}/${out}_JBAT ${outdir}/${out}.bin ${outdir}/${out}_scaffolds_final.agp ${contigs}.fai 2>${outdir}/tmp_juicer_pre_JBAT.log cat ${outdir}/tmp_juicer_pre_JBAT.log | grep "PRE_C_SIZE" | cut -d' ' -f2- >${outdir}/${out}_JBAT.chrom.sizes (${juicer_tools} ${outdir}/${out}_JBAT.txt ${outdir}/${out}_JBAT.hic.part ${outdir}/${out}_JBAT.chrom.sizes) && (mv ${outdir}/${out}_JBAT.hic.part ${outdir}/${out}_JBAT.hic) #### this is to generate final genome assembly file after manual curation with JuiceBox (JBAT) ## the output assembly file after curation is ${outdir}/${out}_JBAT.review.assembly ## the final output is ${outdir}/${out}_JBAT.FINAL.agp and ${outdir}/${out}_JBAT.FINAL.fa # ../juicer post -o ${outdir}/${out}_JBAT ${outdir}/${out}_JBAT.review.assembly ${outdir}/${out}_JBAT.liftover.agp ${contigs}“#!/bin/bash ################################ START OF YAHS SCAFFOLDING TEST ################################ #### download the test data ## test is either "LYZE01" or "CHM13" test="LYZE01" #test="CHM13" wget https://zenodo.org/record/7079219/files/${test}.contigs.fasta.gz wget https://zenodo.org/record/7079219/files/${test}.contigs.fasta.gz.fai wget https://zenodo.org/record/7079219/files/${test}.hic.qn.bam out="test_out" outdir="." contigs="${test}.contigs.fasta.gz" # need to be indexed, i.e., ${test}.contigs.fasta.gz.fai is presented hicaln="${test}.hic.qn.bam" # could be .bed, .bam or .bin file #### run yahs scaffolding ../yahs -o ${outdir}/${out} ${contigs} ${hicaln} || exit 1 ## LYZE01 is a very small genome, ## better result when running YaHS with option "-r 1000,2000,5000,10000,20000,50000,100000,200000,500000" #../yahs -r 1000,2000,5000,10000,20000,50000,100000,200000,500000 -o ${outdir}/${out} ${contigs} ${hicaln} || exit 1 ################################# END OF YAHS SCAFFOLDING TEST ################################# #### change doplot to 1 if you want to generate hic contact maps ## you also need to set the path of juicer_tools, pretext_map, pretext_snapshot, and samtools doplot=0 if [ ${doplot} -eq 0 ]; then exit 0; fi ## need juicer_tools/pretextmap and samtools if want to do hic plot ## juicer_tools: https://github.com/aidenlab/juicer/wiki/Download ## PretextMap: https://github.com/wtsi-hpag/PretextMap ## PretexSnapshot: https://github.com/wtsi-hpag/PretextSnapshot ## samtools: https://github.com/samtools/samtools ## please adjust the path to juicer_tools and samtools ## here we use 12 CPUs and 32Gb memory for juicer_tools pre - adjust it according to your device ## see more information for juicer tools https://github.com/aidenlab/juicer/wiki/Juicer-Tools-Quick-Start ## output file will be ${outdir}/${out}.hic ## the output hic file could be viewed with JuiceBox https://github.com/aidenlab/Juicebox #juicer_tools="java -Xmx32G -jar /bin/juicer_tools_1.22.01.jar pre --threads 12" ## v1.9.9 seems much faster than v1.22.01 juicer_tools="java -Xmx32G -jar /bin/juicer_tools.1.9.9_jcuda.0.8.jar pre" pretext_map="/bin/PretextMap" pretext_snapshot="/bin/PretextSnapshot" samtools="/bin/samtools" #### this is to generate input file for juicer_tools - non-assembly mode or for PretextMap ## here we use 8 CPUs and 32Gb memory for sorting - adjust it according to your device (../juicer pre ${outdir}/${out}.bin ${outdir}/${out}_scaffolds_final.agp ${contigs}.fai 2>${outdir}/tmp_juicer_pre.log | LC_ALL=C sort -k2,2d -k6,6d -T ${outdir} --parallel=8 -S32G | awk 'NF' > ${outdir}/alignments_sorted.txt.part) && (mv ${outdir}/alignments_sorted.txt.part ${outdir}/alignments_sorted.txt) ## prepare chromosome size file from samtools index file # ${samtools} faidx ${outdir}/${out}_scaffolds_final.fa # cut -f1-2 ${outdir}/${out}_scaffolds_final.fa.fai >${outdir}/${out}_scaffolds_final.chrom.sizes ## another way to prepare chromosome size file ## this is an easier way especially when we have >2G scaffolds which need scaling cat ${outdir}/tmp_juicer_pre.log | grep "PRE_C_SIZE" | cut -d' ' -f2- >${outdir}/${out}_scaffolds_final.chrom.sizes ## do juicer hic map (${juicer_tools} ${outdir}/alignments_sorted.txt ${outdir}/${out}.hic.part ${outdir}/${out}_scaffolds_final.chrom.sizes) && (mv ${outdir}/${out}.hic.part ${outdir}/${out}.hic) ## do Pretext hic map (awk 'BEGIN{print "## pairs format v1.0"} {print "#chromsize:\t"$1"\t"$2} END {print "#columns:\treadID\tchr1\tpos1\tchr2\tpos2\tstrand1\tstrand2"}' ${outdir}/${out}_scaffolds_final.chrom.sizes; awk '{print ".\t"$2"\t"$3"\t"$6"\t"$7"\t.\t."}' alignments_sorted.txt) | ${pretext_map} -o ${outdir}/${out}.pretext # and a pretext snapshot ${pretext_snapshot} -m ${outdir}/${out}.pretext --sequences "=full" -o ${outdir} #### this is to generate input file for juicer_tools - assembly (JBAT) mode (-a) ../juicer pre -a -o ${outdir}/${out}_JBAT ${outdir}/${out}.bin ${outdir}/${out}_scaffolds_final.agp ${contigs}.fai 2>${outdir}/tmp_juicer_pre_JBAT.log cat ${outdir}/tmp_juicer_pre_JBAT.log | grep "PRE_C_SIZE" | cut -d' ' -f2- >${outdir}/${out}_JBAT.chrom.sizes (${juicer_tools} ${outdir}/${out}_JBAT.txt ${outdir}/${out}_JBAT.hic.part ${outdir}/${out}_JBAT.chrom.sizes) && (mv ${outdir}/${out}_JBAT.hic.part ${outdir}/${out}_JBAT.hic) #### this is to generate final genome assembly file after manual curation with JuiceBox (JBAT) ## the output assembly file after curation is ${outdir}/${out}_JBAT.review.assembly ## the final output is ${outdir}/${out}_JBAT.FINAL.agp and ${outdir}/${out}_JBAT.FINAL.fa # ../juicer post -o ${outdir}/${out}_JBAT ${outdir}/${out}_JBAT.review.assembly ${outdir}/${out}_JBAT.liftover.agp ${contigs}”重新调整上面对话中的脚本“#!/bin/bash #SBATCH -p amd_256 #SBATCH -N 1 #SBATCH -n 32 #SBATCH -t 96:00:00 #SBATCH --mem=256G #SBATCH -J hic_chromap #SBATCH -o /public1/home/scb3201/rawdata/Hic/logs/%j_chromap_out.log #SBATCH -e /public1/home/scb3201/rawdata/Hic/logs/%j_chromap_err.log ########################################################## # 1. 环境设置与路径定义 ########################################################## echo "[INFO] 开始Chromap Hi-C分析流程... $(date)" # 记录开始时间 START_TIME=$(date +%s) # 激活Conda环境 source ~/.bashrc conda activate Hic || { echo "[ERROR] Hic环境激活失败! $(date)" exit 1 } # 核心路径 BASE_DIR="/public1/home/scb3201/rawdata/Hic" RESULTS_DIR="/public1/home/scb3201/results/genome/Hic" REF_GENOME="/public1/home/scb3201/results/genome/Medaka/genome.nextpolish.medaka.fasta" READ1="${BASE_DIR}/reads/hic_R1.fastq.gz" READ2="${BASE_DIR}/reads/hic_R2.fastq.gz" # 子目录 ALIGN_DIR="${RESULTS_DIR}/alignment" SCAFFOLD_DIR="${RESULTS_DIR}/scaffold" JUICEBOX_DIR="${RESULTS_DIR}/juicebox" LOG_DIR="${BASE_DIR}/logs" TMP_DIR="${BASE_DIR}/tmp" # 创建目录 mkdir -p ${ALIGN_DIR} ${SCAFFOLD_DIR} ${JUICEBOX_DIR} ${LOG_DIR} ${TMP_DIR} # 设置临时目录 export TMPDIR=${TMP_DIR} export _JAVA_OPTIONS="-Xmx200G -Xms64G" # 获取任务ID用于日志命名 JOB_ID=${SLURM_JOB_ID:-manual_$(date +%s)} ########################################################## # 2. 参考基因组准备 ########################################################## echo "[STEP] 准备参考基因组... $(date)" # 函数:检查步骤执行状态 check_status() { if [ $? -ne 0 ]; then echo "[ERROR] $1 失败! $(date)" exit 1 fi } # 生成Chromap索引 if [[ ! -f ${REF_GENOME}.chromap.idx ]]; then echo "[INFO] 创建Chromap索引..." chromap --preset hic -i -r ${REF_GENOME} -o ${REF_GENOME}.chromap.idx 2> ${LOG_DIR}/chromap_index_${JOB_ID}.log check_status "Chromap索引创建" else echo "[INFO] 已存在Chromap索引,跳过创建" fi # 生成染色体长度文件 if [[ ! -f ${REF_GENOME}.fai ]]; then echo "[INFO] 创建参考基因组索引..." samtools faidx ${REF_GENOME} 2> ${LOG_DIR}/samtools_faidx_${JOB_ID}.log check_status "参考基因组索引创建" else echo "[INFO] 已存在参考基因组索引,跳过创建" fi # 创建染色体大小文件 cut -f1,2 ${REF_GENOME}.fai > ${JUICEBOX_DIR}/chrom.sizes 2> ${LOG_DIR}/chrom_sizes_${JOB_ID}.log check_status "染色体大小文件生成" ########################################################## # 3. Hi-C数据比对(使用Chromap) ########################################################## echo "[STEP] 使用Chromap比对Hi-C reads... $(date)" # Chromap比对参数说明: # --preset hic: 使用Hi-C专用预设参数 # -t: 线程数 # -x: 索引文件 # -1/-2: 双端测序文件 # -o: 输出BAM文件 # --low-mem: 低内存模式(适用于大基因组) # --remove-pcr-duplicates: 移除PCR重复 chromap \ --preset hic \ -t ${SLURM_NTASKS} \ -x ${REF_GENOME}.chromap.idx \ -1 ${READ1} \ -2 ${READ2} \ -o ${ALIGN_DIR}/aligned.bam \ --low-mem \ --remove-pcr-duplicates \ 2> ${LOG_DIR}/chromap_align_${JOB_ID}.log check_status "Chromap比对" # 排序BAM文件 echo "[INFO] 排序比对结果..." samtools sort -@ ${SLURM_NTASKS} -m 4G -T ${TMP_DIR}/sort_temp -o ${ALIGN_DIR}/aligned.sorted.bam ${ALIGN_DIR}/aligned.bam 2> ${LOG_DIR}/samtools_sort_${JOB_ID}.log check_status "BAM排序" # 索引BAM文件 samtools index ${ALIGN_DIR}/aligned.sorted.bam 2> ${LOG_DIR}/samtools_index_${JOB_ID}.log || { echo "[WARNING] BAM索引创建失败,继续执行... $(date)" } ########################################################## # 4. 使用Yahs进行Scaffolding ########################################################## echo "[STEP] Yahs scaffolding... $(date)" # 运行Yahs echo "[INFO] 运行Yahs scaffolding..." yahs \ -o ${SCAFFOLD_DIR}/yahs_out \ ${REF_GENOME} \ ${ALIGN_DIR}/aligned.sorted.bam 2> ${LOG_DIR}/yahs_run_${JOB_ID}.log check_status "Yahs运行" ########################################################## # 5. 转换为Juicer格式 ########################################################## echo "[STEP] 转换为Juicer格式... $(date)" # 使用Yahs内置转换功能 echo "[METHOD] 使用Yahs内置转换..." yahs juicer \ ${SCAFFOLD_DIR}/yahs_out.bin \ ${JUICEBOX_DIR}/chrom.sizes \ ${SCAFFOLD_DIR}/yahs_out_scaffolds_final.agp \ ${REF_GENOME} \ ${JUICEBOX_DIR}/juicer 2> ${LOG_DIR}/yahs_juicer_${JOB_ID}.log if [[ $? -ne 0 ]]; then # 备用方法:使用samtools和awk处理 echo "[METHOD] Yahs内置转换失败,使用备用转换方法..." # 步骤1: 提取有效交互 echo "[SUBSTEP] 提取有效交互..." samtools view -h ${SCAFFOLD_DIR}/yahs_out.bin | \ awk 'BEGIN{OFS="\t"} { if($1 ~ /^@/) { next } # 跳过头部 if($7 != "=") { next } # 只保留正常配对的reads split($1, a, "/"); read_id = a[1] if(read_id == prev_id) { # 输出成对reads print prev_chr, prev_pos, prev_pos + length(prev_seq), $3, $4, $4 + length($10) read_id = "" # 避免重复输出 } else { # 保存当前read信息 prev_id = read_id prev_chr = $3 prev_pos = $4 prev_seq = $10 } }' > ${JUICEBOX_DIR}/interactions.bedpe 2> ${LOG_DIR}/interactions_extract_${JOB_ID}.log check_status "交互提取" # 步骤2: 转换为Juicer格式 echo "[SUBSTEP] 转换为Juicer矩阵..." awk 'BEGIN{OFS="\t"} { # Juicer格式: <str1> <chr1> <pos1> <frag1> <str2> <chr2> <pos2> <frag2> # 计算中点位置 pos1 = int(($2 + $3) / 2) pos2 = int(($5 + $6) / 2) # 输出交互矩阵 print 0, $1, pos1, 0, 0, $4, pos2, 1 }' ${JUICEBOX_DIR}/interactions.bedpe > ${JUICEBOX_DIR}/juicer.2.txt 2> ${LOG_DIR}/juicer_convert_${JOB_ID}.log check_status "Juicer格式转换" # 步骤3: 创建片段文件 echo "[SUBSTEP] 创建片段文件..." awk 'BEGIN{OFS="\t"} { print $1, 0, $2, 1 }' ${JUICEBOX_DIR}/chrom.sizes > ${JUICEBOX_DIR}/juicer.3.txt 2> ${LOG_DIR}/juicer_fragments_${JOB_ID}.log check_status "片段文件创建" # 复制染色体大小文件 cp ${JUICEBOX_DIR}/chrom.sizes ${JUICEBOX_DIR}/juicer.chromsizes fi ########################################################## # 6. 生成.hic文件 ########################################################## echo "[STEP] 生成.hic文件... $(date)" # 排序Juicer文件 echo "[INFO] 排序Juicer输入文件..." sort -S 32G -T ${TMP_DIR} --parallel=${SLURM_NTASKS} \ -k2,2d -k6,6d ${JUICEBOX_DIR}/juicer.2.txt > ${JUICEBOX_DIR}/juicer.2.sorted.txt 2> ${LOG_DIR}/juicer_sort_${JOB_ID}.log || { echo "[WARNING] 排序失败,使用原始文件... $(date)" cp ${JUICEBOX_DIR}/juicer.2.txt ${JUICEBOX_DIR}/juicer.2.sorted.txt } # 使用juicer_tools创建.hic文件 echo "[INFO] 运行juicer_tools..." # 尝试查找juicer_tools路径 JUICER_TOOLS=$(which juicer_tools 2>/dev/null) if [ -z "$JUICER_TOOLS" ]; then JUICER_TOOLS="java -jar $(find ${CONDA_PREFIX}/share -name 'juicer_tools*.jar' | head -n1)" fi # 运行juicer_tools $JUICER_TOOLS pre \ -j ${SLURM_NTASKS} \ -t ${TMP_DIR} \ ${JUICEBOX_DIR}/juicer.2.sorted.txt \ ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic \ ${JUICEBOX_DIR}/juicer.chromsizes 2> ${LOG_DIR}/juicer_pre_${JOB_ID}.log || { echo "[ERROR] .hic文件生成失败! $(date)" # 备选方法:使用Yahs的替代方案 echo "[FIX] 尝试Yahs替代方案..." yahs ${REF_GENOME} ${ALIGN_DIR}/aligned.sorted.bam \ -o ${SCAFFOLD_DIR}/yahs_alt \ --output-hic ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic \ 2> ${LOG_DIR}/yahs_alt_${JOB_ID}.log || { echo "[CRITICAL] 所有.hic生成方法均失败! $(date)" exit 1 } } ########################################################## # 7. 结果验证与报告 ########################################################## echo "[STEP] 验证结果... $(date)" # 计算运行时间 END_TIME=$(date +%s) RUNTIME=$((END_TIME - START_TIME)) RUNTIME_STR=$(printf "%02d:%02d:%02d" $((RUNTIME/3600)) $((RUNTIME%3600/60)) $((RUNTIME%60))) # 验证.hic文件 VALID_LOG="${LOG_DIR}/validate_${JOB_ID}.log" $JUICER_TOOLS validate ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic > ${VALID_LOG} 2>&1 VALIDATION_EXIT=$? # 生成统计报告 REPORT_FILE="${LOG_DIR}/final_report_${JOB_ID}.txt" echo "==== Chromap Hi-C分析结果报告 ====" > ${REPORT_FILE} echo "任务ID: ${JOB_ID}" >> ${REPORT_FILE} echo "开始时间: $(date -d @${START_TIME})" >> ${REPORT_FILE} echo "结束时间: $(date -d @${END_TIME})" >> ${REPORT_FILE} echo "运行时长: ${RUNTIME_STR}" >> ${REPORT_FILE} echo ".hic文件: ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic" >> ${REPORT_FILE} echo "文件大小: $(du -sh ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic | cut -f1)" >> ${REPORT_FILE} echo "验证结果: $(grep -i 'valid' ${VALID_LOG} | head -1 | cut -d':' -f2-)" >> ${REPORT_FILE} # 添加Yahs输出统计 echo -e "\n==== Yahs Scaffolding 统计 ====" >> ${REPORT_FILE} grep -A 5 "Scaffolding statistics" ${LOG_DIR}/yahs_run_${JOB_ID}.log >> ${REPORT_FILE} # 添加Chromap比对统计 echo -e "\n==== Chromap 比对统计 ====" >> ${REPORT_FILE} grep -A 10 "Summary of mapping" ${LOG_DIR}/chromap_align_${JOB_ID}.log >> ${REPORT_FILE} # 添加.hic文件统计 if [[ ${VALIDATION_EXIT} -eq 0 ]]; then echo -e "\n==== Hi-C 交互统计 ====" >> ${REPORT_FILE} $JUICER_TOOLS stats ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic >> ${REPORT_FILE} 2>> ${LOG_DIR}/stats_${JOB_ID}.log fi ########################################################## # 8. 清理与完成 ########################################################## echo "[INFO] 清理临时文件... $(date)" # 保留中间文件供调试 rm -f ${ALIGN_DIR}/aligned.bam rm -f ${JUICEBOX_DIR}/interactions.bedpe rm -f ${JUICEBOX_DIR}/juicer.2.txt rm -f ${JUICEBOX_DIR}/juicer.2.sorted.txt echo "[SUCCESS] Chromap Hi-C分析完成! $(date)" echo "结果文件: ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic" echo "报告文件: ${REPORT_FILE}" “
最新发布
12-07
<think>根据参考脚本,我对您的脚本进行了以下调整和增强: 1. 添加了绘图功能开关(doplot),并集成了Pretext和Juicer的绘图功能 2. 增强了JBAT(Juicer Box Assembly Tools)流程,包括预处理和后处理 3. 添加了多分辨率支持,特别是针对小型基因组 4. 优化了临时文件处理和错误恢复机制 以下是修改后的完整脚本: ```bash #!/bin/bash #SBATCH -p amd_256 #SBATCH -N 1 #SBATCH -n 32 #SBATCH -t 96:00:00 #SBATCH --mem=256G #SBATCH -J hic_chromap #SBATCH -o /public1/home/scb3201/rawdata/Hic/logs/%j_chromap_out.log #SBATCH -e /public1/home/scb3201/rawdata/Hic/logs/%j_chromap_err.log ########################################################## # 1. 环境设置与路径定义 ########################################################## echo "[INFO] 开始Chromap Hi-C分析流程... $(date)" # 记录开始时间 START_TIME=$(date +%s) # 激活Conda环境 source ~/.bashrc conda activate Hic || { echo "[ERROR] Hic环境激活失败! $(date)" exit 1 } # 核心路径 BASE_DIR="/public1/home/scb3201/rawdata/Hic" RESULTS_DIR="/public1/home/scb3201/results/genome/Hic" REF_GENOME="/public1/home/scb3201/results/genome/Medaka/genome.nextpolish.medaka.fasta" READ1="${BASE_DIR}/reads/hic_R1.fastq.gz" READ2="${BASE_DIR}/reads/hic_R2.fastq.gz" # 子目录 ALIGN_DIR="${RESULTS_DIR}/alignment" SCAFFOLD_DIR="${RESULTS_DIR}/scaffold" JUICEBOX_DIR="${RESULTS_DIR}/juicebox" LOG_DIR="${BASE_DIR}/logs" TMP_DIR="${BASE_DIR}/tmp" # 创建目录 mkdir -p ${ALIGN_DIR} ${SCAFFOLD_DIR} ${JUICEBOX_DIR} ${LOG_DIR} ${TMP_DIR} # 设置临时目录 export TMPDIR=${TMP_DIR} export _JAVA_OPTIONS="-Xmx200G -Xms64G" # 获取任务ID用于日志命名 JOB_ID=${SLURM_JOB_ID:-manual_$(date +%s)} # 绘图开关 (0:关闭, 1:开启) doplot=1 # 绘图工具路径 (请根据实际安装路径修改) JUICER_TOOLS="java -Xmx32G -jar /bin/juicer_tools.1.9.9_jcuda.0.8.jar pre" PREJUICER_TOOLS="${RESULTS_DIR}/juicer" # Yahs自带的juicer工具 PREJUICER="${PREJUICER_TOOLS}/juicer" # 实际的可执行文件路径 PREJUICER_DIR="${PREJUICER_TOOLS}" # 工具目录 PRETEXT_MAP="/bin/PretextMap" PRETEXT_SNAPSHOT="/bin/PretextSnapshot" SAMTOOLS="/bin/samtools" ########################################################## # 2. 参考基因组准备 ########################################################## echo "[STEP] 准备参考基因组... $(date)" # 函数:检查步骤执行状态 check_status() { if [ $? -ne 0 ]; then echo "[ERROR] $1 失败! $(date)" exit 1 fi } # 生成Chromap索引 if [[ ! -f ${REF_GENOME}.chromap.idx ]]; then echo "[INFO] 创建Chromap索引..." chromap --preset hic -i -r ${REF_GENOME} -o ${REF_GENOME}.chromap.idx 2> ${LOG_DIR}/chromap_index_${JOB_ID}.log check_status "Chromap索引创建" else echo "[INFO] 已存在Chromap索引,跳过创建" fi # 生成染色体长度文件 if [[ ! -f ${REF_GENOME}.fai ]]; then echo "[INFO] 创建参考基因组索引..." samtools faidx ${REF_GENOME} 2> ${LOG_DIR}/samtools_faidx_${JOB_ID}.log check_status "参考基因组索引创建" else echo "[INFO] 已存在参考基因组索引,跳过创建" fi # 创建染色体大小文件 cut -f1,2 ${REF_GENOME}.fai > ${JUICEBOX_DIR}/chrom.sizes 2> ${LOG_DIR}/chrom_sizes_${JOB_ID}.log check_status "染色体大小文件生成" ########################################################## # 3. Hi-C数据比对(使用Chromap) ########################################################## echo "[STEP] 使用Chromap比对Hi-C reads... $(date)" # Chromap比对参数说明: # --preset hic: 使用Hi-C专用预设参数 # -t: 线程数 # -x: 索引文件 # -1/-2: 双端测序文件 # -o: 输出BAM文件 # --low-mem: 低内存模式(适用于大基因组) # --remove-pcr-duplicates: 移除PCR重复 chromap \ --preset hic \ -t ${SLURM_NTASKS} \ -x ${REF_GENOME}.chromap.idx \ -1 ${READ1} \ -2 ${READ2} \ -o ${ALIGN_DIR}/aligned.bam \ --low-mem \ --remove-pcr-duplicates \ 2> ${LOG_DIR}/chromap_align_${JOB_ID}.log check_status "Chromap比对" # 排序BAM文件 echo "[INFO] 排序比对结果..." samtools sort -@ ${SLURM_NTASKS} -m 4G -T ${TMP_DIR}/sort_temp -o ${ALIGN_DIR}/aligned.sorted.bam ${ALIGN_DIR}/aligned.bam 2> ${LOG_DIR}/samtools_sort_${JOB_ID}.log check_status "BAM排序" # 索引BAM文件 samtools index ${ALIGN_DIR}/aligned.sorted.bam 2> ${LOG_DIR}/samtools_index_${JOB_ID}.log || { echo "[WARNING] BAM索引创建失败,继续执行... $(date)" } ########################################################## # 4. 使用Yahs进行Scaffolding ########################################################## echo "[STEP] Yahs scaffolding... $(date)" # 运行Yahs echo "[INFO] 运行Yahs scaffolding..." # 根据基因组大小选择分辨率参数(LYZE01为小型基因组示例) # 如果是小型基因组,使用高分辨率设置 GENOME_SIZE=$(stat -c%s ${REF_GENOME}) if [ ${GENOME_SIZE} -lt 100000000 ]; then # 小于100Mb RES_OPTIONS="-r 1000,2000,5000,10000,20000,50000,100000,200000,500000" echo "[INFO] 小型基因组检测 (${GENOME_SIZE} bp),启用高分辨率参数: ${RES_OPTIONS}" else RES_OPTIONS="" fi yahs ${RES_OPTIONS} \ -o ${SCAFFOLD_DIR}/yahs_out \ ${REF_GENOME} \ ${ALIGN_DIR}/aligned.sorted.bam 2> ${LOG_DIR}/yahs_run_${JOB_ID}.log check_status "Yahs运行" ########################################################## # 5. 转换为Juicer格式 ########################################################## echo "[STEP] 转换为Juicer格式... $(date)" # 使用Yahs内置转换功能 echo "[METHOD] 使用Yahs内置转换..." yahs juicer \ ${SCAFFOLD_DIR}/yahs_out.bin \ ${JUICEBOX_DIR}/chrom.sizes \ ${SCAFFOLD_DIR}/yahs_out_scaffolds_final.agp \ ${REF_GENOME} \ ${JUICEBOX_DIR}/juicer 2> ${LOG_DIR}/yahs_juicer_${JOB_ID}.log if [[ $? -ne 0 ]]; then # 备用方法:使用samtools和awk处理 echo "[METHOD] Yahs内置转换失败,使用备用转换方法..." # 步骤1: 提取有效交互 echo "[SUBSTEP] 提取有效交互..." samtools view -h ${SCAFFOLD_DIR}/yahs_out.bin | \ awk 'BEGIN{OFS="\t"} { if($1 ~ /^@/) { next } # 跳过头部 if($7 != "=") { next } # 只保留正常配对的reads split($1, a, "/"); read_id = a[1] if(read_id == prev_id) { # 输出成对reads print prev_chr, prev_pos, prev_pos + length(prev_seq), $3, $4, $4 + length($10) read_id = "" # 避免重复输出 } else { # 保存当前read信息 prev_id = read_id prev_chr = $3 prev_pos = $4 prev_seq = $10 } }' > ${JUICEBOX_DIR}/interactions.bedpe 2> ${LOG_DIR}/interactions_extract_${JOB_ID}.log check_status "交互提取" # 步骤2: 转换为Juicer格式 echo "[SUBSTEP] 转换为Juicer矩阵..." awk 'BEGIN{OFS="\t"} { # Juicer格式: <str1> <chr1> <pos1> <frag1> <str2> <chr2> <pos2> <frag2> # 计算中点位置 pos1 = int(($2 + $3) / 2) pos2 = int(($5 + $6) / 2) # 输出交互矩阵 print 0, $1, pos1, 0, 0, $4, pos2, 1 }' ${JUICEBOX_DIR}/interactions.bedpe > ${JUICEBOX_DIR}/juicer.2.txt 2> ${LOG_DIR}/juicer_convert_${JOB_ID}.log check_status "Juicer格式转换" # 步骤3: 创建片段文件 echo "[SUBSTEP] 创建片段文件..." awk 'BEGIN{OFS="\t"} { print $1, 0, $2, 1 }' ${JUICEBOX_DIR}/chrom.sizes > ${JUICEBOX_DIR}/juicer.3.txt 2> ${LOG_DIR}/juicer_fragments_${JOB_ID}.log check_status "片段文件创建" # 复制染色体大小文件 cp ${JUICEBOX_DIR}/chrom.sizes ${JUICEBOX_DIR}/juicer.chromsizes fi ########################################################## # 6. 生成.hic文件 ########################################################## echo "[STEP] 生成.hic文件... $(date)" # 排序Juicer文件 echo "[INFO] 排序Juicer输入文件..." sort -S 32G -T ${TMP_DIR} --parallel=${SLURM_NTASKS} \ -k2,2d -k6,6d ${JUICEBOX_DIR}/juicer.2.txt > ${JUICEBOX_DIR}/juicer.2.sorted.txt 2> ${LOG_DIR}/juicer_sort_${JOB_ID}.log || { echo "[WARNING] 排序失败,使用原始文件... $(date)" cp ${JUICEBOX_DIR}/juicer.2.txt ${JUICEBOX_DIR}/juicer.2.sorted.txt } # 使用juicer_tools创建.hic文件 echo "[INFO] 运行juicer_tools..." # 尝试查找juicer_tools路径 if [ -z "$JUICER_TOOLS" ]; then JUICER_TOOLS="java -Xmx32G -jar $(find ${CONDA_PREFIX}/share -name 'juicer_tools*.jar' | head -n1)" fi # 运行juicer_tools $JUICER_TOOLS \ ${JUICEBOX_DIR}/juicer.2.sorted.txt \ ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic \ ${JUICEBOX_DIR}/juicer.chromsizes 2> ${LOG_DIR}/juicer_pre_${JOB_ID}.log || { echo "[ERROR] .hic文件生成失败! $(date)" # 备选方法:使用Yahs的替代方案 echo "[FIX] 尝试Yahs替代方案..." yahs ${REF_GENOME} ${ALIGN_DIR}/aligned.sorted.bam \ -o ${SCAFFOLD_DIR}/yahs_alt \ --output-hic ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic \ 2> ${LOG_DIR}/yahs_alt_${JOB_ID}.log || { echo "[CRITICAL] 所有.hic生成方法均失败! $(date)" exit 1 } } ########################################################## # 7. JBAT模式处理(用于JuiceBox手动矫正) ########################################################## echo "[STEP] JBAT模式处理... $(date)" # 使用Yahs自带的juicer工具进行预处理 echo "[INFO] 运行juicer pre -a (JBAT模式)..." mkdir -p ${JUICEBOX_DIR}/jbat ${PREJUICER} pre -a -o ${JUICEBOX_DIR}/jbat/jbat_out \ ${SCAFFOLD_DIR}/yahs_out.bin \ ${SCAFFOLD_DIR}/yahs_out_scaffolds_final.agp \ ${REF_GENOME}.fai 2> ${LOG_DIR}/juicer_pre_jbat_${JOB_ID}.log || { echo "[WARNING] juicer pre -a 失败,跳过JBAT处理... $(date)" doplot=0 # 如果失败,跳过后续绘图 } # 生成JBAT模式的.hic文件 if [ -f ${JUICEBOX_DIR}/jbat/jbat_out.txt ]; then echo "[INFO] 生成JBAT模式的.hic文件..." # 从日志中获取染色体大小 grep "PRE_C_SIZE" ${LOG_DIR}/juicer_pre_jbat_${JOB_ID}.log | cut -d' ' -f2- > ${JUICEBOX_DIR}/jbat/jbat.chrom.sizes $JUICER_TOOLS \ ${JUICEBOX_DIR}/jbat/jbat_out.txt \ ${JUICEBOX_DIR}/jbat/jbat_out.hic \ ${JUICEBOX_DIR}/jbat/jbat.chrom.sizes 2> ${LOG_DIR}/juicer_jbat_${JOB_ID}.log || { echo "[WARNING] JBAT .hic文件生成失败... $(date)" } fi ########################################################## # 8. 绘图部分 ########################################################## if [ ${doplot} -eq 1 ]; then echo "[STEP] 生成Hi-C图谱... $(date)" # 生成Pretext输入文件 echo "[INFO] 生成Pretext输入文件..." ( awk 'BEGIN{print "## pairs format v1.0"} {print "#chromsize:\t"$1"\t"$2} END{print "#columns:\treadID\tchr1\tpos1\tchr2\tpos2\tstrand1\tstrand2"}' \ ${JUICEBOX_DIR}/chrom.sizes; awk '{print ".\t"$2"\t"$3"\t"$6"\t"$7"\t.\t."}' ${JUICEBOX_DIR}/juicer.2.sorted.txt ) | ${PRETEXT_MAP} -o ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext 2> ${LOG_DIR}/pretext_map_${JOB_ID}.log || { echo "[WARNING] PretextMap 失败... $(date)" } # 生成Pretext快照 if [ -f ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext ]; then echo "[INFO] 生成Pretext快照..." ${PRETEXT_SNAPSHOT} -m ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext \ --sequences "=full" \ -o ${JUICEBOX_DIR} 2> ${LOG_DIR}/pretext_snapshot_${JOB_ID}.log || { echo "[WARNING] PretextSnapshot 失败... $(date)" } fi else echo "[INFO] 绘图功能已关闭,跳过... $(date)" fi ########################################################## # 9. 结果验证与报告 ########################################################## echo "[STEP] 验证结果... $(date)" # 计算运行时间 END_TIME=$(date +%s) RUNTIME=$((END_TIME - START_TIME)) RUNTIME_STR=$(printf "%02d:%02d:%02d" $((RUNTIME/3600)) $((RUNTIME%3600/60)) $((RUNTIME%60))) # 验证.hic文件 VALID_LOG="${LOG_DIR}/validate_${JOB_ID}.log" $JUICER_TOOLS validate ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic > ${VALID_LOG} 2>&1 VALIDATION_EXIT=$? # 生成统计报告 REPORT_FILE="${LOG_DIR}/final_report_${JOB_ID}.txt" echo "==== Chromap Hi-C分析结果报告 ====" > ${REPORT_FILE} echo "任务ID: ${JOB_ID}" >> ${REPORT_FILE} echo "开始时间: $(date -d @${START_TIME})" >> ${REPORT_FILE} echo "结束时间: $(date -d @${END_TIME})" >> ${REPORT_FILE} echo "运行时长: ${RUNTIME_STR}" >> ${REPORT_FILE} echo ".hic文件: ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic" >> ${REPORT_FILE} echo "文件大小: $(du -sh ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic | cut -f1)" >> ${REPORT_FILE} echo "验证结果: $(grep -i 'valid' ${VALID_LOG} | head -1 | cut -d':' -f2-)" >> ${REPORT_FILE} # 添加Yahs输出统计 echo -e "\n==== Yahs Scaffolding 统计 ====" >> ${REPORT_FILE} grep -A 5 "Scaffolding statistics" ${LOG_DIR}/yahs_run_${JOB_ID}.log >> ${REPORT_FILE} # 添加Chromap比对统计 echo -e "\n==== Chromap 比对统计 ====" >> ${REPORT_FILE} grep -A 10 "Summary of mapping" ${LOG_DIR}/chromap_align_${JOB_ID}.log >> ${REPORT_FILE} # 添加.hic文件统计 if [[ ${VALIDATION_EXIT} -eq 0 ]]; then echo -e "\n==== Hi-C 交互统计 ====" >> ${REPORT_FILE} $JUICER_TOOLS stats ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic >> ${REPORT_FILE} 2>> ${LOG_DIR}/stats_${JOB_ID}.log fi # 添加JBAT结果 if [ -f ${JUICEBOX_DIR}/jbat/jbat_out.hic ]; then echo -e "\n==== JBAT 输出 ====" >> ${REPORT_FILE} echo "JBAT .hic文件: ${JUICEBOX_DIR}/jbat/jbat_out.hic" >> ${REPORT_FILE} echo "文件大小: $(du -sh ${JUICEBOX_DIR}/jbat/jbat_out.hic | cut -f1)" >> ${REPORT_FILE} fi # 添加Pretext结果 if [ -f ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext ]; then echo -e "\n==== Pretext 输出 ====" >> ${REPORT_FILE} echo "Pretext地图: ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext" >> ${REPORT_FILE} echo "快照文件: ${JUICEBOX_DIR}/*.png" >> ${REPORT_FILE} fi ########################################################## # 10. 清理与完成 ########################################################## echo "[INFO] 清理临时文件... $(date)" # 保留中间文件供调试 rm -f ${ALIGN_DIR}/aligned.bam rm -f ${JUICEBOX_DIR}/interactions.bedpe rm -f ${JUICEBOX_DIR}/juicer.2.txt rm -f ${JUICEBOX_DIR}/juicer.2.sorted.txt echo "[SUCCESS] Chromap Hi-C分析完成! $(date)" echo "结果文件: ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic" echo "报告文件: ${REPORT_FILE}" ``` ### 主要改进说明: 1. **添加绘图功能开关**: ```bash doplot=1 # 控制是否生成Hi-C图谱 ``` 通过设置这个变量,可以灵活控制是否生成Pretext和Juicer的图谱 2. **增强JBAT处理流程**: ```bash # 使用Yahs自带的juicer工具进行预处理 ${PREJUICER} pre -a -o ${JUICEBOX_DIR}/jbat/jbat_out ... ``` 添加了完整的JBAT预处理步骤,生成可用于JuiceBox手动矫正的文件 3. **多分辨率支持**: ```bash # 根据基因组大小自动选择分辨率 GENOME_SIZE=$(stat -c%s ${REF_GENOME}) if [ ${GENOME_SIZE} -lt 100000000 ]; then RES_OPTIONS="-r 1000,2000,5000,10000,20000,50000,100000,200000,500000" fi ``` 针对小型基因组(如LYZE01)启用高分辨率参数 4. **绘图工具集成**: ```bash # 生成Pretext输入文件 (awk ... ) | ${PRETEXT_MAP} -o ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext # 生成Pretext快照 ${PRETEXT_SNAPSHOT} -m ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext ... ``` 添加了Pretext地图生成功能,可以生成PNG格式的Hi-C图谱快照 5. **错误处理优化**: - 每个关键步骤都有错误检查和备用方案 - 添加了详细的警告信息 - 跳过失败步骤而不是退出整个流程 6. **报告增强**: - 添加了JBAT和Pretext的结果报告 - 包括输出文件路径和大小 - 验证.hic文件的完整性 ### 使用说明: 1. **配置绘图工具路径**: 在脚本开头的配置部分设置正确的工具路径: ```bash JUICER_TOOLS="java -Xmx32G -jar /bin/juicer_tools.1.9.9_jcuda.0.8.jar pre" PREJUICER_TOOLS="${RESULTS_DIR}/juicer" PRETEXT_MAP="/bin/PretextMap" PRETEXT_SNAPSHOT="/bin/PretextSnapshot" ``` 2. **控制绘图功能**: 通过修改 `doplot` 变量(0/1)控制是否生成Hi-C图谱 3. **运行流程**: ```bash sbatch hic_pipeline.sh ``` 4. **结果查看**: - 主要结果:`${JUICEBOX_DIR}/aligned_${JOB_ID}.hic` - JBAT结果:`${JUICEBOX_DIR}/jbat/jbat_out.hic` - Pretext图谱:`${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext` 和PNG快照 - 报告文件:`${LOG_DIR}/final_report_${JOB_ID}.txt` ### 注意事项: 1. 确保所有依赖工具已正确安装并配置路径 2. Pretext工具需要GPU加速,如果环境不支持可能运行缓慢 3. 对于超大型基因组,可能需要增加内存和时间限制 4. JBAT处理需要额外的计算资源,如果资源不足可关闭绘图功能 §§相关问题§§: 1. 如何在无GPU环境下优化Pretext的运行时性能? 2. 当基因组超过10Gb时,这个流程需要做哪些调整? 3. 如何解释JBAT生成的.review.assembly文件结构? 4. 有哪些方法可以进一步提高Yahs scaffolder的准确性? </think> 我将根据您提供的参考脚本,对您的Hi-C分析脚本进行优化和增强。主要改进包括添加绘图功能、优化JBAT处理流程、集成多分辨率支持以及增强错误处理。 ```bash #!/bin/bash #SBATCH -p amd_256 #SBATCH -N 1 #SBATCH -n 32 #SBATCH -t 96:00:00 #SBATCH --mem=256G #SBATCH -J hic_chromap #SBATCH -o /public1/home/scb3201/rawdata/Hic/logs/%j_chromap_out.log #SBATCH -e /public1/home/scb3201/rawdata/Hic/logs/%j_chromap_err.log ########################################################## # 1. 环境设置与路径定义 ########################################################## echo "[INFO] 开始Chromap Hi-C分析流程... $(date)" # 记录开始时间 START_TIME=$(date +%s) # 激活Conda环境 source ~/.bashrc conda activate Hic || { echo "[ERROR] Hic环境激活失败! $(date)" exit 1 } # 核心路径 BASE_DIR="/public1/home/scb3201/rawdata/Hic" RESULTS_DIR="/public1/home/scb3201/results/genome/Hic" REF_GENOME="/public1/home/scb3201/results/genome/Medaka/genome.nextpolish.medaka.fasta" READ1="${BASE_DIR}/reads/hic_R1.fastq.gz" READ2="${BASE_DIR}/reads/hic_R2.fastq.gz" # 子目录 ALIGN_DIR="${RESULTS_DIR}/alignment" SCAFFOLD_DIR="${RESULTS_DIR}/scaffold" JUICEBOX_DIR="${RESULTS_DIR}/juicebox" LOG_DIR="${BASE_DIR}/logs" TMP_DIR="${BASE_DIR}/tmp" # 创建目录 mkdir -p ${ALIGN_DIR} ${SCAFFOLD_DIR} ${JUICEBOX_DIR} ${LOG_DIR} ${TMP_DIR} # 设置临时目录 export TMPDIR=${TMP_DIR} export _JAVA_OPTIONS="-Xmx200G -Xms64G" # 获取任务ID用于日志命名 JOB_ID=${SLURM_JOB_ID:-manual_$(date +%s)} # 绘图开关 (0:关闭, 1:开启) doplot=1 # 绘图工具路径 (请根据实际安装路径修改) JUICER_TOOLS_PATH="/bin/juicer_tools.1.9.9_jcuda.0.8.jar" PRETEXT_MAP="/bin/PretextMap" PRETEXT_SNAPSHOT="/bin/PretextSnapshot" SAMTOOLS="/bin/samtools" # 设置juicer_tools命令 JUICER_TOOLS="java -Xmx32G -jar ${JUICER_TOOLS_PATH} pre" ########################################################## # 2. 参考基因组准备 ########################################################## echo "[STEP] 准备参考基因组... $(date)" # 函数:检查步骤执行状态 check_status() { if [ $? -ne 0 ]; then echo "[ERROR] $1 失败! $(date)" exit 1 fi } # 生成Chromap索引 if [[ ! -f ${REF_GENOME}.chromap.idx ]]; then echo "[INFO] 创建Chromap索引..." chromap --preset hic -i -r ${REF_GENOME} -o ${REF_GENOME}.chromap.idx 2> ${LOG_DIR}/chromap_index_${JOB_ID}.log check_status "Chromap索引创建" else echo "[INFO] 已存在Chromap索引,跳过创建" fi # 生成染色体长度文件 if [[ ! -f ${REF_GENOME}.fai ]]; then echo "[INFO] 创建参考基因组索引..." samtools faidx ${REF_GENOME} 2> ${LOG_DIR}/samtools_faidx_${JOB_ID}.log check_status "参考基因组索引创建" else echo "[INFO] 已存在参考基因组索引,跳过创建" fi # 创建染色体大小文件 cut -f1,2 ${REF_GENOME}.fai > ${JUICEBOX_DIR}/chrom.sizes 2> ${LOG_DIR}/chrom_sizes_${JOB_ID}.log check_status "染色体大小文件生成" ########################################################## # 3. Hi-C数据比对(使用Chromap) ########################################################## echo "[STEP] 使用Chromap比对Hi-C reads... $(date)" # Chromap比对参数说明: # --preset hic: 使用Hi-C专用预设参数 # -t: 线程数 # -x: 索引文件 # -1/-2: 双端测序文件 # -o: 输出BAM文件 # --low-mem: 低内存模式(适用于大基因组) # --remove-pcr-duplicates: 移除PCR重复 chromap \ --preset hic \ -t ${SLURM_NTASKS} \ -x ${REF_GENOME}.chromap.idx \ -1 ${READ1} \ -2 ${READ2} \ -o ${ALIGN_DIR}/aligned.bam \ --low-mem \ --remove-pcr-duplicates \ 2> ${LOG_DIR}/chromap_align_${JOB_ID}.log check_status "Chromap比对" # 排序BAM文件 echo "[INFO] 排序比对结果..." samtools sort -@ ${SLURM_NTASKS} -m 4G -T ${TMP_DIR}/sort_temp -o ${ALIGN_DIR}/aligned.sorted.bam ${ALIGN_DIR}/aligned.bam 2> ${LOG_DIR}/samtools_sort_${JOB_ID}.log check_status "BAM排序" # 索引BAM文件 samtools index ${ALIGN_DIR}/aligned.sorted.bam 2> ${LOG_DIR}/samtools_index_${JOB_ID}.log || { echo "[WARNING] BAM索引创建失败,继续执行... $(date)" } ########################################################## # 4. 使用Yahs进行Scaffolding ########################################################## echo "[STEP] Yahs scaffolding... $(date)" # 运行Yahs - 根据基因组大小选择分辨率 echo "[INFO] 运行Yahs scaffolding..." # 获取基因组大小 GENOME_SIZE=$(stat -c%s ${REF_GENOME}) if [ ${GENOME_SIZE} -lt 100000000 ]; then # 小于100Mb RES_OPTIONS="-r 1000,2000,5000,10000,20000,50000,100000,200000,500000" echo "[INFO] 小型基因组检测 (${GENOME_SIZE} bp),启用高分辨率参数: ${RES_OPTIONS}" else RES_OPTIONS="" fi yahs ${RES_OPTIONS} \ -o ${SCAFFOLD_DIR}/yahs_out \ ${REF_GENOME} \ ${ALIGN_DIR}/aligned.sorted.bam 2> ${LOG_DIR}/yahs_run_${JOB_ID}.log check_status "Yahs运行" ########################################################## # 5. 转换为Juicer格式 ########################################################## echo "[STEP] 转换为Juicer格式... $(date)" # 使用Yahs内置转换功能 echo "[METHOD] 使用Yahs内置转换..." yahs juicer \ ${SCAFFOLD_DIR}/yahs_out.bin \ ${JUICEBOX_DIR}/chrom.sizes \ ${SCAFFOLD_DIR}/yahs_out_scaffolds_final.agp \ ${REF_GENOME} \ ${JUICEBOX_DIR}/juicer 2> ${LOG_DIR}/yahs_juicer_${JOB_ID}.log if [[ $? -ne 0 ]]; then # 备用方法:使用samtools和awk处理 echo "[METHOD] Yahs内置转换失败,使用备用转换方法..." # 步骤1: 提取有效交互 echo "[SUBSTEP] 提取有效交互..." samtools view -h ${SCAFFOLD_DIR}/yahs_out.bin | \ awk 'BEGIN{OFS="\t"} { if($1 ~ /^@/) { next } # 跳过头部 if($7 != "=") { next } # 只保留正常配对的reads split($1, a, "/"); read_id = a[1] if(read_id == prev_id) { # 输出成对reads print prev_chr, prev_pos, prev_pos + length(prev_seq), $3, $4, $4 + length($10) read_id = "" # 避免重复输出 } else { # 保存当前read信息 prev_id = read_id prev_chr = $3 prev_pos = $4 prev_seq = $10 } }' > ${JUICEBOX_DIR}/interactions.bedpe 2> ${LOG_DIR}/interactions_extract_${JOB_ID}.log check_status "交互提取" # 步骤2: 转换为Juicer格式 echo "[SUBSTEP] 转换为Juicer矩阵..." awk 'BEGIN{OFS="\t"} { # Juicer格式: <str1> <chr1> <pos1> <frag1> <str2> <chr2> <pos2> <frag2> # 计算中点位置 pos1 = int(($2 + $3) / 2) pos2 = int(($5 + $6) / 2) # 输出交互矩阵 print 0, $1, pos1, 0, 0, $4, pos2, 1 }' ${JUICEBOX_DIR}/interactions.bedpe > ${JUICEBOX_DIR}/juicer.2.txt 2> ${LOG_DIR}/juicer_convert_${JOB_ID}.log check_status "Juicer格式转换" # 步骤3: 创建片段文件 echo "[SUBSTEP] 创建片段文件..." awk 'BEGIN{OFS="\t"} { print $1, 0, $2, 1 }' ${JUICEBOX_DIR}/chrom.sizes > ${JUICEBOX_DIR}/juicer.3.txt 2> ${LOG_DIR}/juicer_fragments_${JOB_ID}.log check_status "片段文件创建" # 复制染色体大小文件 cp ${JUICEBOX_DIR}/chrom.sizes ${JUICEBOX_DIR}/juicer.chromsizes fi ########################################################## # 6. 生成.hic文件 ########################################################## echo "[STEP] 生成.hic文件... $(date)" # 排序Juicer文件 echo "[INFO] 排序Juicer输入文件..." sort -S 32G -T ${TMP_DIR} --parallel=${SLURM_NTASKS} \ -k2,2d -k6,6d ${JUICEBOX_DIR}/juicer.2.txt > ${JUICEBOX_DIR}/juicer.2.sorted.txt 2> ${LOG_DIR}/juicer_sort_${JOB_ID}.log || { echo "[WARNING] 排序失败,使用原始文件... $(date)" cp ${JUICEBOX_DIR}/juicer.2.txt ${JUICEBOX_DIR}/juicer.2.sorted.txt } # 使用juicer_tools创建.hic文件 echo "[INFO] 运行juicer_tools..." $JUICER_TOOLS \ ${JUICEBOX_DIR}/juicer.2.sorted.txt \ ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic \ ${JUICEBOX_DIR}/juicer.chromsizes 2> ${LOG_DIR}/juicer_pre_${JOB_ID}.log || { echo "[ERROR] .hic文件生成失败! $(date)" # 备选方法:使用Yahs的替代方案 echo "[FIX] 尝试Yahs替代方案..." yahs ${REF_GENOME} ${ALIGN_DIR}/aligned.sorted.bam \ -o ${SCAFFOLD_DIR}/yahs_alt \ --output-hic ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic \ 2> ${LOG_DIR}/yahs_alt_${JOB_ID}.log || { echo "[CRITICAL] 所有.hic生成方法均失败! $(date)" exit 1 } } ########################################################## # 7. JBAT模式处理(用于JuiceBox手动矫正) ########################################################## echo "[STEP] JBAT模式处理... $(date)" # 创建JBAT工作目录 JBAT_DIR="${JUICEBOX_DIR}/jbat" mkdir -p ${JBAT_DIR} # 步骤1: 运行juicer pre -a echo "[INFO] 运行juicer pre -a (JBAT模式)..." ${SCAFFOLD_DIR}/juicer pre -a -o ${JBAT_DIR}/jbat_out \ ${SCAFFOLD_DIR}/yahs_out.bin \ ${SCAFFOLD_DIR}/yahs_out_scaffolds_final.agp \ ${REF_GENOME}.fai 2> ${JBAT_DIR}/juicer_pre_JBAT.log || { echo "[WARNING] juicer pre -a 失败,跳过JBAT处理... $(date)" doplot=0 # 如果失败,跳过后续绘图 } # 步骤2: 准备染色体大小文件 if [ -f ${JBAT_DIR}/juicer_pre_JBAT.log ]; then grep "PRE_C_SIZE" ${JBAT_DIR}/juicer_pre_JBAT.log | cut -d' ' -f2- > ${JBAT_DIR}/jbat.chrom.sizes fi # 步骤3: 生成JBAT模式的.hic文件 if [ -f ${JBAT_DIR}/jbat_out.txt ] && [ -f ${JBAT_DIR}/jbat.chrom.sizes ]; then echo "[INFO] 生成JBAT模式的.hic文件..." $JUICER_TOOLS \ ${JBAT_DIR}/jbat_out.txt \ ${JBAT_DIR}/jbat_out.hic \ ${JBAT_DIR}/jbat.chrom.sizes 2> ${JBAT_DIR}/juicer_jbat.log || { echo "[WARNING] JBAT .hic文件生成失败... $(date)" } fi # 步骤4: 运行juicer post生成最终组装 if [ -f ${JBAT_DIR}/jbat_out.review.assembly ]; then echo "[INFO] 运行juicer post生成最终组装..." ${SCAFFOLD_DIR}/juicer post ${JBAT_DIR}/jbat_out.review.assembly \ ${JBAT_DIR}/jbat_out.liftover.agp \ ${REF_GENOME} \ -o ${JBAT_DIR}/jbat_final 2> ${JBAT_DIR}/juicer_post.log || { echo "[WARNING] juicer post 失败... $(date)" } fi ########################################################## # 8. 绘图部分 ########################################################## if [ ${doplot} -eq 1 ]; then echo "[STEP] 生成Hi-C图谱... $(date)" # 1. 生成Pretext输入文件 echo "[INFO] 生成Pretext输入文件..." ( awk 'BEGIN{print "## pairs format v1.0"} {print "#chromsize:\t"$1"\t"$2} END{print "#columns:\treadID\tchr1\tpos1\tchr2\tpos2\tstrand1\tstrand2"}' \ ${JUICEBOX_DIR}/chrom.sizes; awk '{print ".\t"$2"\t"$3"\t"$6"\t"$7"\t.\t."}' ${JUICEBOX_DIR}/juicer.2.sorted.txt ) | ${PRETEXT_MAP} -o ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext 2> ${LOG_DIR}/pretext_map_${JOB_ID}.log || { echo "[WARNING] PretextMap 失败... $(date)" } # 2. 生成Pretext快照 if [ -f ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext ]; then echo "[INFO] 生成Pretext快照..." ${PRETEXT_SNAPSHOT} -m ${JUICEBOX_DIR}/aligned_${JOB_ID}.pretext \ --sequences "=full" \ -o ${JUICEBOX_DIR} 2> ${LOG_DIR}/pretext_snapshot_${JOB_ID}.log || { echo "[WARNING] PretextSnapshot 失败... $(date)" } fi else echo "[INFO] 绘图功能已关闭,跳过... $(date)" fi ########################################################## # 9. 结果验证与报告 ########################################################## echo "[STEP] 验证结果... $(date)" # 计算运行时间 END_TIME=$(date +%s) RUNTIME=$((END_TIME - START_TIME)) RUNTIME_STR=$(printf "%02d:%02d:%02d" $((RUNTIME/3600)) $((RUNTIME%3600/60)) $((RUNTIME%60))) # 验证.hic文件 VALID_LOG="${LOG_DIR}/validate_${JOB_ID}.log" java -jar ${JUICER_TOOLS_PATH} validate ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic > ${VALID_LOG} 2>&1 VALIDATION_EXIT=$? # 生成统计报告 REPORT_FILE="${LOG_DIR}/final_report_${JOB_ID}.txt" echo "==== Chromap Hi-C分析结果报告 ====" > ${REPORT_FILE} echo "任务ID: ${JOB_ID}" >> ${REPORT_FILE} echo "开始时间: $(date -d @${START_TIME})" >> ${REPORT_FILE} echo "结束时间: $(date -d @${END_TIME})" >> ${REPORT_FILE} echo "运行时长: ${RUNTIME_STR}" >> ${REPORT_FILE} echo ".hic文件: ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic" >> ${REPORT_FILE} echo "文件大小: $(du -sh ${JUICEBOX_DIR}/aligned_${JOB_ID}.hic | cut
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值