帮我优化检查脚本”#!/bin/bash
#SBATCH -p amd_256 # 集群分区(根据实际调整)
#SBATCH -N 1 # 单节点运行
#SBATCH -n 32 # 线程数(与集群资源匹配)
#SBATCH -t 24:00:00 # 运行时间(可调整)
#SBATCH --mem=64G # 内存(可调整)
#SBATCH -J juicebox_hic # 作业名
#SBATCH -o /public1/home/scb3201/rawdata/Hic/logs/%j_hic_out.log
#SBATCH -e /public1/home/scb3201/rawdata/Hic/logs/%j_hic_err.log
##########################################################
# 1. 环境激活与关键工具检查
##########################################################
source ~/.bashrc && conda activate Hic || { echo "[ERROR] Hic环境激活失败!"; exit 1; }
# 检查核心工具是否存在
which samtools >/dev/null 2>&1 || { echo "[ERROR] samtools未安装在Hic环境中"; exit 1; }
which bedtools >/dev/null 2>&1 || { echo "[ERROR] bedtools未安装在Hic环境中"; exit 1; }
which juicer_tools >/dev/null 2>&1 || { echo "[ERROR] juicer_tools未安装在Hic环境中"; exit 1; }
##########################################################
# 2. 路径定义与必要目录创建
##########################################################
# 固定路径(根据用户实际路径保持不变)
REF_GENOME="/public1/home/scb3201/results/genome/Medaka/genome.nextpolish.medaka.fasta"
ALIGN_DIR="/public1/home/scb3201/results/genome/Hic/alignment"
SCAFFOLD_DIR="/public1/home/scb3201/results/genome/Hic/scaffold"
JUICEBOX_DIR="/public1/home/scb3201/results/genome/Hic/juicebox"
LOG_DIR="/public1/home/scb3201/rawdata/Hic/logs"
TMP_DIR="/public1/home/scb3201/rawdata/Hic/tmp" # 临时目录(避免磁盘空间不足)
# 创建所有依赖目录(确保不存在时自动创建)
mkdir -p ${LOG_DIR} ${JUICEBOX_DIR} ${ALIGN_DIR} ${SCAFFOLD_DIR} ${TMP_DIR} || { echo "[ERROR] 目录创建失败"; exit 1; }
##########################################################
# 3. 生成染色体大小文件(chrom.sizes)
##########################################################
echo "[INFO] 生成染色体大小文件..."
if [ ! -f "${REF_GENOME}.fai" ]; then
samtools faidx ${REF_GENOME} 2>> ${LOG_DIR}/chrom_sizes.log || { echo "[ERROR] 参考基因组索引生成失败!"; exit 1; }
fi
cut -f1,2 ${REF_GENOME}.fai > ${JUICEBOX_DIR}/chrom.sizes 2>> ${LOG_DIR}/chrom_sizes.log || { echo "[ERROR] chrom.sizes生成失败!"; exit 1; }
#重命名为medaka_genome.chrom.sizes(匹配基因组ID)
cp ${JUICEBOX_DIR}/chrom.sizes ${JUICEBOX_DIR}/medaka_genome.chrom.sizes || { echo "[ERROR] 染色体大小文件重命名失败"; exit 1; }
#########################################################
# 4. 检查输入BAM文件有效性
##########################################################
echo "[INFO] 检查输入BAM文件..."
INPUT_BAM="${ALIGN_DIR}/aligned_sorted.bam"
if [ ! -f "${INPUT_BAM}" ]; then
echo "[ERROR] 输入BAM文件不存在:${INPUT_BAM}"; exit 1;
fi
##########################################################
# 5. BAM转bedpe格式(juicer_tools输入要求)
##########################################################
echo "[INFO] BAM转换为bedpe格式..."
BEDPE_FILE="${JUICEBOX_DIR}/aligned.bedpe"
bedtools bamtobed -bedpe -i ${INPUT_BAM} > ${BEDPE_FILE} 2>> ${LOG_DIR}/bam_to_bedpe.log || { echo "[ERROR] BAM转bedpe失败!"; exit 1; }
##########################################################
# 6. 生成DPNII限制酶位点文件(适配用户实验酶)
##########################################################
echo "[INFO] 生成DPNII酶位点文件(识别位点:GATC)..."
ENZYME_FILE="${JUICEBOX_DIR}/dpnII_sites.txt"
echo -e "chrom\tsite\tstrand" > ${ENZYME_FILE} # 符合juicer_tools格式要求
# 提取参考基因组中所有GATC位点的字节偏移量(作为位点位置)
samtools faidx ${REF_GENOME} | cut -f1 | while read chr; do
grep -b -o "GATC" ${REF_GENOME} | awk -v chr=${chr} '{print chr"\t"$1"\t+"}' >> ${ENZYME_FILE}
done 2>> ${LOG_DIR}/dpnII_sites.log || { echo "[WARNING] DPNII位点文件生成失败(非必需,可跳过)"; }
echo "[INFO] 生成hic文件(DPNII酶适配+染色体文件修正)..."
# 切换到JUICEBOX_DIR目录,确保染色体大小文件被找到
cd ${JUICEBOX_DIR} || { echo "[ERROR] 无法切换到JUICEBOX_DIR"; exit 1; }
if [ ! -f "medaka_genome.chrom.sizes" ]; then echo "[ERROR] 染色体文件缺失"; exit1; fi
if [ ! -r "medaka_genome.chrom.sizes" ]; then echo "[ERROR] 文件无读权限"; exit1; fi
##########################################################
# 7. juicer_tools生成hic文件(核心参数修正)
##########################################################
echo "[INFO] 生成hic文件(DPNII酶适配版)..."
juicer_tools pre \
--threads ${SLURM_NTASKS} \
-t ${TMP_DIR} \
-f dpnII_sites.txt \
aligned.bedpe \
medaka_hic_final.hic \
medaka_genome \
2>> ${LOG_DIR}/juicer_tools_pre.log || { echo "[ERROR] hic文件生成失败!"; exit 1; }
# 正确指定线程数(利用集群资源)
# 临时目录(避免磁盘溢出)
# DPNII酶位点文件(提升结果分辨率)
# 输入bedpe文件
# 输出hic文件
# 自定义基因组ID(唯一即可)
##########################################################
# 8. 清理临时文件(释放磁盘空间)
##########################################################
echo "[INFO] 清理临时文件..."
rm -f ${BEDPE_FILE} 2>> ${LOG_DIR}/cleanup.log || { echo "[WARNING] 临时文件清理失败,不影响结果"; }
##########################################################
# 9. MultiQC报告生成(含BAM统计信息)
##########################################################
echo "[INFO] 生成MultiQC分析报告..."
MULTIQC_DIR="${JUICEBOX_DIR}/multiqc_report"
mkdir -p ${MULTIQC_DIR} || { echo "[ERROR] MultiQC目录创建失败"; exit 1; }
# 添加BAM统计信息(让报告有实际内容)
samtools stats ${INPUT_BAM} > ${LOG_DIR}/bam_stats.txt 2>&1
# 生成报告
multiqc ${LOG_DIR} -o ${MULTIQC_DIR} --title "HI-C Analysis Report" --filename "hic_report.html" 2>> ${LOG_DIR}/multiqc.log || { echo "[WARNING] MultiQC报告生成失败,结果保留"; }
##########################################################
# 运行完成提示
##########################################################
echo "[SUCCESS] 全流程运行完成!"
echo "结果文件路径:"
echo "1. hic文件:${JUICEBOX_DIR}/medaka_hic_final.hic"
echo "2. MultiQC报告:${MULTIQC_DIR}/hic_report.html"
echo "3. 染色体大小文件:${JUICEBOX_DIR}/chrom.sizes"
“我提供参考内容”juicebox_assembly_converter.py
usage: juicebox_assembly_converter.py [-h] -a ASSEMBLY -f FASTA [-p PREFIX] [-c] [-s] [-v]
juicebox_assembly_converter.py: error: the following arguments are required: -a/--assembly, -f/--fasta
(Hic) [scb3201@ln134%bscc-a6 Hic]$ python degap_assembly.py
python: can't open file '/public1/home/scb3201/rawdata/Hic/degap_assembly.py': [Errno 2] No such file or directory
(Hic) [scb3201@ln134%bscc-a6 Hic]$ degap_assembly.py
Traceback (most recent call last):
File "/public1/home/scb3201/anaconda3/envs/Hic/bin/degap_assembly.py", line 6, in <module>
with open(sys.argv[1]) as file:
IndexError: list index out of range
(Hic) [scb3201@ln134%bscc-a6 Hic]$ makeAgpFromFasta.py
makeAgpFromFasta.py usage: makeAgpFromFasta.py <fasta_file> <agp_out_file>
(Hic) [scb3201@ln134%bscc-a6 Hic]$ juicebox_assembly_converter.py -h
usage: juicebox_assembly_converter.py [-h] -a ASSEMBLY -f FASTA [-p PREFIX] [-c] [-s] [-v]
optional arguments:
-h, --help show this help message and exit
-a ASSEMBLY, --assembly ASSEMBLY
juicebox assembly file
-f FASTA, --fasta FASTA
the fasta file
-p PREFIX, --prefix PREFIX
the prefix to use for writing outputs. Default: the assembly file,
minus the file extension
-c, --contig_mode ignore scaffold specification and just output contigs. useful when
only trying to obtain a fasta reflecting juicebox breaks. Default:
False
-s, --simple_chr_names
use simple chromosome names ("ChromosomeX") for scaffolds instead
of detailed chromosome names
("PGA_scaffold_X__Y_contigs__length_Z"). Has no effect in
contig_mode.
-v, --verbose print summary of processing steps to stdout, otherwise silent.
Default: True
(Hic) [scb3201@ln134%bscc-a6 Hic]$ bwa -h
[main] unrecognized command '-h'
(Hic) [scb3201@ln134%bscc-a6 Hic]$ bwa
Program: bwa (alignment via Burrows-Wheeler transformation)
Version: 0.7.19-r1273
Contact: Heng Li <hli@ds.dfci.harvard.edu>
Usage: bwa <command> [options]
Command: index index sequences in the FASTA format
mem BWA-MEM algorithm
fastmap identify super-maximal exact matches
pemerge merge overlapping paired ends (EXPERIMENTAL)
aln gapped/ungapped alignment
samse generate alignment (single ended)
sampe generate alignment (paired ended)
bwasw BWA-SW for long queries (DEPRECATED)
shm manage indices in shared memory
fa2pac convert FASTA to PAC format
pac2bwt generate BWT from PAC
pac2bwtgen alternative algorithm for generating BWT
bwtupdate update .bwt to the new format
bwt2sa generate SA from BWT and Occ
Note: To use BWA, you need to first index the genome with `bwa index'.
There are three alignment algorithms in BWA: `mem', `bwasw', and
`aln/samse/sampe'. If you are not sure which to use, try `bwa mem'
first. Please `man ./bwa.1' for the manual.
(Hic) [scb3201@ln134%bscc-a6 Hic]$ samtools
Program: samtools (Tools for alignments in the SAM format)
Version: 1.22.1 (using htslib 1.22.1)
Usage: samtools <command> [options]
Commands:
-- Indexing
dict create a sequence dictionary file
faidx index/extract FASTA
fqidx index/extract FASTQ
index index alignment
-- Editing
calmd recalculate MD/NM tags and '=' bases
fixmate fix mate information
reheader replace BAM header
targetcut cut fosmid regions (for fosmid pool only)
addreplacerg adds or replaces RG tags
markdup mark duplicates
ampliconclip clip oligos from the end of reads
-- File operations
collate shuffle and group alignments by name
cat concatenate BAMs
consensus produce a consensus Pileup/FASTA/FASTQ
merge merge sorted alignments
mpileup multi-way pileup
sort sort alignment file
split splits a file by read group
quickcheck quickly check if SAM/BAM/CRAM file appears intact
fastq converts a BAM to a FASTQ
fasta converts a BAM to a FASTA
import Converts FASTA or FASTQ files to SAM/BAM/CRAM
reference Generates a reference from aligned data
reset Reverts aligner changes in reads
-- Statistics
bedcov read depth per BED region
coverage alignment depth and percent coverage
depth compute the depth
flagstat simple stats
idxstats BAM index stats
cram-size list CRAM Content-ID and Data-Series sizes
phase phase heterozygotes
stats generate stats (former bamcheck)
ampliconstats generate amplicon specific stats
checksum produce order-agnostic checksums of sequence content
-- Viewing
flags explain BAM flags
head header viewer
tview text alignment viewer
view SAM<->BAM<->CRAM conversion
depad convert padded BAM to unpadded BAM
samples list the samples in a set of SAM/BAM/CRAM files
-- Misc
help [cmd] display this help message or help for [cmd]
version detailed version information
(Hic) [scb3201@ln134%bscc-a6 Hic]$ bedtools
bedtools is a powerful toolset for genome arithmetic.
Version: v2.31.1
About: developed in the quinlanlab.org and by many contributors worldwide.
Docs: http://bedtools.readthedocs.io/
Code: https://github.com/arq5x/bedtools2
Mail: https://groups.google.com/forum/#!forum/bedtools-discuss
Usage: bedtools <subcommand> [options]
The bedtools sub-commands include:
[ Genome arithmetic ]
intersect Find overlapping intervals in various ways.
window Find overlapping intervals within a window around an interval.
closest Find the closest, potentially non-overlapping interval.
coverage Compute the coverage over defined intervals.
map Apply a function to a column for each overlapping interval.
genomecov Compute the coverage over an entire genome.
merge Combine overlapping/nearby intervals into a single interval.
cluster Cluster (but don't merge) overlapping/nearby intervals.
complement Extract intervals _not_ represented by an interval file.
shift Adjust the position of intervals.
subtract Remove intervals based on overlaps b/w two files.
slop Adjust the size of intervals.
flank Create new intervals from the flanks of existing intervals.
sort Order the intervals in a file.
random Generate random intervals in a genome.
shuffle Randomly redistribute intervals in a genome.
sample Sample random records from file using reservoir sampling.
spacing Report the gap lengths between intervals in a file.
annotate Annotate coverage of features from multiple files.
[ Multi-way file comparisons ]
multiinter Identifies common intervals among multiple interval files.
unionbedg Combines coverage intervals from multiple BEDGRAPH files.
[ Paired-end manipulation ]
pairtobed Find pairs that overlap intervals in various ways.
pairtopair Find pairs that overlap other pairs in various ways.
[ Format conversion ]
bamtobed Convert BAM alignments to BED (& other) formats.
bedtobam Convert intervals to BAM records.
bamtofastq Convert BAM records to FASTQ records.
bedpetobam Convert BEDPE intervals to BAM records.
bed12tobed6 Breaks BED12 intervals into discrete BED6 intervals.
[ Fasta manipulation ]
getfasta Use intervals to extract sequences from a FASTA file.
maskfasta Use intervals to mask sequences from a FASTA file.
nuc Profile the nucleotide content of intervals in a FASTA file.
[ BAM focused tools ]
multicov Counts coverage from multiple BAMs at specific intervals.
tag Tag BAM alignments based on overlaps with interval files.
[ Statistical relationships ]
jaccard Calculate the Jaccard statistic b/w two sets of intervals.
reldist Calculate the distribution of relative distances b/w two files.
fisher Calculate Fisher statistic b/w two feature files.
[ Miscellaneous tools ]
overlap Computes the amount of overlap from two intervals.
igv Create an IGV snapshot batch script.
links Create a HTML page of links to UCSC locations.
makewindows Make interval "windows" across a genome.
groupby Group by common cols. & summarize oth. cols. (~ SQL "groupBy")
expand Replicate lines based on lists of values in columns.
split Split a file into multiple files with equal records or base pairs.
summary Statistical summary of intervals in a file.
[ General Parameters ]
--cram-ref Reference used by a CRAM input
[ General help ]
--help Print this help menu.
--version What version of bedtools are you using?.
--contact Feature requests, bugs, mailing lists, etc.
(Hic) [scb3201@ln134%bscc-a6 Hic]$ assembly-stats
usage: stats [options] <list of fasta/q files>
Reports sequence length statistics from fasta and/or fastq files
options:
-l <int>
Minimum length cutoff for each sequence.
Sequences shorter than the cutoff will be ignored [1]
-s
Print 'grep friendly' output
-t
Print tab-delimited output
-u
Print tab-delimited output with no header line
-v
Print version and exit
(Hic) [scb3201@ln134%bscc-a6 Hic]$ juicer
Usage: juicer <command> <arguments>
Version: 1.2.2
Command: pre generate files compatible with Juicebox toolset
post generate assembly files after Juicebox curation
(Hic) [scb3201@ln134%bscc-a6 Hic]$ juicertools
bash: juicertools: command not found...
(Hic) [scb3201@ln134%bscc-a6 Hic]$ juicer_tools
WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.
WARN [2025-12-06T22:43:06,303] [Globals.java:138] [main] Development mode is enabled
Juicer Tools Version 1.22.01
Usage:
dump <observed/oe> <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize> [outfile]
dump <norm/expected> <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr> <BP/FRAG> <binsize> [outfile]
dump <loops/domains> <hicFile URL> [outfile]
pre [options] <infile> <outfile> <genomeID>
addNorm <input_HiC_file> [input_vector_file]
pearsons [-p] <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr> <BP/FRAG> <binsize> [outfile]
eigenvector -p <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr> <BP/FRAG> <binsize> [outfile]
apa <hicFile(s)> <PeaksFile> <SaveFolder>
arrowhead <hicFile(s)> <output_file>
hiccups <hicFile> <outputDirectory>
hiccupsdiff <firstHicFile> <secondHicFile> <firstLoopList> <secondLoopList> <outputDirectory>
validate <hicFile>
-h, --help print help
-v, --verbose verbose mode
-V, --version print version
Type juicer_tools <commandName> for more detailed usage instructions
(Hic) [scb3201@ln134%bscc-a6 Hic]$ juicer_tools pre
WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.
WARN [2025-12-06T22:43:17,904] [Globals.java:138] [main] Development mode is enabled
Usage: juicer_tools pre [options] <infile> <outfile> <genomeID>
: -d only calculate intra chromosome (diagonal) [false]
: -f <restriction site file> calculate fragment map
: -m <int> only write cells with count above threshold m [0]
: -q <int> filter by MAPQ score greater than or equal to q [not set]
: -c <chromosome ID> only calculate map on specific chromosome [not set]
: -r <comma-separated list of resolutions> Only calculate specific resolutions [not set]
: -t <tmpDir> Set a temporary directory for writing
: -s <statistics file> Add the text statistics file to the Hi-C file header
: -g <graphs file> Add the text graphs file to the Hi-C file header
: -n Don't normalize the matrices
: -z <double> scale factor for hic file
: -a <1, 2, 3, 4, 5> filter based on inner, outer, left-left, right-right, tandem pairs respectively
: --randomize_position randomize positions between fragment sites
: --random_seed <long> for seeding random number generator
: --frag_site_maps <fragment site files> for randomization
: -k normalizations to include
: -j number of CPU threads to use
: --threads <int> number of threads
: --mndindex <filepath> to mnd chr block indices
(Hic) [scb3201@ln134%bscc-a6 Hic]$ juicer_tools dump
WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.
WARN [2025-12-06T22:44:15,812] [Globals.java:138] [main] Development mode is enabled
Usage: juicer_tools dump <observed/oe> <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr1>[:x1:x2] <chr2>[:y1:y2] <BP/FRAG> <binsize> [outfile]
dump <norm/expected> <NONE/VC/VC_SQRT/KR> <hicFile(s)> <chr> <BP/FRAG> <binsize> [outfile]
dump <loops/domains> <hicFile URL> [outfile]“
最新发布