1、c处理vcf文件,初步统计snp位置
位置信息有5种
down_stream是gene下游2k以内,up_stream 是gene上游2k以内,gene_exon是snp在外显子内,gene_intron是gene在内含子内,uninfluncial是gene不在上诉所有区域
SnpType.py
内容:
#这个脚本是用来看snp哪些是在基因中(分为exon和intron),哪些是在基因的上游、下游
#最终输出结果为:scaffold position 位置 geneid
#输入文件1:snpeff结果文件(/public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/snpeff/snpeffC_F/02.runC/mor_imp_strictestfilter_Snp_AncRef_selected_anno4.vcf)
#输入文件2:gff3文件(/public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/genome.gff3)
import sys,re
file1 = sys.argv[1]
file2 = sys.argv[2]
file3 = sys.argv[3]
f1 = open(file1,'r')
f2 = open(file2,'r')
f3 = open(file3,'w')
flag = 'fuck'
f1dick = {
}
arr1 = []
for i in f1:
if re.match("#",i):
continue
a = i.split("\t")
if flag != a[0] and flag is not 'fuck':
f1dick[flag] = arr1
flag = a[0]
arr1 = []
# print(a[0])
arr1.append(a[1])
elif flag is 'fuck':
flag = a[0]
else:
arr1.append(a[1])
f1dick[flag] = arr1
#for i in f1dick:
# print(i,f1dick[i])
sc = "fuck"
f2dick1 = {
}
f2dick2 = {
}
f2arr1 = []
for i in f2:
if re.match("M",i) == None:
f2dick1[geneid] = f2arr1
continue
a = i.split("\t")
if a[2] == 'gene':
f2arr1 = []
f2arr1.append(a[3:5])
if sc != a[0]:
f2dick2[sc] = f2dick1
f2dick1 = {
}
sc = a[0]
geneid = a[8].split(";")[0].split("=")[1]
if a[2] == 'exon':
f2arr1.append(a[3:5])
f2dick1[geneid] = f2arr1
f2dick2[sc] = f2dick1
del f2dick2['fuck']
f2.close()
for i in f1dick.keys():
for j in f1dick[i]:#取到scaffold内的所有snp位点
try:
taq = 0
for k in f2dick2