简单脚本自动查询Phred质量分数编码系统

本文介绍了一个利用脚本查询Phred质量分数编码的步骤,详细讲解了代码的每一部分,包括如何处理输入参数,提取数据,以及通过awk和od命令进行ASCII码到二进制的转换。主要涉及的工具有bash脚本、awk和od命令。


修改自:
从零开始完整学习全基因组测序数据分析:第2节 FASTA和FASTQ

less $1 | head -n 1000 | awk '{if(NR%4==0) printf("%s",$0);}' | od -A n -t u1 -v \
| awk 'BEGIN{min=100;max=0;} \
  {for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END \
  {if(max<=126 && min<59) print "Phred33"; \
  else if(max>73 && min>=64) print "Phred64"; \
  else if(min>=59 && min<64 && max>73) print "Solexa64"; \
  else print "Unknown score encoding"; \
  print "( " min ", " max, ")";}'

下面逐步解析代码:

1)less $1

这个不多说,$1代表脚本输入的第一个参数

第 1 章 Unix/Linux操作系统介绍...........................................................................................................1 1.1 远程登陆...................................................................................................................................1 1.2 文件的复制、删除和移动命令................................................................................................7 1.3 目录的创建、删除及更改目录命令........................................................................................9 1.4 文本查看命令.........................................................................................................................11 1.5 文本处理命令.........................................................................................................................13 1.6 改变文件或目录的权限命令..................................................................................................16 1.7 备份与压缩命令.....................................................................................................................18 1.8 磁盘及系统管理.....................................................................................................................20 1.9 软件安装简介.........................................................................................................................22 1.10 其他......................................................................................................................................23 第2 章 数据的基本处理........................................................................................................................25 2.1 测序原理介绍..........................................................................................................................25 2.2 峰图转化 Phred ......................................................................................................................27 2.3 Phd2Fasta ...............................................................................................................................32 2.4 载体屏蔽 cross_match ........
import numpy as np from Bio import SeqIO from sklearn.preprocessing import MinMaxScaler import os import pandas as pd def parse_ab1(file_path): """解析AB1文件获取四通道荧光数据,自动去除首尾25个碱基""" record = SeqIO.read(file_path, "abi") channels = ('DATA9', 'DATA10', 'DATA11', 'DATA12') # A/C/G/T通道 # 获取数据长度并计有效区间 data_length = len(record.annotations['abif_raw']['DATA9']) start_index = 25 end_index = data_length - 25 if data_length > 50 else data_length trace = { 'A': np.array(record.annotations['abif_raw']['DATA9'][start_index:end_index]), 'C': np.array(record.annotations['abif_raw']['DATA10'][start_index:end_index]), 'G': np.array(record.annotations['abif_raw']['DATA11'][start_index:end_index]), 'T': np.array(record.annotations['abif_raw']['DATA12'][start_index:end_index]) } return trace def detect_heterozygotes(trace, window_size=5): """滑动窗口检测双峰区域""" features = [] num_points = len(trace['A']) for i in range(num_points - window_size): window = {base: trace[base][i:i+window_size] for base in 'ACGT'} # 特征工程:峰高比/标准差/极差 ratios = [ np.mean(window['A']) / (np.mean(window['G']) + 1e-6), np.mean(window['C']) / (np.mean(window['T']) + 1e-6) ] values = np.concatenate(list(window.values())) features.append([ max(ratios), np.std(values), np.ptp(values) # Peak-to-peak (max-min) ]) return np.array(features) def create_dataset(ab1_files, labels): """构建训练数据集""" X, y = [], [] scaler = MinMaxScaler() for file, label in zip(ab1_files, labels): trace = parse_ab1(file) features = detect_heterozygotes(trace) if len(features) > 0: X.append(scaler.fit_transform(features)) y.append(label) return np.array(X, dtype=object), np.array(y) # dtype=object处理不等长序列 def parse_ab1_file(file_path): """解析单个AB1文件,返回序列和质量信息""" # 使用SeqIO读取AB1文件 record = SeqIO.read(file_path, "abi") # 提取序列和质量分数 sequence = str(record.seq) qualities = record.letter_annotations["phred_quality"] # 这里可以返回更多信息,根据实际需求 return {"filename": os.path.basename(file_path), "sequence": sequence, "qualities": qualities} def process_ab1_directory(directory): """遍历目录中的所有AB1文件,并解析每个文件""" data = [] # 遍历目录,找到所有以.ab1结尾的文件 for filename in os.listdir(directory): if filename.endswith(".ab1"): file_path = os.path.join(directory, filename) try: parsed_data = parse_ab1_file(file_path) data.append(parsed_data) print(f"成功解析文件: {filename}") except Exception as e: print(f"解析文件{filename}时出错: {str(e)}") return data import os import numpy as np # 假设parse_ab1和detect_heterozygotes已经定义 def build_dataset(ab1_dir, k=5): # 初始化特征列表和标签列表 features = [] labels = [] # 遍历ab1_dir目录下的所有.ab1文件 for file_name in os.listdir(ab1_dir): if not file_name.endswith('.ab1'): continue file_path = os.path.join(ab1_dir, file_name) # 解析AB1文件 data = parse_ab1(file_path) # 检测杂合位点 hetero_sites = detect_heterozygotes(data) # 将hetero_sites转换为字典:键为位置(整数),为(base1, base2, qual1, qual2) hetero_dict = {} for site in hetero_sites: # site: (pos, base1, base2, qual1, qual2) hetero_dict[site[0]] = (site[1], site[2], site[3], site[4]) # 获取序列和质量分数 seq = data['base_sequence'] # 假设返回的是字符串 qual = data['quality_scores'] # 假设返回的是整数列表,长度与序列相同 # 遍历序列中的每个位置(从k到len(seq)-k-1) for i in range(k, len(seq)-k): # 检查位置i是否在hetero_dict中 if i in hetero_dict: label = 1 # 杂合 base1, base2, qual1, qual2 = hetero_dict[i] # 确定主要碱基和次要碱基的质量分数 # 主要碱基就是seq[i] main_base = seq[i] if main_base == base1: secondary_qual = qual2 elif main_base == base2: secondary_qual = qual1 else: # 如果主要碱基不在两个碱基中,说明有误,跳过 continue # 主要碱基质量分数就是qual[i](或者我们也可以使用hetero_dict中给出的质量分数?但parse_ab1已经给出了主要碱基的质量分数,所以用qual[i]) main_qual = qual[i] sec_qual = secondary_qual else: label = 0 # 纯合 main_qual = qual[i] sec_qual = 0 # 表示没有次要碱基 # 提取上下文序列:从i-k到i+k(包括i) context = seq[i-k:i+k+1] # 对上下文进行one-hot编码 context_feature = [] for base in context: # 对每个碱基进行one-hot编码 if base == 'A': context_feature.extend([1,0,0,0]) elif base == 'C': context_feature.extend([0,1,0,0]) elif base == 'G': context_feature.extend([0,0,1,0]) elif base == 'T': context_feature.extend([0,0,0,1]) else: # 其他碱基,用全0表示 context_feature.extend([0,0,0,0]) # 特征向量:包括主要碱基质量分数、次要碱基质量分数和上下文特征 feature_vec = [main_qual, sec_qual] + context_feature features.append(feature_vec) labels.append(label) # 将features和labels转换为numpy数组 features = np.array(features) labels = np.array(labels) return features, labels def build_training_dataset(ab1_dir, output_csv="heterozygotes_dataset.csv"): """从AB1文件目录构建训练数据集""" dataset = [] for filename in os.listdir(ab1_dir): if not filename.endswith('.ab1'): continue file_path = os.path.join(ab1_dir, filename) try: # 步骤1: 解析AB1文件 seq_data = parse_ab1(file_path) # 步骤2: 检测杂合位点 hetero_sites = detect_heterozygotes(seq_data) # 步骤3: 提取特征和标签 for pos, base1, base2, qual1, qual2 in hetero_sites: # 特征工程 features = { 'file': filename, 'position': pos, 'primary_base': base1, 'secondary_base': base2, 'primary_quality': qual1, 'secondary_quality': qual2, 'quality_diff': abs(qual1 - qual2), 'peak_ratio': min(qual1, qual2) / max(qual1, qual2), # 添加上下文特征 (示例: 前3个碱基) 'context': seq_data['sequence'][max(0,pos-3):pos] } # 标签: 1表示杂合位点 dataset.append({**features, 'label': 1}) except Exception as e: print(f"处理 {filename} 失败: {str(e)}") # 转换为DataFrame并保存 df = pd.DataFrame(dataset) df.to_csv(output_csv, index=False) return df if __name__ == "__main__": # 当直接运行此脚本时,执行main函数 dataset = build_training_dataset( ab1_dir="F:\KeziahCHAPLIN", output_csv="training_data.csv" ) # 查看数据集结构 print(f"生成 {len(dataset)} 个样本") print("特征列表:", dataset.columns.tolist()) # build_training_dataset(r'F:\KeziahCHAPLIN\heterozygous-mutation\pass') 全部都提示 处理 1 (10227).ab1 失败: not enough values to unpack (expected 5, got 3)
07-18
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值