import numpy as np
from Bio import SeqIO
from sklearn.preprocessing import MinMaxScaler
import os
import pandas as pd
def parse_ab1(file_path):
"""解析AB1文件获取四通道荧光数据,自动去除首尾25个碱基"""
record = SeqIO.read(file_path, "abi")
channels = ('DATA9', 'DATA10', 'DATA11', 'DATA12') # A/C/G/T通道
# 获取数据长度并计算有效区间
data_length = len(record.annotations['abif_raw']['DATA9'])
start_index = 25
end_index = data_length - 25 if data_length > 50 else data_length
trace = {
'A': np.array(record.annotations['abif_raw']['DATA9'][start_index:end_index]),
'C': np.array(record.annotations['abif_raw']['DATA10'][start_index:end_index]),
'G': np.array(record.annotations['abif_raw']['DATA11'][start_index:end_index]),
'T': np.array(record.annotations['abif_raw']['DATA12'][start_index:end_index])
}
return trace
def detect_heterozygotes(trace, window_size=5):
"""滑动窗口检测双峰区域"""
features = []
num_points = len(trace['A'])
for i in range(num_points - window_size):
window = {base: trace[base][i:i+window_size] for base in 'ACGT'}
# 特征工程:峰高比/标准差/极差
ratios = [
np.mean(window['A']) / (np.mean(window['G']) + 1e-6),
np.mean(window['C']) / (np.mean(window['T']) + 1e-6)
]
values = np.concatenate(list(window.values()))
features.append([
max(ratios),
np.std(values),
np.ptp(values) # Peak-to-peak (max-min)
])
return np.array(features)
def create_dataset(ab1_files, labels):
"""构建训练数据集"""
X, y = [], []
scaler = MinMaxScaler()
for file, label in zip(ab1_files, labels):
trace = parse_ab1(file)
features = detect_heterozygotes(trace)
if len(features) > 0:
X.append(scaler.fit_transform(features))
y.append(label)
return np.array(X, dtype=object), np.array(y) # dtype=object处理不等长序列
def parse_ab1_file(file_path):
"""解析单个AB1文件,返回序列和质量信息"""
# 使用SeqIO读取AB1文件
record = SeqIO.read(file_path, "abi")
# 提取序列和质量分数
sequence = str(record.seq)
qualities = record.letter_annotations["phred_quality"]
# 这里可以返回更多信息,根据实际需求
return {"filename": os.path.basename(file_path), "sequence": sequence, "qualities": qualities}
def process_ab1_directory(directory):
"""遍历目录中的所有AB1文件,并解析每个文件"""
data = []
# 遍历目录,找到所有以.ab1结尾的文件
for filename in os.listdir(directory):
if filename.endswith(".ab1"):
file_path = os.path.join(directory, filename)
try:
parsed_data = parse_ab1_file(file_path)
data.append(parsed_data)
print(f"成功解析文件: {filename}")
except Exception as e:
print(f"解析文件{filename}时出错: {str(e)}")
return data
import os
import numpy as np
# 假设parse_ab1和detect_heterozygotes已经定义
def build_dataset(ab1_dir, k=5):
# 初始化特征列表和标签列表
features = []
labels = []
# 遍历ab1_dir目录下的所有.ab1文件
for file_name in os.listdir(ab1_dir):
if not file_name.endswith('.ab1'):
continue
file_path = os.path.join(ab1_dir, file_name)
# 解析AB1文件
data = parse_ab1(file_path)
# 检测杂合位点
hetero_sites = detect_heterozygotes(data)
# 将hetero_sites转换为字典:键为位置(整数),值为(base1, base2, qual1, qual2)
hetero_dict = {}
for site in hetero_sites:
# site: (pos, base1, base2, qual1, qual2)
hetero_dict[site[0]] = (site[1], site[2], site[3], site[4])
# 获取序列和质量分数
seq = data['base_sequence'] # 假设返回的是字符串
qual = data['quality_scores'] # 假设返回的是整数列表,长度与序列相同
# 遍历序列中的每个位置(从k到len(seq)-k-1)
for i in range(k, len(seq)-k):
# 检查位置i是否在hetero_dict中
if i in hetero_dict:
label = 1 # 杂合
base1, base2, qual1, qual2 = hetero_dict[i]
# 确定主要碱基和次要碱基的质量分数
# 主要碱基就是seq[i]
main_base = seq[i]
if main_base == base1:
secondary_qual = qual2
elif main_base == base2:
secondary_qual = qual1
else:
# 如果主要碱基不在两个碱基中,说明有误,跳过
continue
# 主要碱基质量分数就是qual[i](或者我们也可以使用hetero_dict中给出的质量分数?但parse_ab1已经给出了主要碱基的质量分数,所以用qual[i])
main_qual = qual[i]
sec_qual = secondary_qual
else:
label = 0 # 纯合
main_qual = qual[i]
sec_qual = 0 # 表示没有次要碱基
# 提取上下文序列:从i-k到i+k(包括i)
context = seq[i-k:i+k+1]
# 对上下文进行one-hot编码
context_feature = []
for base in context:
# 对每个碱基进行one-hot编码
if base == 'A':
context_feature.extend([1,0,0,0])
elif base == 'C':
context_feature.extend([0,1,0,0])
elif base == 'G':
context_feature.extend([0,0,1,0])
elif base == 'T':
context_feature.extend([0,0,0,1])
else:
# 其他碱基,用全0表示
context_feature.extend([0,0,0,0])
# 特征向量:包括主要碱基质量分数、次要碱基质量分数和上下文特征
feature_vec = [main_qual, sec_qual] + context_feature
features.append(feature_vec)
labels.append(label)
# 将features和labels转换为numpy数组
features = np.array(features)
labels = np.array(labels)
return features, labels
def build_training_dataset(ab1_dir, output_csv="heterozygotes_dataset.csv"):
"""从AB1文件目录构建训练数据集"""
dataset = []
for filename in os.listdir(ab1_dir):
if not filename.endswith('.ab1'):
continue
file_path = os.path.join(ab1_dir, filename)
try:
# 步骤1: 解析AB1文件
seq_data = parse_ab1(file_path)
# 步骤2: 检测杂合位点
hetero_sites = detect_heterozygotes(seq_data)
# 步骤3: 提取特征和标签
for pos, base1, base2, qual1, qual2 in hetero_sites:
# 特征工程
features = {
'file': filename,
'position': pos,
'primary_base': base1,
'secondary_base': base2,
'primary_quality': qual1,
'secondary_quality': qual2,
'quality_diff': abs(qual1 - qual2),
'peak_ratio': min(qual1, qual2) / max(qual1, qual2),
# 添加上下文特征 (示例: 前3个碱基)
'context': seq_data['sequence'][max(0,pos-3):pos]
}
# 标签: 1表示杂合位点
dataset.append({**features, 'label': 1})
except Exception as e:
print(f"处理 {filename} 失败: {str(e)}")
# 转换为DataFrame并保存
df = pd.DataFrame(dataset)
df.to_csv(output_csv, index=False)
return df
if __name__ == "__main__":
# 当直接运行此脚本时,执行main函数
dataset = build_training_dataset(
ab1_dir="F:\KeziahCHAPLIN",
output_csv="training_data.csv"
)
# 查看数据集结构
print(f"生成 {len(dataset)} 个样本")
print("特征列表:", dataset.columns.tolist())
# build_training_dataset(r'F:\KeziahCHAPLIN\heterozygous-mutation\pass') 全部都提示 处理 1 (10227).ab1 失败: not enough values to unpack (expected 5, got 3)
最新发布