import itertools
def generate_kmers(k):
"""生成所有可能的k-mer组合"""
nucleotides = ['A', 'T', 'C', 'G']
return [''.join(kmer) for kmer in itertools.product(nucleotides, repeat=k)]
def kmer_feature_extraction(sequence, k, normalize=False):
"""
提取DNA序列的k-mer特征
参数:
sequence (str): 输入的DNA序列
k (int): k-mer的长度
normalize (bool): 是否返回归一化频率
返回:
list: k-mer特征向量(出现次数或频率)
"""
# 预处理:转换为大写并验证有效性
sequence = sequence.upper()
n = len(sequence)
valid_chars = {'A', 'T', 'C', 'G'}
# 参数校验
if k < 1:
raise ValueError("k必须为不小于1的整数")
if k > n:
return [0] * (4**k) if normalize else [0] * (4**k)
# 生成所有可能的k-mer
all_kmers = generate_kmers(k)
kmer_counts = {kmer: 0 for kmer in all_kmers}
total_valid = 0 # 有效k-mer总数
# 滑动窗口统计
for i in range(n - k + 1):
current_kmer = sequence[i:i+k]
if all(c in valid_chars for c in current_kmer):
kmer_counts[current_kmer] += 1
total_valid += 1
# 生成特征向量
features = []
for kmer in all_kmers:
count = kmer_counts[kmer]
if normalize and total_valid > 0:
features.append(count / total_valid)
else:
features.append(count)
return features
# 示例用法
if __name__ == "__main__":
dna_sequence = "ATGCTAGCTA"
k = 3
# 获取原始计数特征
features = kmer_feature_extraction(dna_sequence, k)
print(f"k-mer计数特征(k={k}):", features[:10], "...") # 打印前10个特征
# 获取归一化特征
normalized_features = kmer_feature_extraction(dna_sequence, k, normalize=True)
print(f"归一化k-mer特征(k={k}):", normalized_features[:10], "...")
###一个小例子,比较简单但是可以根据后续进行丰满。
499

被折叠的 条评论
为什么被折叠?



