面试手撕代码

机器学习核心知识点整理

原创已于 2025-09-16 14:39:36 修改 · 141 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python

于 2025-09-15 11:56:48 首次发布

auc

def get_auc(label,score):
    #label [0,0,1,1,0]
    pos=[i for i,item in enumerate(label) if item==1]
    neg=[i for i,item in enumerate(label) if item==0]
    if not pos or not neg:
        return 0.5 
    ans=0
    for p in pos:
        for n in neg:
            if score[p]>score[n]:
                ans+=1
            elif score[p]==score[n]:
                ans+=0.5
    return ans/(len(pos)*len(neg))

线性回归

import numpy as np
class LR:
    def __init__(self,num_input):
        self.w=np.random.normal(scale=0.01,size=(num_input,1))
        self.b=np.random.normal(scale=0.01,size=(1))
    def forward(self,input):
        return np.dot(input,self.w)+self.b
    def mse_loss(self,pre,label):
        label=label.reshape(pre.shape)
        return np.mean((pre-label)**2)/2
    def sgd_update(self,lr,input,pre,label):
        batch_size=input.shape[0]
        error=pre-label.reshape(pre.shape)
        dw=np.dot(input.T,error)/batch_size # (y-pred)*x_i
        self.w-=lr*dw
        db=np.mean(error)
        self.b-=lr*db
    def train(self,lr,inputs,labels,epochs,batch_size=32):
        input=np.array(inputs)
        label=np.array(labels).reshape(-1,1)
        num_samples = inputs.shape[0]
        for epoch in range(epochs):
            total_loss=0
            for i in range(0,num_samples,batch_size):
                batch_input=inputs[i:i+batch_size]
                batch_label=labels[i:i+batch_size]
                output=self.forward(batch_input)
                loss=self.mse_loss(output,batch_label)
                total_loss+=loss
                self.sgd_update(lr,batch_input,output,batch_label)
            avg_loss = total_loss / (num_samples // batch_size + 1)
            if (epoch + 1) % 100 == 0:  # 每100轮打印一次
                print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.6f}")

if __name__ =="__main__":
    np.random.seed(42)
    x=np.random.rand(1000,2)
    y=3*x[:,0]+2*x[:,1]+5+np.random.randn(1000)*0.1
    model=LR(num_input=2)
    model.train(lr=0.01,inputs=x,labels=y,epochs=1000,batch_size=32)
    print(model.w.flatten())
    print(model.b[0])

逻辑回归

import numpy as np
class LogisticRegression:
    def __init__(self,num_features):
        self.w=np.random.normal(scale=0.01,size=(num_features,1))
        self.b=np.random.normal(scale=0.01,size=(1))
    def sigmoid(self,z):
        z=np.clip(z,-100,100)
        return 1/(1+np.exp(-z))
    def forward(self,input):
        return self.sigmoid(np.dot(input,self.w)+self.b)
    def bce(self,pre,label):
        label=label.reshape(pre.shape)
        pre=np.clip(pre,1e-10,1-1e-10)
        return -np.mean(label*np.log(pre)+(1-label)*np.log(1-pre))
    def sgd_update(self,lr,input,pre,label):
        batch_size=input.shape[0]
        error=pre-label.reshape(pre.shape)
        dw=np.dot(input.T,error)/batch_size # (y-pred)*x_i
        self.w-=lr*dw
        db=np.mean(error)
        self.b-=lr*db
    def train(self,inputs,labels,epochs,lr,batch_size=32):
        input=np.array(inputs)
        label=np.array(labels).reshape(-1,1)
        num_samples = inputs.shape[0]
        for epoch in range(epochs):
            total_loss=0
            for i in range(0,num_samples,batch_size):
                batch_input=inputs[i:i+batch_size]
                batch_label=labels[i:i+batch_size]
                output=self.forward(batch_input)
                loss=self.bce(output,batch_label)
                total_loss+=loss
                self.sgd_update(lr,batch_input,output,batch_label)
            avg_loss = total_loss / (num_samples // batch_size + 1)
            if (epoch + 1) % 100 == 0:  # 每100轮打印一次
                print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.6f}")
    def predict(self,x,threshold=0.5):
        y=self.forward(x)
        return (y>=threshold).astype(int)
    def predict_prob(self,x):
        return self.forward(x)

if __name__ =="__main__":
    np.random.seed(42)
    class0 = np.random.randn(500, 2) * 0.5 + np.array([1, 1])
    class1 = np.random.randn(500, 2) * 0.5 + np.array([3, 3])
    
    X = np.vstack([class0, class1])
    y = np.hstack([np.zeros(500), np.ones(500)])  # 标签 0 和 1
    # 创建并训练模型
    model = LogisticRegression(num_features=2)
    model.train(X, y, epochs=1000, lr=0.1, batch_size=32)
    
    # 评估模型
    y_pred = model.predict(X)
    accuracy = np.mean(y_pred.flatten() == y)
    print(f"\n模型准确率: {accuracy:.4f}")
    print("学习到的权重:", model.w.flatten())
    print("学习到的偏置:", model.b[0])

batchnorm

import numpy as np

class BN:
    def __init__(self, momentum=0.9, eps=1e-5, feat_dim=2):  # 修正1：momentum默认值调整
        self._running_mean = np.zeros(shape=(feat_dim,))
        self._running_var = np.ones(shape=(feat_dim,))
        self._momentum = momentum  # 滑动平均系数，通常接近1（如0.9）
        self._eps = eps
        self._beta = np.zeros(shape=(feat_dim,))
        self._gamma = np.ones(shape=(feat_dim,))
        self.training = True  # 模型自身的训练状态标记

    def forward(self, x):
        if self.training:
            # 计算当前批次的均值和方差（按特征维度，axis=0）
            x_mean = x.mean(axis=0)
            x_var = x.var(axis=0, ddof=0)  # 修正2：使用总体方差（ddof=0）
            
            # 修正3：滑动平均公式（保留历史信息为主）
            self._running_mean = self._momentum * self._running_mean + (1 - self._momentum) * x_mean
            self._running_var = self._momentum * self._running_var + (1 - self._momentum) * x_var
            
            # 批次内归一化
            x_hat = (x - x_mean) / np.sqrt(x_var + self._eps)
        else:
            # 测试时使用累计的均值和方差
            x_hat = (x - self._running_mean) / np.sqrt(self._running_var + self._eps)
        
        # 缩放和平移
        return self._gamma * x_hat + self._beta

if __name__ == "__main__":
    x = np.random.randn(5, 2)  # 5个样本，2个特征
    bn = BN(momentum=0.9, feat_dim=2)  # 使用更合理的momentum值
    
    # 训练模式
    bn.forward(x)
    
    # 修正4：正确切换模型到测试模式（修改bn的training属性）
    bn.training = False  
    print(bn.forward(x))

layernorm

import numpy as np

class LayerNorm:
    def __init__(self, eps=1e-5, feat_dim=None):
        """
        初始化LayerNorm层
        :param eps: 防止分母为0的小值，默认1e-5
        :param feat_dim: 特征维度，用于初始化可学习参数
        """
        self.eps = eps
        # 可学习的缩放参数gamma和偏移参数beta，初始化为1和0
        self.gamma = np.ones(feat_dim) if feat_dim is not None else None
        self.beta = np.zeros(feat_dim) if feat_dim is not None else None
        self.training = True  # 训练/测试状态（LayerNorm在两种状态下计算逻辑一致）

    def forward(self, x):
        """
        前向传播
        :param x: 输入数据，形状为(..., feat_dim)，如(batch_size, seq_len, feat_dim)或(batch_size, feat_dim)
        :return: 归一化并经过缩放偏移的结果，形状与x一致
        """
        # 计算单个样本在所有特征维度上的均值和方差
        # axis=-1表示对最后一个维度（特征维度）计算
        mean = x.mean(axis=-1, keepdims=True)  # 保留维度以便广播，形状为(..., 1)
        var = x.var(axis=-1, keepdims=True, ddof=0)  # 总体方差（ddof=0）
        
        # 归一化：(x - 均值) / sqrt(方差 + eps)
        x_hat = (x - mean) / np.sqrt(var + self.eps)
        
        # 初始化gamma和beta（如果未指定feat_dim，根据输入自动推断）
        if self.gamma is None:
            self.gamma = np.ones(x.shape[-1])
            self.beta = np.zeros(x.shape[-1])
        
        # 应用缩放和平移：gamma * x_hat + beta（通过广播适配输入形状）
        return self.gamma * x_hat + self.beta

# 测试代码
if __name__ == "__main__":
    # 测试1：2D输入 (batch_size, feat_dim)
    x_2d = np.random.randn(3, 5)  # 3个样本，每个样本5个特征
    ln_2d = LayerNorm(feat_dim=5)
    out_2d = ln_2d.forward(x_2d)
    print("2D输入的输出形状:", out_2d.shape)  # 应保持(3, 5)
    print("2D输出的特征维度均值（应接近0）:", out_2d.mean(axis=-1))
    print("2D输出的特征维度方差（应接近1）:\n", out_2d.var(axis=-1, ddof=0))
    
    # 测试2：3D输入 (batch_size, seq_len, feat_dim)，模拟文本序列
    x_3d = np.random.randn(2, 4, 6)  # 2个样本，每个样本4个序列位置，每个位置6个特征
    ln_3d = LayerNorm()  # 不指定feat_dim，自动推断
    out_3d = ln_3d.forward(x_3d)
    print("\n3D输入的输出形状:", out_3d.shape)  # 应保持(2, 4, 6)
    print("3D输出的特征维度均值（应接近0）:\n", out_3d.mean(axis=-1))

RMSNorm

import numpy as np
class RMSNorm:
    def __init__(self, dim, eps=1e-8):
        self.eps = eps 
        self.scale = np.ones(dim)  
    def forward(self, x):
        mean_square = np.mean(x **2, axis=-1, keepdims=True)
        rms = np.sqrt(mean_square + self.eps)
        x_normalized = x / rms
        output = self.scale * x_normalized
        return output
if __name__ == "__main__":
    x = np.random.rand(4, 2)
    rms_norm = RMSNorm(dim=x.shape[-1])
    output = rms_norm.forward(x)
    print("输入形状:", x.shape)
    print("输出形状:", output.shape)
    print("输出结果:\n", output)

KL散度

import numpy as np
import torch

def kl_torch(p, q, eps=1e-9):
    """PyTorch版本KL散度计算（输入为torch.Tensor）"""
    # 确保输入是概率分布（归一化）
    p = torch.softmax(p, dim=-1)  # 适用于logits输入，若已是概率分布可移除
    q = torch.softmax(q, dim=-1)
    
    # 数值稳定处理
    p = torch.clamp(p, eps, 1.0)
    q = torch.clamp(q, eps, 1.0)
    
    # 计算KL散度
    kl_div = p * (torch.log(p) - torch.log(q))
    return torch.sum(kl_div, dim=-1)

def kl_numpy(p, q, eps=1e-10):
    """NumPy版本KL散度计算（输入为numpy.ndarray）"""
    # 确保输入是概率分布（归一化）
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # 数值稳定处理
    p = np.clip(p, eps, 1.0)
    q = np.clip(q, eps, 1.0)
    
    # 计算KL散度
    kl = np.sum(p * np.log(p / q))
    return kl

if __name__ == "__main__":
    # 示例1：两个相同的分布（NumPy版本）
    p1 = np.array([0.2, 0.3, 0.5])
    q1 = np.array([0.2, 0.3, 0.5])
    print(f"NumPy相同分布KL散度: {kl_numpy(p1, q1):.4f}")  # 接近0
    
    # 示例2：两个不同的分布（NumPy版本）
    p2 = np.array([0.8, 0.2])
    q2 = np.array([0.5, 0.5])
    print(f"NumPy不同分布KL散度: {kl_numpy(p2, q2):.4f}")  # 应为正值
    
    # 示例3：PyTorch版本测试（输入为logits，需先归一化）
    torch.manual_seed(42)
    p_torch = torch.randn(3, 5)  # 模拟未归一化的logits
    q_torch = torch.randn(3, 5)
    
    # 测试相同分布（将q设置为与p相同）
    same_kl = kl_torch(p_torch, p_torch)
    print(f"PyTorch相同分布KL散度: {same_kl}")  # 应为接近0的张量
    
    # 测试不同分布
    diff_kl = kl_torch(p_torch, q_torch)
    print(f"PyTorch不同分布KL散度: {diff_kl}")  # 应为正值张量

交叉熵

import torch
logits = torch.randn(3, 5)  # 3个样本，5个类别
labels = torch.tensor([2, 0, 4])  # 每个样本的真实类别索引

def softmax(logits):
    max_vals = torch.max(logits, dim=1, keepdim=True).values
    exp_logits = torch.exp(logits - max_vals)  # 防止指数爆炸
    return exp_logits / torch.sum(exp_logits, dim=1, keepdim=True)

def cross_entropy(logits, labels):
    probs = softmax(logits)
    probs = torch.clamp(probs, min=1e-9, max=1.0)
    selected_probs = probs.gather(dim=1, index=labels.unsqueeze(1))
    loss = -torch.log(selected_probs)
    return torch.mean(loss)  # 返回批次平均损失

# 测试实现
custom_loss = cross_entropy(logits, labels)
print(f"自定义交叉熵损失: {custom_loss.item()}")

# 与PyTorch官方实现对比（验证正确性）
official_loss = torch.nn.functional.cross_entropy(logits, labels)
print(f"PyTorch官方交叉熵损失: {official_loss.item()}")

对比损失

import torch
import torch.nn.functional as F

def infonce_loss(anchor, positive, negatives, temperature=0.1):
    """
    实现InfoNCE对比损失
    Args:
        anchor: 锚点样本特征，形状为[batch_size, feature_dim]
        positive: 正样本特征，形状为[batch_size, feature_dim]
        negatives: 负样本特征，形状为[batch_size, num_negatives, feature_dim]
        temperature: 温度系数，控制softmax的陡峭程度
    Returns:
        平均InfoNCE损失
    """
    # 计算锚点与正样本的相似度 (batch_size, 1)
    pos_sim = torch.sum(anchor * positive, dim=1, keepdim=True)  # 内积衡量相似度
    
    # 计算锚点与所有负样本的相似度 (batch_size, num_negatives)
    neg_sim = torch.sum(anchor.unsqueeze(1) * negatives, dim=2)  # 广播后内积
    
    # 拼接正样本和负样本的相似度 (batch_size, 1 + num_negatives)
    logits = torch.cat([pos_sim, neg_sim], dim=1)
    
    # 除以温度系数
    logits /= temperature
    
    # 构造标签：正样本在第0个位置
    labels = torch.zeros(logits.size(0), dtype=torch.long, device=anchor.device)
    
    # 计算交叉熵损失（等价于InfoNCE损失）
    loss = F.cross_entropy(logits, labels)
    
    return loss

# 测试代码
if __name__ == "__main__":
    # 模拟数据：3个样本，特征维度为128，每个样本配5个负样本
    batch_size = 3
    feature_dim = 128
    num_negatives = 5
    
    anchor = F.normalize(torch.randn(batch_size, feature_dim), dim=1)  # L2归一化
    positive = F.normalize(anchor + 0.1 * torch.randn_like(anchor), dim=1)  # 与锚点相似
    negatives = F.normalize(torch.randn(batch_size, num_negatives, feature_dim), dim=2)  # 随机负样本
    
    # 计算损失
    loss = infonce_loss(anchor, positive, negatives, temperature=0.1)
    print(f"InfoNCE损失值: {loss.item()}")

swiglu

import torch
import torch.nn as nn
import torch.nn.functional as F

class SwiGLU(nn.Module):
    """
    SwiGLU激活函数实现
    公式：SwiGLU(x) = x1 * Swish(x2)，其中x1和x2是输入x拆分后的两部分
          Swish(x) = x * sigmoid(beta * x)，通常beta=1
    """
    def __init__(self, dim_in, dim_out=None):
        super().__init__()
        # 输出维度默认与输入维度相同
        dim_out = dim_out or dim_in
        # 线性层将输入映射为2倍维度（用于拆分x1和x2）
        self.w = nn.Linear(dim_in, 2 * dim_out)
        
    def forward(self, x):
        # 输入通过线性层得到2*dim_out维度的输出
        x = self.w(x)  # 形状: [batch_size, ..., 2*dim_out]
        # 拆分为两部分：x1和x2（各占一半维度）
        x1, x2 = torch.chunk(x, 2, dim=-1)  # 各部分形状: [batch_size, ..., dim_out]
        # 计算SwiGLU: x1 * swish(x2)，其中swish(x) = x * sigmoid(x)
        return x1 * F.silu(x2)  # F.silu是PyTorch内置的Swish激活（beta=1）

# 测试代码
if __name__ == "__main__":
    # 模拟输入：batch_size=2，序列长度=3，特征维度=8
    x = torch.randn(2, 3, 8)
    # 初始化SwiGLU，输入维度8，输出维度4
    swiglu = SwiGLU(dim_in=8, dim_out=4)
    # 前向传播
    output = swiglu(x)
    # 输出形状应为 [2, 3, 4]
    print(f"输入形状: {x.shape}")
    print(f"输出形状: {output.shape}")
    print(f"输出结果:\n{output}")

Focalloss

import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        super(FocalLoss, self).__init__()
        # alpha可以是None（不使用类别权重）、标量（所有类别相同权重）或列表（类别专属权重）
        if alpha is not None:
            if isinstance(alpha, (float, int)):
                self.alpha = torch.tensor([alpha, 1 - alpha])  # 二分类情况
            else:
                self.alpha = torch.tensor(alpha)  # 多分类情况
        else:
            self.alpha = None
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # inputs: [N, C]  logits
        # targets: [N]    ground truth labels
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)  # 真实类别的预测概率
        focal_loss = (1 - pt)** self.gamma * ce_loss

        # 处理类别权重alpha
        if self.alpha is not None:
            # 将alpha转移到与inputs相同的设备
            self.alpha = self.alpha.to(inputs.device)
            # 获取每个样本对应的alpha值
            alpha = self.alpha.gather(0, targets)
            focal_loss = alpha * focal_loss

        # 损失聚合
        if self.reduction == "mean":
            return focal_loss.mean()
        elif self.reduction == "sum":
            return focal_loss.sum()
        else:
            return focal_loss

MOE

import torch
import torch.nn as nn
import torch.nn.functional as F

class Expert(nn.Module):
    """专家网络：简单的两层全连接网络"""
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        return x

class MoE(nn.Module):
    """混合专家模型：包含多个专家和一个路由器"""
    def __init__(self, input_dim, output_dim, num_experts=4, hidden_dim=64, top_k=2):
        super().__init__()
        self.num_experts = num_experts  # 专家数量
        self.top_k = top_k  # 每个样本选择的专家数量
        
        # 创建多个专家网络
        self.experts = nn.ModuleList([
            Expert(input_dim, hidden_dim, output_dim) 
            for _ in range(num_experts)
        ])
        
        # 路由器网络：预测每个专家的权重
        self.router = nn.Linear(input_dim, num_experts)
        
    def forward(self, x):
        # x形状: [batch_size, input_dim]
        
        # 1. 路由选择：计算每个样本对每个专家的权重
        router_logits = self.router(x)  # [batch_size, num_experts]
        router_weights = F.softmax(router_logits, dim=1)  # 归一化权重
        
        # 2. 选择Top-k专家
        top_k_weights, top_k_indices = torch.topk(router_weights, self.top_k, dim=1)
        # 重新归一化Top-k权重
        top_k_weights = top_k_weights / torch.sum(top_k_weights, dim=1, keepdim=True)
        
        # 3. 收集专家输出并加权融合
        batch_size = x.shape[0]
        output = torch.zeros(batch_size, x.shape[1], device=x.device)  # 初始化输出
        
        for i in range(self.top_k):
            # 获取第i个专家的索引和权重
            expert_idx = top_k_indices[:, i]  # [batch_size]
            weights = top_k_weights[:, i].unsqueeze(1)  # [batch_size, 1]
            
            # 对每个样本应用对应的专家
            for batch_idx in range(batch_size):
                expert = self.experts[expert_idx[batch_idx]]
                output[batch_idx] += weights[batch_idx] * expert(x[batch_idx])
        
        return output

# 测试代码
if __name__ == "__main__":
    # 配置参数
    input_dim = 32
    output_dim = 32
    batch_size = 8
    
    # 创建模型和输入
    moe = MoE(input_dim, output_dim, num_experts=4, top_k=2)
    x = torch.randn(batch_size, input_dim)
    
    # 前向传播
    output = moe(x)
    print(f"输入形状: {x.shape}")
    print(f"输出形状: {output.shape}")

LORA

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class LinearLoRALayer(nn.Module):
    def __init__(self, in_features, out_features, merge=False,lora_rank=1, lora_alpha=1, lora_dropout=0.0):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.lora_rank = lora_rank
        self.merge=merge

        self.linear = nn.Linear(in_features, out_features)
        if lora_rank > 0:
            self.lora_a=nn.Parameter(torch.zeros(out_features,lora_rank))
            nn.init.kaiming_normal_(self.lora_a,a=0.01)
            self.lora_b=nn.Parameter(torch.zeros(lora_rank,in_features))
            self.scaling=lora_alpha/lora_rank
            self.linear.weight.requires_grad=False
            self.linear.bias.requires_grad=False
        self.dropout=nn.Dropout(lora_dropout) if lora_dropout>0 else nn.Identity()
        if merge:
            self.merge_weights()
    def forward(self, x):
        if self.lora_rank > 0 and not self.merge:
            output=self.linear(x)+self.scaling*(x@(self.lora_a@self.lora_b).T)
        elif self.lora_rank > 0 and self.merge:
            output=self.linear(x)
        else:
            output=self.linear(x)
        return self.dropout(output)
    def merge_weights(self,):
        if self.lora_rank > 0 and not self.merge:
            self.linear.weight.data+=self.scaling*(self.lora_a@self.lora_b)
    def unmerge_weights(self,):
        if self.lora_rank > 0 and self.merge:
            self.linear.weight.data-=self.scaling*(self.lora_a@self.lora_b)

batch_size=2
seq_len=4
in_features=8
out_features=16
lora_rank=8
lora_alpha=16
lora_dropout=0.1

x=torch.randn(batch_size,seq_len,in_features)
lora_layer=LinearLoRALayer(in_features,out_features,merge=False,lora_rank=lora_rank,lora_alpha=lora_alpha,lora_dropout=lora_dropout)
output=lora_layer(x)
print(output.shape)

layer_merged=LinearLoRALayer(in_features,out_features,merge=True,lora_rank=lora_rank,lora_alpha=lora_alpha,lora_dropout=lora_dropout)
output_merged=layer_merged(x)
print(output_merged.shape)
lora_layer.merge_weights()
output_after_merge=lora_layer(x)
layer_merged.unmerge_weights()
output_after_unmerge=layer_merged(x)
print(torch.max(torch.abs(output-output_after_unmerge)).item())

GQA

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class GQA(nn.Module):
    def __init__(self,hidden_dim,nums_head,nums_key_value_head):
        super(GQA, self).__init__()
        self.hidden_dim = hidden_dim
        self.nums_head = nums_head
        self.nums_key_value_head = nums_key_value_head
        self.head_dim = hidden_dim // nums_head
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)
        self.k_proj = nn.Linear(hidden_dim, nums_key_value_head * self.head_dim)
        self.v_proj = nn.Linear(hidden_dim, nums_key_value_head * self.head_dim)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim)
    def forward(self, x,attention_mask=None):
        batch_size, seq_len, _ = x.size()
        q = self.q_proj(x)
        k = self.k_proj(x)
        v = self.v_proj(x)
        q = q.view(batch_size,seq_len, self.nums_head, self.head_dim).transpose(1, 2)
        k=k.view(batch_size,seq_len,self.nums_key_value_head,self.head_dim).transpose(1, 2)
        v=v.view(batch_size,seq_len,self.nums_key_value_head,self.head_dim).transpose(1, 2)
        k=k.repeat_interleave(self.nums_head//self.nums_key_value_head,dim=1)
        v=v.repeat_interleave(self.nums_head//self.nums_key_value_head,dim=1)
        attention_score = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if attention_mask is not None:
            attention_score = attention_score.masked_fill(attention_mask == 0, -1e9)
        attention_weight = F.softmax(attention_score, dim=-1)
        attention_output = torch.matmul(attention_weight, v)
        attention_output = attention_output.transpose(1, 2).contiguous()
        output=self.out_proj(attention_output.view(batch_size,seq_len,self.hidden_dim))
        return output
    
x=torch.randn(2,4,8)
model=GQA(hidden_dim=8,nums_head=4,nums_key_value_head=2)
output=model(x)
print(output.shape)