auc
def get_auc ( label, score) :
pos= [ i for i, item in enumerate ( label) if item== 1 ]
neg= [ i for i, item in enumerate ( label) if item== 0 ]
if not pos or not neg:
return 0.5
ans= 0
for p in pos:
for n in neg:
if score[ p] > score[ n] :
ans+= 1
elif score[ p] == score[ n] :
ans+= 0.5
return ans/ ( len ( pos) * len ( neg) )
线性回归
import numpy as np
class LR :
def __init__ ( self, num_input) :
self. w= np. random. normal( scale= 0.01 , size= ( num_input, 1 ) )
self. b= np. random. normal( scale= 0.01 , size= ( 1 ) )
def forward ( self, input ) :
return np. dot( input , self. w) + self. b
def mse_loss ( self, pre, label) :
label= label. reshape( pre. shape)
return np. mean( ( pre- label) ** 2 ) / 2
def sgd_update ( self, lr, input , pre, label) :
batch_size= input . shape[ 0 ]
error= pre- label. reshape( pre. shape)
dw= np. dot( input . T, error) / batch_size
self. w-= lr* dw
db= np. mean( error)
self. b-= lr* db
def train ( self, lr, inputs, labels, epochs, batch_size= 32 ) :
input = np. array( inputs)
label= np. array( labels) . reshape( - 1 , 1 )
num_samples = inputs. shape[ 0 ]
for epoch in range ( epochs) :
total_loss= 0
for i in range ( 0 , num_samples, batch_size) :
batch_input= inputs[ i: i+ batch_size]
batch_label= labels[ i: i+ batch_size]
output= self. forward( batch_input)
loss= self. mse_loss( output, batch_label)
total_loss+= loss
self. sgd_update( lr, batch_input, output, batch_label)
avg_loss = total_loss / ( num_samples // batch_size + 1 )
if ( epoch + 1 ) % 100 == 0 :
print ( f"Epoch { epoch+ 1 } , Average Loss: { avg_loss: .6f } " )
if __name__ == "__main__" :
np. random. seed( 42 )
x= np. random. rand( 1000 , 2 )
y= 3 * x[ : , 0 ] + 2 * x[ : , 1 ] + 5 + np. random. randn( 1000 ) * 0.1
model= LR( num_input= 2 )
model. train( lr= 0.01 , inputs= x, labels= y, epochs= 1000 , batch_size= 32 )
print ( model. w. flatten( ) )
print ( model. b[ 0 ] )
逻辑回归
import numpy as np
class LogisticRegression :
def __init__ ( self, num_features) :
self. w= np. random. normal( scale= 0.01 , size= ( num_features, 1 ) )
self. b= np. random. normal( scale= 0.01 , size= ( 1 ) )
def sigmoid ( self, z) :
z= np. clip( z, - 100 , 100 )
return 1 / ( 1 + np. exp( - z) )
def forward ( self, input ) :
return self. sigmoid( np. dot( input , self. w) + self. b)
def bce ( self, pre, label) :
label= label. reshape( pre. shape)
pre= np. clip( pre, 1e - 10 , 1 - 1e - 10 )
return - np. mean( label* np. log( pre) + ( 1 - label) * np. log( 1 - pre) )
def sgd_update ( self, lr, input , pre, label) :
batch_size= input . shape[ 0 ]
error= pre- label. reshape( pre. shape)
dw= np. dot( input . T, error) / batch_size
self. w-= lr* dw
db= np. mean( error)
self. b-= lr* db
def train ( self, inputs, labels, epochs, lr, batch_size= 32 ) :
input = np. array( inputs)
label= np. array( labels) . reshape( - 1 , 1 )
num_samples = inputs. shape[ 0 ]
for epoch in range ( epochs) :
total_loss= 0
for i in range ( 0 , num_samples, batch_size) :
batch_input= inputs[ i: i+ batch_size]
batch_label= labels[ i: i+ batch_size]
output= self. forward( batch_input)
loss= self. bce( output, batch_label)
total_loss+= loss
self. sgd_update( lr, batch_input, output, batch_label)
avg_loss = total_loss / ( num_samples // batch_size + 1 )
if ( epoch + 1 ) % 100 == 0 :
print ( f"Epoch { epoch+ 1 } , Average Loss: { avg_loss: .6f } " )
def predict ( self, x, threshold= 0.5 ) :
y= self. forward( x)
return ( y>= threshold) . astype( int )
def predict_prob ( self, x) :
return self. forward( x)
if __name__ == "__main__" :
np. random. seed( 42 )
class0 = np. random. randn( 500 , 2 ) * 0.5 + np. array( [ 1 , 1 ] )
class1 = np. random. randn( 500 , 2 ) * 0.5 + np. array( [ 3 , 3 ] )
X = np. vstack( [ class0, class1] )
y = np. hstack( [ np. zeros( 500 ) , np. ones( 500 ) ] )
model = LogisticRegression( num_features= 2 )
model. train( X, y, epochs= 1000 , lr= 0.1 , batch_size= 32 )
y_pred = model. predict( X)
accuracy = np. mean( y_pred. flatten( ) == y)
print ( f"\n模型准确率: { accuracy: .4f } " )
print ( "学习到的权重:" , model. w. flatten( ) )
print ( "学习到的偏置:" , model. b[ 0 ] )
batchnorm
import numpy as np
class BN :
def __init__ ( self, momentum= 0.9 , eps= 1e - 5 , feat_dim= 2 ) :
self. _running_mean = np. zeros( shape= ( feat_dim, ) )
self. _running_var = np. ones( shape= ( feat_dim, ) )
self. _momentum = momentum
self. _eps = eps
self. _beta = np. zeros( shape= ( feat_dim, ) )
self. _gamma = np. ones( shape= ( feat_dim, ) )
self. training = True
def forward ( self, x) :
if self. training:
x_mean = x. mean( axis= 0 )
x_var = x. var( axis= 0 , ddof= 0 )
self. _running_mean = self. _momentum * self. _running_mean + ( 1 - self. _momentum) * x_mean
self. _running_var = self. _momentum * self. _running_var + ( 1 - self. _momentum) * x_var
x_hat = ( x - x_mean) / np. sqrt( x_var + self. _eps)
else :
x_hat = ( x - self. _running_mean) / np. sqrt( self. _running_var + self. _eps)
return self. _gamma * x_hat + self. _beta
if __name__ == "__main__" :
x = np. random. randn( 5 , 2 )
bn = BN( momentum= 0.9 , feat_dim= 2 )
bn. forward( x)
bn. training = False
print ( bn. forward( x) )
layernorm
import numpy as np
class LayerNorm :
def __init__ ( self, eps= 1e - 5 , feat_dim= None ) :
"""
初始化LayerNorm层
:param eps: 防止分母为0的小值,默认1e-5
:param feat_dim: 特征维度,用于初始化可学习参数
"""
self. eps = eps
self. gamma = np. ones( feat_dim) if feat_dim is not None else None
self. beta = np. zeros( feat_dim) if feat_dim is not None else None
self. training = True
def forward ( self, x) :
"""
前向传播
:param x: 输入数据,形状为(..., feat_dim),如(batch_size, seq_len, feat_dim)或(batch_size, feat_dim)
:return: 归一化并经过缩放偏移的结果,形状与x一致
"""
mean = x. mean( axis= - 1 , keepdims= True )
var = x. var( axis= - 1 , keepdims= True , ddof= 0 )
x_hat = ( x - mean) / np. sqrt( var + self. eps)
if self. gamma is None :
self. gamma = np. ones( x. shape[ - 1 ] )
self. beta = np. zeros( x. shape[ - 1 ] )
return self. gamma * x_hat + self. beta
if __name__ == "__main__" :
x_2d = np. random. randn( 3 , 5 )
ln_2d = LayerNorm( feat_dim= 5 )
out_2d = ln_2d. forward( x_2d)
print ( "2D输入的输出形状:" , out_2d. shape)
print ( "2D输出的特征维度均值(应接近0):" , out_2d. mean( axis= - 1 ) )
print ( "2D输出的特征维度方差(应接近1):\n" , out_2d. var( axis= - 1 , ddof= 0 ) )
x_3d = np. random. randn( 2 , 4 , 6 )
ln_3d = LayerNorm( )
out_3d = ln_3d. forward( x_3d)
print ( "\n3D输入的输出形状:" , out_3d. shape)
print ( "3D输出的特征维度均值(应接近0):\n" , out_3d. mean( axis= - 1 ) )
RMSNorm
import numpy as np
class RMSNorm :
def __init__ ( self, dim, eps= 1e - 8 ) :
self. eps = eps
self. scale = np. ones( dim)
def forward ( self, x) :
mean_square = np. mean( x ** 2 , axis= - 1 , keepdims= True )
rms = np. sqrt( mean_square + self. eps)
x_normalized = x / rms
output = self. scale * x_normalized
return output
if __name__ == "__main__" :
x = np. random. rand( 4 , 2 )
rms_norm = RMSNorm( dim= x. shape[ - 1 ] )
output = rms_norm. forward( x)
print ( "输入形状:" , x. shape)
print ( "输出形状:" , output. shape)
print ( "输出结果:\n" , output)
KL散度
import numpy as np
import torch
def kl_torch ( p, q, eps= 1e - 9 ) :
"""PyTorch版本KL散度计算(输入为torch.Tensor)"""
p = torch. softmax( p, dim= - 1 )
q = torch. softmax( q, dim= - 1 )
p = torch. clamp( p, eps, 1.0 )
q = torch. clamp( q, eps, 1.0 )
kl_div = p * ( torch. log( p) - torch. log( q) )
return torch. sum ( kl_div, dim= - 1 )
def kl_numpy ( p, q, eps= 1e - 10 ) :
"""NumPy版本KL散度计算(输入为numpy.ndarray)"""
p = p / np. sum ( p)
q = q / np. sum ( q)
p = np. clip( p, eps, 1.0 )
q = np. clip( q, eps, 1.0 )
kl = np. sum ( p * np. log( p / q) )
return kl
if __name__ == "__main__" :
p1 = np. array( [ 0.2 , 0.3 , 0.5 ] )
q1 = np. array( [ 0.2 , 0.3 , 0.5 ] )
print ( f"NumPy相同分布KL散度: { kl_numpy( p1, q1) : .4f } " )
p2 = np. array( [ 0.8 , 0.2 ] )
q2 = np. array( [ 0.5 , 0.5 ] )
print ( f"NumPy不同分布KL散度: { kl_numpy( p2, q2) : .4f } " )
torch. manual_seed( 42 )
p_torch = torch. randn( 3 , 5 )
q_torch = torch. randn( 3 , 5 )
same_kl = kl_torch( p_torch, p_torch)
print ( f"PyTorch相同分布KL散度: { same_kl} " )
diff_kl = kl_torch( p_torch, q_torch)
print ( f"PyTorch不同分布KL散度: { diff_kl} " )
交叉熵
import torch
logits = torch. randn( 3 , 5 )
labels = torch. tensor( [ 2 , 0 , 4 ] )
def softmax ( logits) :
max_vals = torch. max ( logits, dim= 1 , keepdim= True ) . values
exp_logits = torch. exp( logits - max_vals)
return exp_logits / torch. sum ( exp_logits, dim= 1 , keepdim= True )
def cross_entropy ( logits, labels) :
probs = softmax( logits)
probs = torch. clamp( probs, min = 1e - 9 , max = 1.0 )
selected_probs = probs. gather( dim= 1 , index= labels. unsqueeze( 1 ) )
loss = - torch. log( selected_probs)
return torch. mean( loss)
custom_loss = cross_entropy( logits, labels)
print ( f"自定义交叉熵损失: { custom_loss. item( ) } " )
official_loss = torch. nn. functional. cross_entropy( logits, labels)
print ( f"PyTorch官方交叉熵损失: { official_loss. item( ) } " )
对比损失
import torch
import torch. nn. functional as F
def infonce_loss ( anchor, positive, negatives, temperature= 0.1 ) :
"""
实现InfoNCE对比损失
Args:
anchor: 锚点样本特征,形状为[batch_size, feature_dim]
positive: 正样本特征,形状为[batch_size, feature_dim]
negatives: 负样本特征,形状为[batch_size, num_negatives, feature_dim]
temperature: 温度系数,控制softmax的陡峭程度
Returns:
平均InfoNCE损失
"""
pos_sim = torch. sum ( anchor * positive, dim= 1 , keepdim= True )
neg_sim = torch. sum ( anchor. unsqueeze( 1 ) * negatives, dim= 2 )
logits = torch. cat( [ pos_sim, neg_sim] , dim= 1 )
logits /= temperature
labels = torch. zeros( logits. size( 0 ) , dtype= torch. long , device= anchor. device)
loss = F. cross_entropy( logits, labels)
return loss
if __name__ == "__main__" :
batch_size = 3
feature_dim = 128
num_negatives = 5
anchor = F. normalize( torch. randn( batch_size, feature_dim) , dim= 1 )
positive = F. normalize( anchor + 0.1 * torch. randn_like( anchor) , dim= 1 )
negatives = F. normalize( torch. randn( batch_size, num_negatives, feature_dim) , dim= 2 )
loss = infonce_loss( anchor, positive, negatives, temperature= 0.1 )
print ( f"InfoNCE损失值: { loss. item( ) } " )
swiglu
import torch
import torch. nn as nn
import torch. nn. functional as F
class SwiGLU ( nn. Module) :
"""
SwiGLU激活函数实现
公式:SwiGLU(x) = x1 * Swish(x2),其中x1和x2是输入x拆分后的两部分
Swish(x) = x * sigmoid(beta * x),通常beta=1
"""
def __init__ ( self, dim_in, dim_out= None ) :
super ( ) . __init__( )
dim_out = dim_out or dim_in
self. w = nn. Linear( dim_in, 2 * dim_out)
def forward ( self, x) :
x = self. w( x)
x1, x2 = torch. chunk( x, 2 , dim= - 1 )
return x1 * F. silu( x2)
if __name__ == "__main__" :
x = torch. randn( 2 , 3 , 8 )
swiglu = SwiGLU( dim_in= 8 , dim_out= 4 )
output = swiglu( x)
print ( f"输入形状: { x. shape} " )
print ( f"输出形状: { output. shape} " )
print ( f"输出结果:\n { output} " )
Focalloss
import torch
import torch. nn as nn
import torch. nn. functional as F
class FocalLoss ( nn. Module) :
def __init__ ( self, alpha= None , gamma= 2.0 , reduction= "mean" ) :
super ( FocalLoss, self) . __init__( )
if alpha is not None :
if isinstance ( alpha, ( float , int ) ) :
self. alpha = torch. tensor( [ alpha, 1 - alpha] )
else :
self. alpha = torch. tensor( alpha)
else :
self. alpha = None
self. gamma = gamma
self. reduction = reduction
def forward ( self, inputs, targets) :
ce_loss = F. cross_entropy( inputs, targets, reduction= "none" )
pt = torch. exp( - ce_loss)
focal_loss = ( 1 - pt) ** self. gamma * ce_loss
if self. alpha is not None :
self. alpha = self. alpha. to( inputs. device)
alpha = self. alpha. gather( 0 , targets)
focal_loss = alpha * focal_loss
if self. reduction == "mean" :
return focal_loss. mean( )
elif self. reduction == "sum" :
return focal_loss. sum ( )
else :
return focal_loss
MOE
import torch
import torch. nn as nn
import torch. nn. functional as F
class Expert ( nn. Module) :
"""专家网络:简单的两层全连接网络"""
def __init__ ( self, input_dim, hidden_dim, output_dim) :
super ( ) . __init__( )
self. fc1 = nn. Linear( input_dim, hidden_dim)
self. fc2 = nn. Linear( hidden_dim, output_dim)
self. activation = nn. ReLU( )
def forward ( self, x) :
x = self. fc1( x)
x = self. activation( x)
x = self. fc2( x)
return x
class MoE ( nn. Module) :
"""混合专家模型:包含多个专家和一个路由器"""
def __init__ ( self, input_dim, output_dim, num_experts= 4 , hidden_dim= 64 , top_k= 2 ) :
super ( ) . __init__( )
self. num_experts = num_experts
self. top_k = top_k
self. experts = nn. ModuleList( [
Expert( input_dim, hidden_dim, output_dim)
for _ in range ( num_experts)
] )
self. router = nn. Linear( input_dim, num_experts)
def forward ( self, x) :
router_logits = self. router( x)
router_weights = F. softmax( router_logits, dim= 1 )
top_k_weights, top_k_indices = torch. topk( router_weights, self. top_k, dim= 1 )
top_k_weights = top_k_weights / torch. sum ( top_k_weights, dim= 1 , keepdim= True )
batch_size = x. shape[ 0 ]
output = torch. zeros( batch_size, x. shape[ 1 ] , device= x. device)
for i in range ( self. top_k) :
expert_idx = top_k_indices[ : , i]
weights = top_k_weights[ : , i] . unsqueeze( 1 )
for batch_idx in range ( batch_size) :
expert = self. experts[ expert_idx[ batch_idx] ]
output[ batch_idx] += weights[ batch_idx] * expert( x[ batch_idx] )
return output
if __name__ == "__main__" :
input_dim = 32
output_dim = 32
batch_size = 8
moe = MoE( input_dim, output_dim, num_experts= 4 , top_k= 2 )
x = torch. randn( batch_size, input_dim)
output = moe( x)
print ( f"输入形状: { x. shape} " )
print ( f"输出形状: { output. shape} " )
LORA
import torch
import torch. nn as nn
import torch. nn. functional as F
import math
class LinearLoRALayer ( nn. Module) :
def __init__ ( self, in_features, out_features, merge= False , lora_rank= 1 , lora_alpha= 1 , lora_dropout= 0.0 ) :
super ( ) . __init__( )
self. in_features = in_features
self. out_features = out_features
self. lora_rank = lora_rank
self. merge= merge
self. linear = nn. Linear( in_features, out_features)
if lora_rank > 0 :
self. lora_a= nn. Parameter( torch. zeros( out_features, lora_rank) )
nn. init. kaiming_normal_( self. lora_a, a= 0.01 )
self. lora_b= nn. Parameter( torch. zeros( lora_rank, in_features) )
self. scaling= lora_alpha/ lora_rank
self. linear. weight. requires_grad= False
self. linear. bias. requires_grad= False
self. dropout= nn. Dropout( lora_dropout) if lora_dropout> 0 else nn. Identity( )
if merge:
self. merge_weights( )
def forward ( self, x) :
if self. lora_rank > 0 and not self. merge:
output= self. linear( x) + self. scaling* ( x@( self. lora_a@self. lora_b) . T)
elif self. lora_rank > 0 and self. merge:
output= self. linear( x)
else :
output= self. linear( x)
return self. dropout( output)
def merge_weights ( self, ) :
if self. lora_rank > 0 and not self. merge:
self. linear. weight. data+= self. scaling* ( self. lora_a@self. lora_b)
def unmerge_weights ( self, ) :
if self. lora_rank > 0 and self. merge:
self. linear. weight. data-= self. scaling* ( self. lora_a@self. lora_b)
batch_size= 2
seq_len= 4
in_features= 8
out_features= 16
lora_rank= 8
lora_alpha= 16
lora_dropout= 0.1
x= torch. randn( batch_size, seq_len, in_features)
lora_layer= LinearLoRALayer( in_features, out_features, merge= False , lora_rank= lora_rank, lora_alpha= lora_alpha, lora_dropout= lora_dropout)
output= lora_layer( x)
print ( output. shape)
layer_merged= LinearLoRALayer( in_features, out_features, merge= True , lora_rank= lora_rank, lora_alpha= lora_alpha, lora_dropout= lora_dropout)
output_merged= layer_merged( x)
print ( output_merged. shape)
lora_layer. merge_weights( )
output_after_merge= lora_layer( x)
layer_merged. unmerge_weights( )
output_after_unmerge= layer_merged( x)
print ( torch. max ( torch. abs ( output- output_after_unmerge) ) . item( ) )
GQA
import torch
import torch. nn as nn
import torch. nn. functional as F
import math
class GQA ( nn. Module) :
def __init__ ( self, hidden_dim, nums_head, nums_key_value_head) :
super ( GQA, self) . __init__( )
self. hidden_dim = hidden_dim
self. nums_head = nums_head
self. nums_key_value_head = nums_key_value_head
self. head_dim = hidden_dim // nums_head
self. q_proj = nn. Linear( hidden_dim, hidden_dim)
self. k_proj = nn. Linear( hidden_dim, nums_key_value_head * self. head_dim)
self. v_proj = nn. Linear( hidden_dim, nums_key_value_head * self. head_dim)
self. out_proj = nn. Linear( hidden_dim, hidden_dim)
def forward ( self, x, attention_mask= None ) :
batch_size, seq_len, _ = x. size( )
q = self. q_proj( x)
k = self. k_proj( x)
v = self. v_proj( x)
q = q. view( batch_size, seq_len, self. nums_head, self. head_dim) . transpose( 1 , 2 )
k= k. view( batch_size, seq_len, self. nums_key_value_head, self. head_dim) . transpose( 1 , 2 )
v= v. view( batch_size, seq_len, self. nums_key_value_head, self. head_dim) . transpose( 1 , 2 )
k= k. repeat_interleave( self. nums_head// self. nums_key_value_head, dim= 1 )
v= v. repeat_interleave( self. nums_head// self. nums_key_value_head, dim= 1 )
attention_score = torch. matmul( q, k. transpose( - 2 , - 1 ) ) / math. sqrt( self. head_dim)
if attention_mask is not None :
attention_score = attention_score. masked_fill( attention_mask == 0 , - 1e9 )
attention_weight = F. softmax( attention_score, dim= - 1 )
attention_output = torch. matmul( attention_weight, v)
attention_output = attention_output. transpose( 1 , 2 ) . contiguous( )
output= self. out_proj( attention_output. view( batch_size, seq_len, self. hidden_dim) )
return output
x= torch. randn( 2 , 4 , 8 )
model= GQA( hidden_dim= 8 , nums_head= 4 , nums_key_value_head= 2 )
output= model( x)
print ( output. shape)