那我们从第一步开始吧,
# %%
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import re
import math
from typing import Optional, Tuple, Dict, List, Any
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import random
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
# %% [markdown]
# RoPE: 旋转位置编码(完全自实现)
# %%
def precompute_freqs_cis(
hidden_size: int,
max_seq_len: int,
base: int = 10000,
num_attention_heads: int = 8
) -> torch.Tensor:
"""
预计算复数形式的旋转位置编码 (RoPE)
Args:
hidden_size: 模型维度
max_seq_len: 最大序列长度
base: 频率基数
num_attention_heads: 注意力头数
Returns:
freqs_cis: complex tensor of shape (max_seq_len, head_dim // 2)
"""
head_dim = hidden_size // num_attention_heads
inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
t = torch.arange(max_seq_len) # 位置索引 [0, 1, ..., S-1]
freqs = torch.einsum("s,d->sd", t, inv_freq) # (S, D//2)
# 转为复数角度:cosθ + i*sinθ
freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
return freqs_cis # shape: (max_seq_len, head_dim // 2)
def _rotate_half(x: torch.Tensor) -> torch.Tensor:
"""
在最后一个维度上,将后半部分移到前面并取负。
例如:x = [x0, x1, x2, x3] -> [-x2, -x3, x0, x1]
这对应于乘以 i 的操作(复数旋转90度)
"""
x1, x2 = x.chunk(2, dim=-1) # 分成前后两半
return torch.cat((-x2, x1), dim=-1) # 后半取负放前
def apply_rotary_pos_emb(
q: torch.Tensor,
k: torch.Tensor,
freqs_cis: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
将 RoPE 应用于 query 和 key 张量。
Args:
q: (bsz, n_heads, seq_len, head_dim)
k: (bsz, n_heads, seq_len, head_dim)
freqs_cis: (seq_len, head_dim // 2)
Returns:
q_embed, k_embed
"""
# 提取实部和虚部,并扩展维度以便广播
cos = freqs_cis.real.view(1, 1, -1, 1) # (1, 1, S, 1)
sin = freqs_cis.imag.view(1, 1, -1, 1)
# 使用标准公式: q * cos + rotate_half(q) * sin
q_out = (q * cos[:, :, :q.size(2), :]) + (_rotate_half(q) * sin[:, :, :q.size(2), :])
k_out = (k * cos[:, :, :k.size(2), :]) + (_rotate_half(k) * sin[:, :, :k.size(2), :])
return q_out.type_as(q), k_out.type_as(k)
# %% [markdown]
# Tokenization 编译层
# %%
class CharLevelTokenizer:
"""
字符级 Tokenizer
- 所有普通内容(数字、字母、标点)按字符拆解
- 特殊 token [BOS], [EOS], [UNK] 不可拆解,作为原子单位
- 支持自动添加 BOS/EOS
"""
def __init__(self):
self.char_to_id: Dict[str, int] = {}
self.special_tokens = ['[BOS]', '[EOS]', '[PAD]', '[UNK]']
next_id = 0
# Step 1: 数字 0-9
for c in '0123456789':
self.char_to_id[c] = next_id
next_id += 1
# Step 2: 字母 a-z
alphabet = 'abcdefghijklmnopqrstuvwxyz'
for c in alphabet:
if c not in self.char_to_id:
self.char_to_id[c] = next_id
next_id += 1
# Step 3: 标点符号
for c in '?!.,;:':
if c not in self.char_to_id:
self.char_to_id[c] = next_id
next_id += 1
# Step 4: 分隔符(重要!用于 token 间分割)
self.char_to_id[' '] = next_id
next_id += 1
# Step 5: 添加特殊 token(整体,不分拆)
for tok in self.special_tokens:
self.char_to_id[tok] = next_id
next_id += 1
# 构建逆映射
self.id_to_char = {v: k for k, v in self.char_to_id.items()}
# 缓存
self._cache = {}
def encode(
self,
tokens: List[Any],
add_bos: bool = True,
add_eos: bool = False
) -> torch.Tensor:
"""
编码 token 列表为 ID 张量
:param tokens: 输入列表,如 ["the", "next", 260, "?"]
:param add_bos: 是否添加 [BOS]
:param add_eos: 是否添加 [EOS]
:return: (1, L) 形状的 long 张量
"""
input_ids = []
# 添加 BOS
if add_bos:
input_ids.append(self.char_to_id['[BOS]'])
for token in tokens:
key = str(token).lower()
# 判断是否为特殊 token(直接查表)
if key in self.special_tokens:
input_ids.append(self.char_to_id[key])
continue
# 普通 token:逐字符编码
for char in key:
if char in self.char_to_id:
input_ids.append(self.char_to_id[char])
else:
input_ids.append(self.char_to_id['[UNK]'])
# 添加空格分隔符
input_ids.append(self.char_to_id[' '])
# 添加 EOS
if add_eos:
input_ids.append(self.char_to_id['[EOS]'])
return torch.tensor([input_ids], dtype=torch.long)
def decode(self, token_ids: torch.Tensor) -> str:
"""
解码 token IDs 为字符串
注意:[BOS], [EOS], [UNK] 不会被拆开
"""
if token_ids.dim() == 2:
ids = token_ids.squeeze(0).tolist()
else:
ids = token_ids.tolist()
pieces = []
for idx in ids:
if idx in self.id_to_char:
tok = self.id_to_char[idx]
# 特殊 token 单独处理(可以保留或过滤)
if tok in ['[BOS]', '[EOS]', '[PAD]']:
pieces.append(tok)
elif tok == ' ':
pass # 忽略空格
else:
pieces.append(tok)
else:
pieces.append('[UNK]')
return ''.join(pieces)
def extract_last_number(self, text: str) -> str | None:
"""从文本中提取最后一个连续数字"""
matches = re.findall(r'\d+', text)
return matches[-1] if matches else None
def decode_with_number(self, token_ids: torch.Tensor) -> Tuple[str, str | None]:
"""解码并提取预测数字"""
full_text = self.decode(token_ids)
number = self.extract_last_number(full_text)
return full_text, number
@property
def vocab_size(self) -> int:
return len(self.char_to_id)
def __len__(self) -> int:
return self.vocab_size
@property
def bos_token_id(self) -> int:
return self.char_to_id['[BOS]']
@property
def eos_token_id(self) -> int:
return self.char_to_id['[EOS]']
@property
def unk_token_id(self) -> int:
return self.char_to_id['[UNK]']
@property
def pad_token_id(self) -> int:
return self.char_to_id['[PAD]']
# %% [markdown]
# Embedding 感知层
# %%
class SimpleEmbedding(nn.Module):
def __init__(self, vocab_size: int, hidden_size: int, max_position_embeddings: int):
super().__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.dropout = nn.Dropout(0.1)
# 初始化
self.word_embeddings.weight.data.normal_(mean=0.0, std=0.02)
self.position_embeddings.weight.data.normal_(mean=0.0, std=0.02)
def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None):
batch_size, seq_length = input_ids.shape
if position_ids is None:
position_ids = torch.arange(seq_length, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
word_embs = self.word_embeddings(input_ids)
pos_embs = self.position_embeddings(position_ids)
embeddings = word_embs + pos_embs
return self.dropout(embeddings)
# %% [markdown]
# 自定义注意力层
# %%
class YiziAttention(nn.Module):
def __init__(self, hidden_size: int, num_heads: int):
super().__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
assert self.head_dim * num_heads == hidden_size, "hidden_size 必须能被 num_heads 整除"
self.scale = self.head_dim ** -0.5 # 缩放因子
# QKV 投影层
self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False)
self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False)
self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False)
self.o_proj = nn.Linear(hidden_size, hidden_size, bias=False)
# 存储最后一次 attention 权重(用于可视化)
self.attn_weights = None
def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
bsz, seq_len, _ = x.shape
# 投影到 QKV
query_states = self.q_proj(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = self.k_proj(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = self.v_proj(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# 应用 RoPE
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, freqs_cis)
# Scaled Dot-Product Attention
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
attn_weights = torch.softmax(attn_weights, dim=-1)
# 存储 attention map(可用于可视化)
self.attn_weights = attn_weights.detach()
# 合并头
attn_output = torch.matmul(attn_weights, value_states)
attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, self.hidden_size)
return self.o_proj(attn_output)
# %% [markdown]
# 前馈网络 + 残差连接
# %%
class YiziBlock(nn.Module):
def __init__(self, hidden_size: int, num_heads: int, intermediate_size: int):
super().__init__()
self.attn = YiziAttention(hidden_size, num_heads)
self.norm1 = nn.LayerNorm(hidden_size)
self.mlp = nn.Sequential(
nn.Linear(hidden_size, intermediate_size),
nn.GELU(),
nn.Linear(intermediate_size, hidden_size)
)
self.norm2 = nn.LayerNorm(hidden_size)
def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
# 注意力残差连接
x = x + self.attn(self.norm1(x), freqs_cis)
# MLP 残差连接
x = x + self.mlp(self.norm2(x))
return x
# %% [markdown]
# 主模型:YiziLM
# %%
class YiziLMConfig:
def __init__(
self,
vocab_size: int = 30000,
hidden_size: int = 512,
num_hidden_layers: int = 6,
num_attention_heads: int = 8,
max_position_embeddings: int = 8192,
intermediate_size: int = 2048,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.intermediate_size = intermediate_size
class YiziLM(nn.Module):
def __init__(self, config: YiziLMConfig):
super().__init__()
self.config = config
# 使用标准词嵌入
self.embed_tokens = SimpleEmbedding(
vocab_size=config.vocab_size,
hidden_size=config.hidden_size,
max_position_embeddings=config.max_position_embeddings
)
# 预计算 RoPE 并注册为 buffer
freqs_cis = precompute_freqs_cis(
hidden_size=config.hidden_size,
max_seq_len=config.max_position_embeddings,
base=10000,
num_attention_heads=config.num_attention_heads
)
self.register_buffer("freqs_cis", freqs_cis)
# Transformer 层堆叠
self.layers = nn.ModuleList([
YiziBlock(
hidden_size=config.hidden_size,
num_heads=config.num_attention_heads,
intermediate_size=config.intermediate_size
) for _ in range(config.num_hidden_layers)
])
# 输出归一化
self.norm = nn.LayerNorm(config.hidden_size)
# 语言模型头(仅此一个输出)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.lm_head.weight = self.embed_tokens.word_embeddings.weight # 共享权重
# 初始化
self.apply(self._init_weights)
def _init_weights(self, module):
"""递归初始化"""
if isinstance(module, (nn.Linear, nn.Embedding)):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if hasattr(module, 'bias') and module.bias is not None:
torch.nn.init.zeros_(module.bias)
def forward(
self,
input_ids: torch.LongTensor,
labels: Optional[torch.LongTensor] = None
) -> Dict[str, torch.Tensor]:
seq_len = input_ids.size(1)
freqs_cis = self.freqs_cis[:seq_len]
inputs_embeds = self.embed_tokens(input_ids)
hidden_states = inputs_embeds
for layer in self.layers:
hidden_states = layer(hidden_states, freqs_cis)
hidden_states = self.norm(hidden_states)
logits = self.lm_head(hidden_states)
output = {"logits": logits}
if labels is not None:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
output["loss"] = loss
return output
@torch.no_grad()
def generate(
self,
input_ids: torch.LongTensor,
max_new_tokens: int = 128,
temperature: float = 0.7,
top_k: int = 50,
eos_token_id: Optional[int] = None,
) -> torch.LongTensor:
"""
纯语言自回归生成
"""
for _ in range(max_new_tokens):
outputs = self.forward(input_ids)
logits = outputs["logits"][:, -1, :] # 取最后一步预测
logit = logits / temperature
if top_k > 0:
v, _ = torch.topk(logit, min(top_k, logit.size(-1)))
pivot = v[:, [-1]]
logit = torch.where(logit < pivot, -float('inf'), logit)
probs = torch.softmax(logit, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
input_ids = torch.cat([input_ids, next_token], dim=1)
if eos_token_id is not None and next_token.item() == eos_token_id:
break
return input_ids
# %% [markdown]
# 初始化模型
# %%
tokenizer = CharLevelTokenizer()
config = YiziLMConfig(
vocab_size=len(tokenizer),
hidden_size=512,
num_hidden_layers=4,
num_attention_heads=8,
intermediate_size=2048,
max_position_embeddings=8192,
)
model = YiziLM(config)
total_params = sum(p.numel() for p in model.parameters())
print(f"🚀 Total Parameters: {total_params:,}")
# %% [markdown]
# # 训练
# %% [markdown]
# 工具函数:编码样本
# %%
def encode_sample(sample, tokenizer, max_length=128):
"""
将一个样本编码为 input_ids 和 labels
prompt + target 拼接,labels 中 prompt 部分为 -100
"""
prompt = sample["prompt"]
target = sample["target"]
# 转为字符串 token
prompt_tokens = [str(x).lower() for x in prompt]
target_tokens = [str(x). lower() for x in target]
# 编码
input_ids = tokenizer.encode(prompt_tokens, add_bos=True, add_eos=False).squeeze(0) # (L_in,)
label_ids = tokenizer.encode(target_tokens, add_bos=False, add_eos=True).squeeze(0) # (L_out,)
# 拼接完整序列
full_input_ids = torch.cat([input_ids, label_ids], dim=0)
# 构造 labels:仅 label 部分参与 loss 计算
full_labels = full_input_ids.clone()
full_labels[:len(input_ids)] = -100 # ignore prompt
return {
"input_ids": full_input_ids,
"labels": full_labels
}
# %% [markdown]
# 数据集整理
# %%
class GeneralDataset(Dataset):
def __init__(self, data, tokenizer, max_length=128):
self.raw_data = data
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.raw_data)
def __getitem__(self, idx):
sample = self.raw_data[idx]
return encode_sample(sample, self.tokenizer, self.max_length)
def collate_fn(batch):
input_ids_list = [b["input_ids"] for b in batch]
labels_list = [b["labels"] for b in batch]
# 动态 padding
input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
labels = pad_sequence(labels_list, batch_first=True, padding_value=-100)
return {
"input_ids": input_ids,
"labels": labels
}
# %% [markdown]
# 训练函数
# %%
@torch.no_grad()
def evaluate_generation(model, tokenizer, examples, device):
"""生成式评估:看模型是否能正确输出"""
model.eval()
results = []
for prompt in examples:
tokens = [str(x).lower() for x in prompt]
input_ids = tokenizer.encode(tokens, add_bos=True, add_eos=False).to(device)
output_ids = model.generate(
input_ids,
max_new_tokens=20,
temperature=0.7,
top_k=15,
eos_token_id=tokenizer.eos_token_id
)
text, num = tokenizer.decode_with_number(output_ids.cpu())
results.append((prompt, text, num))
return results
def train_model(
model,
train_data,
tokenizer,
val_prompts=None, # 用于生成测试的 prompt 列表
batch_size=4,
epochs=3,
lr=3e-4,
warmup_steps=100,
log_steps=10,
eval_steps=50,
save_path=None,
device=None
):
"""
通用训练主函数
"""
if device is None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 数据集与加载器
dataset = GeneralDataset(train_data, tokenizer)
dataloader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn
)
# 模型准备
model.to(device)
model.train()
# 优化器与学习率调度
optimizer = AdamW(model.parameters(), lr=lr)
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)
# 训练循环
global_step = 0
for epoch in range(epochs):
print(f"\n{'='*20} Epoch {epoch+1}/{epochs} {'='*20}")
epoch_losses = []
progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
for batch in progress_bar:
input_ids = batch["input_ids"].to(device) # (B, T)
labels = batch["labels"].to(device) # (B, T)
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.get('loss')
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
epoch_losses.append(loss.item())
global_step += 1
if global_step % log_steps == 0:
avg_loss = np.mean(epoch_losses[-log_steps:])
progress_bar.set_postfix({"loss": f"{avg_loss:.4f}"})
if val_prompts and global_step % eval_steps == 0:
print(f"\n🔍 Step {global_step}: Generation Sample")
model.eval()
results = evaluate_generation(model, tokenizer, val_prompts, device)
for p, out, num in results:
print(f" Prompt: {' '.join(map(str, p))}")
print(f" Output: {out} → Num: {num}")
model.train()
avg_epoch_loss = np.mean(epoch_losses)
print(f"✅ Epoch {epoch+1} 完成 | 平均 Loss: {avg_epoch_loss:.4f}")
# 最终保存
if save_path:
torch.save(model.state_dict(), save_path)
print(f"💾 模型权重已保存至: {save_path}")
设计一个适合我的模型的数据集
最新发布