小心太大的query_cache

当query_size过大且缓存了大量query时,插入语句可能导致许多缓存失效,从而降低性能。本文探讨了解决这一问题的方法,包括调整缓存大小或使用外部缓存。
当query_size过大,比如说有256M或者更多,而又缓存了很多query(Qcache_queries_in_cache),在加上一条插入语句可能使很多的缓存无效,那么插入就会变得很慢,因为有一个使大量缓存无效的过程。可以使用flush query cache清除缓存看一下需要多长时间。解决方法可以减小缓存,或者使用外部的缓存如memcached。
那我们从第一步开始吧, # %% import torch import torch.nn as nn import torch.nn.functional as F from torch.optim import AdamW import re import math from typing import Optional, Tuple, Dict, List, Any from transformers import get_linear_schedule_with_warmup from tqdm import tqdm import matplotlib.pyplot as plt import numpy as np import random from torch.nn.utils.rnn import pad_sequence from torch.utils.data import Dataset, DataLoader # 设置随机种子 torch.manual_seed(42) np.random.seed(42) random.seed(42) # %% [markdown] # RoPE: 旋转位置编码(完全自实现) # %% def precompute_freqs_cis( hidden_size: int, max_seq_len: int, base: int = 10000, num_attention_heads: int = 8 ) -> torch.Tensor: """ 预计算复数形式的旋转位置编码 (RoPE) Args: hidden_size: 模型维度 max_seq_len: 最大序列长度 base: 频率基数 num_attention_heads: 注意力头数 Returns: freqs_cis: complex tensor of shape (max_seq_len, head_dim // 2) """ head_dim = hidden_size // num_attention_heads inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim)) t = torch.arange(max_seq_len) # 位置索引 [0, 1, ..., S-1] freqs = torch.einsum("s,d->sd", t, inv_freq) # (S, D//2) # 转为复数角度:cosθ + i*sinθ freqs_cis = torch.polar(torch.ones_like(freqs), freqs) return freqs_cis # shape: (max_seq_len, head_dim // 2) def _rotate_half(x: torch.Tensor) -> torch.Tensor: """ 在最后一个维度上,将后半部分移到前面并取负。 例如:x = [x0, x1, x2, x3] -> [-x2, -x3, x0, x1] 这对应于乘以 i 的操作(复数旋转90度) """ x1, x2 = x.chunk(2, dim=-1) # 分成前后两半 return torch.cat((-x2, x1), dim=-1) # 后半取负放前 def apply_rotary_pos_emb( q: torch.Tensor, k: torch.Tensor, freqs_cis: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: """ 将 RoPE 应用于 query 和 key 张量。 Args: q: (bsz, n_heads, seq_len, head_dim) k: (bsz, n_heads, seq_len, head_dim) freqs_cis: (seq_len, head_dim // 2) Returns: q_embed, k_embed """ # 提取实部和虚部,并扩展维度以便广播 cos = freqs_cis.real.view(1, 1, -1, 1) # (1, 1, S, 1) sin = freqs_cis.imag.view(1, 1, -1, 1) # 使用标准公式: q * cos + rotate_half(q) * sin q_out = (q * cos[:, :, :q.size(2), :]) + (_rotate_half(q) * sin[:, :, :q.size(2), :]) k_out = (k * cos[:, :, :k.size(2), :]) + (_rotate_half(k) * sin[:, :, :k.size(2), :]) return q_out.type_as(q), k_out.type_as(k) # %% [markdown] # Tokenization 编译层 # %% class CharLevelTokenizer: """ 字符级 Tokenizer - 所有普通内容(数字、字母、标点)按字符拆解 - 特殊 token [BOS], [EOS], [UNK] 不可拆解,作为原子单位 - 支持自动添加 BOS/EOS """ def __init__(self): self.char_to_id: Dict[str, int] = {} self.special_tokens = ['[BOS]', '[EOS]', '[PAD]', '[UNK]'] next_id = 0 # Step 1: 数字 0-9 for c in '0123456789': self.char_to_id[c] = next_id next_id += 1 # Step 2: 字母 a-z alphabet = 'abcdefghijklmnopqrstuvwxyz' for c in alphabet: if c not in self.char_to_id: self.char_to_id[c] = next_id next_id += 1 # Step 3: 标点符号 for c in '?!.,;:': if c not in self.char_to_id: self.char_to_id[c] = next_id next_id += 1 # Step 4: 分隔符(重要!用于 token 间分割) self.char_to_id[' '] = next_id next_id += 1 # Step 5: 添加特殊 token(整体,不分拆) for tok in self.special_tokens: self.char_to_id[tok] = next_id next_id += 1 # 构建逆映射 self.id_to_char = {v: k for k, v in self.char_to_id.items()} # 缓存 self._cache = {} def encode( self, tokens: List[Any], add_bos: bool = True, add_eos: bool = False ) -> torch.Tensor: """ 编码 token 列表为 ID 张量 :param tokens: 输入列表,如 ["the", "next", 260, "?"] :param add_bos: 是否添加 [BOS] :param add_eos: 是否添加 [EOS] :return: (1, L) 形状的 long 张量 """ input_ids = [] # 添加 BOS if add_bos: input_ids.append(self.char_to_id['[BOS]']) for token in tokens: key = str(token).lower() # 判断是否为特殊 token(直接查表) if key in self.special_tokens: input_ids.append(self.char_to_id[key]) continue # 普通 token:逐字符编码 for char in key: if char in self.char_to_id: input_ids.append(self.char_to_id[char]) else: input_ids.append(self.char_to_id['[UNK]']) # 添加空格分隔符 input_ids.append(self.char_to_id[' ']) # 添加 EOS if add_eos: input_ids.append(self.char_to_id['[EOS]']) return torch.tensor([input_ids], dtype=torch.long) def decode(self, token_ids: torch.Tensor) -> str: """ 解码 token IDs 为字符串 注意:[BOS], [EOS], [UNK] 不会被拆开 """ if token_ids.dim() == 2: ids = token_ids.squeeze(0).tolist() else: ids = token_ids.tolist() pieces = [] for idx in ids: if idx in self.id_to_char: tok = self.id_to_char[idx] # 特殊 token 单独处理(可以保留或过滤) if tok in ['[BOS]', '[EOS]', '[PAD]']: pieces.append(tok) elif tok == ' ': pass # 忽略空格 else: pieces.append(tok) else: pieces.append('[UNK]') return ''.join(pieces) def extract_last_number(self, text: str) -> str | None: """从文本中提取最后一个连续数字""" matches = re.findall(r'\d+', text) return matches[-1] if matches else None def decode_with_number(self, token_ids: torch.Tensor) -> Tuple[str, str | None]: """解码并提取预测数字""" full_text = self.decode(token_ids) number = self.extract_last_number(full_text) return full_text, number @property def vocab_size(self) -> int: return len(self.char_to_id) def __len__(self) -> int: return self.vocab_size @property def bos_token_id(self) -> int: return self.char_to_id['[BOS]'] @property def eos_token_id(self) -> int: return self.char_to_id['[EOS]'] @property def unk_token_id(self) -> int: return self.char_to_id['[UNK]'] @property def pad_token_id(self) -> int: return self.char_to_id['[PAD]'] # %% [markdown] # Embedding 感知层 # %% class SimpleEmbedding(nn.Module): def __init__(self, vocab_size: int, hidden_size: int, max_position_embeddings: int): super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size) self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.dropout = nn.Dropout(0.1) # 初始化 self.word_embeddings.weight.data.normal_(mean=0.0, std=0.02) self.position_embeddings.weight.data.normal_(mean=0.0, std=0.02) def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None): batch_size, seq_length = input_ids.shape if position_ids is None: position_ids = torch.arange(seq_length, device=input_ids.device).unsqueeze(0).expand(batch_size, -1) word_embs = self.word_embeddings(input_ids) pos_embs = self.position_embeddings(position_ids) embeddings = word_embs + pos_embs return self.dropout(embeddings) # %% [markdown] # 自定义注意力层 # %% class YiziAttention(nn.Module): def __init__(self, hidden_size: int, num_heads: int): super().__init__() self.hidden_size = hidden_size self.num_heads = num_heads self.head_dim = hidden_size // num_heads assert self.head_dim * num_heads == hidden_size, "hidden_size 必须能被 num_heads 整除" self.scale = self.head_dim ** -0.5 # 缩放因子 # QKV 投影层 self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) self.o_proj = nn.Linear(hidden_size, hidden_size, bias=False) # 存储最后一次 attention 权重(用于可视化) self.attn_weights = None def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: bsz, seq_len, _ = x.shape # 投影到 QKV query_states = self.q_proj(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = self.k_proj(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = self.v_proj(x).view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2) # 应用 RoPE query_states, key_states = apply_rotary_pos_emb(query_states, key_states, freqs_cis) # Scaled Dot-Product Attention attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale attn_weights = torch.softmax(attn_weights, dim=-1) # 存储 attention map(可用于可视化) self.attn_weights = attn_weights.detach() # 合并头 attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, self.hidden_size) return self.o_proj(attn_output) # %% [markdown] # 前馈网络 + 残差连接 # %% class YiziBlock(nn.Module): def __init__(self, hidden_size: int, num_heads: int, intermediate_size: int): super().__init__() self.attn = YiziAttention(hidden_size, num_heads) self.norm1 = nn.LayerNorm(hidden_size) self.mlp = nn.Sequential( nn.Linear(hidden_size, intermediate_size), nn.GELU(), nn.Linear(intermediate_size, hidden_size) ) self.norm2 = nn.LayerNorm(hidden_size) def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: # 注意力残差连接 x = x + self.attn(self.norm1(x), freqs_cis) # MLP 残差连接 x = x + self.mlp(self.norm2(x)) return x # %% [markdown] # 主模型:YiziLM # %% class YiziLMConfig: def __init__( self, vocab_size: int = 30000, hidden_size: int = 512, num_hidden_layers: int = 6, num_attention_heads: int = 8, max_position_embeddings: int = 8192, intermediate_size: int = 2048, ): self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.max_position_embeddings = max_position_embeddings self.intermediate_size = intermediate_size class YiziLM(nn.Module): def __init__(self, config: YiziLMConfig): super().__init__() self.config = config # 使用标准词嵌入 self.embed_tokens = SimpleEmbedding( vocab_size=config.vocab_size, hidden_size=config.hidden_size, max_position_embeddings=config.max_position_embeddings ) # 预计算 RoPE 并注册为 buffer freqs_cis = precompute_freqs_cis( hidden_size=config.hidden_size, max_seq_len=config.max_position_embeddings, base=10000, num_attention_heads=config.num_attention_heads ) self.register_buffer("freqs_cis", freqs_cis) # Transformer 层堆叠 self.layers = nn.ModuleList([ YiziBlock( hidden_size=config.hidden_size, num_heads=config.num_attention_heads, intermediate_size=config.intermediate_size ) for _ in range(config.num_hidden_layers) ]) # 输出归一化 self.norm = nn.LayerNorm(config.hidden_size) # 语言模型头(仅此一个输出) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head.weight = self.embed_tokens.word_embeddings.weight # 共享权重 # 初始化 self.apply(self._init_weights) def _init_weights(self, module): """递归初始化""" if isinstance(module, (nn.Linear, nn.Embedding)): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) if hasattr(module, 'bias') and module.bias is not None: torch.nn.init.zeros_(module.bias) def forward( self, input_ids: torch.LongTensor, labels: Optional[torch.LongTensor] = None ) -> Dict[str, torch.Tensor]: seq_len = input_ids.size(1) freqs_cis = self.freqs_cis[:seq_len] inputs_embeds = self.embed_tokens(input_ids) hidden_states = inputs_embeds for layer in self.layers: hidden_states = layer(hidden_states, freqs_cis) hidden_states = self.norm(hidden_states) logits = self.lm_head(hidden_states) output = {"logits": logits} if labels is not None: shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss_fct = nn.CrossEntropyLoss(ignore_index=-100) loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) output["loss"] = loss return output @torch.no_grad() def generate( self, input_ids: torch.LongTensor, max_new_tokens: int = 128, temperature: float = 0.7, top_k: int = 50, eos_token_id: Optional[int] = None, ) -> torch.LongTensor: """ 纯语言自回归生成 """ for _ in range(max_new_tokens): outputs = self.forward(input_ids) logits = outputs["logits"][:, -1, :] # 取最后一步预测 logit = logits / temperature if top_k > 0: v, _ = torch.topk(logit, min(top_k, logit.size(-1))) pivot = v[:, [-1]] logit = torch.where(logit < pivot, -float('inf'), logit) probs = torch.softmax(logit, dim=-1) next_token = torch.multinomial(probs, num_samples=1) input_ids = torch.cat([input_ids, next_token], dim=1) if eos_token_id is not None and next_token.item() == eos_token_id: break return input_ids # %% [markdown] # 初始化模型 # %% tokenizer = CharLevelTokenizer() config = YiziLMConfig( vocab_size=len(tokenizer), hidden_size=512, num_hidden_layers=4, num_attention_heads=8, intermediate_size=2048, max_position_embeddings=8192, ) model = YiziLM(config) total_params = sum(p.numel() for p in model.parameters()) print(f"🚀 Total Parameters: {total_params:,}") # %% [markdown] # # 训练 # %% [markdown] # 工具函数:编码样本 # %% def encode_sample(sample, tokenizer, max_length=128): """ 将一个样本编码为 input_ids 和 labels prompt + target 拼接,labels 中 prompt 部分为 -100 """ prompt = sample["prompt"] target = sample["target"] # 转为字符串 token prompt_tokens = [str(x).lower() for x in prompt] target_tokens = [str(x). lower() for x in target] # 编码 input_ids = tokenizer.encode(prompt_tokens, add_bos=True, add_eos=False).squeeze(0) # (L_in,) label_ids = tokenizer.encode(target_tokens, add_bos=False, add_eos=True).squeeze(0) # (L_out,) # 拼接完整序列 full_input_ids = torch.cat([input_ids, label_ids], dim=0) # 构造 labels:仅 label 部分参与 loss 计算 full_labels = full_input_ids.clone() full_labels[:len(input_ids)] = -100 # ignore prompt return { "input_ids": full_input_ids, "labels": full_labels } # %% [markdown] # 数据集整理 # %% class GeneralDataset(Dataset): def __init__(self, data, tokenizer, max_length=128): self.raw_data = data self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.raw_data) def __getitem__(self, idx): sample = self.raw_data[idx] return encode_sample(sample, self.tokenizer, self.max_length) def collate_fn(batch): input_ids_list = [b["input_ids"] for b in batch] labels_list = [b["labels"] for b in batch] # 动态 padding input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id) labels = pad_sequence(labels_list, batch_first=True, padding_value=-100) return { "input_ids": input_ids, "labels": labels } # %% [markdown] # 训练函数 # %% @torch.no_grad() def evaluate_generation(model, tokenizer, examples, device): """生成式评估:看模型是否能正确输出""" model.eval() results = [] for prompt in examples: tokens = [str(x).lower() for x in prompt] input_ids = tokenizer.encode(tokens, add_bos=True, add_eos=False).to(device) output_ids = model.generate( input_ids, max_new_tokens=20, temperature=0.7, top_k=15, eos_token_id=tokenizer.eos_token_id ) text, num = tokenizer.decode_with_number(output_ids.cpu()) results.append((prompt, text, num)) return results def train_model( model, train_data, tokenizer, val_prompts=None, # 用于生成测试的 prompt 列表 batch_size=4, epochs=3, lr=3e-4, warmup_steps=100, log_steps=10, eval_steps=50, save_path=None, device=None ): """ 通用训练主函数 """ if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 数据集与加载器 dataset = GeneralDataset(train_data, tokenizer) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn ) # 模型准备 model.to(device) model.train() # 优化器与学习率调度 optimizer = AdamW(model.parameters(), lr=lr) total_steps = len(dataloader) * epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps ) # 训练循环 global_step = 0 for epoch in range(epochs): print(f"\n{'='*20} Epoch {epoch+1}/{epochs} {'='*20}") epoch_losses = [] progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}") for batch in progress_bar: input_ids = batch["input_ids"].to(device) # (B, T) labels = batch["labels"].to(device) # (B, T) outputs = model(input_ids=input_ids, labels=labels) loss = outputs.get('loss') loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() epoch_losses.append(loss.item()) global_step += 1 if global_step % log_steps == 0: avg_loss = np.mean(epoch_losses[-log_steps:]) progress_bar.set_postfix({"loss": f"{avg_loss:.4f}"}) if val_prompts and global_step % eval_steps == 0: print(f"\n🔍 Step {global_step}: Generation Sample") model.eval() results = evaluate_generation(model, tokenizer, val_prompts, device) for p, out, num in results: print(f" Prompt: {' '.join(map(str, p))}") print(f" Output: {out} → Num: {num}") model.train() avg_epoch_loss = np.mean(epoch_losses) print(f"✅ Epoch {epoch+1} 完成 | 平均 Loss: {avg_epoch_loss:.4f}") # 最终保存 if save_path: torch.save(model.state_dict(), save_path) print(f"💾 模型权重已保存至: {save_path}") 设计一个适合我的模型的数据集
最新发布
12-07
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值