from turtledemo.penrose import start
import numpy as np
import torch
import time
import math
torch.set_printoptions(8)
def gelu(x):
"""
Task: Use the torch API to implement the approximate calculation formula of the `GELU`
activation function. The formula is as follows (you need to paste it into the latex
online conversion website)
Website: https://www.latexlive.com/
Input: Tensor
Output: Tensor
"""
return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x,3))))
def softmax(x,dim=-1):
"""
Task: Use torch API to implement `softmax` function, search the specific formula by yourself
Input: Tensor
Output: Tensor
"""
x_max=torch.max(x,dim=dim,keepdim=True).values
x_stable=x-x_max
exp_x=torch.exp(x_stable)
return exp_x/torch.sum(exp_x,dim=dim,keepdim=True)
def layer_norm(x, g_b, eps:float = 1e-5):
"""
Task: Use torch API to implement `layernorm` function, search `layernorm` by yourself
Input:
x: Tensor
g_b: dictionary that load from gpt2 weight. g-gamma and b-bias are the keys
Output: Tensor
"""
""" if torch.isnan(x).any():
print("Nan\n")
assert(0)"""
g, b = torch.Tensor(g_b['g']), torch.Tensor(g_b['b'])
x = x.clone().detach().to(torch.float32)
g=g.to(x.device)
b=b.to(x.device)
normalized_shape=g.shape
dims=list(range(-len(normalized_shape),0)) #warning
# if not isinstance(x, torch.Tensor):
# x = torch.tensor(x, dtype=torch.float32)
x = x.float()
# print(x,"/n")
mean=x.mean(dim=-1,keepdim=True)
var=x.var(dim=-1,keepdim=True)
x_=(x-mean)/torch.sqrt(var+eps)
#print(x_*g+b,"/n")
return x_*g+b
def linear(x, w_b): # [m, in], [in, out], [out] -> [m, out]
"""
Task: implement linear layer
Input:
x: Tensor
w_b: dictionary that load from gpt2 weight. w-weight and b-bias are the keys
Output: Tensor
"""
w, b = torch.Tensor(w_b['w']), torch.Tensor(w_b['b'])
w,b=w.to(x.device),b.to(x.device)
# print(torch.matmul(x,w)+b)
return torch.matmul(x,w)+b #warning
def ffn(x, mlp): # [n_seq, n_embd] -> [n_seq, n_embd]
"""
Task: use `gelu` `linear` to implement ffn
Notes: x --linear--> --gelu--> --linear--> output
Input:
x: Tensor
mlp: dictionary that load from gpt2 weight. w_b1 and w_b2 are the params of two linear layer
Output: Tensor
"""
w_b1, w_b2 = mlp['c_fc'], mlp['c_proj']
# print(x,"\n")
x=linear(x,w_b1)
x=gelu(x)
x=linear(x,w_b2)
return x
def attention(q, k, v, mask,past_kv=None): # [n_q, d_k], [n_k, d_k], [n_k, d_v], [n_q, n_k] -> [n_q, d_v]
"""
Task: use torch API to implement attention computation according to formula(1) of the following paper
where d_k account for the last dimension of `k`
Paper: https://arxiv.org/abs/1706.03762
Input:
q: Tensor
k: Tensor
v: Tensor
mask: Tensor
mlp: dictionary that load from gpt2 weight. w_b1 and w_b2 are the params of two linear layer
Output: Tensor
"""
if past_kv is not None:
past_key,past_value= past_kv
k=torch.cat([past_key,k],dim=0)
v=torch.cat([past_value,v],dim=0)
current_k=(k,v)
atten_score=torch.matmul(q,k.transpose(-2,-1))
d_k=k.size(-1)
atten_score=atten_score/torch.sqrt(torch.tensor(d_k,dtype=atten_score.dtype))
if mask is not None:
if past_kv is not None:
seq_len=k.size(0) #warning
causal_mask=torch.triu(torch.ones(seq_len,seq_len)*-1e9,diagonal=1)
causal_mask=causal_mask[-q.size(0):]
else:
causal_mask=mask
atten_score=atten_score.masked_fill_(causal_mask==0,-1e9)
atten_weights=softmax(atten_score)
output=torch.matmul(atten_weights,v)
return output,current_k
def mha(x, attn, n_head,past_kv=None): # [n_seq, n_embd] -> [n_seq, n_embd]
"""
Task: Complete the code of the multi-head attention
Input:
x: Tensor
attn: dictionary that load from gpt2 weight. c_attn and c_proj are the params of two linear layer
n_head: number of head
Output: Tensorying multi-head attention and linear transformation, shape [n_seq, n_embd].
"""
c_attn, c_proj = attn['c_attn'], attn['c_proj']
# qkv projection
#print(x,"/n")
x = linear(x, c_attn) # [n_seq, n_embd] -> [n_seq, 3*n_embd]
#print(x,"/n")
# Split into qkv
"""
Task: Split the q,k,v matrix from the tensor x
Notes: [n_seq, 3*n_embd] -> 3 * [n_seq, n_embd]
"""
n_seq,n_embd=x.shape
n_embd_total=n_embd//3
q,k,v=torch.split(x,n_embd_total,dim=-1)
qkv =[q,k,v] # need to modify #warning4
# Split into heads
qkv_heads = [qkv_part.chunk(n_head, dim=-1) for qkv_part in qkv] # 3 * [n_seq, n_embd] -> 3 * n_head * [n_seq, n_embd/n_head]
qkv_heads = list(zip(*qkv_heads)) # [3, n_head, n_seq, n_embd/n_head]
# Causal mask to hide future inputs from being attended to
"""
Task: Construct mask matrix
Notes:
| 0 -inf -inf ... -inf |
| 0 0 -inf ... -inf |
| 0 0 0 ... -inf |
|... ... ... ... ... |
| 0 0 0 ... 0 |
Mask is a tensor whose dimension is [n_seq, n_seq]
"""
if past_kv is None:
past_kv_per_head = [None] * n_head
else:
past_kv_per_head = past_kv
if past_kv is None:
causal_mask = torch.triu(torch.ones(n_seq, n_seq) * -1e9, diagonal=1)
else:
causal_mask = None
causal_mask = torch.triu(torch.ones(n_seq, n_seq)*-1e9, diagonal=1)#warning3 # need to modify
out_heads=[]
new_kv_per_head=[]
for i,(q,k,v) in enumerate(qkv_heads):#warning
out_head,new_kv=attention(q,k,v,causal_mask,past_kv_per_head[i])
out_heads.append(out_head)
new_kv_per_head.append(new_kv)
# Perform attention over each head
# Merge heads
"""
Task: merge multi-heads results
Notes: n_head * [n_seq, n_embd/n_head] --> [n_seq, n_embd]
"""
# print(x,"/n")
x = torch.cat(out_heads,dim=-1) # need to modify
#
# Out projection
x = linear(x, c_proj) # [n_seq, n_embd] -> [n_seq, n_embd]
#print(x,"/n")
return x,new_kv_per_head
def transformer_block(x, block, n_head,past_kv=None): # [n_seq, n_embd] -> [n_seq, n_embd]
mlp, attn, ln_1, ln_2 = block['mlp'], block['attn'], block['ln_1'], block['ln_2']
# print(x,"/n")
# multi-head causal self attention
#print(x,"/n")
attn_out,new_kv=mha(layer_norm(x,ln_1),attn,n_head,past_kv)
x = x + attn_out # [n_seq, n_embd] -> [n_seq, n_embd] #problem A!!!!
#print(x,"/n")
# position-wise feed forward network
x = x + ffn(layer_norm(x, ln_2), mlp) # [n_seq, n_embd] -> [n_seq, n_embd]
return x,new_kv
def gpt2(inputs, params, n_head,past_kvs=None): # [n_seq] -> [n_seq, n_vocab]
wte, wpe, blocks, ln_f = params['wte'], params['wpe'], params['blocks'], params['ln_f']
# token + positional embeddings
wte=torch.Tensor(wte)
wpe=torch.Tensor(wpe)
if past_kvs is None:
x = wte[inputs] + wpe[range(len(inputs))] # [n_seq] -> [n_seq, n_embd]
start_pos=0
else:
x=wte[inputs[-1:]]+wpe[[len(inputs)-1]]
start_pos=len(inputs)-1
# print(x.shape,params,n_head,"1/n")
# x = transformer_block(x, blocks, n_head=n_head)
#print(x.shape,"2/n")
#print(x,"/n")
# forward pass through n_layer transformer blocks
x=torch.Tensor(x)
new_past_kvs=[]
for i,block in enumerate(blocks):
past_kv=past_kvs[i] if past_kvs is not None else None #warning
x ,new_kv= transformer_block(x, block, n_head=n_head,past_kv=past_kv) # [n_seq, n_embd] -> [n_seq, n_embd]
new_past_kvs.append(new_kv)
# projection to vocab
# print(x,"/n")
x = layer_norm(x, ln_f) # [n_seq, n_embd] -> [n_seq, n_embd]
return x @ wte.T,new_past_kvs # [n_seq, n_embd] -> [n_seq, n_vocab]
def apply_repetition_penalty(logits, generated_tokens, penalty=1.2):
for token in set(generated_tokens[-20:]):
if token < len(logits):
logits[token] = logits[token] / penalty
return logits
def generate(inputs, params, n_head, n_tokens_to_generate,temperature=0.8, repetition_penalty=1.2):
from tqdm import tqdm
past_kvs = None
generated=inputs.copy()
for _ in tqdm(range(n_tokens_to_generate), "generating"): # auto-regressive decode loop
logits ,past_kvs= gpt2(generated, params, n_head,past_kvs) # model forward pass
# next_id = np.argmax(logits[-1]) # greedy sampling #warning
# print(logits,"/n")
# inputs.append(int(next_id)) # append prediction to input
last_logits=logits[-1]
last_logits=apply_repetition_penalty(last_logits, generated, repetition_penalty)
if temperature>0:
last_logits=last_logits/temperature
probs=softmax(last_logits,dim=-1)
next_id=torch.multinomial(probs,num_samples=1).item()
else:next_id=torch.argmax(last_logits).item()
generated.append(next_id)
return generated[len(inputs):] # only return generated ids
def greedy_speculative_generate(inputs, draft_params, target_params, hparams_draft, hparams_target, n_tokens_to_generate, K):
"""
Task: Load 124M and 1558M models at the same time, use greedy sampling, and complete speculative decoding
Inputs:
inputs (list): The initial list of token IDs from the prompt.
draft_params, target_params: Model weights for the draft and target models.
hparams_draft, hparams_target: Hyperparameters for both models.
n_tokens_to_generate (int): The number of new tokens to generate.
K (int): The number of tokens the draft model speculates at each step (e.g., 4).
Returns:
list: A list of newly generated token IDs.
"""
draft_past_kvs=None
target_past_kvs=None
generated_ids = []
current_inputs = list(inputs)
while len(generated_ids) < n_tokens_to_generate:
draft_tokens=[]
draft_inputs=list(current_inputs)
draft_current_past_kvs=draft_past_kvs
for _ in range(K):
if len(generated_ids)+len(draft_tokens)>=n_tokens_to_generate:
break
logits=gpt2(draft_inputs,draft_params,hparams_draft['n_head'],past_kvs=draft_current_past_kvs) #warning5
next_id=np.argmax(logits[-1])
draft_tokens.append(next_id)
draft_inputs.append(next_id)
if not draft_tokens:
break # warning6
target_input=current_inputs+draft_tokens
target_current_past_kvs=target_past_kvs
target_logits,target_current_past_kvs=gpt2(target_input,target_params,hparams_target['n_head'],target_current_past_kvs)
# warning 7
accepted_token=[]
for i, draft_token in enumerate(draft_tokens):
target_position=len(current_inputs)+i
target_token=np.argmax(target_logits[target_position].detach().numpy())
if draft_token==target_token:
accepted_token.append(draft_token)
else:
accepted_token.append(target_token)
break
#print(accepted_token,"/n")
if len(accepted_token)==len(draft_tokens):
draft_past_kvs=draft_current_past_kvs
target_past_kvs=target_current_past_kvs
else:
draft_past_kvs=None
generated_ids.extend(accepted_token)
current_inputs.extend(accepted_token)
return generated_ids
def main(prompt: str, n_tokens_to_generate: int = 5, model_size: str = "124M", models_dir: str = "models"):
from utils import load_encoder_hparams_and_params
# load encoder, hparams, and params from the released open-ai gpt-2 files
encoder, hparams, params = load_encoder_hparams_and_params(model_size, models_dir)
# encode the input string using the BPE tokenizer
input_ids = encoder.encode(prompt)
# make sure we are not surpassing the max sequence length of our model
assert len(input_ids) + n_tokens_to_generate < hparams["n_ctx"]
# generate output ids
start = time.time()
output_ids = generate(input_ids, params, hparams["n_head"], n_tokens_to_generate,temperature=0.7,repetition_penalty=1.2)
end = time.time()
print(f"Time taken to generate {n_tokens_to_generate} tokens: {end - start:.2f}s")
# print("/n output_ids/n",output_ids)
# decode the ids back into a string
output_text = encoder.decode(output_ids)
# print(output_text)
return output_text
if __name__ == "__main__":
import fire
fire.Fire(main)这个大模型的输出是 that become become become one day become one day one day become one day one one day one day one day one one one one one one one one one one one one one one one one one one one预计输出是the most powerful machine on the planet,怎么修改得到正确结果