Linear Regression with Eager API

本文详细介绍使用TensorFlow实现线性回归的过程,包括数据准备、模型定义、损失函数计算、优化器选择及训练过程。通过具体实例展示了如何利用梯度下降法调整权重和偏置,以最小化预测值与真实值之间的差距。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


from __future__ import absolute_import, division, print_function
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

# Set Eeger API

tf.enable_eager_execution()
tfe = tf.contrib.eager

# Training Data
train_X = [3.3, 4.4, 5.5, 6.71, 6.93, 4.168, 9.779, 6.182, 7.59, 2.167,
           7.042, 10.791, 5.313, 7.997, 5.654, 9.27, 3.1]
train_Y = [1.7, 2.76, 2.09, 3.19, 1.694, 1.573, 3.366, 2.596, 2.53, 1.221,
           2.827, 3.465, 1.65, 2.904, 2.42, 2.94, 1.3]

n_samples = len(train_X)

# Parameters
learning_rate = 0.01
display_step = 100
num_steps = 1000

# Weight and Bias
W = tfe.Variable(np.random.randn())
b = tfe.Variable(np.random.randn())
# Linear regression(Wx + b)
def linear_regression(inputs):
    return inputs * W + b
# Mean square error
def mean_square_fn(model_fn, inputs, labels):
    return tf.reduce_sum(tf.pow(model_fn(inputs) - labels, 2)) / (2 * n_samples)

# SDG Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
# Compute gradients
grad = tfe.implicit_gradients(mean_square_fn)

# Initial cost, before optimizing
print("Initial cost={:.9f}".format(mean_square_fn(linear_regression, train_X, train_Y)),
      "W=", W.numpy(), "b=", b.numpy())



# Training
for step in range(num_steps):
    optimizer.apply_gradients(grad(linear_regression, train_X, train_Y))
    if (step + 1) % display_step == 0 or step == 0:
        print("Epoch:", '%04d' % (step + 1), "cost=",
              "{:.9f}".format(mean_square_fn(linear_regression, train_X, train_Y)),
              "W=", W.numpy(), "b=", b.numpy())


# Graphic display
plt.plot(train_X, train_Y, 'ro', label='Original data')
plt.plot(train_X, np.array(W * train_X + b), label='Fitted line')
plt.legend()
plt.show()

if __name__ == "__main__":
    pass
分析下面的代码 :#!/usr/bin/env python # coding: utf-8 # # AIMET Quantization workflow for LLaMA V2 7B with LORA adapters using PEFT pipeline # # This notebook shows a working code example of how to use AIMET to quantize LLaMaV2 model. # # --- # ### Required packages # The notebook assumes AIMET and LLamaV2 related packages are already installed. # In[ ]: # #### Overall flow # This notebook covers the following # 1. Top Level Config # 1. Model Adaptation. # 1. Model Sample Input # 1. Base BERT MHA FP Model Instantiation # 1. Loading LORA Adapter on base Bert MHA Model # 1. Adapted BERT MHA Model Preparation # 1. Adapted BERT MHA Model Quantization with PEFT pipeline # 1. Base KV MHA FP Model Instantiation # 1. Loading LORA Adapter on base KV MHA Model # 1. Adapted KV MHA FP Model Preparation # 1. Create Adapted KVcache MHA Quantsim and Apply Encodings from Adapted BERT MHA Model # 1. Export onnx and encodings # # # #### What this notebook is not # * This notebook is not intended to show the full scope of optimization. For example, the flow will not use QAT, KD-QAT as deliberate choice to have the notebook execute more quickly. # In[2]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[1]: # Install packages only if running in jupyter notebook mode if hasattr(__builtins__,'__IPYTHON__'): get_ipython().system('sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir transformers==4.41.2,') get_ipython().system('sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir tokenizers==0.19.0,') # !sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir transformers==4.27.4, # !sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir tokenizers==0.13.0, get_ipython().system('sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir peft') get_ipython().system('sudo -H apt-get update') get_ipython().system('sudo -H apt-get install -y libc++-dev') get_ipython().system('sudo -H apt-get install -y clang') # In[1]: import transformers print (transformers.__version__) # In[2]: get_ipython().system('sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir transformers==4.41.2,') get_ipython().system('sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir tokenizers==0.19.0,') # #### Setup QNN SDK # In[14]: import sys import os QNN_SDK_ROOT = '/tmp/qnn/2.28.0.241029' # QNN 2.28.0 sys.path.insert(0, QNN_SDK_ROOT + '/lib/python') os.environ['LD_LIBRARY_PATH'] = os.path.join(QNN_SDK_ROOT + '/lib/x86_64-linux-clang', os.getenv('LD_LIBRARY_PATH', '')) # In[15]: import qti #checking if correct QNN version is loaded print(qti.__path__) # ### Setting NSP Target # In[3]: sys.path.append(os.path.abspath('../')) sys.path.append(os.path.abspath('../../common')) from utilities.nsptargets import NspTargets # Android GEN4 is supported for this notebook nsp_target = NspTargets.Android.GEN4 # Select quantsim config based on target # Point this to different path if AIMET install in path other than /usr/local/lib/python3.10/dist-packages/ htp_config_file = f'/usr/local/lib/python3.10/dist-packages/aimet_common/quantsim_config/htp_quantsim_config_{nsp_target.dsp_arch}.json' # --- # ## 1. Top Level Config # # In[11]: import os, sys from tqdm import tqdm os.environ['CUDA_VISIBLE_DEVICES'] = '1' import torch from transformers import AutoConfig, AutoTokenizer, default_data_collator cache_dir='./cache_dir' output_dir = './32layer_test' os.makedirs(cache_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True) device = "cuda" # Auto-regression length: number of tokens to consume and number of logits to produce. ARN=73 # model_id="<path_to_model_weight> or <HF_model_id(meta-llama/Llama-2-7b-hf)>" model_id="/002data/kraus/projects/LLM/qualcomm_llama/model/Step-1/7b_chat/" num_hidden_layers = 32 #configurable to less number for debugging purposes context_length = 4096 # adatper dictionary name to peft_id lora_adapter_dict={'french':'kaitchup/Llama-2-7b-mt-French-to-English', 'oasst': 'kaitchup/Llama-2-7B-oasstguanaco-adapter'} lora_adapter_dict={'french':'french', 'oasst': 'oasst'} # --- # # ## 2. Model Adaptations # The following model adaptation are enabled for inference using provided modeling_llama.py: # * Use 2D attention_mask # * Replace position ids with embedding # * Output new KV only # # The following adaptation is enabled using in place replacement utility function # * Convert linear to conv # In[5]: from transformers.models.llama import modeling_llama from transformers import cache_utils from aimet_utils.linear_to_conv import replace_linears_with_convs from aimet_torch.pro.utils.profiler import event_marker from qcllama_adaptation import ( QcLlamaAttention, bypass_update_causal_mask, DynamicCache_update, DynamicCache_get_seq_length, update_attr ) with event_marker("FP model adaptation configuration"): modeling_llama.LLAMA_ATTENTION_CLASSES['eager'] = QcLlamaAttention # Bypass attention_mask preparation assert update_attr(modeling_llama.LlamaModel, '_update_causal_mask', bypass_update_causal_mask) or \ update_attr(modeling_llama.LlamaModel, '_prepare_decoder_attention_mask', bypass_update_causal_mask), \ f"neither _prepare_decoder_attention_mask(..) nor _update_causal_mask(..) found, Unknown LlamaModel definition in {modeling_llama.__file__}" # Adapting KV$ management assert update_attr(cache_utils.DynamicCache, 'update', DynamicCache_update), f"Unknown DynamicCache definition: {cache_utils.DynamicCache}" assert update_attr(cache_utils.DynamicCache, 'get_seq_length', DynamicCache_get_seq_length), f"Unknown DynamicCache definition: {cache_utils.DynamicCache}" # In[6]: import qcllama_adaptation print(qcllama_adaptation.__file__) # In[7]: from transformers.models.deprecated.open_llama import modeling_open_llama from transformers import cache_utils from aimet_utils.linear_to_conv import replace_linears_with_convs from aimet_torch.pro.utils.profiler import event_marker from qcllama_adaptation import ( QcLlamaAttention, bypass_update_causal_mask, DynamicCache_update, DynamicCache_get_seq_length, update_attr ) with event_marker("FP model adaptation configuration"): modeling_llama.LLAMA_ATTENTION_CLASSES['eager'] = QcLlamaAttention # Bypass attention_mask preparation assert update_attr(modeling_llama.LlamaModel, '_update_causal_mask', bypass_update_causal_mask) or \ update_attr(modeling_llama.LlamaModel, '_prepare_decoder_attention_mask', bypass_update_causal_mask), \ f"neither _prepare_decoder_attention_mask(..) nor _update_causal_mask(..) found, Unknown LlamaModel definition in {modeling_llama.__file__}" # Adapting KV$ management assert update_attr(cache_utils.DynamicCache, 'update', DynamicCache_update), f"Unknown DynamicCache definition: {cache_utils.DynamicCache}" assert update_attr(cache_utils.DynamicCache, 'get_seq_length', DynamicCache_get_seq_length), f"Unknown DynamicCache definition: {cache_utils.DynamicCache}" # --- # ## 3. Model Sample Input # #### Dummy input # # In[8]: from forward_pass_wrapper import get_position_embeddings_from_position_ids, prepare_combined_attention_mask, get_padded_kv_values, flatten_tensors def get_dummy_data(model_mode, num_layers, hidden_size, num_attention_heads, rope_theta, tokenizer, device, separate_tuple_input_output, num_tokens=None, concat_head_in_batch_dimension=False): max_tokens = tokenizer.model_max_length attention_mask = torch.ones((1, max_tokens), dtype=torch.long, device=device) if model_mode == 'bertcache': num_tokens = max_tokens position_ids = torch.cumsum(attention_mask, dim=1) - 1 position_ids = position_ids.clip(0, max_tokens - 1) position_ids = position_ids[..., :num_tokens] position_ids = position_ids.to(device=device) past_kv_length = max_tokens - num_tokens if model_mode == 'kvcache' else 0 attention_mask = prepare_combined_attention_mask(attention_mask, input_shape=(1, num_tokens), past_key_values_length=past_kv_length, device=device, mask_neg=-100) position_ids = get_position_embeddings_from_position_ids(position_ids, head_dim=hidden_size//num_attention_heads, max_length=max_tokens, rope_theta=rope_theta, device=device) inputs = { 'attention_mask': attention_mask, 'position_ids': position_ids, 'input_ids': torch.randint(0, len(tokenizer), (1, num_tokens), device=device) } if model_mode == 'kvcache': inputs['past_key_values'] = get_padded_kv_values(past_size=max_tokens - num_tokens, num_layers=num_layers, hidden_size=hidden_size, concat_head_in_batch_dimension=concat_head_in_batch_dimension, num_attention_heads=num_attention_heads, device=device) if separate_tuple_input_output: flattened_kvcache = tuple(flatten_tensors(inputs['past_key_values'])) inputs = inputs['input_ids'], inputs['attention_mask'], inputs['position_ids'][0], inputs['position_ids'][1] inputs = inputs + flattened_kvcache else: if separate_tuple_input_output: inputs = inputs['input_ids'], inputs['attention_mask'], inputs['position_ids'][0], inputs['position_ids'][1] return inputs # #### Input and Output names # In[9]: def get_input_output_names(num_layers, past_key_values_in, separate_tuple_input_output): def _get_past_key_values_names(sfx, n_layers): all = [] for i in range(n_layers): all.append(f'past_key_{i}_{sfx}') all.append(f'past_value_{i}_{sfx}') return all output_names = ['logits'] input_names = ['input_ids', 'attention_mask'] if separate_tuple_input_output: output_names += _get_past_key_values_names('out', num_layers) input_names += ['position_ids_cos', 'position_ids_sin'] if past_key_values_in: input_names += _get_past_key_values_names('in', num_layers) else: output_names += ['past_key_values'] input_names += ['position_ids'] if past_key_values_in: input_names += ['past_key_values'] return input_names, output_names # ### 4. Base BERT MHA FP Model Instantiation # # In[ ]: # In[12]: llm_config = AutoConfig.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True) # model params llm_config.num_hidden_layers = num_hidden_layers llm_config.cache_dir = cache_dir llm_config.device = torch.device('cpu') # QC LLM model config setattr(llm_config, 'mask_neg', -100) setattr(llm_config, 'num_logits_to_return', 0) setattr(llm_config, 'return_top_k', 0) setattr(llm_config, "use_conv", False) setattr(llm_config, 'return_new_key_value_only', True) setattr(llm_config, 'transposed_key_cache', True) setattr(llm_config, 'use_combined_mask_input', True) setattr(llm_config, 'use_position_embedding_input', True) setattr(llm_config, 'separate_tuple_input_output', False) setattr(llm_config, '_attn_implementation', 'eager') setattr(llm_config, '_attn_implementation_internal', 'eager') print(f'num_layer: {llm_config.num_hidden_layers}, context_length: {context_length}') with event_marker('BERT MHA FP model'): fp_base_model = modeling_llama.LlamaForCausalLM.from_pretrained(model_id, config=llm_config) os.environ['TOKENIZERS_PARALLELISM'] = '0' tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, use_fast=True, trust_remote_code=True) ## Adjust the tokenizer to limit to context length tokenizer.model_max_length = context_length # ### 5. Loading LORA Adapters on Base Bert MHA Model # In[13]: # loading adapter to Bert MHA model and save adapter weights from peft import PeftModel,PeftConfig,LoraConfig from aimet_torch.peft import replace_lora_layers_with_quantizable_layers,track_lora_meta_data from lora_utils import save_lora_weights_after_adaptation from aimet_utils.linear_to_conv import replace_linears_with_convs from aimet_utils.linear_to_conv import ConvInplaceLinear import copy # Adding dummy adapter for q_proj, k_proj, v_proj and combined adapters into one adapter k_v_lora_config = LoraConfig( r=16, lora_alpha=16, bias='none', target_modules=["q_proj","k_proj", "v_proj"], init_lora_weights=False # leads to random init not zeros ) for adapter_name,peft_model_id in lora_adapter_dict.items(): model_before_adapter = copy.deepcopy(fp_base_model) print (f"=====loading adapter {adapter_name}====") lora_model = PeftModel.from_pretrained(model_before_adapter, peft_model_id, adapter_name=adapter_name) dummy_adapter_name = "k_v_adapter" lora_model.add_adapter(dummy_adapter_name, k_v_lora_config) for name, param in lora_model.named_parameters(): if dummy_adapter_name in name and "lora" in name: param.data.fill_(0.0) combined_adapter_name = "combined_adapter" lora_model.add_weighted_adapter( adapters=[adapter_name, dummy_adapter_name], weights=[1.0, 1.0], adapter_name=combined_adapter_name, combination_type="linear" ) lora_model.set_adapter(combined_adapter_name) lora_model.delete_adapter(adapter_name) lora_model.delete_adapter(dummy_adapter_name) # Replace lora layers with quantizable layers replace_lora_layers_with_quantizable_layers(lora_model) # Linear to Conv model adaptation lora_model=replace_linears_with_convs(lora_model) # Save adapter weights after adaptation save_lora_weights_after_adaptation(lora_model, output_dir, adapter_name) del model_before_adapter del fp_base_model track_lora_meta_data(lora_model, output_dir, 'meta_data', ConvInplaceLinear) # In[14]: # fill lora layers with 0 to evaluate base model for name, param in lora_model.named_parameters(): if 'lora' in name: param.data.fill_(0.0) # #### 5.1 Adapted BERT MHA Model Evaluation # # In[15]: # defining ppl evaluation function from torch.nn import CrossEntropyLoss bert_mha_fp_model=lora_model.base_model.model def bert_ppl_eval(data_loader, forward_pass_manager, num_batches=0): if num_batches == 0: num_batches = len(data_loader) loss = 0 for batch_id, batch in enumerate(tqdm(data_loader, total=num_batches, desc="Evaluating")): if batch_id >= num_batches: break outputs = forward_pass_manager(**batch) lm_logits = outputs["lm_logits"].cpu() # we can either pass input_ids or input_embeds in our fpm, hence with input_embeds we pass the labels. if 'input_ids' not in batch: batch['input_ids'] = batch['labels'] lm_logits = lm_logits.reshape(batch['input_ids'].shape[0], -1, lm_logits.shape[-1]) shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = batch['input_ids'][..., 1:].contiguous().to(shift_logits.device) loss_fct = CrossEntropyLoss() loss += loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ) loss = loss / num_batches ppl = loss.exp() return ppl # In[ ]: from forward_pass_wrapper import LLMForwardPassManager orig_fpm = LLMForwardPassManager(cfg=llm_config, model=bert_mha_fp_model, tokenizer=tokenizer, model_mode='bertcache', num_logits_to_return=0, separate_tuple_input_output=False) input_names, output_names = get_input_output_names(num_layers=llm_config.num_hidden_layers, past_key_values_in=False, separate_tuple_input_output=False) from wikitext_dataloader import get_wiki_dataset train_dataloader, test_dataloader, _ = get_wiki_dataset(context_length, tokenizer, cache_dir, train_batch_size = 1, test_batch_size = 1) with event_marker("BERT MHA FP eval"): with torch.no_grad(): with orig_fpm.place_on_device(device): orig_ppl = bert_ppl_eval(test_dataloader, orig_fpm) print(f"ppl score of original BERT MHA fp model: {orig_ppl}") # ### 6. Adapted BERT MHA Model Preparation # #### Estimated running time: ~ 1h 20m # In[13]: import aimet_torch.pro.ir_graph_op_handler as ir_graph_op_handler from aimet_torch.pro import model_preparer # Setting this flag to False means that the prepared model will be flattened # This flag must be set to false because we rely on the model structure being flat to enable weight sharing ir_graph_op_handler.KEEP_ORIGINAL_MODEL_STRUCTURE = False # configuring the model for BERT mode bert_mha_fp_model.num_logits_to_return = 0 dummy_input = get_dummy_data('bertcache', llm_config.num_hidden_layers, llm_config.hidden_size, llm_config.num_attention_heads, llm_config.rope_theta, tokenizer, 'cpu', separate_tuple_input_output=False) input_names, output_names = get_input_output_names(num_layers=llm_config.num_hidden_layers, past_key_values_in=False, separate_tuple_input_output=True) converter_args_param = ['--input_layout'] converter_args_value = 'NONTRIVIAL' converter_args = [] for input_param in converter_args_param: for input_name in input_names: converter_args += [input_param, input_name, converter_args_value] with event_marker("BERT MHA Model prepare", flush_ram=True): bert_mha_prepared_model = model_preparer.prepare_model(bert_mha_fp_model, dummy_input, filename="bert_mha_prepared_model", path=output_dir, input_names=input_names, output_names=output_names, converter_args=converter_args, skipped_optimizers=['eliminate_common_subexpression','eliminate_nop_with_unit', 'eliminate_duplicate_initializer'], ) # In[ ]: del orig_fpm del bert_mha_fp_model # #### 6.1 Adapted BERT MHA Prepared Model Verification # Verify if prepared BERT model generates the same PPL as FP model # ##### Estimated running time: ~ 3m # In[17]: from aimet_torch.utils import load_pytorch_model # Load prepared model if prepartion is run before and prepared model can be retrived from filer path # # bert_mha_prepared_model = load_pytorch_model(path=output_dir, filename="bert_mha_prepared_model", # # model_name='ConvertedModel', load_state_dict=True) # Calculate ppl score for prepared fp model bert_mha_fpm = LLMForwardPassManager(cfg=llm_config, model=bert_mha_prepared_model, tokenizer=tokenizer, model_mode='bertcache', num_logits_to_return=0, separate_tuple_input_output=True) with event_marker("BERT MHA Prepared FP eval"): with torch.no_grad(): with bert_mha_fpm.place_on_device(device): prepared_bertcache_ppl = bert_ppl_eval(test_dataloader, bert_mha_fpm) print(f"ppl score of BERT prepared fp model: {prepared_bertcache_ppl}\n" f"orig ppl - prepared ppl = {orig_ppl - prepared_bertcache_ppl}") # ### 7. Adapted BERT MHA Model Quantization with PEFT pipeline # # We will be executing PTQ using calibration data that was captured earlier # #### Create Quantsim # In[16]: from aimet_common.defs import QuantScheme from aimet_torch.v2.quantsim import QuantizationSimModel dummy_input = get_dummy_data('bertcache', llm_config.num_hidden_layers, llm_config.hidden_size, llm_config.num_attention_heads, llm_config.rope_theta, tokenizer, device, separate_tuple_input_output=True) with event_marker("create Quantsim", flush_ram=True): with bert_mha_fpm.place_on_device(device): quant_sim = QuantizationSimModel(model=bert_mha_fpm.model, quant_scheme=QuantScheme.post_training_tf, dummy_input=dummy_input, default_output_bw=16, default_param_bw=4, in_place=False, config_file=htp_config_file) quant_sim.model.to('cpu') # In[43]: ### Setting 16*8 matmuls from aimet_torch.v2.experimental.quantsim_utils import set_matmul_second_input_producer_to_8bit_symmetric set_matmul_second_input_producer_to_8bit_symmetric(quant_sim) # #### Manual Mixed Precision # In[44]: from mixed_precision_overrides import ManualQuantsimMixedPrecisionConfig with event_marker("Apply Mixed Precision", flush_ram=True): quantsim_adjuster = ManualQuantsimMixedPrecisionConfig(mixed_precision_config_file= "./config/mixed_precision_profiles/w4_a16_exceptions_llama_v2_prepared_disableRMSNorm_clampgateprojconv_bundledkv.json") quantsim_adjuster.apply_exceptions(quant_sim) # #### Instantiation of PEFT utils # In[30]: import pickle,json from aimet_torch.peft import PeftQuantUtils from aimet_torch.v2.quantization.affine import QuantizeDequantize with open(os.path.join(output_dir,'meta_data.pkl'), "rb") as f: meta_data_file = pickle.load(f) with open(os.path.join(output_dir,'bert_mha_prepared_model.json')) as f: name_to_module_dict = json.load(f) peft_utils = PeftQuantUtils(adapater_name_to_meta_data=meta_data_file, name_to_module_dict=name_to_module_dict) # #### Sequential MSE # ##### Estimated running time: ~ 1h 20m # In[46]: from aimet_torch.v2.seq_mse import apply_seq_mse from aimet_torch.seq_mse import SeqMseParams from aimet_torch.utils import load_pytorch_model # Load prepared model if prepartion is run before and prepared model can be retrived from filer path # # bert_mha_prepared_model = load_pytorch_model(path=output_dir, filename="bert_mha_prepared_model", # # model_name='ConvertedModel', load_state_dict=True) lora_layers =[layer for name,layer in peft_utils.get_fp_lora_layer(bert_mha_prepared_model)] def _forward_fn(model, inputs): prepared_inputs, _ = bert_mha_fpm.prepare_inputs(**inputs) if model == bert_mha_fpm.model else bert_mha_fpm.prepare_inputs(**inputs) model(**prepared_inputs) params = SeqMseParams(num_batches=20, inp_symmetry="symqt", num_candidates=20, loss_fn="mse", forward_fn=_forward_fn) bert_mha_sim_fpm = LLMForwardPassManager(cfg=llm_config, model=quant_sim.model, tokenizer=tokenizer, model_mode='bertcache', num_logits_to_return=0, separate_tuple_input_output=True) with event_marker("SeqMSE"): with bert_mha_fpm.place_on_device("cuda"),bert_mha_sim_fpm.place_on_device("cuda"): apply_seq_mse(bert_mha_fpm.model, quant_sim, train_dataloader, params, modules_to_exclude=lora_layers) quant_sim.save_encodings_to_json(output_dir, 'base_seqmse') # #### Concat Encoding Unification # In[47]: from aimet_torch.v2.experimental import propagate_output_encodings import aimet_torch.elementwise_ops as aimet_ops propagate_output_encodings(quant_sim, aimet_ops.Concat) # #### Setup Lora Layer to 16 bit per tensor # In[48]: ## do this if changing for lora layers for _,module in peft_utils.get_quantized_lora_layer(quant_sim): # setting 16 bit per tensor module.param_quantizers['weight'] = QuantizeDequantize(shape=(1, 1, 1, 1), bitwidth=16, symmetric=True).to(module.weight.device) peft_utils.quantize_lora_scale_with_fixed_range(quant_sim, 16, 0.0, 1.0) peft_utils.disable_lora_adapters(quant_sim) # #### Calibration # ##### Estimated running time: ~ 5m # In[49]: def calibration_wrapper(model, kwargs): data_loader = kwargs['data_loader'] fpm = kwargs['fpm'] max_iterations = kwargs['num_batches'] for batch_id, batch in enumerate(tqdm(data_loader)): if batch_id < max_iterations: prepared_inputs, _ = fpm.prepare_inputs(**batch) model(**prepared_inputs) else: break kwargs = { 'data_loader': train_dataloader, 'fpm': bert_mha_sim_fpm, 'num_batches': 100 } with event_marker("compute encoding for base", flush_ram=True): with bert_mha_sim_fpm.place_on_device(device): quant_sim.compute_encodings(calibration_wrapper, kwargs) from global_encoding_clipper import clamp_activation_encodings clamp_activation_encodings(quant_sim,500) # #### Adapted BERT MHA Quantsim Eval for Quantization Accuracy # ##### Estimated running time: ~7m # In[50]: with event_marker("Sim eval for base"): with torch.no_grad(): with bert_mha_sim_fpm.place_on_device(device): sim_ppl = bert_ppl_eval(test_dataloader, bert_mha_sim_fpm) print(f"ppl score of quantsim model: {sim_ppl}\n" f"orig ppl - quantsim ppl = {orig_ppl - sim_ppl}") quant_sim.save_encodings_to_json(output_dir, 'base_encoding') # #### Load Adapter Weights, Compute Encodings and Save Encodings # ##### Estimated running time: ~ 25m # In[51]: peft_utils.freeze_base_model_param_quantizers(quant_sim) for adapter_name,peft_model_id in lora_adapter_dict.items(): peft_utils.enable_adapter_and_load_weights(quant_sim,os.path.join(output_dir,f'{adapter_name}.safetensor')) with event_marker(f"compute encoding for {adapter_name} adapter", flush_ram=True): with bert_mha_sim_fpm.place_on_device(device): quant_sim.compute_encodings(calibration_wrapper, kwargs) from global_encoding_clipper import clamp_activation_encodings clamp_activation_encodings(quant_sim, 500) with event_marker(f"Sim eval for {adapter_name} adapter"): with torch.no_grad(): with bert_mha_sim_fpm.place_on_device(device): sim_ppl = bert_ppl_eval(test_dataloader, bert_mha_sim_fpm) print(f"ppl score of quantsim model: {sim_ppl}\n" f"orig ppl - quantsim ppl = {orig_ppl - sim_ppl}") ## save encodings for kvcache mode to consume quant_sim.save_encodings_to_json(output_dir, f'{adapter_name}_adapter_encoding') # In[52]: del bert_mha_sim_fpm del bert_mha_fpm del bert_mha_prepared_model del quant_sim # ### 8. Base KV MHA FP Model Instantiation # In[20]: llm_config = AutoConfig.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True) # model params llm_config.num_hidden_layers = num_hidden_layers llm_config.cache_dir = cache_dir llm_config.device = torch.device('cpu') # QC LLM model config setattr(llm_config, 'mask_neg', -100) setattr(llm_config, 'num_logits_to_return', ARN) setattr(llm_config, 'return_top_k', 0) setattr(llm_config, "use_conv", False) setattr(llm_config, 'return_new_key_value_only', True) setattr(llm_config, 'transposed_key_cache', True) setattr(llm_config, 'use_combined_mask_input', True) setattr(llm_config, 'concat_head_in_batch_dimension', False) setattr(llm_config, 'use_sha', False) setattr(llm_config, 'num_tokens', ARN) setattr(llm_config, 'use_position_embedding_input', True) setattr(llm_config, 'separate_tuple_input_output', False) setattr(llm_config, '_attn_implementation', 'eager') setattr(llm_config, '_attn_implementation_internal', 'eager') print(f'num_layer: {llm_config.num_hidden_layers}, context_length: {context_length}, arn: {ARN}') with event_marker('KV FP model'): kv_fp_base_model = modeling_llama.LlamaForCausalLM.from_pretrained(model_id, config=llm_config) os.environ['TOKENIZERS_PARALLELISM'] = '0' tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, use_fast=True, trust_remote_code=True) ## Adjust the tokenizer to limit to context length tokenizer.model_max_length = context_length # ### 9. Loading LORA Adapter on base KV MHA Model # In[21]: # loading only 1 adapter to have adapted graph , adapter_name= "french" # peft_model_id="kaitchup/Llama-2-7b-mt-French-to-English" peft_model_id="french" lora_model = PeftModel.from_pretrained(kv_fp_base_model, peft_model_id, adapter_name=adapter_name) dummy_adapter_name = "k_v_adapter" lora_model.add_adapter(dummy_adapter_name, k_v_lora_config) # Write the lora's for k and v with zeros # not doing this due to graph issue reported for g2g for name, param in lora_model.named_parameters(): if dummy_adapter_name in name and "lora" in name: param.data.fill_(0.0) combined_adapter_name = "combined_adapter" lora_model.add_weighted_adapter( adapters=[adapter_name, dummy_adapter_name], weights=[1.0, 1.0], adapter_name=combined_adapter_name, combination_type="linear" ) lora_model.set_adapter(combined_adapter_name) lora_model.delete_adapter(adapter_name) lora_model.delete_adapter(dummy_adapter_name) # Replace lora layer with quantizable layers replace_lora_layers_with_quantizable_layers(lora_model) # linear to conv adaptation lora_model=replace_linears_with_convs(lora_model) kv_mha_fp_model = lora_model.base_model.model # ### 10. Adapted KV Cache MHA Model Preparation # #### Estimated running time: ~ 1h 20m # In[22]: import aimet_torch.pro.ir_graph_op_handler as ir_graph_op_handler from aimet_torch.pro import model_preparer # Setting this flag to False means that the prepared model will be flattened # This flag must be set to false because we rely on the model structure being flat to enable weight sharing ir_graph_op_handler.KEEP_ORIGINAL_MODEL_STRUCTURE = False dummy_input = get_dummy_data('kvcache', llm_config.num_hidden_layers, llm_config.hidden_size, llm_config.num_attention_heads, llm_config.rope_theta, tokenizer, 'cpu', separate_tuple_input_output=False, num_tokens=ARN, concat_head_in_batch_dimension=llm_config.concat_head_in_batch_dimension) input_names, output_names = get_input_output_names( num_layers=llm_config.num_hidden_layers, past_key_values_in=True, separate_tuple_input_output=True) # Build the converter args converter_args_param = ['--input_layout'] converter_args_value = 'NONTRIVIAL' converter_args = [] for input_param in converter_args_param: for input_name in input_names: converter_args += [input_param, input_name, converter_args_value] with event_marker("KV MHA Model prepare", flush_ram=True): kv_mha_prepared_model = model_preparer.prepare_model(kv_mha_fp_model, dummy_input, filename="kv_mha_prepared_model", path=output_dir, input_names=input_names, output_names=output_names, converter_args=converter_args, skipped_optimizers=['eliminate_common_subexpression','eliminate_nop_with_unit', 'eliminate_duplicate_initializer'], ) del kv_mha_fp_model # # ### 11. Create Adapted KVcache MHA Quantsim and Apply Encodings from Adapted BERT MHA Model # In[23]: kvcache_fpm = LLMForwardPassManager(cfg=llm_config, model=kv_mha_prepared_model, tokenizer=tokenizer, model_mode='kvcache', num_logits_to_return=ARN, separate_tuple_input_output=True, num_tokens=ARN) llm_config.concat_head_in_batch_dimension = False dummy_input = get_dummy_data('kvcache', llm_config.num_hidden_layers, llm_config.hidden_size, llm_config.num_attention_heads, llm_config.rope_theta, tokenizer, device, separate_tuple_input_output=True, num_tokens=ARN, concat_head_in_batch_dimension=llm_config.concat_head_in_batch_dimension) with event_marker("create KV Quantsim"): with kvcache_fpm.place_on_device(device): kv_quant_sim = QuantizationSimModel(model=kvcache_fpm.model, quant_scheme=QuantScheme.post_training_tf, dummy_input=dummy_input, default_output_bw=16, default_param_bw=4, in_place=True, config_file=htp_config_file, ) # In[24]: ### Setting 16*8 malmuls from aimet_torch.v2.experimental.quantsim_utils import set_matmul_second_input_producer_to_8bit_symmetric set_matmul_second_input_producer_to_8bit_symmetric(kv_quant_sim) # #### Concat encoding unification # In[25]: from aimet_torch.v2.experimental import propagate_output_encodings import aimet_torch.elementwise_ops as aimet_ops propagate_output_encodings(kv_quant_sim, aimet_ops.Concat) # #### Mixed precision config # In[26]: from mixed_precision_overrides import ManualQuantsimMixedPrecisionConfig with event_marker("Apply Mixed Precision", flush_ram=True): quantsim_adjuster = ManualQuantsimMixedPrecisionConfig(mixed_precision_config_file= "./config/mixed_precision_profiles/w4_a16_exceptions_llama_v2_prepared_disableRMSNorm_clampgateprojconv_bundledkv.json") quantsim_adjuster.apply_exceptions(kv_quant_sim) # #### Setup lora layer to be 16bit per tensor # In[31]: import json import pickle with open(os.path.join(output_dir,'meta_data.pkl'), "rb") as f: meta_data_file = pickle.load(f) with open(os.path.join(output_dir,'kv_mha_prepared_model.json')) as f: name_to_module_dict = json.load(f) peft_utils = PeftQuantUtils(adapater_name_to_meta_data=meta_data_file, name_to_module_dict=name_to_module_dict) ## do this if changing for lora layers for _,module in peft_utils.get_quantized_lora_layer(kv_quant_sim): # setting 16 bit per tensor module.param_quantizers['weight'] = QuantizeDequantize(shape=(1, 1, 1, 1), bitwidth=16, symmetric=True).to(module.weight.device) peft_utils.quantize_lora_scale_with_fixed_range(kv_quant_sim, 16, 0.0, 1.0) peft_utils.disable_lora_adapters(kv_quant_sim) # #### Mapping Base Encodings and Loading Mapped Encodings into Quantizer # In[32]: from encodings_mapper import EncodingsMapper encoding_file = os.path.join(output_dir, 'base_encoding.json') _ , mapped_encoding_file = EncodingsMapper(llm_config, output_dir, encoding_file).map_encodings() kv_quant_sim.load_encodings(mapped_encoding_file, partial=False) # ### 12. Export KVCache Model Onnx and encodings # #### Estimated running time: ~ 1h # In[34]: from aimet_torch.utils import change_tensor_device_placement from aimet_torch.onnx_utils import OnnxExportApiArgs from aimet_torch import onnx_utils from aimet_utils.clip_weights import clip_weights_to_7f7f onnx_dir = os.path.join(output_dir, 'onnx') os.makedirs(onnx_dir, exist_ok=True) input_names, output_names = get_input_output_names( num_layers=llm_config.num_hidden_layers, past_key_values_in=True, separate_tuple_input_output=True) onnx_utils.RESTORE_ONNX_MODEL_INITIALIZERS = True clip_weights_to_7f7f(kv_quant_sim) onnx_api_args = OnnxExportApiArgs(input_names=input_names,output_names=output_names) sample_inputs = change_tensor_device_placement(dummy_input, torch.device('cpu')) filename_prefix = f"llamav2_AR{ARN}" filename_prefix_encodings = f"{filename_prefix}_base" with event_marker("KVCache export onnx and test vectors", flush_ram=True): kv_quant_sim.export(onnx_dir, filename_prefix, sample_inputs, onnx_export_args=onnx_api_args,export_model=True, filename_prefix_encodings=filename_prefix_encodings) # exporting tokenizer tokenizer_dir = os.path.join(output_dir, 'tokenizer') os.makedirs(tokenizer_dir, exist_ok=True) tokenizer.save_pretrained(tokenizer_dir) # #### Create sample test vectors for QNN SDK # ##### Estimated running time: ~ 9m # In[35]: from test_vectors import generate_test_vectors test_vector_layers = [ "model_layers_\\d+_input_layernorm_Pow", "model_layers_\\d+_input_layernorm_Cast", "lm_head_conv_Conv", "lm_head_MatMul", "model.layers\\d+.input_layernorm.cast", "lm_head_conv", "lm_head" ] with event_marker("generate test vector"): generate_test_vectors(kv_quant_sim, kvcache_fpm, train_dataloader, output_dir, num_batches=1, test_vector_layers=test_vector_layers, input_names=input_names) # #### Mapping Encoding from Bert to Kvcache and Export encodings for Adapters # ##### Estimated running time : ~ 10m # In[36]: from encodings_mapper import EncodingsMapper peft_utils.freeze_base_model_param_quantizers(kv_quant_sim) for adapter_name,peft_model_id in lora_adapter_dict.items(): peft_utils.enable_adapter_and_load_weights(kv_quant_sim,os.path.join(output_dir,f'{adapter_name}.safetensor')) encoding_file = os.path.join(output_dir, f'{adapter_name}_adapter_encoding.json') _ , mapped_encoding_file = EncodingsMapper(llm_config, output_dir, encoding_file).map_encodings() kv_quant_sim.load_encodings(mapped_encoding_file, partial=False) clip_weights_to_7f7f(kv_quant_sim) peft_utils.export_adapter_weights(kv_quant_sim, output_dir, f'{adapter_name}_onnx') filename_prefix_encodings = f"{filename_prefix}_{adapter_name}" with event_marker(f"KVCache export {adapter_name} adapter encodings", flush_ram=True): kv_quant_sim.export(onnx_dir, filename_prefix, sample_inputs, onnx_export_args=onnx_api_args,export_model=False, filename_prefix_encodings=filename_prefix_encodings) # --- # ## Summary # In[37]: from aimet_torch.pro.utils.profiler import EventProfiler EventProfiler().report() EventProfiler().json_dump(os.path.join(output_dir, 'profiling_stats'))
最新发布
06-11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值