Baichuan-M1-14B-Instruct 部署
conda create -n baichuan python=3.12
conda activate baichuan
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 -i https://pypi.douban.io/simple
pip install sentencepiece
pip install transformers==4.47.0
pip install flash_attn-2.6.0+cu122torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
accelerate 1.3.0
bitsandbytes 0.45.1
certifi 2024.12.14
charset-normalizer 3.4.1
einops 0.8.0
filelock 3.17.0
flash-attn 2.6.0
fsspec 2024.12.0
huggingface-hub 0.27.1
idna 3.10
Jinja2 3.1.5
MarkupSafe 3.0.2
mpmath 1.3.0
networkx 3.4.2
numpy 2.2.2
nvidia-cublas-cu12 12.4.5.8
nvidia-cuda-cupti-cu12 12.4.127
nvidia-cuda-nvrtc-cu12 12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12 9.1.0.70
nvidia-cufft-cu12 11.2.1.3
nvidia-curand-cu12 10.3.5.147
nvidia-cusolver-cu12 11.6.1.9
nvidia-cusparse-cu12 12.3.1.170
nvidia-nccl-cu12 2.21.5
nvidia-nvjitlink-cu12 12.4.127
nvidia-nvtx-cu12 12.4.127
packaging 24.2
pillow 11.1.0
pip 24.2
psutil 6.1.1
PyYAML 6.0.2
regex 2024.11.6
requests 2.32.3
safetensors 0.5.2
sentencepiece 0.2.0
setuptools 75.1.0
sympy 1.13.1
tokenizers 0.21.0
torch 2.5.0
torchaudio 2.5.0
torchvision 0.20.0
tqdm 4.67.1
transformers 4.48.1
triton 3.1.0
typing_extensions 4.12.2
urllib3 2.3.0
wheel 0.44.0
显存不足,量化
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# 1. Load pre-trained model and tokenizer
model_name = "./Baichuan-M1-14B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True,torch_dtype = torch.bfloat8)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
load_in_4bit=False,
load_in_8bit=True,
trust_remote_code=True,
)
# model = model.quantize(8).cuda()
# 2. Input prompt text
prompt = "感冒了怎么弄?"
# 3. Encode the input text for the model
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
print(model_inputs)
print(model_inputs['input_ids'].shape)
print(model_inputs['input_ids'].dtype)
# 4. Generate text
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# 5. Decode the generated text
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# 6. Output the result
print("Generated text:")
print(response)