一、模型下载脚本
from modelscope.hub.snapshot_download import snapshot_download
model_dir = snapshot_download("Qwen/Qwen3-8B", cache_dir="D:\Modles")
二、模型多轮对话
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
import time
class QwenChatbot:
def __init__(self, model_path=r"D:\Modles\Qwen\Qwen3-8B"):
self.tokenizer = AutoTokenizer.from_pretrained(model_path,
trust_remote_code=True)
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0
)
self.model = AutoModelForCausalLM.from_pretrained(model_path,
device_map='auto',
quantization_config=quantization_config,
trust_remote_code=True)
self.history = []
def generate_response(self, user_input):
messages = self.history + [{"role": "user", "content": user_input}]
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = self.tokenizer(text, return_tensors="pt")
inputs = inputs.to(self.model.device)
response_ids = self.model.generate(**inputs, max_new_tokens=4096)[0][len(inputs.input_ids[0]):].tolist()
response = self.tokenizer.decode(response_ids, skip_special_tokens=True)
# Update history
self.history.append({"role": "user", "content": user_input})
self.history.append({"role": "assistant", "content": response})
return response
# Example Usage
if __name__ == "__main__":
chatbot = QwenChatbot()
# First input (without /think or /no_think tags, thinking mode is enabled by default)
s_time = time.time()
user_input_1 = "How many r's in strawberries?"
print(f"User: {user_input_1}")
response_1 = chatbot.generate_response(user_input_1)
print(f"Bot: {response_1}")
e_time = time.time()
print(f"生成完成,耗时: {e_time - s_time:.2f}秒")
print("----------------------")
# Second input with /no_think
s_time = time.time()
user_input_2 = "Then, how many r's in blueberries? /no_think"
print(f"User: {user_input_2}")
response_2 = chatbot.generate_response(user_input_2)
print(f"Bot: {response_2}")
e_time = time.time()
print(f"生成完成,耗时: {e_time - s_time:.2f}秒")
print("----------------------")
# Third input with /think
s_time = time.time()
user_input_3 = "Really? /think"
print(f"User: {user_input_3}")
response_3 = chatbot.generate_response(user_input_3)
e_time = time.time()
print(f"生成完成,耗时: {e_time - s_time:.2f}秒")
print(f"Bot: {response_3}")
注意:要根据自己的显卡类型来选择合适的max_new_tokens=4096,以及是否需要量化(不需要的话可以直接将量化模块注释掉)
2万+

被折叠的 条评论
为什么被折叠?



