from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread
class DeepSeekModel:
def __init__(self, model_path="D:\\Algorithm\\DeepSeek-R1-Distill-Qwen-1.5B\\DeepSeek-R1-Distill-Qwen-1.5B"):
# 设备自适应,根据 GPU 可用性选择设备
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
# 加载 tokenizer 和模型,添加稳定性相关参数
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto", # 显式指定设备映射
trust_remote_code=True,
torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
low_cpu_mem_usage=True, # 减少 CPU 内存使用
attn_implementation="eager" # 禁用可能不稳定的优化
).to(self.device)
# 设置模型为评估模式,避免不必要的梯度计算
self.model.
【DeepSeek-R1】DeepSeek-R1-Distill-Qwen-1.5B 流式与非流式推理调用
最新推荐文章于 2025-09-16 00:08:20 发布

最低0.47元/天 解锁文章
1万+

被折叠的 条评论
为什么被折叠?



