1.安装环境依赖
langchain, langchain_community, langchain_core, transformers等
2.导入
from typing import Any, List, Optional
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms.utils import enforce_stop_tokens
device = "cuda"
3.自定义LLM类
首先必须重写“_llm_type”和“_call”这两个方法,这是官方规定的,_llm_type方法用于返回自定义大模型的类型,随便写就行。_call方法接收的参数是一个字符串(就是你的prompt信息)和一些可选的停止词,返回值是一个字符串(就是自定义的大模型的输出值)。定义好了这个方法之后,外部可以通过实例化的大模型如llm.invoke(prompt)访问。着重注意的是:_call方法内部调用自定义大模型产生输出的逻辑是要自己定义的。
除了必须重写的两个方法以外,还要在类中实现加载model,tokenizer等操作,具体看代码。
class Qwen_2_5(LLM):
max_token: int = 4096
temperature: float = 0.8
top_p: float = 0.9
tokenizer: object = None
model: object = None
history: List[str] = []
def __init__(self):
super().__init__()
@property
def _llm_type(self) -> str:
return "Qwen_2_5"
def load_model(self, model_path=None):
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto"
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
# 自定义的函数,用于调用LLM生成结果
response, _ = chat2_5(self.model,
self.tokenizer,
prompt,
history=self.history,
temperature=self.temperature)
if stop is not None:
response = enforce_stop_tokens(response, stop)
self.history = self.history + [[None, response]]
return response
自定义的chat2_5函数,用于产生输出的
def chat2_5(model, tokenizer, prompt, history=[], temperature=0.7):
if history == []:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
else:
messages = history
messages.append({"role": "user", "content": prompt})
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512,
temperature=temperature
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
#
messages.append({
"role": "assistant", "content": response
})
history = messages
return response, history
以上部分就是封装本地模型的所有步骤了
4.测试
if __name__ == '__main__':
qwen_llm = Qwen_2_5()
MODEL_PATH = '本地大模型所在的地址,自己修改'
qwen_llm.load_model(MODEL_PATH)
prompt = '自定义的提示信息'
response = qwen_llm.invoke(prompt)
print(response)
844

被折叠的 条评论
为什么被折叠?



