基于node/egg.js/langchain.js实现openai对话 支持插件功能核心代码支持网络搜索 本地知识库问答 数据库知识上传切片

支持 网络搜索/远程知识库数据库对话 Pinecon 先来讲一下啊 网上那些什么node的langchain本地知识库对话的教程,文章一大堆,拽一堆专业词汇(拽啥,不一样是看的人家英文文档,用的翻译,搞得看个中文教程看的我都难受),看的迷迷糊糊的跟着他们用langchain搞下来,是能用,哎,不支持上下文对话, 就一个人工智障跟个鸡肋似的

我也就是用langchain来进行数据切片。

本地知识库对话主要逻辑用大白话讲其实就是 把用户的提问转换成向量(openai有提供的接口) ,

字符(string) = >向量(Array:[[122,399],000,....])

然后去远程向量数据库通过余弦相似度算法去检索(Pinecon有现成的文档不需要知道原理配置一下调用就行)有一定相似度的数据(也就是相关数据)

再然后把这些提示整理一下注入到openai的聊天上下中(role: 'system',这是一个openai中的角色参数,设置为系统,{ role: 'system', content: Here is the result of querying the remote vector database \n content: \n ${promptContent.text} })

远程向量数据库其实也一样就是把用户提供的pdf txt 什么的数据(先切片,因为有大小限制)转为向量存到数据库中

网络搜索同上

这种问答的基础流程 (langchain在这里最大的用处就是数据库上传的时候切片)

用户的提问=>转为向量=>去向量数据查询=>获取前几个搜索结果注入到会话提示中=>向openai发起请求

调用main.js

 

js

复制代码

//askQuestionData 是 openai接口请求的官方参数 详细去官网看 //添加对应插件 ( 网络搜索 / 本地知识库问答 ) 对应main.js中的方法 提供给openai调用 askQuestionData['functions'] = [ { "name": "getNetworkPrompt", "description": "This is your Search engine. Calling this function will connect to the Internet to search and return relevant data", "parameters": { "type": "object", "properties": { "q": { "type": "string", "description": "User raised questions" } }, "required": ["q"] } }, { "name": "getLocalPrompt", "description": "Query the remote vector library and obtain relevant content based on user questions", "parameters": { "type": "object", "properties": { "content": { "type": "string", "description": "User raised questions" } "namespace": { "type": "string", "description": "The name of the user's query database" } }, "required": ["content","namespace"] } }]; service.openai.main.askQuestion(askQuestionData, { callbackFn: (res) => { if (res.catch) { //网络出错 console.log(res.catch, '网络出错') } else if (res.error) { //请求出错 data.status = 'success' data.content = res.error } else { const content = res.choices[0].delta.content !== undefined res.choices[0].delta.content : ''; } console.log(content,'流式响应数据') }, readStreamCallback: (readStream) => { console.log(readStream,'readStream,可以执行销毁停止操作') } });

main.js

 

js

复制代码

const Service = require('egg').Service; const { Configuration, OpenAIApi } = require("openai"); const { PineconeClient } = require("@pinecone-database/pinecone"); const axios = require("axios"); const { htmlToText } = require('html-to-text'); const { encodingForModel } = require("js-tiktoken"); const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { EPubLoader } = require("langchain/document_loaders/fs/epub"); const { PineconeStore } = require("langchain/vectorstores/pinecone"); const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); const { PDFLoader } = require("langchain/document_loaders/fs/pdf"); const { TextLoader } = require("langchain/document_loaders/fs/text"); const { DocxLoader } = require("langchain/document_loaders/fs/docx"); const fs = require('fs'); class OpenaiService extends Service { questionConfiguration = { stream: true, model: "gpt-3.5-turbo" } askQuestionCurlParameter = { headers: { "Authorization": `Bearer ${this.ctx.app.config.openai.apiKey}`, //openai apikey 'Content-Type': 'application/json', // json }, timeout: 10000, //超时时间 dataType: 'json', //数据类型 json streaming: true, //开始curl流响应 method: 'post', //请求方式 post } trySplicing(data) { try { return JSON.parse(data) } catch (error) { console.log(data, '拼接失败') return null } } dataProcessing(dataStream, BrokenData = []) { return String(dataStream).split("data:").filter((e) => e).flat(Infinity).map(item => { try { return JSON.parse(item) } catch (error) { if (item.split(/[\t\r\f\n\s]*/g).join('') == '[DONE]') { return false } else { let result = null; if (BrokenData.length >= 1) { result = this.trySplicing(`${BrokenData[BrokenData.length - 1]}${item}`) } if (result) { BrokenData.pop(); return result; } else { BrokenData.push(item) return false } } } }).filter((e) => e) } routingParameterGeneration(parameter) { if (parameter == null || parameter == undefined) return; const parameterType = typeof parameter; if (parameterType == 'string') { let obj = {}; let splitParameter = parameter.split('?'); if (splitParameter.length <= 1) { console.warn("暂无路径参数"); return } splitParameter[1].split('&').forEach(item => { if (item.indexOf('=') !== -1) { let splitItem = item.split('=') obj[splitItem[0]] = splitItem[1] } else { console.warn("error"); } }) return obj } else if (parameterType == 'object') { let stringParameter = '?' let parameterEntries = Object.entries(parameter); parameterEntries.forEach((key, index) => { let itemString = `${key[0]}=${key[1]}${parameterEntries.length - 1 == index ? '' : '&'}`; stringParameter += itemString }) return stringParameter } } /** * @description 查找网络数据 * @param { string } value - 问题 * @param { string } model - 必应/谷歌 * @returns { Promise } - 查询到的数据 */ async findNetworkdata(BingSearchParameter) { // console.log(BingSearchParameter, 'BingSearchParameter'); return new Promise((resolve, reject) => { axios.get(`${this.ctx.app.config}${BingSearchParameter}`).then(res => resolve(res)).catch(err => reject(err)) }) } /** * @description html解析为文本 * @param {Object|Array} html - html数据 * @returns { Array } - 返回的数据 */ parsingHTML(html) { if (!html) throw new Error('缺少html参数'); const htmlValues = Array.isArray(html) ? html : [html]; return htmlValues.map(item => htmlToText(item)); } // 查找相近数据 findSimilarData(pineconeParameter) { const { apiKey, environment } = this.ctx.app.config.pinecone; return new Promise((resolve, reject) => { //实例向量库 const client = new PineconeClient(); //初始化向量库 https://first-database-a81708f.svc.us-east-1-aws.pinecone.io client.init({ apiKey, environment, }).then(() => { const pineconeIndex = client.Index('first-database'); const queryRequest = Object.assign({ vector: [], topK: 10, includeValues: true, includeMetadata: true, namespace: "", }, pineconeParameter); pineconeIndex.query({ queryRequest }).then(res => { resolve(res) }).catch(err => { reject(err) }) }) }) } //数据向量 getEmbeddings(input, model = 'text-embedding-ada-002') { return new Promise((resolve, reject) => { const configuration = new Configuration({ apiKey: this.ctx.app.config.openai.apiKey, basePath: this.ctx.app.config.openai.basePath }); const openai = new OpenAIApi(configuration); openai.createEmbedding({ model, input }).then(res => { resolve(res) }).catch(err => { reject(err) }) }) } /** * @description 获取网络搜索提示 * @param { Object } parameter - 搜索参数 */ async getNetworkPrompt(parameter = {}) { if (!parameter.q) return; let webSearchParameter = this.routingParameterGeneration(Object.assign({...this.app.ctx.config.GoogleSearch}, parameter, { q: encodeURI(parameter.q) })); let searchResults = await this.findNetworkdata(webSearchParameter); //获取相关网址 let webDatas = await this.parsingWebPageContent(searchResults.data.items, { question: parameter.q, top: 1 }); return { role: 'system', content: ` You can refer to the following online search results for ${decodeURIComponent(parameter.q)} to improve your answer. If it is not helpful to you, please ignore this prompt. Network search data \n ${webDatas.map((item, index) => ` ${index + 1} \n content:${item.value} \n Network source:${item.link} \n `).join('\n')}` } } async webCorrelationAnalysis({ question, websites }) { websites = websites.map(item => { let { title, link, snippet } = item; return { title, link, snippet } }); websites.length = 5; const configuration = new Configuration({ apiKey: this.ctx.app.config.openai.apiKey, basePath: this.ctx.app.config.openai.basePath }); const openai = new OpenAIApi(configuration); let chatCompletion = []; await openai.createChatCompletion({ model: "gpt-3.5-turbo-0613", messages: [ { role: "system", content: `Query a question related to the user's question in the provided JSON list and pass the JSON as a parameter to call the parseWebPageContent method` }, { role: "system", content: `JSON data: \n ${websites}` }, { role: "user", content: question } ], functions: [{ "name": "parseWebPageContent", "description": "This is a method of obtaining web page content by parsing JSON data", "parameters": { "type": "object", "properties": { "link": { "type": "string", "description": `This is the 'link' field in JSON data`, }, "title": { "type": "string", "description": `This is the 'title' field in JSON data`, }, "snippet": { "type": "string", "description": `This is the 'snippet' field in JSON data`, } }, "required": ["link", "title", "snippet"], } }] }).then(res => { if (res.data.choices[0].message?.function_call) { let { arguments: parameters } = res.data.choices[0].message?.function_call; chatCompletion.push(JSON.parse(parameters)); } }).catch(err => { console.log(err) }); return chatCompletion; } /** * @description 通过地址解析网页内容 并切生成摘要 * @param {Array} websites - 网址 * @param { Number } top */ async parsingWebPageContent(websites, { question, top = 1 }) { console.log(websites.forEach(item=>{console.log(item.snippet)})) if (websites.length > top) websites.length = top; let websiteValues = await Promise.all(websites.map(async item => { let { link } = item; let html; await axios.get(link).then(res => { html = res }).catch(err => { html = null; console.log(err) }); let value = ''; if (html) { value = await this.generateSummary(question, this.parsingHTML(html.data)); } return { value, link } })); return websiteValues } /** * @description OpenAi 通过提问生成摘要 */ async generateSummary(question, value) { let enc = encodingForModel('gpt-3.5-turbo'); let valueTokens = enc.encode(value.join(",")); let residueTokensLength = 3096 - enc.encode(question).length; if (valueTokens.length > residueTokensLength) valueTokens.length = residueTokensLength; const configuration = new Configuration({ apiKey: this.ctx.app.config.openai.apiKey, basePath: this.ctx.app.config.openai.basePath }); const openai = new OpenAIApi(configuration); let chatCompletion; await openai.createChatCompletion({ model: "gpt-3.5-turbo", messages: [{ role: "system", content: `Summarize the provided network search content based on user questions` }, { role: "system", content: `Network search results: \n ${enc.decode(valueTokens)}` }, { role: "user", content: question }], }).then(res => { chatCompletion = res.data.choices[0].message.content }).catch(err => { console.log(err) chatCompletion = ''; }); return chatCompletion; } functionCallSplicing(FunctionConfiguration, item) { let { function_call = {} } = item.choices[0].delta; return Object.assign(FunctionConfiguration, function_call, { arguments: FunctionConfiguration.arguments + (function_call?.arguments || '') }); } /** * @description 向openai发起提问 * @param {object} questionConfiguration - openai提问参数 * @param {Object} callBackConfiguration - 回调对象 * @param {Function} callBackConfiguration.callbackFn - 回调函数 * @param {Function} callBackConfiguration.readStreamCallback - 可读流回调 返回参数诗可读流对象 questionConfiguration.stream:true */ async askQuestion(questionConfiguration = this.questionConfiguration, callBackConfiguration) { const data = Object.assign({}, this.questionConfiguration, questionConfiguration); const askQuestionCurlParameter = Object.assign({}, this.askQuestionCurlParameter, { data }); this.ctx.curl(`${this.ctx.app.config.openai.basePath}/chat/completions`, askQuestionCurlParameter).then(res => { let BrokenData = []; let FunctionConfiguration = { is: false, name: "", arguments: "" }; if (askQuestionCurlParameter.data.stream) { if (callBackConfiguration.hasOwnProperty('readStreamCallback')) callBackConfiguration.readStreamCallback(res); res.res.on('data', dataStream => { let a = this.dataProcessing(dataStream, BrokenData); if (a.length && a[0].choices[0].delta.hasOwnProperty('function_call')) FunctionConfiguration.is = true; a.forEach(item => FunctionConfiguration.is ? this.functionCallSplicing(FunctionConfiguration, item) : callBackConfiguration.callbackFn(item)); }); res.res.on('close', () => { if (FunctionConfiguration.is) this.functionCall(FunctionConfiguration, { messages: questionConfiguration.messages, callBackConfiguration }); }) } else { callBackConfiguration.callbackFn(res.data) } }).catch(error => { callBackConfiguration.callbackFn({ catch: error }) }) } getLocalPrompt({ content,namespace, pineconeParameter = {} }) { let enc = encodingForModel('gpt-3.5-turbo'); let contentLength = enc.encode(content).length; return new Promise((resolve, reject) => { this.getEmbeddings(content).then(res => { let vector = res.data.data[0].embedding; this.findSimilarData({ ...pineconeParameter, vector, namespace }).then(res => { let promptContent = res.matches.reduce((total, currentValue, currentIndex, arr) => { if (currentIndex == 1) total = { text: total.metadata.text, textLength: enc.encode(total.metadata.text).length }; let calLength = 3596 - contentLength - total.textLength; let remainingLength = calLength > 0 ? calLength : 0; if (!remainingLength) return total; let currentValueCodes = enc.encode(currentValue.metadata.text); currentValueCodes.length = remainingLength; let text = total.text + enc.decode(currentValueCodes); let textLength = total.textLength + currentValueCodes.length; return { text, textLength }; }); resolve({ role: 'system', content: `Here is the result of querying the remote vector database \n content: \n ${promptContent.text} ` }) }).catch(err => { reject(err) }) }) }) } async documentReading() { let { filepath, filename, } = this.ctx.request.files[0]; let mimeType = filepath.split('.')[1]; console.log('文件类型', mimeType) //加载分块 const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 }); // 加载 const loader = this.documentClassification(filepath, mimeType); console.log('文档读取') const docs = await loader.load(); // 分割 const splitterDocs = await splitter.splitDocuments(docs); console.log( '文档分割完成'); return { splitterDocs, useTokens: splitterDocs.length * 1000 } } async findNamespace() { let namespaces = await this.ctx.model.Namespace.findAll({ where: { user_id: this.ctx.userinfo.id } }); return namespaces } async createdOrUpdateNamespace() { let result = null; await this.ctx.model.Namespace.findOrCreate( { where: { name:this.ctx.request.body.namespace, user_id: this.ctx.userinfo.id }, defaults: { name: this.ctx.request.body.namespace, user_id: this.ctx.userinfo.id, upload_status: 'Ready', details:this.ctx.request.body.describe, title:this.ctx.request.body.title||'' } } ).then(async (res) => { let namespace = res[0]; if (namespace.upload_status == 'Loading'|| namespace.upload_status == 'Adding') this.ctx.throw(500, '当前空间正在使用'); result = await namespace.update({ upload_status: namespace.upload_status == 'Ready' ? 'Loading' : 'Adding' }); }); return result } async fileUpload({ splitterDocs, useTokens }) { let upload_status ; const { apiKey, environment } = this.ctx.app.config.pinecone; const client = new PineconeClient(); await client.init({ apiKey, environment }); const pineconeIndex = client.Index('first-database'); let namespace = `${this.ctx.userinfo.id}-${this.ctx.request.body.namespace}`; await PineconeStore.fromDocuments(splitterDocs, new OpenAIEmbeddings({ openAIApiKey: this.ctx.app.config.openai.apiKey }, { basePath: this.ctx.app.config.openai.basePath }), { pineconeIndex, namespace, }).then(res => { console.log('操作成功'); upload_status = 'Success' }).catch(err => { console.log(err, 'err'); upload_status = 'Fail' }).finally(() => { let { filepath } = this.ctx.request.files[0]; fs.unlink(filepath, (err) => { console.log(err, 'File deleted!'); }); }); return upload_status } documentClassification(filepath, mimeType) { if (!filepath) throw new Error(`路径出现问题:${filepath}`) let loader = null; switch (mimeType) { case 'pdf': loader = new PDFLoader(filepath) break; case 'epub': loader = new EPubLoader(filepath) break; case 'txt': loader = new TextLoader(filepath) break; case 'docx': loader = new DocxLoader(filepath) break; default: break; } if (!loader) throw new Error(`无法解析的类型:${mimeType}`) return loader; } /** * * @param {Object} FunctionConfiguration - function_call对象 * @param {*} askQuestionParameter - askQuestion参数 用于调用完成继续对话 */ functionCall(FunctionConfiguration, askQuestionParameter) { console.log(FunctionConfiguration,'FunctionConfiguration') let { messages, callBackConfiguration } = askQuestionParameter; this[FunctionConfiguration.name](JSON.parse(FunctionConfiguration.arguments)).then(res => { this.askQuestion({ messages: [res, messages[messages.length - 1]],model:'gpt-3.5-turbo-0613' }, callBackConfiguration) }) } } module.exports = OpenaiService

作者:不太聪明的小曹
链接:https://juejin.cn/post/7270871863162208312
来源:稀土掘金
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值