提交 391dc1d3 作者: hzg0601

debug for fastchat-openai-llm

...@@ -226,6 +226,10 @@ Web UI 可以实现如下功能: ...@@ -226,6 +226,10 @@ Web UI 可以实现如下功能:
- [x] [THUDM/chatglm-6b-int4-qe](https://huggingface.co/THUDM/chatglm-6b-int4-qe) - [x] [THUDM/chatglm-6b-int4-qe](https://huggingface.co/THUDM/chatglm-6b-int4-qe)
- [x] [ClueAI/ChatYuan-large-v2](https://huggingface.co/ClueAI/ChatYuan-large-v2) - [x] [ClueAI/ChatYuan-large-v2](https://huggingface.co/ClueAI/ChatYuan-large-v2)
- [x] [fnlp/moss-moon-003-sft](https://huggingface.co/fnlp/moss-moon-003-sft) - [x] [fnlp/moss-moon-003-sft](https://huggingface.co/fnlp/moss-moon-003-sft)
- [x] [bigscience/bloomz-7b1](https://huggingface.co/bigscience/bloomz-7b1)
- [x] [bigscience/bloom-3b](https://huggingface.co/bigscience/bloom-3b)
- [x] [baichuan-inc/baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
- [x] [lmsys/vicuna-13b-delta-v1.1](https://huggingface.co/lmsys/vicuna-13b-delta-v1.1)
- [x] 支持通过调用 [fastchat](https://github.com/lm-sys/FastChat) api 调用 llm - [x] 支持通过调用 [fastchat](https://github.com/lm-sys/FastChat) api 调用 llm
- [x] 增加更多 Embedding 模型支持 - [x] 增加更多 Embedding 模型支持
- [x] [nghuyong/ernie-3.0-nano-zh](https://huggingface.co/nghuyong/ernie-3.0-nano-zh) - [x] [nghuyong/ernie-3.0-nano-zh](https://huggingface.co/nghuyong/ernie-3.0-nano-zh)
...@@ -251,7 +255,7 @@ Web UI 可以实现如下功能: ...@@ -251,7 +255,7 @@ Web UI 可以实现如下功能:
- [x] VUE 前端 - [x] VUE 前端
## 项目交流群 ## 项目交流群
<img src="img/qr_code_42.jpg" alt="二维码" width="300" height="300" /> <img src="img/qr_code_44.jpg" alt="二维码" width="300" height="300" />
🎉 langchain-ChatGLM 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。 🎉 langchain-ChatGLM 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
#encoding:utf-8
import argparse import argparse
import json import json
import os import os
......
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from typing import Any, List
class MyEmbeddings(HuggingFaceEmbeddings):
def __init__(self, **kwargs: Any):
super().__init__(**kwargs)
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
texts = list(map(lambda x: x.replace("\n", " "), texts))
embeddings = self.client.encode(texts, normalize_embeddings=True)
return embeddings.tolist()
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
text = text.replace("\n", " ")
embedding = self.client.encode(text, normalize_embeddings=True)
return embedding.tolist()
from langchain.vectorstores import FAISS
from typing import Any, Callable, List, Optional, Tuple, Dict
from langchain.docstore.document import Document
from langchain.docstore.base import Docstore
from langchain.vectorstores.utils import maximal_marginal_relevance
from langchain.embeddings.base import Embeddings
import uuid
from langchain.docstore.in_memory import InMemoryDocstore
import numpy as np
def dependable_faiss_import() -> Any:
"""Import faiss if available, otherwise raise error."""
try:
import faiss
except ImportError:
raise ValueError(
"Could not import faiss python package. "
"Please install it with `pip install faiss` "
"or `pip install faiss-cpu` (depending on Python version)."
)
return faiss
class FAISSVS(FAISS):
def __init__(self,
embedding_function: Callable[..., Any],
index: Any,
docstore: Docstore,
index_to_docstore_id: Dict[int, str]):
super().__init__(embedding_function, index, docstore, index_to_docstore_id)
def max_marginal_relevance_search_by_vector(
self, embedding: List[float], k: int = 4, fetch_k: int = 20, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents with scores selected by maximal marginal relevance.
"""
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
# -1 happens when not enough docs are returned.
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32), embeddings, k=k
)
selected_indices = [indices[0][i] for i in mmr_selected]
selected_scores = [scores[0][i] for i in mmr_selected]
docs = []
for i, score in zip(selected_indices, selected_scores):
if i == -1:
# This happens when not enough docs are returned.
continue
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append((doc, score))
return docs
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents with scores selected by maximal marginal relevance.
"""
embedding = self.embedding_function(query)
docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
return docs
@classmethod
def __from(
cls,
texts: List[str],
embeddings: List[List[float]],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> FAISS:
faiss = dependable_faiss_import()
index = faiss.IndexFlatIP(len(embeddings[0]))
index.add(np.array(embeddings, dtype=np.float32))
# # my code, for speeding up search
# quantizer = faiss.IndexFlatL2(len(embeddings[0]))
# index = faiss.IndexIVFFlat(quantizer, len(embeddings[0]), 100)
# index.train(np.array(embeddings, dtype=np.float32))
# index.add(np.array(embeddings, dtype=np.float32))
documents = []
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
documents.append(Document(page_content=text, metadata=metadata))
index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
docstore = InMemoryDocstore(
{index_to_id[i]: doc for i, doc in enumerate(documents)}
)
return cls(embedding.embed_query, index, docstore, index_to_id)
...@@ -246,8 +246,8 @@ LLM_HISTORY_LEN = 3 ...@@ -246,8 +246,8 @@ LLM_HISTORY_LEN = 3
# 知识库检索时返回的匹配内容条数 # 知识库检索时返回的匹配内容条数
VECTOR_SEARCH_TOP_K = 5 VECTOR_SEARCH_TOP_K = 5
# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准 # 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,建议设置为500左右,经测试设置为小于500时,匹配结果更精准
VECTOR_SEARCH_SCORE_THRESHOLD = 390 VECTOR_SEARCH_SCORE_THRESHOLD = 500
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data") NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
......
...@@ -6,6 +6,7 @@ from queue import Queue ...@@ -6,6 +6,7 @@ from queue import Queue
from threading import Thread from threading import Thread
from langchain.callbacks.manager import CallbackManagerForChainRun from langchain.callbacks.manager import CallbackManagerForChainRun
from models.loader import LoaderCheckPoint from models.loader import LoaderCheckPoint
from pydantic import BaseModel
import torch import torch
import transformers import transformers
...@@ -23,13 +24,12 @@ class ListenerToken: ...@@ -23,13 +24,12 @@ class ListenerToken:
self._scores = _scores self._scores = _scores
class AnswerResult: class AnswerResult(BaseModel):
""" """
消息实体 消息实体
""" """
history: List[List[str]] = [] history: List[List[str]] = []
llm_output: Optional[dict] = None llm_output: Optional[dict] = None
listenerToken: ListenerToken = None
class AnswerResultStream: class AnswerResultStream:
...@@ -167,8 +167,6 @@ class BaseAnswer(ABC): ...@@ -167,8 +167,6 @@ class BaseAnswer(ABC):
with generate_with_streaming(inputs=inputs, run_manager=run_manager) as generator: with generate_with_streaming(inputs=inputs, run_manager=run_manager) as generator:
for answerResult in generator: for answerResult in generator:
if answerResult.listenerToken:
output = answerResult.listenerToken.input_ids
yield answerResult yield answerResult
@abstractmethod @abstractmethod
......
...@@ -94,8 +94,6 @@ class ChatGLMLLMChain(BaseAnswer, Chain, ABC): ...@@ -94,8 +94,6 @@ class ChatGLMLLMChain(BaseAnswer, Chain, ABC):
answer_result = AnswerResult() answer_result = AnswerResult()
answer_result.history = history answer_result.history = history
answer_result.llm_output = {"answer": stream_resp} answer_result.llm_output = {"answer": stream_resp}
if listenerQueue.listenerQueue.__len__() > 0:
answer_result.listenerToken = listenerQueue.listenerQueue.pop()
generate_with_callback(answer_result) generate_with_callback(answer_result)
self.checkPoint.clear_torch_cache() self.checkPoint.clear_torch_cache()
else: else:
...@@ -114,8 +112,6 @@ class ChatGLMLLMChain(BaseAnswer, Chain, ABC): ...@@ -114,8 +112,6 @@ class ChatGLMLLMChain(BaseAnswer, Chain, ABC):
answer_result = AnswerResult() answer_result = AnswerResult()
answer_result.history = history answer_result.history = history
answer_result.llm_output = {"answer": response} answer_result.llm_output = {"answer": response}
if listenerQueue.listenerQueue.__len__() > 0:
answer_result.listenerToken = listenerQueue.listenerQueue.pop()
generate_with_callback(answer_result) generate_with_callback(answer_result)
from abc import ABC from abc import ABC
from langchain.chains.base import Chain from langchain.chains.base import Chain
from typing import Any, Dict, List, Optional, Generator, Collection from typing import (
Any, Dict, List, Optional, Generator, Collection, Set,
Callable,
Tuple,
Union)
from models.loader import LoaderCheckPoint from models.loader import LoaderCheckPoint
from langchain.callbacks.manager import CallbackManagerForChainRun from langchain.callbacks.manager import CallbackManagerForChainRun
from models.base import (BaseAnswer, from models.base import (BaseAnswer,
...@@ -8,9 +13,26 @@ from models.base import (BaseAnswer, ...@@ -8,9 +13,26 @@ from models.base import (BaseAnswer,
AnswerResult, AnswerResult,
AnswerResultStream, AnswerResultStream,
AnswerResultQueueSentinelTokenListenerQueue) AnswerResultQueueSentinelTokenListenerQueue)
from tenacity import (
before_sleep_log,
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
from pydantic import Extra, Field, root_validator
from openai import (
ChatCompletion
)
import openai
import logging
import torch import torch
import transformers import transformers
logger = logging.getLogger(__name__)
def _build_message_template() -> Dict[str, str]: def _build_message_template() -> Dict[str, str]:
""" """
...@@ -25,14 +47,25 @@ def _build_message_template() -> Dict[str, str]: ...@@ -25,14 +47,25 @@ def _build_message_template() -> Dict[str, str]:
# 将历史对话数组转换为文本格式 # 将历史对话数组转换为文本格式
def build_message_list(query, history: List[List[str]]) -> Collection[Dict[str, str]]: def build_message_list(query, history: List[List[str]]) -> Collection[Dict[str, str]]:
build_messages: Collection[Dict[str, str]] = [] build_messages: Collection[Dict[str, str]] = []
for i, (old_query, response) in enumerate(history):
user_build_message = _build_message_template()
user_build_message['role'] = 'user'
user_build_message['content'] = old_query
system_build_message = _build_message_template() system_build_message = _build_message_template()
system_build_message['role'] = 'system' system_build_message['role'] = 'system'
system_build_message['content'] = response system_build_message['content'] = "You are a helpful assistant."
build_messages.append(system_build_message)
if history:
for i, (user, assistant) in enumerate(history):
if user:
user_build_message = _build_message_template()
user_build_message['role'] = 'user'
user_build_message['content'] = user
build_messages.append(user_build_message) build_messages.append(user_build_message)
if not assistant:
raise RuntimeError("历史数据结构不正确")
system_build_message = _build_message_template()
system_build_message['role'] = 'assistant'
system_build_message['content'] = assistant
build_messages.append(system_build_message) build_messages.append(system_build_message)
user_build_message = _build_message_template() user_build_message = _build_message_template()
...@@ -43,6 +76,9 @@ def build_message_list(query, history: List[List[str]]) -> Collection[Dict[str, ...@@ -43,6 +76,9 @@ def build_message_list(query, history: List[List[str]]) -> Collection[Dict[str,
class FastChatOpenAILLMChain(RemoteRpcModel, Chain, ABC): class FastChatOpenAILLMChain(RemoteRpcModel, Chain, ABC):
client: Any
"""Timeout for requests to OpenAI completion API. Default is 600 seconds."""
max_retries: int = 6
api_base_url: str = "http://localhost:8000/v1" api_base_url: str = "http://localhost:8000/v1"
model_name: str = "chatglm-6b" model_name: str = "chatglm-6b"
max_token: int = 10000 max_token: int = 10000
...@@ -108,6 +144,35 @@ class FastChatOpenAILLMChain(RemoteRpcModel, Chain, ABC): ...@@ -108,6 +144,35 @@ class FastChatOpenAILLMChain(RemoteRpcModel, Chain, ABC):
def call_model_name(self, model_name): def call_model_name(self, model_name):
self.model_name = model_name self.model_name = model_name
def _create_retry_decorator(self) -> Callable[[Any], Any]:
min_seconds = 1
max_seconds = 60
# Wait 2^x * 1 second between each retry starting with
# 4 seconds, then up to 10 seconds, then 10 seconds afterwards
return retry(
reraise=True,
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds),
retry=(
retry_if_exception_type(openai.error.Timeout)
| retry_if_exception_type(openai.error.APIError)
| retry_if_exception_type(openai.error.APIConnectionError)
| retry_if_exception_type(openai.error.RateLimitError)
| retry_if_exception_type(openai.error.ServiceUnavailableError)
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def completion_with_retry(self, **kwargs: Any) -> Any:
"""Use tenacity to retry the completion call."""
retry_decorator = self._create_retry_decorator()
@retry_decorator
def _completion_with_retry(**kwargs: Any) -> Any:
return self.client.create(**kwargs)
return _completion_with_retry(**kwargs)
def _call( def _call(
self, self,
inputs: Dict[str, Any], inputs: Dict[str, Any],
...@@ -121,32 +186,74 @@ class FastChatOpenAILLMChain(RemoteRpcModel, Chain, ABC): ...@@ -121,32 +186,74 @@ class FastChatOpenAILLMChain(RemoteRpcModel, Chain, ABC):
run_manager: Optional[CallbackManagerForChainRun] = None, run_manager: Optional[CallbackManagerForChainRun] = None,
generate_with_callback: AnswerResultStream = None) -> None: generate_with_callback: AnswerResultStream = None) -> None:
history = inputs[self.history_key] history = inputs.get(self.history_key, [])
streaming = inputs[self.streaming_key] streaming = inputs.get(self.streaming_key, False)
prompt = inputs[self.prompt_key] prompt = inputs[self.prompt_key]
stop = inputs.get("stop", "stop")
print(f"__call:{prompt}") print(f"__call:{prompt}")
try: try:
import openai
# Not support yet # Not support yet
# openai.api_key = "EMPTY" # openai.api_key = "EMPTY"
openai.api_key = self.api_key openai.api_key = self.api_key
openai.api_base = self.api_base_url openai.api_base = self.api_base_url
except ImportError: self.client = openai.ChatCompletion
except AttributeError:
raise ValueError( raise ValueError(
"Could not import openai python package. " "`openai` has no `ChatCompletion` attribute, this is likely "
"Please install it with `pip install openai`." "due to an old version of the openai package. Try upgrading it "
"with `pip install --upgrade openai`."
) )
# create a chat completion msg = build_message_list(prompt, history=history)
completion = openai.ChatCompletion.create(
model=self.model_name,
messages=build_message_list(prompt,history=history)
)
print(f"response:{completion.choices[0].message.content}")
print(f"+++++++++++++++++++++++++++++++++++")
history += [[prompt, completion.choices[0].message.content]] if streaming:
params = {"stream": streaming,
"model": self.model_name,
"stop": stop}
out_str = ""
for stream_resp in self.completion_with_retry(
messages=msg,
**params
):
role = stream_resp["choices"][0]["delta"].get("role", "")
token = stream_resp["choices"][0]["delta"].get("content", "")
out_str += token
history[-1] = [prompt, out_str]
answer_result = AnswerResult() answer_result = AnswerResult()
answer_result.history = history answer_result.history = history
answer_result.llm_output = {"answer": completion.choices[0].message.content} answer_result.llm_output = {"answer": out_str}
generate_with_callback(answer_result) generate_with_callback(answer_result)
else:
params = {"stream": streaming,
"model": self.model_name,
"stop": stop}
response = self.completion_with_retry(
messages=msg,
**params
)
role = response["choices"][0]["message"].get("role", "")
content = response["choices"][0]["message"].get("content", "")
history += [[prompt, content]]
answer_result = AnswerResult()
answer_result.history = history
answer_result.llm_output = {"answer": content}
generate_with_callback(answer_result)
if __name__ == "__main__":
chain = FastChatOpenAILLMChain()
chain.set_api_key("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")
# chain.set_api_base_url("https://api.openai.com/v1")
# chain.call_model_name("gpt-3.5-turbo")
answer_result_stream_result = chain({"streaming": True,
"prompt": "你好",
"history": []
})
for answer_result in answer_result_stream_result['answer_result_stream']:
resp = answer_result.llm_output["answer"]
print(resp)
...@@ -186,7 +186,5 @@ class LLamaLLMChain(BaseAnswer, Chain, ABC): ...@@ -186,7 +186,5 @@ class LLamaLLMChain(BaseAnswer, Chain, ABC):
answer_result = AnswerResult() answer_result = AnswerResult()
history += [[prompt, reply]] history += [[prompt, reply]]
answer_result.history = history answer_result.history = history
if listenerQueue.listenerQueue.__len__() > 0:
answer_result.listenerToken = listenerQueue.listenerQueue.pop()
answer_result.llm_output = {"answer": reply} answer_result.llm_output = {"answer": reply}
generate_with_callback(answer_result) generate_with_callback(answer_result)
...@@ -11,7 +11,7 @@ beautifulsoup4 ...@@ -11,7 +11,7 @@ beautifulsoup4
icetk icetk
cpm_kernels cpm_kernels
faiss-cpu faiss-cpu
gradio==3.28.3 gradio==3.37.0
fastapi~=0.95.0 fastapi~=0.95.0
uvicorn~=0.21.1 uvicorn~=0.21.1
pypinyin~=0.48.0 pypinyin~=0.48.0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论