提交 ba336440 作者: hzg0601

Merge branch 'dev' of github.com:imClumsyPanda/langchain-ChatGLM into dev

pull for 2023--6-15
...@@ -167,6 +167,7 @@ log/* ...@@ -167,6 +167,7 @@ log/*
vector_store/* vector_store/*
content/* content/*
api_content/* api_content/*
knowledge_base/*
llm/* llm/*
embedding/* embedding/*
......
...@@ -229,6 +229,7 @@ Web UI 可以实现如下功能: ...@@ -229,6 +229,7 @@ Web UI 可以实现如下功能:
- [x] VUE 前端 - [x] VUE 前端
## 项目交流群 ## 项目交流群
![二维码](img/qr_code_30.jpg) <img src="img/qr_code_32.jpg" alt="二维码" width="300" height="300" />
🎉 langchain-ChatGLM 项目交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
🎉 langchain-ChatGLM 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
...@@ -187,8 +187,9 @@ class LocalDocQA: ...@@ -187,8 +187,9 @@ class LocalDocQA:
torch_gc() torch_gc()
else: else:
if not vs_path: if not vs_path:
vs_path = os.path.join(VS_ROOT_PATH, vs_path = os.path.join(KB_ROOT_PATH,
f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""") f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""",
"vector_store")
vector_store = MyFAISS.from_documents(docs, self.embeddings) # docs 为Document列表 vector_store = MyFAISS.from_documents(docs, self.embeddings) # docs 为Document列表
torch_gc() torch_gc()
...@@ -283,6 +284,31 @@ class LocalDocQA: ...@@ -283,6 +284,31 @@ class LocalDocQA:
"source_documents": result_docs} "source_documents": result_docs}
yield response, history yield response, history
def delete_file_from_vector_store(self,
filepath: str or List[str],
vs_path):
vector_store = load_vector_store(vs_path, self.embeddings)
status = vector_store.delete_doc(filepath)
return status
def update_file_from_vector_store(self,
filepath: str or List[str],
vs_path,
docs: List[Document],):
vector_store = load_vector_store(vs_path, self.embeddings)
status = vector_store.update_doc(filepath, docs)
return status
def list_file_from_vector_store(self,
vs_path,
fullpath=False):
vector_store = load_vector_store(vs_path, self.embeddings)
docs = vector_store.list_docs()
if fullpath:
return docs
else:
return [os.path.split(doc)[-1] for doc in docs]
if __name__ == "__main__": if __name__ == "__main__":
# 初始化消息 # 初始化消息
......
...@@ -64,7 +64,7 @@ def start_api(ip, port): ...@@ -64,7 +64,7 @@ def start_api(ip, port):
# 然后在cli.py里初始化 # 然后在cli.py里初始化
@start.command(name="cli", context_settings=dict(help_option_names=['-h', '--help'])) @start.command(name="cli", context_settings=dict(help_option_names=['-h', '--help']))
def start_cli(info): def start_cli():
print("通过cli.py调用cli_demo...") print("通过cli.py调用cli_demo...")
from models import shared from models import shared
...@@ -79,9 +79,7 @@ def start_cli(info): ...@@ -79,9 +79,7 @@ def start_cli(info):
# 故建议不要通过以上命令启动webui,将下述语句注释掉 # 故建议不要通过以上命令启动webui,将下述语句注释掉
@start.command(name="webui", context_settings=dict(help_option_names=['-h', '--help'])) @start.command(name="webui", context_settings=dict(help_option_names=['-h', '--help']))
@click.option('-i', '--info', default="start client", show_default=True, type=str) def start_webui():
def start_webui(info):
print(info)
import webui import webui
......
...@@ -74,7 +74,7 @@ llm_model_dict = { ...@@ -74,7 +74,7 @@ llm_model_dict = {
"vicuna-13b-hf": { "vicuna-13b-hf": {
"name": "vicuna-13b-hf", "name": "vicuna-13b-hf",
"pretrained_model_name": "vicuna-13b-hf", "pretrained_model_name": "vicuna-13b-hf",
"local_model_path": "/media/checkpoint/vicuna-13b-hf", "local_model_path": None,
"provides": "LLamaLLM" "provides": "LLamaLLM"
}, },
...@@ -119,10 +119,8 @@ USE_PTUNING_V2 = False ...@@ -119,10 +119,8 @@ USE_PTUNING_V2 = False
# LLM running device # LLM running device
LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
# 知识库默认存储路径
VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store") KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content")
# 基于上下文的prompt模版,请务必保留"{question}"和"{context}" # 基于上下文的prompt模版,请务必保留"{question}"和"{context}"
PROMPT_TEMPLATE = """已知信息: PROMPT_TEMPLATE = """已知信息:
...@@ -139,10 +137,10 @@ SENTENCE_SIZE = 100 ...@@ -139,10 +137,10 @@ SENTENCE_SIZE = 100
# 匹配后单段上下文长度 # 匹配后单段上下文长度
CHUNK_SIZE = 250 CHUNK_SIZE = 250
# LLM input history length # 传入LLM的历史记录长度
LLM_HISTORY_LEN = 3 LLM_HISTORY_LEN = 3
# return top-k text chunk from vector store # 知识库检索时返回的匹配内容条数
VECTOR_SEARCH_TOP_K = 5 VECTOR_SEARCH_TOP_K = 5
# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准 # 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准
......
...@@ -33,7 +33,9 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader): ...@@ -33,7 +33,9 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
if __name__ == "__main__": if __name__ == "__main__":
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.jpg") import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg")
loader = UnstructuredPaddleImageLoader(filepath, mode="elements") loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
docs = loader.load() docs = loader.load()
for doc in docs: for doc in docs:
......
...@@ -49,7 +49,9 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): ...@@ -49,7 +49,9 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
if __name__ == "__main__": if __name__ == "__main__":
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf") import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf")
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements") loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
docs = loader.load() docs = loader.load()
for doc in docs: for doc in docs:
......
...@@ -98,9 +98,10 @@ class LLamaLLM(BaseAnswer, LLM, ABC): ...@@ -98,9 +98,10 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
""" """
formatted_history = '' formatted_history = ''
history = history[-self.history_len:] if self.history_len > 0 else [] history = history[-self.history_len:] if self.history_len > 0 else []
for i, (old_query, response) in enumerate(history): if len(history) > 0:
formatted_history += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) for i, (old_query, response) in enumerate(history):
formatted_history += "[Round {}]\n问:{}\n答:".format(len(history), query) formatted_history += "### Human:{}\n### Assistant:{}\n".format(old_query, response)
formatted_history += "### Human:{}\n### Assistant:".format(query)
return formatted_history return formatted_history
def prepare_inputs_for_generation(self, def prepare_inputs_for_generation(self,
...@@ -140,12 +141,13 @@ class LLamaLLM(BaseAnswer, LLM, ABC): ...@@ -140,12 +141,13 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
"max_new_tokens": self.max_new_tokens, "max_new_tokens": self.max_new_tokens,
"num_beams": self.num_beams, "num_beams": self.num_beams,
"top_p": self.top_p, "top_p": self.top_p,
"do_sample": True,
"top_k": self.top_k, "top_k": self.top_k,
"repetition_penalty": self.repetition_penalty, "repetition_penalty": self.repetition_penalty,
"encoder_repetition_penalty": self.encoder_repetition_penalty, "encoder_repetition_penalty": self.encoder_repetition_penalty,
"min_length": self.min_length, "min_length": self.min_length,
"temperature": self.temperature, "temperature": self.temperature,
"eos_token_id": self.eos_token_id, "eos_token_id": self.checkPoint.tokenizer.eos_token_id,
"logits_processor": self.logits_processor} "logits_processor": self.logits_processor}
# 向量转换 # 向量转换
...@@ -178,6 +180,6 @@ class LLamaLLM(BaseAnswer, LLM, ABC): ...@@ -178,6 +180,6 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
response = self._call(prompt=softprompt, stop=['\n###']) response = self._call(prompt=softprompt, stop=['\n###'])
answer_result = AnswerResult() answer_result = AnswerResult()
answer_result.history = history + [[None, response]] answer_result.history = history + [[prompt, response]]
answer_result.llm_output = {"answer": response} answer_result.llm_output = {"answer": response}
yield answer_result yield answer_result
...@@ -75,8 +75,8 @@ class MOSSLLM(BaseAnswer, LLM, ABC): ...@@ -75,8 +75,8 @@ class MOSSLLM(BaseAnswer, LLM, ABC):
repetition_penalty=1.02, repetition_penalty=1.02,
num_return_sequences=1, num_return_sequences=1,
eos_token_id=106068, eos_token_id=106068,
pad_token_id=self.tokenizer.pad_token_id) pad_token_id=self.checkPoint.tokenizer.pad_token_id)
response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) response = self.checkPoint.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
self.checkPoint.clear_torch_cache() self.checkPoint.clear_torch_cache()
history += [[prompt, response]] history += [[prompt, response]]
answer_result = AnswerResult() answer_result = AnswerResult()
......
from langchain.vectorstores import FAISS from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStore from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.faiss import dependable_faiss_import from langchain.vectorstores.faiss import dependable_faiss_import
from typing import Any, Callable, List, Tuple, Dict from typing import Any, Callable, List, Dict
from langchain.docstore.base import Docstore from langchain.docstore.base import Docstore
from langchain.docstore.document import Document from langchain.docstore.document import Document
import numpy as np import numpy as np
import copy
class MyFAISS(FAISS, VectorStore): class MyFAISS(FAISS, VectorStore):
...@@ -46,6 +47,7 @@ class MyFAISS(FAISS, VectorStore): ...@@ -46,6 +47,7 @@ class MyFAISS(FAISS, VectorStore):
docs = [] docs = []
id_set = set() id_set = set()
store_len = len(self.index_to_docstore_id) store_len = len(self.index_to_docstore_id)
rearrange_id_list = False
for j, i in enumerate(indices[0]): for j, i in enumerate(indices[0]):
if i == -1 or 0 < self.score_threshold < scores[0][j]: if i == -1 or 0 < self.score_threshold < scores[0][j]:
# This happens when not enough docs are returned. # This happens when not enough docs are returned.
...@@ -53,11 +55,13 @@ class MyFAISS(FAISS, VectorStore): ...@@ -53,11 +55,13 @@ class MyFAISS(FAISS, VectorStore):
_id = self.index_to_docstore_id[i] _id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id) doc = self.docstore.search(_id)
if (not self.chunk_conent) or ("context_expand" in doc.metadata and not doc.metadata["context_expand"]): if (not self.chunk_conent) or ("context_expand" in doc.metadata and not doc.metadata["context_expand"]):
# 匹配出的文本如果不需要扩展上下文则执行如下代码
if not isinstance(doc, Document): if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}") raise ValueError(f"Could not find document for id {_id}, got {doc}")
doc.metadata["score"] = int(scores[0][j]) doc.metadata["score"] = int(scores[0][j])
docs.append(doc) docs.append(doc)
continue continue
id_set.add(i) id_set.add(i)
docs_len = len(doc.page_content) docs_len = len(doc.page_content)
for k in range(1, max(i, store_len - i)): for k in range(1, max(i, store_len - i)):
...@@ -72,15 +76,17 @@ class MyFAISS(FAISS, VectorStore): ...@@ -72,15 +76,17 @@ class MyFAISS(FAISS, VectorStore):
if l not in id_set and 0 <= l < len(self.index_to_docstore_id): if l not in id_set and 0 <= l < len(self.index_to_docstore_id):
_id0 = self.index_to_docstore_id[l] _id0 = self.index_to_docstore_id[l]
doc0 = self.docstore.search(_id0) doc0 = self.docstore.search(_id0)
if docs_len + len(doc0.page_content) > self.chunk_size or doc0.metadata["source"] != doc.metadata["source"]: if docs_len + len(doc0.page_content) > self.chunk_size or doc0.metadata["source"] != \
doc.metadata["source"]:
break_flag = True break_flag = True
break break
elif doc0.metadata["source"] == doc.metadata["source"]: elif doc0.metadata["source"] == doc.metadata["source"]:
docs_len += len(doc0.page_content) docs_len += len(doc0.page_content)
id_set.add(l) id_set.add(l)
rearrange_id_list = True
if break_flag: if break_flag:
break break
if (not self.chunk_conent) or ("add_context" in doc.metadata and not doc.metadata["add_context"]): if (not self.chunk_conent) or (not rearrange_id_list):
return docs return docs
if len(id_set) == 0 and self.score_threshold > 0: if len(id_set) == 0 and self.score_threshold > 0:
return [] return []
...@@ -90,7 +96,8 @@ class MyFAISS(FAISS, VectorStore): ...@@ -90,7 +96,8 @@ class MyFAISS(FAISS, VectorStore):
for id in id_seq: for id in id_seq:
if id == id_seq[0]: if id == id_seq[0]:
_id = self.index_to_docstore_id[id] _id = self.index_to_docstore_id[id]
doc = self.docstore.search(_id) # doc = self.docstore.search(_id)
doc = copy.deepcopy(self.docstore.search(_id))
else: else:
_id0 = self.index_to_docstore_id[id] _id0 = self.index_to_docstore_id[id]
doc0 = self.docstore.search(_id0) doc0 = self.docstore.search(_id0)
...@@ -101,3 +108,33 @@ class MyFAISS(FAISS, VectorStore): ...@@ -101,3 +108,33 @@ class MyFAISS(FAISS, VectorStore):
doc.metadata["score"] = int(doc_score) doc.metadata["score"] = int(doc_score)
docs.append(doc) docs.append(doc)
return docs return docs
def delete_doc(self, source: str or List[str]):
try:
if isinstance(source, str):
ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] == source]
else:
ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] in source]
if len(ids) == 0:
return f"docs delete fail"
else:
for id in ids:
index = list(self.index_to_docstore_id.keys())[list(self.index_to_docstore_id.values()).index(id)]
self.index_to_docstore_id.pop(index)
self.docstore._dict.pop(id)
return f"docs delete success"
except Exception as e:
print(e)
return f"docs delete fail"
def update_doc(self, source, new_docs):
try:
delete_len = self.delete_doc(source)
ls = self.add_documents(new_docs)
return f"docs update success"
except Exception as e:
print(e)
return f"docs update fail"
def list_docs(self):
return list(set(v.metadata["source"] for v in self.docstore._dict.values()))
...@@ -20,9 +20,9 @@ nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path ...@@ -20,9 +20,9 @@ nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
def get_vs_list(): def get_vs_list():
lst_default = ["新建知识库"] lst_default = ["新建知识库"]
if not os.path.exists(VS_ROOT_PATH): if not os.path.exists(KB_ROOT_PATH):
return lst_default return lst_default
lst = os.listdir(VS_ROOT_PATH) lst = os.listdir(KB_ROOT_PATH)
if not lst: if not lst:
return lst_default return lst_default
lst.sort() lst.sort()
...@@ -144,18 +144,18 @@ def init_model(llm_model: str = 'chat-glm-6b', embedding_model: str = 'text2vec' ...@@ -144,18 +144,18 @@ def init_model(llm_model: str = 'chat-glm-6b', embedding_model: str = 'text2vec'
def get_vector_store(vs_id, files, sentence_size, history, one_conent, one_content_segmentation): def get_vector_store(vs_id, files, sentence_size, history, one_conent, one_content_segmentation):
vs_path = os.path.join(VS_ROOT_PATH, vs_id) vs_path = os.path.join(KB_ROOT_PATH, vs_id, "vector_store")
filelist = [] filelist = []
if not os.path.exists(os.path.join(UPLOAD_ROOT_PATH, vs_id)): if not os.path.exists(os.path.join(KB_ROOT_PATH, vs_id, "content")):
os.makedirs(os.path.join(UPLOAD_ROOT_PATH, vs_id)) os.makedirs(os.path.join(KB_ROOT_PATH, vs_id, "content"))
if local_doc_qa.llm and local_doc_qa.embeddings: if local_doc_qa.llm and local_doc_qa.embeddings:
if isinstance(files, list): if isinstance(files, list):
for file in files: for file in files:
filename = os.path.split(file.name)[-1] filename = os.path.split(file.name)[-1]
shutil.move(file.name, os.path.join( shutil.move(file.name, os.path.join(
UPLOAD_ROOT_PATH, vs_id, filename)) KB_ROOT_PATH, vs_id, "content", filename))
filelist.append(os.path.join( filelist.append(os.path.join(
UPLOAD_ROOT_PATH, vs_id, filename)) KB_ROOT_PATH, vs_id, "content", filename))
vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store( vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(
filelist, vs_path, sentence_size) filelist, vs_path, sentence_size)
else: else:
...@@ -516,7 +516,7 @@ with st.form('my_form', clear_on_submit=True): ...@@ -516,7 +516,7 @@ with st.form('my_form', clear_on_submit=True):
last_response = output_messages() last_response = output_messages()
for history, _ in answer(q, for history, _ in answer(q,
vs_path=os.path.join( vs_path=os.path.join(
VS_ROOT_PATH, vs_path), KB_ROOT_PATH, vs_path, "vector_store"),
history=[], history=[],
mode=mode, mode=mode,
score_threshold=score_threshold, score_threshold=score_threshold,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论