提交 ba336440 作者: hzg0601

Merge branch 'dev' of github.com:imClumsyPanda/langchain-ChatGLM into dev

pull for 2023--6-15
......@@ -167,6 +167,7 @@ log/*
vector_store/*
content/*
api_content/*
knowledge_base/*
llm/*
embedding/*
......
......@@ -229,6 +229,7 @@ Web UI 可以实现如下功能:
- [x] VUE 前端
## 项目交流群
![二维码](img/qr_code_30.jpg)
<img src="img/qr_code_32.jpg" alt="二维码" width="300" height="300" />
🎉 langchain-ChatGLM 项目交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
🎉 langchain-ChatGLM 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
......@@ -187,8 +187,9 @@ class LocalDocQA:
torch_gc()
else:
if not vs_path:
vs_path = os.path.join(VS_ROOT_PATH,
f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""")
vs_path = os.path.join(KB_ROOT_PATH,
f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""",
"vector_store")
vector_store = MyFAISS.from_documents(docs, self.embeddings) # docs 为Document列表
torch_gc()
......@@ -283,6 +284,31 @@ class LocalDocQA:
"source_documents": result_docs}
yield response, history
def delete_file_from_vector_store(self,
filepath: str or List[str],
vs_path):
vector_store = load_vector_store(vs_path, self.embeddings)
status = vector_store.delete_doc(filepath)
return status
def update_file_from_vector_store(self,
filepath: str or List[str],
vs_path,
docs: List[Document],):
vector_store = load_vector_store(vs_path, self.embeddings)
status = vector_store.update_doc(filepath, docs)
return status
def list_file_from_vector_store(self,
vs_path,
fullpath=False):
vector_store = load_vector_store(vs_path, self.embeddings)
docs = vector_store.list_docs()
if fullpath:
return docs
else:
return [os.path.split(doc)[-1] for doc in docs]
if __name__ == "__main__":
# 初始化消息
......
......@@ -64,7 +64,7 @@ def start_api(ip, port):
# 然后在cli.py里初始化
@start.command(name="cli", context_settings=dict(help_option_names=['-h', '--help']))
def start_cli(info):
def start_cli():
print("通过cli.py调用cli_demo...")
from models import shared
......@@ -79,9 +79,7 @@ def start_cli(info):
# 故建议不要通过以上命令启动webui,将下述语句注释掉
@start.command(name="webui", context_settings=dict(help_option_names=['-h', '--help']))
@click.option('-i', '--info', default="start client", show_default=True, type=str)
def start_webui(info):
print(info)
def start_webui():
import webui
......
......@@ -74,7 +74,7 @@ llm_model_dict = {
"vicuna-13b-hf": {
"name": "vicuna-13b-hf",
"pretrained_model_name": "vicuna-13b-hf",
"local_model_path": "/media/checkpoint/vicuna-13b-hf",
"local_model_path": None,
"provides": "LLamaLLM"
},
......@@ -119,10 +119,8 @@ USE_PTUNING_V2 = False
# LLM running device
LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store")
UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content")
# 知识库默认存储路径
KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
# 基于上下文的prompt模版,请务必保留"{question}"和"{context}"
PROMPT_TEMPLATE = """已知信息:
......@@ -139,10 +137,10 @@ SENTENCE_SIZE = 100
# 匹配后单段上下文长度
CHUNK_SIZE = 250
# LLM input history length
# 传入LLM的历史记录长度
LLM_HISTORY_LEN = 3
# return top-k text chunk from vector store
# 知识库检索时返回的匹配内容条数
VECTOR_SEARCH_TOP_K = 5
# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准
......
......@@ -33,7 +33,9 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
if __name__ == "__main__":
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.jpg")
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg")
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
docs = loader.load()
for doc in docs:
......
......@@ -49,7 +49,9 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
if __name__ == "__main__":
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf")
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf")
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
docs = loader.load()
for doc in docs:
......
......@@ -98,9 +98,10 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
"""
formatted_history = ''
history = history[-self.history_len:] if self.history_len > 0 else []
for i, (old_query, response) in enumerate(history):
formatted_history += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
formatted_history += "[Round {}]\n问:{}\n答:".format(len(history), query)
if len(history) > 0:
for i, (old_query, response) in enumerate(history):
formatted_history += "### Human:{}\n### Assistant:{}\n".format(old_query, response)
formatted_history += "### Human:{}\n### Assistant:".format(query)
return formatted_history
def prepare_inputs_for_generation(self,
......@@ -140,12 +141,13 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
"max_new_tokens": self.max_new_tokens,
"num_beams": self.num_beams,
"top_p": self.top_p,
"do_sample": True,
"top_k": self.top_k,
"repetition_penalty": self.repetition_penalty,
"encoder_repetition_penalty": self.encoder_repetition_penalty,
"min_length": self.min_length,
"temperature": self.temperature,
"eos_token_id": self.eos_token_id,
"eos_token_id": self.checkPoint.tokenizer.eos_token_id,
"logits_processor": self.logits_processor}
# 向量转换
......@@ -178,6 +180,6 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
response = self._call(prompt=softprompt, stop=['\n###'])
answer_result = AnswerResult()
answer_result.history = history + [[None, response]]
answer_result.history = history + [[prompt, response]]
answer_result.llm_output = {"answer": response}
yield answer_result
......@@ -75,8 +75,8 @@ class MOSSLLM(BaseAnswer, LLM, ABC):
repetition_penalty=1.02,
num_return_sequences=1,
eos_token_id=106068,
pad_token_id=self.tokenizer.pad_token_id)
response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
pad_token_id=self.checkPoint.tokenizer.pad_token_id)
response = self.checkPoint.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
self.checkPoint.clear_torch_cache()
history += [[prompt, response]]
answer_result = AnswerResult()
......
from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.faiss import dependable_faiss_import
from typing import Any, Callable, List, Tuple, Dict
from typing import Any, Callable, List, Dict
from langchain.docstore.base import Docstore
from langchain.docstore.document import Document
import numpy as np
import copy
class MyFAISS(FAISS, VectorStore):
......@@ -46,6 +47,7 @@ class MyFAISS(FAISS, VectorStore):
docs = []
id_set = set()
store_len = len(self.index_to_docstore_id)
rearrange_id_list = False
for j, i in enumerate(indices[0]):
if i == -1 or 0 < self.score_threshold < scores[0][j]:
# This happens when not enough docs are returned.
......@@ -53,11 +55,13 @@ class MyFAISS(FAISS, VectorStore):
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if (not self.chunk_conent) or ("context_expand" in doc.metadata and not doc.metadata["context_expand"]):
# 匹配出的文本如果不需要扩展上下文则执行如下代码
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
doc.metadata["score"] = int(scores[0][j])
docs.append(doc)
continue
id_set.add(i)
docs_len = len(doc.page_content)
for k in range(1, max(i, store_len - i)):
......@@ -72,15 +76,17 @@ class MyFAISS(FAISS, VectorStore):
if l not in id_set and 0 <= l < len(self.index_to_docstore_id):
_id0 = self.index_to_docstore_id[l]
doc0 = self.docstore.search(_id0)
if docs_len + len(doc0.page_content) > self.chunk_size or doc0.metadata["source"] != doc.metadata["source"]:
if docs_len + len(doc0.page_content) > self.chunk_size or doc0.metadata["source"] != \
doc.metadata["source"]:
break_flag = True
break
elif doc0.metadata["source"] == doc.metadata["source"]:
docs_len += len(doc0.page_content)
id_set.add(l)
rearrange_id_list = True
if break_flag:
break
if (not self.chunk_conent) or ("add_context" in doc.metadata and not doc.metadata["add_context"]):
if (not self.chunk_conent) or (not rearrange_id_list):
return docs
if len(id_set) == 0 and self.score_threshold > 0:
return []
......@@ -90,7 +96,8 @@ class MyFAISS(FAISS, VectorStore):
for id in id_seq:
if id == id_seq[0]:
_id = self.index_to_docstore_id[id]
doc = self.docstore.search(_id)
# doc = self.docstore.search(_id)
doc = copy.deepcopy(self.docstore.search(_id))
else:
_id0 = self.index_to_docstore_id[id]
doc0 = self.docstore.search(_id0)
......@@ -101,3 +108,33 @@ class MyFAISS(FAISS, VectorStore):
doc.metadata["score"] = int(doc_score)
docs.append(doc)
return docs
def delete_doc(self, source: str or List[str]):
try:
if isinstance(source, str):
ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] == source]
else:
ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] in source]
if len(ids) == 0:
return f"docs delete fail"
else:
for id in ids:
index = list(self.index_to_docstore_id.keys())[list(self.index_to_docstore_id.values()).index(id)]
self.index_to_docstore_id.pop(index)
self.docstore._dict.pop(id)
return f"docs delete success"
except Exception as e:
print(e)
return f"docs delete fail"
def update_doc(self, source, new_docs):
try:
delete_len = self.delete_doc(source)
ls = self.add_documents(new_docs)
return f"docs update success"
except Exception as e:
print(e)
return f"docs update fail"
def list_docs(self):
return list(set(v.metadata["source"] for v in self.docstore._dict.values()))
......@@ -20,9 +20,9 @@ nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
def get_vs_list():
lst_default = ["新建知识库"]
if not os.path.exists(VS_ROOT_PATH):
if not os.path.exists(KB_ROOT_PATH):
return lst_default
lst = os.listdir(VS_ROOT_PATH)
lst = os.listdir(KB_ROOT_PATH)
if not lst:
return lst_default
lst.sort()
......@@ -144,18 +144,18 @@ def init_model(llm_model: str = 'chat-glm-6b', embedding_model: str = 'text2vec'
def get_vector_store(vs_id, files, sentence_size, history, one_conent, one_content_segmentation):
vs_path = os.path.join(VS_ROOT_PATH, vs_id)
vs_path = os.path.join(KB_ROOT_PATH, vs_id, "vector_store")
filelist = []
if not os.path.exists(os.path.join(UPLOAD_ROOT_PATH, vs_id)):
os.makedirs(os.path.join(UPLOAD_ROOT_PATH, vs_id))
if not os.path.exists(os.path.join(KB_ROOT_PATH, vs_id, "content")):
os.makedirs(os.path.join(KB_ROOT_PATH, vs_id, "content"))
if local_doc_qa.llm and local_doc_qa.embeddings:
if isinstance(files, list):
for file in files:
filename = os.path.split(file.name)[-1]
shutil.move(file.name, os.path.join(
UPLOAD_ROOT_PATH, vs_id, filename))
KB_ROOT_PATH, vs_id, "content", filename))
filelist.append(os.path.join(
UPLOAD_ROOT_PATH, vs_id, filename))
KB_ROOT_PATH, vs_id, "content", filename))
vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(
filelist, vs_path, sentence_size)
else:
......@@ -516,7 +516,7 @@ with st.form('my_form', clear_on_submit=True):
last_response = output_messages()
for history, _ in answer(q,
vs_path=os.path.join(
VS_ROOT_PATH, vs_path),
KB_ROOT_PATH, vs_path, "vector_store"),
history=[],
mode=mode,
score_threshold=score_threshold,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论