Merge branch 'dev' of github.com:imClumsyPanda/langchain-ChatGLM into dev

pull for 2023--6-15

Merge branch 'dev' of github.com:imClumsyPanda/langchain-ChatGLM into dev
pull for 2023--6-15
ba336440 · hzg0601 · c42c0cbf · 3d95706d · ba336440 · ba336440
--- a/.gitignore
+++ b/.gitignore
@@ -167,6 +167,7 @@ log/*
 vector_store/*
 content/*
 api_content/*
+knowledge_base/*
 llm/*
 embedding/*

--- a/README.md
+++ b/README.md
@@ -229,6 +229,7 @@ Web UI 可以实现如下功能：
 - [x] VUE 前端
 ## 项目交流群
-![二维码](img/qr_code_30.jpg)
+<img src="img/qr_code_32.jpg" alt="二维码" width="300" height="300" />
-🎉 langchain-ChatGLM 项目交流群，如果你也对本项目感兴趣，欢迎加入群聊参与讨论交流。
+🎉 langchain-ChatGLM 项目微信交流群，如果你也对本项目感兴趣，欢迎加入群聊参与讨论交流。
--- a/agent/agent模式测试.ipynb
+++ b/agent/agent模式测试.ipynb
--- a/api.py
+++ b/api.py
--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@@ -187,8 +187,9 @@ class LocalDocQA:
                torch_gc()
            else:
                if not vs_path:
-                    vs_path = os.path.join(VS_ROOT_PATH,
+                    vs_path = os.path.join(KB_ROOT_PATH,
-                                           f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""")
+                                           f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""",
+                                           "vector_store")
                vector_store = MyFAISS.from_documents(docs, self.embeddings)  # docs 为Document列表
                torch_gc()
@@ -283,6 +284,31 @@ class LocalDocQA:
                        "source_documents": result_docs}
            yield response, history
+    def delete_file_from_vector_store(self,
+                                      filepath: str or List[str],
+                                      vs_path):
+        vector_store = load_vector_store(vs_path, self.embeddings)
+        status = vector_store.delete_doc(filepath)
+        return status
+    def update_file_from_vector_store(self,
+                                      filepath: str or List[str],
+                                      vs_path,
+                                      docs: List[Document],):
+        vector_store = load_vector_store(vs_path, self.embeddings)
+        status = vector_store.update_doc(filepath, docs)
+        return status
+    def list_file_from_vector_store(self,
+                                    vs_path,
+                                    fullpath=False):
+        vector_store = load_vector_store(vs_path, self.embeddings)
+        docs = vector_store.list_docs()
+        if fullpath:
+            return docs
+        else:
+            return [os.path.split(doc)[-1] for doc in docs]
 if __name__ == "__main__":
    # 初始化消息

--- a/cli.py
+++ b/cli.py
@@ -64,7 +64,7 @@ def start_api(ip, port):
    # 然后在cli.py里初始化
 @start.command(name="cli", context_settings=dict(help_option_names=['-h', '--help']))
-def start_cli(info):
+def start_cli():
    print("通过cli.py调用cli_demo...")
    from models import shared
@@ -79,9 +79,7 @@ def start_cli(info):
 # 故建议不要通过以上命令启动webui,将下述语句注释掉
 @start.command(name="webui", context_settings=dict(help_option_names=['-h', '--help']))
-@click.option('-i', '--info', default="start client", show_default=True, type=str)
+def start_webui():
-def start_webui(info):
-    print(info)
    import webui

--- a/configs/model_config.py
+++ b/configs/model_config.py
@@ -74,7 +74,7 @@ llm_model_dict = {
    "vicuna-13b-hf": {
        "name": "vicuna-13b-hf",
        "pretrained_model_name": "vicuna-13b-hf",
-        "local_model_path": "/media/checkpoint/vicuna-13b-hf",
+        "local_model_path": None,
        "provides": "LLamaLLM"
    },
@@ -119,10 +119,8 @@ USE_PTUNING_V2 = False
 # LLM running device
 LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+# 知识库默认存储路径
-VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store")
+KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
-UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content")
 # 基于上下文的prompt模版，请务必保留"{question}"和"{context}"
 PROMPT_TEMPLATE = """已知信息：
@@ -139,10 +137,10 @@ SENTENCE_SIZE = 100
 # 匹配后单段上下文长度
 CHUNK_SIZE = 250
-# LLM input history length
+# 传入LLM的历史记录长度
 LLM_HISTORY_LEN = 3
-# return top-k text chunk from vector store
+# 知识库检索时返回的匹配内容条数
 VECTOR_SEARCH_TOP_K = 5
 # 知识检索内容相关度 Score, 数值范围约为0-1100，如果为0，则不生效，经测试设置为小于500时，匹配结果更精准

--- a/img/qr_code_30.jpg
+++ b/img/qr_code_30.jpg
--- a/img/qr_code_32.jpg
+++ b/img/qr_code_32.jpg
--- a/content/samples/README.md
+++ b/content/samples/README.md
--- a/content/samples/test.jpg
+++ b/content/samples/test.jpg
--- a/content/samples/test.pdf
+++ b/content/samples/test.pdf
--- a/content/samples/test.txt
+++ b/content/samples/test.txt
--- a/knowledge_base/samples/vector_store/index.faiss
+++ b/knowledge_base/samples/vector_store/index.faiss
--- a/knowledge_base/samples/vector_store/index.pkl
+++ b/knowledge_base/samples/vector_store/index.pkl
--- a/loader/image_loader.py
+++ b/loader/image_loader.py
@@ -33,7 +33,9 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
 if __name__ == "__main__":
-    filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.jpg")
+    import sys
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+    filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg")
    loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
    docs = loader.load()
    for doc in docs:

--- a/loader/pdf_loader.py
+++ b/loader/pdf_loader.py
@@ -49,7 +49,9 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
 if __name__ == "__main__":
-    filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf")
+    import sys
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+    filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf")
    loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
    docs = loader.load()
    for doc in docs:

--- a/models/llama_llm.py
+++ b/models/llama_llm.py
@@ -98,9 +98,10 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
        """
        formatted_history = ''
        history = history[-self.history_len:] if self.history_len > 0 else []
-        for i, (old_query, response) in enumerate(history):
+        if len(history) > 0:
-            formatted_history += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            for i, (old_query, response) in enumerate(history):
-        formatted_history += "[Round {}]\n问：{}\n答：".format(len(history), query)
+                formatted_history += "### Human：{}\n### Assistant：{}\n".format(old_query, response)
+        formatted_history += "### Human：{}\n### Assistant：".format(query)
        return formatted_history
    def prepare_inputs_for_generation(self,
@@ -140,12 +141,13 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
            "max_new_tokens": self.max_new_tokens,
            "num_beams": self.num_beams,
            "top_p": self.top_p,
+            "do_sample": True,
            "top_k": self.top_k,
            "repetition_penalty": self.repetition_penalty,
            "encoder_repetition_penalty": self.encoder_repetition_penalty,
            "min_length": self.min_length,
            "temperature": self.temperature,
-            "eos_token_id": self.eos_token_id,
+            "eos_token_id": self.checkPoint.tokenizer.eos_token_id,
            "logits_processor": self.logits_processor}
        #  向量转换
@@ -178,6 +180,6 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
        response = self._call(prompt=softprompt, stop=['\n###'])
        answer_result = AnswerResult()
-        answer_result.history = history + [[None, response]]
+        answer_result.history = history + [[prompt, response]]
        answer_result.llm_output = {"answer": response}
        yield answer_result
--- a/models/moss_llm.py
+++ b/models/moss_llm.py
@@ -75,8 +75,8 @@ class MOSSLLM(BaseAnswer, LLM, ABC):
                repetition_penalty=1.02,
                num_return_sequences=1,
                eos_token_id=106068,
-                pad_token_id=self.tokenizer.pad_token_id)
+                pad_token_id=self.checkPoint.tokenizer.pad_token_id)
-            response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            response = self.checkPoint.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
            self.checkPoint.clear_torch_cache()
            history += [[prompt, response]]
            answer_result = AnswerResult()

--- a/vectorstores/MyFAISS.py
+++ b/vectorstores/MyFAISS.py
 from langchain.vectorstores import FAISS
 from langchain.vectorstores.base import VectorStore
 from langchain.vectorstores.faiss import dependable_faiss_import
-from typing import Any, Callable, List, Tuple, Dict
+from typing import Any, Callable, List, Dict
 from langchain.docstore.base import Docstore
 from langchain.docstore.document import Document
 import numpy as np
+import copy
 class MyFAISS(FAISS, VectorStore):
@@ -46,6 +47,7 @@ class MyFAISS(FAISS, VectorStore):
        docs = []
        id_set = set()
        store_len = len(self.index_to_docstore_id)
+        rearrange_id_list = False
        for j, i in enumerate(indices[0]):
            if i == -1 or 0 < self.score_threshold < scores[0][j]:
                # This happens when not enough docs are returned.
@@ -53,11 +55,13 @@ class MyFAISS(FAISS, VectorStore):
            _id = self.index_to_docstore_id[i]
            doc = self.docstore.search(_id)
            if (not self.chunk_conent) or ("context_expand" in doc.metadata and not doc.metadata["context_expand"]):
+                # 匹配出的文本如果不需要扩展上下文则执行如下代码
                if not isinstance(doc, Document):
                    raise ValueError(f"Could not find document for id {_id}, got {doc}")
                doc.metadata["score"] = int(scores[0][j])
                docs.append(doc)
                continue
            id_set.add(i)
            docs_len = len(doc.page_content)
            for k in range(1, max(i, store_len - i)):
@@ -72,15 +76,17 @@ class MyFAISS(FAISS, VectorStore):
                    if l not in id_set and 0 <= l < len(self.index_to_docstore_id):
                        _id0 = self.index_to_docstore_id[l]
                        doc0 = self.docstore.search(_id0)
-                        if docs_len + len(doc0.page_content) > self.chunk_size or doc0.metadata["source"] != doc.metadata["source"]:
+                        if docs_len + len(doc0.page_content) > self.chunk_size or doc0.metadata["source"] != \
+                                doc.metadata["source"]:
                            break_flag = True
                            break
                        elif doc0.metadata["source"] == doc.metadata["source"]:
                            docs_len += len(doc0.page_content)
                            id_set.add(l)
+                            rearrange_id_list = True
                if break_flag:
                    break
-        if (not self.chunk_conent) or ("add_context" in doc.metadata and not doc.metadata["add_context"]):
+        if (not self.chunk_conent) or (not rearrange_id_list):
            return docs
        if len(id_set) == 0 and self.score_threshold > 0:
            return []
@@ -90,7 +96,8 @@ class MyFAISS(FAISS, VectorStore):
            for id in id_seq:
                if id == id_seq[0]:
                    _id = self.index_to_docstore_id[id]
-                    doc = self.docstore.search(_id)
+                    # doc = self.docstore.search(_id)
+                    doc = copy.deepcopy(self.docstore.search(_id))
                else:
                    _id0 = self.index_to_docstore_id[id]
                    doc0 = self.docstore.search(_id0)
@@ -101,3 +108,33 @@ class MyFAISS(FAISS, VectorStore):
            doc.metadata["score"] = int(doc_score)
            docs.append(doc)
        return docs
+    def delete_doc(self, source: str or List[str]):
+        try:
+            if isinstance(source, str):
+                ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] == source]
+            else:
+                ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] in source]
+            if len(ids) == 0:
+                return f"docs delete fail"
+            else:
+                for id in ids:
+                    index = list(self.index_to_docstore_id.keys())[list(self.index_to_docstore_id.values()).index(id)]
+                    self.index_to_docstore_id.pop(index)
+                    self.docstore._dict.pop(id)
+                return f"docs delete success"
+        except Exception as e:
+            print(e)
+            return f"docs delete fail"
+    def update_doc(self, source, new_docs):
+        try:
+            delete_len = self.delete_doc(source)
+            ls = self.add_documents(new_docs)
+            return f"docs update success"
+        except Exception as e:
+            print(e)
+            return f"docs update fail"
+    def list_docs(self):
+        return list(set(v.metadata["source"] for v in self.docstore._dict.values()))
--- a/webui.py
+++ b/webui.py
--- a/webui_st.py
+++ b/webui_st.py
@@ -20,9 +20,9 @@ nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
 def get_vs_list():
    lst_default = ["新建知识库"]
-    if not os.path.exists(VS_ROOT_PATH):
+    if not os.path.exists(KB_ROOT_PATH):
        return lst_default
-    lst = os.listdir(VS_ROOT_PATH)
+    lst = os.listdir(KB_ROOT_PATH)
    if not lst:
        return lst_default
    lst.sort()
@@ -144,18 +144,18 @@ def init_model(llm_model: str = 'chat-glm-6b', embedding_model: str = 'text2vec'
 def get_vector_store(vs_id, files, sentence_size, history, one_conent, one_content_segmentation):
-    vs_path = os.path.join(VS_ROOT_PATH, vs_id)
+    vs_path = os.path.join(KB_ROOT_PATH, vs_id, "vector_store")
    filelist = []
-    if not os.path.exists(os.path.join(UPLOAD_ROOT_PATH, vs_id)):
+    if not os.path.exists(os.path.join(KB_ROOT_PATH, vs_id, "content")):
-        os.makedirs(os.path.join(UPLOAD_ROOT_PATH, vs_id))
+        os.makedirs(os.path.join(KB_ROOT_PATH, vs_id, "content"))
    if local_doc_qa.llm and local_doc_qa.embeddings:
        if isinstance(files, list):
            for file in files:
                filename = os.path.split(file.name)[-1]
                shutil.move(file.name, os.path.join(
-                    UPLOAD_ROOT_PATH, vs_id, filename))
+                    KB_ROOT_PATH, vs_id, "content", filename))
                filelist.append(os.path.join(
-                    UPLOAD_ROOT_PATH, vs_id, filename))
+                    KB_ROOT_PATH, vs_id, "content", filename))
            vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(
                filelist, vs_path, sentence_size)
        else:
@@ -516,7 +516,7 @@ with st.form('my_form', clear_on_submit=True):
                last_response = output_messages()
                for history, _ in answer(q,
                                         vs_path=os.path.join(
-                                             VS_ROOT_PATH, vs_path),
+                                             KB_ROOT_PATH, vs_path, "vector_store"),
                                         history=[],
                                         mode=mode,
                                         score_threshold=score_threshold,