Unverified 提交 241e19a9 作者: LuoQianhong 提交者: GitHub

Merge branch 'dev' into fix/kbs_interface_adjust

......@@ -229,7 +229,7 @@ Web UI 可以实现如下功能:
- [x] VUE 前端
## 项目交流群
<img src="img/qr_code_32.jpg" alt="二维码" width="300" height="300" />
<img src="img/qr_code_33.jpg" alt="二维码" width="300" height="300" />
🎉 langchain-ChatGLM 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
......@@ -17,6 +17,7 @@ import models.shared as shared
from agent import bing_search
from langchain.docstore.document import Document
from functools import lru_cache
from textsplitter.zh_title_enhance import zh_title_enhance
# patch HuggingFaceEmbeddings to make it hashable
......@@ -56,7 +57,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
return ret_list, [os.path.basename(p) for p in ret_list]
def load_file(filepath, sentence_size=SENTENCE_SIZE):
def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE):
if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load()
......@@ -79,6 +80,8 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
loader = UnstructuredFileLoader(filepath, mode="elements")
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(text_splitter=textsplitter)
if using_zh_title_enhance:
docs = zh_title_enhance(docs)
write_check_file(filepath, docs)
return docs
......
......@@ -173,4 +173,9 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
# 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out
# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
BING_SUBSCRIPTION_KEY = ""
\ No newline at end of file
BING_SUBSCRIPTION_KEY = ""
# 是否开启中文标题加强,以及标题增强的相关配置
# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记;
# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
ZH_TITLE_ENHANCE = False
......@@ -33,5 +33,4 @@ numpy~=1.23.5
tqdm~=4.65.0
requests~=2.28.2
tenacity~=8.2.2
# 默认下载的charset_normalizer模块版本过高会抛出,`artially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)`
charset_normalizer==2.1.0
\ No newline at end of file
from configs.model_config import *
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import nltk
from vectorstores import MyFAISS
from chains.local_doc_qa import load_file
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
if __name__ == "__main__":
filepath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
"knowledge_base", "samples", "content", "test.txt")
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],
model_kwargs={'device': EMBEDDING_DEVICE})
docs = load_file(filepath, using_zh_title_enhance=True)
vector_store = MyFAISS.from_documents(docs, embeddings)
query = "指令提示技术有什么示例"
search_result = vector_store.similarity_search(query)
print(search_result)
pass
from .chinese_text_splitter import ChineseTextSplitter
from .ali_text_splitter import AliTextSplitter
\ No newline at end of file
from .ali_text_splitter import AliTextSplitter
from .zh_title_enhance import zh_title_enhance
\ No newline at end of file
from langchain.docstore.document import Document
import re
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
as a title or narrative text. The ratio does not count spaces.
Parameters
----------
text
The input string to test
threshold
If the proportion of non-alpha characters exceeds this threshold, the function
returns False
"""
if len(text) == 0:
return False
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
total_count = len([char for char in text if char.strip()])
try:
ratio = alpha_count / total_count
return ratio < threshold
except:
return False
def is_possible_title(
text: str,
title_max_word_length: int = 20,
non_alpha_threshold: float = 0.5,
) -> bool:
"""Checks to see if the text passes all of the checks for a valid title.
Parameters
----------
text
The input text to check
title_max_word_length
The maximum number of words a title can contain
non_alpha_threshold
The minimum number of alpha characters the text needs to be considered a title
"""
# 文本长度为0的话,肯定不是title
if len(text) == 0:
print("Not a title. Text is empty.")
return False
# 文本中有标点符号,就不是title
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
if ENDS_IN_PUNCT_RE.search(text) is not None:
return False
# 文本长度不能超过设定值,默认20
# NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
# is less expensive and actual tokenization doesn't add much value for the length check
if len(text) > title_max_word_length:
return False
# 文本中数字的占比不能太高,否则不是title
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
return False
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
if text.endswith((",", ".", ",", "。")):
return False
if text.isnumeric():
print(f"Not a title. Text is all numeric:\n\n{text}") # type: ignore
return False
# 开头的字符内应该有数字,默认5个字符内
if len(text) < 5:
text_5 = text
else:
text_5 = text[:5]
alpha_in_text_5 = sum(list(map(lambda x: x.isnumeric(), list(text_5))))
if not alpha_in_text_5:
return False
return True
def zh_title_enhance(docs: Document) -> Document:
title = None
if len(docs) > 0:
for doc in docs:
if is_possible_title(doc.page_content):
doc.metadata['category'] = 'cn_Title'
title = doc.page_content
elif title:
doc.page_content = f"下文与({title})有关。{doc.page_content}"
return docs
else:
print("文件不存在")
......@@ -6,6 +6,8 @@ from langchain.docstore.base import Docstore
from langchain.docstore.document import Document
import numpy as np
import copy
import os
from configs.model_config import *
class MyFAISS(FAISS, VectorStore):
......@@ -22,6 +24,9 @@ class MyFAISS(FAISS, VectorStore):
docstore=docstore,
index_to_docstore_id=index_to_docstore_id,
normalize_L2=normalize_L2)
self.score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD
self.chunk_size = CHUNK_SIZE
self.chunk_conent = False
def seperate_list(self, ls: List[int]) -> List[List[int]]:
# TODO: 增加是否属于同一文档的判断
......@@ -53,10 +58,10 @@ class MyFAISS(FAISS, VectorStore):
# This happens when not enough docs are returned.
continue
if i in self.index_to_docstore_id:
_id = self.index_to_docstore_id[i]
_id = self.index_to_docstore_id[i]
# 执行接下来的操作
else:
continue
continue
doc = self.docstore.search(_id)
if (not self.chunk_conent) or ("context_expand" in doc.metadata and not doc.metadata["context_expand"]):
# 匹配出的文本如果不需要扩展上下文则执行如下代码
......@@ -117,8 +122,10 @@ class MyFAISS(FAISS, VectorStore):
try:
if isinstance(source, str):
ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] == source]
vs_path = os.path.join(os.path.split(os.path.split(source)[0])[0], "vector_store")
else:
ids = [k for k, v in self.docstore._dict.items() if v.metadata["source"] in source]
vs_path = os.path.join(os.path.split(os.path.split(source[0])[0])[0], "vector_store")
if len(ids) == 0:
return f"docs delete fail"
else:
......@@ -126,6 +133,9 @@ class MyFAISS(FAISS, VectorStore):
index = list(self.index_to_docstore_id.keys())[list(self.index_to_docstore_id.values()).index(id)]
self.index_to_docstore_id.pop(index)
self.docstore._dict.pop(id)
# TODO: 从 self.index 中删除对应id
# self.index.reset()
self.save_local(vs_path)
return f"docs delete success"
except Exception as e:
print(e)
......
......@@ -15,7 +15,7 @@ COPY . /app
RUN pnpm run build
FROM frontend AS final
COPY --from=frontend /app/dist /app/public
......
......@@ -63,3 +63,18 @@ export const web_url = () => {
export const setapi = () => {
return window.baseApi
}
export const getkblist = (knowledge_base_id: any) => {
return api({
url: '/local_doc_qa/list_knowledge_base',
method: 'get',
params: {},
})
}
export const deletekb = (params: any) => {
return api({
url: '/local_doc_qa/delete_knowledge_base',
method: 'post',
data: JSON.stringify(params),
})
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论