Unverified 提交 55504fcd 作者: shrimp 提交者: GitHub

新增加知识库测试能力 (#302)

上级 466bfb7a
...@@ -69,6 +69,9 @@ LLM_HISTORY_LEN = 3 ...@@ -69,6 +69,9 @@ LLM_HISTORY_LEN = 3
# return top-k text chunk from vector store # return top-k text chunk from vector store
VECTOR_SEARCH_TOP_K = 5 VECTOR_SEARCH_TOP_K = 5
# 如果为0,则不生效,经测试小于500值的结果更精准
VECTOR_SEARCH_SCORE_THRESHOLD = 0
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data") NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
FLAG_USER_NAME = uuid.uuid4().hex FLAG_USER_NAME = uuid.uuid4().hex
......
...@@ -175,7 +175,7 @@ if __name__ == "__main__": ...@@ -175,7 +175,7 @@ if __name__ == "__main__":
llm_device=LLM_DEVICE, ) llm_device=LLM_DEVICE, )
last_print_len = 0 last_print_len = 0
for resp, history in llm._call("你好", streaming=True): for resp, history in llm._call("你好", streaming=True):
logger.info(resp[last_print_len:]) logger.info(resp[last_print_len:], end="", flush=True)
last_print_len = len(resp) last_print_len = len(resp)
for resp, history in llm._call("你好", streaming=False): for resp, history in llm._call("你好", streaming=False):
logger.info(resp) logger.info(resp)
......
...@@ -5,9 +5,10 @@ from configs.model_config import SENTENCE_SIZE ...@@ -5,9 +5,10 @@ from configs.model_config import SENTENCE_SIZE
class ChineseTextSplitter(CharacterTextSplitter): class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs): def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.pdf = pdf self.pdf = pdf
self.sentence_size = sentence_size
def split_text1(self, text: str) -> List[str]: def split_text1(self, text: str) -> List[str]:
if self.pdf: if self.pdf:
...@@ -23,7 +24,7 @@ class ChineseTextSplitter(CharacterTextSplitter): ...@@ -23,7 +24,7 @@ class ChineseTextSplitter(CharacterTextSplitter):
sent_list.append(ele) sent_list.append(ele)
return sent_list return sent_list
def split_text(self, text: str) -> List[str]: def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf: if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text) text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text) text = re.sub('\s', " ", text)
...@@ -38,15 +39,15 @@ class ChineseTextSplitter(CharacterTextSplitter): ...@@ -38,15 +39,15 @@ class ChineseTextSplitter(CharacterTextSplitter):
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i] ls = [i for i in text.split("\n") if i]
for ele in ls: for ele in ls:
if len(ele) > SENTENCE_SIZE: if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n") ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls: for ele_ele1 in ele1_ls:
if len(ele_ele1) > SENTENCE_SIZE: if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n") ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls: for ele_ele2 in ele2_ls:
if len(ele_ele2) > SENTENCE_SIZE: if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2) ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论