提交 fbdc62d9 作者: imClumsyPanda

merge master

......@@ -178,6 +178,6 @@ Web UI 可以实现如下功能:
- [ ] 实现调用 API 的 Web UI Demo
## 项目交流群
![二维码](img/qr_code_14.jpg)
![二维码](img/qr_code_15.jpg)
🎉 langchain-ChatGLM 项目交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
......@@ -31,7 +31,7 @@ if __name__ == "__main__":
chat_history=history,
streaming=STREAMING):
if STREAMING:
logger.info(resp["result"][last_print_len:], end="", flush=True)
logger.info(resp["result"][last_print_len:])
last_print_len = len(resp["result"])
else:
logger.info(resp["result"])
......
......@@ -69,6 +69,9 @@ LLM_HISTORY_LEN = 3
# return top-k text chunk from vector store
VECTOR_SEARCH_TOP_K = 5
# 如果为0,则不生效,经测试小于500值的结果更精准
VECTOR_SEARCH_SCORE_THRESHOLD = 0
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
FLAG_USER_NAME = uuid.uuid4().hex
......
......@@ -5,9 +5,10 @@ from configs.model_config import SENTENCE_SIZE
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs):
def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text1(self, text: str) -> List[str]:
if self.pdf:
......@@ -23,7 +24,7 @@ class ChineseTextSplitter(CharacterTextSplitter):
sent_list.append(ele)
return sent_list
def split_text(self, text: str) -> List[str]:
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
......@@ -38,15 +39,15 @@ class ChineseTextSplitter(CharacterTextSplitter):
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > SENTENCE_SIZE:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > SENTENCE_SIZE:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > SENTENCE_SIZE:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论