Unverified 提交 55504fcd 作者: shrimp 提交者: GitHub

新增加知识库测试能力 (#302)

上级 466bfb7a
...@@ -69,6 +69,9 @@ LLM_HISTORY_LEN = 3 ...@@ -69,6 +69,9 @@ LLM_HISTORY_LEN = 3
# return top-k text chunk from vector store # return top-k text chunk from vector store
VECTOR_SEARCH_TOP_K = 5 VECTOR_SEARCH_TOP_K = 5
# 如果为0,则不生效,经测试小于500值的结果更精准
VECTOR_SEARCH_SCORE_THRESHOLD = 0
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data") NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
FLAG_USER_NAME = uuid.uuid4().hex FLAG_USER_NAME = uuid.uuid4().hex
...@@ -79,4 +82,4 @@ llm device: {LLM_DEVICE} ...@@ -79,4 +82,4 @@ llm device: {LLM_DEVICE}
embedding device: {EMBEDDING_DEVICE} embedding device: {EMBEDDING_DEVICE}
dir: {os.path.dirname(os.path.dirname(__file__))} dir: {os.path.dirname(os.path.dirname(__file__))}
flagging username: {FLAG_USER_NAME} flagging username: {FLAG_USER_NAME}
""") """)
\ No newline at end of file
...@@ -125,7 +125,7 @@ class ChatGLM(LLM): ...@@ -125,7 +125,7 @@ class ChatGLM(LLM):
prefix_encoder_file.close() prefix_encoder_file.close()
model_config.pre_seq_len = prefix_encoder_config['pre_seq_len'] model_config.pre_seq_len = prefix_encoder_config['pre_seq_len']
model_config.prefix_projection = prefix_encoder_config['prefix_projection'] model_config.prefix_projection = prefix_encoder_config['prefix_projection']
except Exception as e: except Exception as e:
logger.error(f"加载PrefixEncoder config.json失败: {e}") logger.error(f"加载PrefixEncoder config.json失败: {e}")
self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True, self.model = AutoModel.from_pretrained(model_name_or_path, config=model_config, trust_remote_code=True,
**kwargs) **kwargs)
...@@ -163,7 +163,7 @@ class ChatGLM(LLM): ...@@ -163,7 +163,7 @@ class ChatGLM(LLM):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict) self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
self.model.transformer.prefix_encoder.float() self.model.transformer.prefix_encoder.float()
except Exception as e: except Exception as e:
logger.error(f"加载PrefixEncoder模型参数失败:{e}") logger.error(f"加载PrefixEncoder模型参数失败:{e}")
self.model = self.model.eval() self.model = self.model.eval()
...@@ -175,7 +175,7 @@ if __name__ == "__main__": ...@@ -175,7 +175,7 @@ if __name__ == "__main__":
llm_device=LLM_DEVICE, ) llm_device=LLM_DEVICE, )
last_print_len = 0 last_print_len = 0
for resp, history in llm._call("你好", streaming=True): for resp, history in llm._call("你好", streaming=True):
logger.info(resp[last_print_len:]) logger.info(resp[last_print_len:], end="", flush=True)
last_print_len = len(resp) last_print_len = len(resp)
for resp, history in llm._call("你好", streaming=False): for resp, history in llm._call("你好", streaming=False):
logger.info(resp) logger.info(resp)
......
...@@ -5,9 +5,10 @@ from configs.model_config import SENTENCE_SIZE ...@@ -5,9 +5,10 @@ from configs.model_config import SENTENCE_SIZE
class ChineseTextSplitter(CharacterTextSplitter): class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs): def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.pdf = pdf self.pdf = pdf
self.sentence_size = sentence_size
def split_text1(self, text: str) -> List[str]: def split_text1(self, text: str) -> List[str]:
if self.pdf: if self.pdf:
...@@ -23,7 +24,7 @@ class ChineseTextSplitter(CharacterTextSplitter): ...@@ -23,7 +24,7 @@ class ChineseTextSplitter(CharacterTextSplitter):
sent_list.append(ele) sent_list.append(ele)
return sent_list return sent_list
def split_text(self, text: str) -> List[str]: def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf: if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text) text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text) text = re.sub('\s', " ", text)
...@@ -38,15 +39,15 @@ class ChineseTextSplitter(CharacterTextSplitter): ...@@ -38,15 +39,15 @@ class ChineseTextSplitter(CharacterTextSplitter):
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i] ls = [i for i in text.split("\n") if i]
for ele in ls: for ele in ls:
if len(ele) > SENTENCE_SIZE: if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n") ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls: for ele_ele1 in ele1_ls:
if len(ele_ele1) > SENTENCE_SIZE: if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n") ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls: for ele_ele2 in ele2_ls:
if len(ele_ele2) > SENTENCE_SIZE: if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2) ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论