Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
J
jinchat-server
概览
概览
详情
活动
周期分析
版本库
存储库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
aigc-pioneer
jinchat-server
Commits
4150af00
提交
4150af00
authored
5月 06, 2023
作者:
imClumsyPanda
浏览文件
操作
浏览文件
下载
差异文件
merge master
上级
e2d7452c
3adfecaa
显示空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
34 行增加
和
21 行删除
+34
-21
README.md
README.md
+1
-1
api.py
api.py
+9
-13
qr_code_11.jpg
img/qr_code_11.jpg
+0
-0
qr_code_12.jpg
img/qr_code_12.jpg
+0
-0
chinese_text_splitter.py
textsplitter/chinese_text_splitter.py
+24
-7
没有找到文件。
README.md
浏览文件 @
4150af00
...
...
@@ -178,6 +178,6 @@ Web UI 可以实现如下功能:
-
[
]
实现调用 API 的 Web UI Demo
## 项目交流群
![
二维码
](
img/qr_code_1
1
.jpg
)
![
二维码
](
img/qr_code_1
2
.jpg
)
🎉 langchain-ChatGLM 项目交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。
api.py
浏览文件 @
4150af00
...
...
@@ -170,24 +170,25 @@ async def delete_docs(
async
def
chat
(
knowledge_base_id
:
str
=
Body
(
...
,
description
=
"
知识库名字
"
,
example
=
"kb1"
),
question
:
str
=
Body
(
...
,
description
=
"
问题
"
,
example
=
"工伤保险是什么?"
),
knowledge_base_id
:
str
=
Body
(
...
,
description
=
"
Knowledge Base Name
"
,
example
=
"kb1"
),
question
:
str
=
Body
(
...
,
description
=
"
Question
"
,
example
=
"工伤保险是什么?"
),
history
:
List
[
List
[
str
]]
=
Body
(
[],
description
=
"
问题及答案的历史记录
"
,
description
=
"
History of previous questions and answers
"
,
example
=
[
[
"
这里是问题,如:
工伤保险是什么?"
,
"
答案:
工伤保险是指用人单位按照国家规定,为本单位的职工和用人单位的其他人员,缴纳工伤保险费,由保险机构按照国家规定的标准,给予工伤保险待遇的社会保险制度。"
,
"工伤保险是什么?"
,
"工伤保险是指用人单位按照国家规定,为本单位的职工和用人单位的其他人员,缴纳工伤保险费,由保险机构按照国家规定的标准,给予工伤保险待遇的社会保险制度。"
,
]
],
),
):
vs_path
=
os
.
path
.
join
(
VS_ROOT_PATH
,
knowledge_base_id
)
resp
=
{}
if
os
.
path
.
exists
(
vs_path
)
and
knowledge_base_id
:
if
not
os
.
path
.
exists
(
vs_path
):
raise
ValueError
(
f
"Knowledge base {knowledge_base_id} not found"
)
for
resp
,
history
in
local_doc_qa
.
get_knowledge_based_answer
(
query
=
question
,
vs_path
=
vs_path
,
chat_history
=
history
,
streaming
=
Fals
e
query
=
question
,
vs_path
=
vs_path
,
chat_history
=
history
,
streaming
=
Tru
e
):
pass
source_documents
=
[
...
...
@@ -195,11 +196,6 @@ async def chat(
f
"""相关度:{doc.metadata['score']}
\n\n
"""
for
inum
,
doc
in
enumerate
(
resp
[
"source_documents"
])
]
else
:
for
resp_s
,
history
in
local_doc_qa
.
llm
.
_call
(
prompt
=
question
,
history
=
history
,
streaming
=
False
):
pass
resp
[
"result"
]
=
resp_s
source_documents
=
[(
"当前知识库为空,如需基于知识库进行问答,请先加载知识库后,再进行提问。"
)]
return
ChatMessage
(
question
=
question
,
...
...
img/qr_code_11.jpg
deleted
100644 → 0
浏览文件 @
e2d7452c
276.1 KB
img/qr_code_12.jpg
0 → 100644
浏览文件 @
4150af00
264.7 KB
textsplitter/chinese_text_splitter.py
浏览文件 @
4150af00
...
...
@@ -3,17 +3,25 @@ import re
from
typing
import
List
from
configs.model_config
import
SENTENCE_SIZE
class
ChineseTextSplitter
(
CharacterTextSplitter
):
def
__init__
(
self
,
pdf
:
bool
=
False
,
**
kwargs
):
super
()
.
__init__
(
**
kwargs
)
self
.
pdf
=
pdf
def
split_text1
(
self
,
text
:
str
)
->
List
[
str
]:
def
split_text1
(
self
,
text
:
str
,
use_document_segmentation
:
bool
=
False
)
->
List
[
str
]:
# use_document_segmentation参数指定是否用语义切分文档,此处采取的文档语义分割模型为达摩院开源的nlp_bert_document-segmentation_chinese-base,论文见https://arxiv.org/abs/2107.09278
# 如果使用模型进行文档语义切分,那么需要安装modelscope[nlp]:pip install "modelscope[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
# 考虑到使用了三个模型,可能对于低配置gpu不太友好,因此这里将模型load进cpu计算,有需要的话可以替换device为自己的显卡id
if
self
.
pdf
:
text
=
re
.
sub
(
r"\n{3,}"
,
"
\n
"
,
text
)
text
=
re
.
sub
(
'
\
s'
,
' '
,
text
)
text
=
text
.
replace
(
"
\n\n
"
,
""
)
sent_sep_pattern
=
re
.
compile
(
'([﹒﹔;﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))'
)
# del :;
if
use_document_segmentation
:
result
=
p
(
documents
=
text
)
sent_list
=
[
i
for
i
in
result
[
"text"
]
.
split
(
"
\n\t
"
)
if
i
]
else
:
sent_sep_pattern
=
re
.
compile
(
'([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))'
)
# del :;
sent_list
=
[]
for
ele
in
sent_sep_pattern
.
split
(
text
):
if
sent_sep_pattern
.
match
(
ele
)
and
sent_list
:
...
...
@@ -22,11 +30,21 @@ class ChineseTextSplitter(CharacterTextSplitter):
sent_list
.
append
(
ele
)
return
sent_list
def
split_text
(
self
,
text
:
str
)
->
List
[
str
]:
def
split_text
(
self
,
text
:
str
,
use_document_segmentation
:
bool
=
False
)
->
List
[
str
]:
if
self
.
pdf
:
text
=
re
.
sub
(
r"\n{3,}"
,
r"\n"
,
text
)
text
=
re
.
sub
(
'
\
s'
,
" "
,
text
)
text
=
re
.
sub
(
"
\n\n
"
,
""
,
text
)
if
use_document_segmentation
:
from
modelscope.pipelines
import
pipeline
p
=
pipeline
(
task
=
"document-segmentation"
,
model
=
'damo/nlp_bert_document-segmentation_chinese-base'
,
device
=
"cpu"
)
result
=
p
(
documents
=
text
)
sent_list
=
[
i
for
i
in
result
[
"text"
]
.
split
(
"
\n\t
"
)
if
i
]
return
sent_list
else
:
text
=
re
.
sub
(
r'([;;.!?。!?\?])([^”’])'
,
r"\1\n\2"
,
text
)
# 单字符断句符
text
=
re
.
sub
(
r'(\.{6})([^"’”」』])'
,
r"\1\n\2"
,
text
)
# 英文省略号
text
=
re
.
sub
(
r'(\…{2})([^"’”」』])'
,
r"\1\n\2"
,
text
)
# 中文省略号
...
...
@@ -47,12 +65,11 @@ class ChineseTextSplitter(CharacterTextSplitter):
if
len
(
ele_ele2
)
>
SENTENCE_SIZE
:
ele_ele3
=
re
.
sub
(
'( ["’”」』]{0,2})([^ ])'
,
r'\1\n\2'
,
ele_ele2
)
ele2_id
=
ele2_ls
.
index
(
ele_ele2
)
ele2_ls
=
ele2_ls
[:
ele2_id
]
+
[
i
for
i
in
ele_ele3
.
split
(
"
\n
"
)
if
i
]
+
ele2_ls
[
ele2_id
+
1
:]
ele2_ls
=
ele2_ls
[:
ele2_id
]
+
[
i
for
i
in
ele_ele3
.
split
(
"
\n
"
)
if
i
]
+
ele2_ls
[
ele2_id
+
1
:]
ele_id
=
ele1_ls
.
index
(
ele_ele1
)
ele1_ls
=
ele1_ls
[:
ele_id
]
+
[
i
for
i
in
ele2_ls
if
i
]
+
ele1_ls
[
ele_id
+
1
:]
id
=
ls
.
index
(
ele
)
ls
=
ls
[:
id
]
+
[
i
for
i
in
ele1_ls
if
i
]
+
ls
[
id
+
1
:]
ls
=
ls
[:
id
]
+
[
i
for
i
in
ele1_ls
if
i
]
+
ls
[
id
+
1
:]
return
ls
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论