Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
J
jinchat-server
概览
概览
详情
活动
周期分析
版本库
存储库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
aigc-pioneer
jinchat-server
Commits
037b5e45
Unverified
提交
037b5e45
authored
5月 22, 2023
作者:
yawudede
提交者:
GitHub
5月 22, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
local_doc_qa.py
添加精排
上级
b8272bfe
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
70 行增加
和
2 行删除
+70
-2
local_doc_qa.py
chains/local_doc_qa.py
+70
-2
没有找到文件。
chains/local_doc_qa.py
浏览文件 @
037b5e45
...
...
@@ -20,7 +20,45 @@ from models.loader import LoaderCheckPoint
import
models.shared
as
shared
from
agent
import
bing_search
from
langchain.docstore.document
import
Document
from
sentence_transformers
import
SentenceTransformer
,
CrossEncoder
,
util
class
SemanticSearch
:
def
__init__
(
self
):
self
.
use
=
SentenceTransformer
(
'GanymedeNil_text2vec-large-chinese'
)
self
.
fitted
=
False
def
fit
(
self
,
data
,
batch
=
100
,
n_neighbors
=
10
):
self
.
data
=
data
self
.
embeddings
=
self
.
get_text_embedding
(
data
,
batch
=
batch
)
n_neighbors
=
min
(
n_neighbors
,
len
(
self
.
embeddings
))
self
.
nn
=
NearestNeighbors
(
n_neighbors
=
n_neighbors
)
self
.
nn
.
fit
(
self
.
embeddings
)
self
.
fitted
=
True
def
__call__
(
self
,
text
,
return_data
=
True
):
inp_emb
=
self
.
use
.
encode
([
text
])
neighbors
=
self
.
nn
.
kneighbors
(
inp_emb
,
return_distance
=
False
)[
0
]
if
return_data
:
return
[
self
.
data
[
i
]
for
i
in
neighbors
]
else
:
return
neighbors
def
get_text_embedding
(
self
,
texts
,
batch
=
100
):
embeddings
=
[]
for
i
in
range
(
0
,
len
(
texts
),
batch
):
text_batch
=
texts
[
i
:
(
i
+
batch
)]
emb_batch
=
self
.
use
.
encode
(
text_batch
)
embeddings
.
append
(
emb_batch
)
embeddings
=
np
.
vstack
(
embeddings
)
return
embeddings
def
get_docs_with_score
(
docs_with_score
):
docs
=
[]
for
doc
,
score
in
docs_with_score
:
doc
.
metadata
[
"score"
]
=
score
docs
.
append
(
doc
)
return
docs
def
load_file
(
filepath
,
sentence_size
=
SENTENCE_SIZE
):
if
filepath
.
lower
()
.
endswith
(
".md"
):
...
...
@@ -262,9 +300,39 @@ class LocalDocQA:
vector_store
.
chunk_conent
=
self
.
chunk_conent
vector_store
.
score_threshold
=
self
.
score_threshold
related_docs_with_score
=
vector_store
.
similarity_search_with_score
(
query
,
k
=
self
.
top_k
)
###########################################精排 之前faiss检索作为粗排 需要设置config参数
####提取文档
related_docs
=
get_docs_with_score
(
related_docs_with_score
)
text_batch0
=
[]
for
i
in
range
(
len
(
related_docs
)):
cut_txt
=
" "
.
join
([
w
for
w
in
list
(
related_docs
[
i
]
.
page_content
)])
cut_txt
=
cut_txt
.
replace
(
" "
,
""
)
text_batch0
.
append
(
cut_txt
)
######文档去重
text_batch_new
=
[]
for
i
in
range
(
len
(
text_batch0
)):
if
text_batch0
[
i
]
in
text_batch_new
:
continue
else
:
while
text_batch_new
and
text_batch_new
[
-
1
]
>
text_batch0
[
i
]
and
text_batch_new
[
-
1
]
in
text_batch0
[
i
+
1
:]:
text_batch_new
.
pop
()
# 弹出栈顶元素
text_batch_new
.
append
(
text_batch0
[
i
])
text_batch_new0
=
"
\n
"
.
join
([
doc
for
doc
in
text_batch_new
])
###精排 采用knn和semantic search
recommender
=
SemanticSearch
()
chunks
=
text_to_chunks
(
text_batch_new0
,
start_page
=
1
)
recommender
.
fit
(
chunks
)
topn_chunks
=
recommender
(
query
)
torch_gc
()
prompt
=
generate_prompt
(
related_docs_with_score
,
query
)
#去掉文字中的空格
topn_chunks0
=
[]
for
i
in
range
(
len
(
topn_chunks
)):
cut_txt
=
topn_chunks
[
i
]
.
replace
(
" "
,
""
)
topn_chunks0
.
append
(
cut_txt
)
############生成prompt
prompt
=
generate_prompt
(
topn_chunks0
,
query
)
########################
for
answer_result
in
self
.
llm
.
generatorAnswer
(
prompt
=
prompt
,
history
=
chat_history
,
streaming
=
streaming
):
resp
=
answer_result
.
llm_output
[
"answer"
]
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论