提交 d898c7dd 作者: imClumsyPanda

update textsplitter

上级 4150af00
from .chinese_text_splitter import ChineseTextSplitter
from .chinese_text_splitter import *
\ No newline at end of file
...@@ -18,8 +18,14 @@ class ChineseTextSplitter(CharacterTextSplitter): ...@@ -18,8 +18,14 @@ class ChineseTextSplitter(CharacterTextSplitter):
text = re.sub('\s', ' ', text) text = re.sub('\s', ' ', text)
text = text.replace("\n\n", "") text = text.replace("\n\n", "")
if use_document_segmentation: if use_document_segmentation:
from modelscope.pipelines import pipeline
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
result = p(documents=text) result = p(documents=text)
sent_list = [i for i in result["text"].split("\n\t") if i] sent_list = [i for i in result["text"].split("\n\t") if i]
return sent_list
else: else:
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
sent_list = [] sent_list = []
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论