提交 64275cb7 作者: imClumsyPanda

update textsplitter

上级 47e9bdb1
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter
import re import re
from typing import List from typing import List
from modelscope.pipelines import pipeline
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
class AliTextSplitter(CharacterTextSplitter): class AliTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs): def __init__(self, pdf: bool = False, **kwargs):
...@@ -21,6 +16,12 @@ class AliTextSplitter(CharacterTextSplitter): ...@@ -21,6 +16,12 @@ class AliTextSplitter(CharacterTextSplitter):
text = re.sub(r"\n{3,}", r"\n", text) text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text) text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text) text = re.sub("\n\n", "", text)
from modelscope.pipelines import pipeline
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
result = p(documents=text) result = p(documents=text)
sent_list = [i for i in result["text"].split("\n\t") if i] sent_list = [i for i in result["text"].split("\n\t") if i]
return sent_list return sent_list
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论