提交 a5294748 作者: hzg0601

Merge branch 'dev' of github.com:imClumsyPanda/langchain-ChatGLM into dev

...@@ -8,7 +8,6 @@ from typing import List ...@@ -8,7 +8,6 @@ from typing import List
from utils import torch_gc from utils import torch_gc
from tqdm import tqdm from tqdm import tqdm
from pypinyin import lazy_pinyin from pypinyin import lazy_pinyin
from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader
from models.base import (BaseAnswer, from models.base import (BaseAnswer,
AnswerResult) AnswerResult)
from models.loader.args import parser from models.loader.args import parser
...@@ -59,6 +58,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None): ...@@ -59,6 +58,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE): def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE):
if filepath.lower().endswith(".md"): if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements") loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load() docs = loader.load()
...@@ -67,10 +67,14 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_T ...@@ -67,10 +67,14 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_T
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(textsplitter) docs = loader.load_and_split(textsplitter)
elif filepath.lower().endswith(".pdf"): elif filepath.lower().endswith(".pdf"):
# 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x
from loader import UnstructuredPaddlePDFLoader
loader = UnstructuredPaddlePDFLoader(filepath) loader = UnstructuredPaddlePDFLoader(filepath)
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size) textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
docs = loader.load_and_split(textsplitter) docs = loader.load_and_split(textsplitter)
elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"): elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"):
# 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x
from loader import UnstructuredPaddleImageLoader
loader = UnstructuredPaddleImageLoader(filepath, mode="elements") loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(text_splitter=textsplitter) docs = loader.load_and_split(text_splitter=textsplitter)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论