提交 26817283 作者: imClumsyPanda

update local_doc_qa.py

上级 6d152372
from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredFileLoader from langchain.document_loaders import UnstructuredFileLoader, TextLoader
from configs.model_config import * from configs.model_config import *
import datetime import datetime
from textsplitter import ChineseTextSplitter from textsplitter import ChineseTextSplitter
...@@ -10,8 +10,7 @@ import numpy as np ...@@ -10,8 +10,7 @@ import numpy as np
from utils import torch_gc from utils import torch_gc
from tqdm import tqdm from tqdm import tqdm
from pypinyin import lazy_pinyin from pypinyin import lazy_pinyin
from loader import UnstructuredPaddleImageLoader from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader
from loader import UnstructuredPaddlePDFLoader
from models.base import (BaseAnswer, from models.base import (BaseAnswer,
AnswerResult, AnswerResult,
AnswerResultStream, AnswerResultStream,
...@@ -21,14 +20,14 @@ from models.loader import LoaderCheckPoint ...@@ -21,14 +20,14 @@ from models.loader import LoaderCheckPoint
import models.shared as shared import models.shared as shared
def load_file(filepath, sentence_size=SENTENCE_SIZE): def load_file(filepath, sentence_size=SENTENCE_SIZE):
if filepath.lower().endswith(".md"): if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements") loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load() docs = loader.load()
elif filepath.lower().endswith(".txt"): elif filepath.lower().endswith(".txt"):
loader = UnstructuredFileLoader(filepath, mode="elements") loader = TextLoader(filepath, autodetect_encoding=True)
docs = loader.load() textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(textsplitter)
elif filepath.lower().endswith(".pdf"): elif filepath.lower().endswith(".pdf"):
loader = UnstructuredPaddlePDFLoader(filepath) loader = UnstructuredPaddlePDFLoader(filepath)
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size) textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论