提交 e8a37ff4 作者: imClumsyPanda

update loader.py

上级 d5ffdaa2
...@@ -26,6 +26,9 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE): ...@@ -26,6 +26,9 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
if filepath.lower().endswith(".md"): if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements") loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load() docs = loader.load()
elif filepath.lower().endswith(".txt"):
loader = UnstructuredFileLoader(filepath, mode="elements")
docs = loader.load()
elif filepath.lower().endswith(".pdf"): elif filepath.lower().endswith(".pdf"):
loader = UnstructuredPaddlePDFLoader(filepath) loader = UnstructuredPaddlePDFLoader(filepath)
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size) textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
...@@ -47,7 +50,7 @@ def write_check_file(filepath, docs): ...@@ -47,7 +50,7 @@ def write_check_file(filepath, docs):
if not os.path.exists(folder_path): if not os.path.exists(folder_path):
os.makedirs(folder_path) os.makedirs(folder_path)
fp = os.path.join(folder_path, 'load_file.txt') fp = os.path.join(folder_path, 'load_file.txt')
fout = open(fp, 'a') with open(fp, 'a+', encoding='utf-8') as fout:
fout.write("filepath=%s,len=%s" % (filepath, len(docs))) fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
fout.write('\n') fout.write('\n')
for i in docs: for i in docs:
......
...@@ -19,7 +19,7 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): ...@@ -19,7 +19,7 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
doc = fitz.open(filepath) doc = fitz.open(filepath)
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
img_name = os.path.join(full_dir_path, '.tmp.png') img_name = os.path.join(full_dir_path, 'tmp.png')
with open(txt_file_path, 'w', encoding='utf-8') as fout: with open(txt_file_path, 'w', encoding='utf-8') as fout:
for i in range(doc.page_count): for i in range(doc.page_count):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论