提交 6d152372 作者: imClumsyPanda

update pdf_loader.py

上级 e8a37ff4
...@@ -15,13 +15,11 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): ...@@ -15,13 +15,11 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
if not os.path.exists(full_dir_path): if not os.path.exists(full_dir_path):
os.makedirs(full_dir_path) os.makedirs(full_dir_path)
filename = os.path.split(filepath)[-1]
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
doc = fitz.open(filepath) doc = fitz.open(filepath)
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt")
img_name = os.path.join(full_dir_path, 'tmp.png') img_name = os.path.join(full_dir_path, 'tmp.png')
with open(txt_file_path, 'w', encoding='utf-8') as fout: with open(txt_file_path, 'w', encoding='utf-8') as fout:
for i in range(doc.page_count): for i in range(doc.page_count):
page = doc[i] page = doc[i]
text = page.get_text("") text = page.get_text("")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论