Unverified 提交 f7e120fe 作者: Ding Junyao 提交者: GitHub

add tree func for reading files in dirs (#471)

上级 4295f606
...@@ -20,6 +20,29 @@ from agent import bing_search ...@@ -20,6 +20,29 @@ from agent import bing_search
from langchain.docstore.document import Document from langchain.docstore.document import Document
def tree(filepath, ignore_dir_names=None, ignore_file_names=None):
"""返回两个列表,第一个列表为 filepath 下全部文件的完整路径, 第二个为对应的文件名"""
if ignore_dir_names is None:
ignore_dir_names = []
if ignore_file_names is None:
ignore_file_names = []
ret_list = []
if isinstance(filepath, str):
if not os.path.exists(filepath):
print("路径不存在")
return None, None
elif os.path.isfile(filepath) and os.path.basename(filepath) not in ignore_file_names:
return [filepath], [os.path.basename(filepath)]
elif os.path.isdir(filepath) and os.path.basename(filepath) not in ignore_dir_names:
for file in os.listdir(filepath):
fullfilepath = os.path.join(filepath, file)
if os.path.isfile(fullfilepath) and os.path.basename(fullfilepath) not in ignore_file_names:
ret_list.append(fullfilepath)
if os.path.isdir(fullfilepath) and os.path.basename(fullfilepath) not in ignore_dir_names:
ret_list.extend(tree(fullfilepath, ignore_dir_names, ignore_file_names)[0])
return ret_list, [os.path.basename(p) for p in ret_list]
def load_file(filepath, sentence_size=SENTENCE_SIZE): def load_file(filepath, sentence_size=SENTENCE_SIZE):
if filepath.lower().endswith(".md"): if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath, mode="elements") loader = UnstructuredFileLoader(filepath, mode="elements")
...@@ -189,8 +212,7 @@ class LocalDocQA: ...@@ -189,8 +212,7 @@ class LocalDocQA:
return None return None
elif os.path.isdir(filepath): elif os.path.isdir(filepath):
docs = [] docs = []
for file in tqdm(os.listdir(filepath), desc="加载文件"): for fullfilepath, file in tqdm(zip(*tree(filepath, ignore_dir_names=['tmp_files'])), desc="加载文件"):
fullfilepath = os.path.join(filepath, file)
try: try:
docs += load_file(fullfilepath, sentence_size) docs += load_file(fullfilepath, sentence_size)
loaded_files.append(fullfilepath) loaded_files.append(fullfilepath)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论