提交 9ea4124d 作者: 孙俊华

增加jpeg格式转换,修改流式传输bug,增加文本转换处理器

上级 e4949603
......@@ -412,10 +412,10 @@ async def stream_chat(websocket: WebSocket):
"knowledge_base_id"]
vs_path = get_vs_path(knowledge_base_id)
if not os.path.exists(vs_path):
await websocket.send_json({"error": f"Knowledge base {knowledge_base_id} not found"})
await websocket.close()
return
# if not os.path.exists(vs_path):
# await websocket.send_json({"error": f"Knowledge base {knowledge_base_id} not found"})
# await websocket.close()
# return
await websocket.send_json({"question": question, "turn": turn, "flag": "start"})
......
......@@ -74,7 +74,7 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_T
loader = UnstructuredPaddlePDFLoader(filepath)
textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
docs = loader.load_and_split(textsplitter)
elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"):
elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".jpeg") or filepath.lower().endswith(".png"):
# 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x
from loader import UnstructuredPaddleImageLoader
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
......
from langchain.docstore.document import Document
import feedparser
import html2text
import ssl
import time
class RSS_Url_loader:
def __init__(self, urls=None,interval=60):
'''可用参数urls数组或者是字符串形式的url列表'''
self.urls = []
self.interval = interval
if urls is not None:
try:
if isinstance(urls, str):
urls = [urls]
elif isinstance(urls, list):
pass
else:
raise TypeError('urls must be a list or a string.')
self.urls = urls
except:
Warning('urls must be a list or a string.')
#定时代码还要考虑是不是引入其他类,暂时先不对外开放
def scheduled_execution(self):
while True:
docs = self.load()
return docs
time.sleep(self.interval)
def load(self):
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
documents = []
for url in self.urls:
parsed = feedparser.parse(url)
for entry in parsed.entries:
if "content" in entry:
data = entry.content[0].value
else:
data = entry.description or entry.summary
data = html2text.html2text(data)
metadata = {"title": entry.title, "link": entry.link}
documents.append(Document(page_content=data, metadata=metadata))
return documents
if __name__=="__main__":
#需要在配置文件中加入urls的配置,或者是在用户界面上加入urls的配置
urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"]
loader = RSS_Url_loader(urls)
docs = loader.load()
for doc in docs:
print(doc)
\ No newline at end of file
from .image_loader import UnstructuredPaddleImageLoader
from .pdf_loader import UnstructuredPaddlePDFLoader
from .dialogue import (
Person,
Dialogue,
Turn,
DialogueLoader
)
__all__ = [
"UnstructuredPaddleImageLoader",
"UnstructuredPaddlePDFLoader",
"DialogueLoader",
]
import json
from abc import ABC
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
class Dialogue:
"""
Build an abstract dialogue model using classes and methods to represent different dialogue elements.
This class serves as a fundamental framework for constructing dialogue models.
"""
def __init__(self, file_path: str):
self.file_path = file_path
self.turns = []
def add_turn(self, turn):
"""
Create an instance of a conversation participant
:param turn:
:return:
"""
self.turns.append(turn)
def parse_dialogue(self):
"""
The parse_dialogue function reads the specified dialogue file and parses each dialogue turn line by line.
For each turn, the function extracts the name of the speaker and the message content from the text,
creating a Turn instance. If the speaker is not already present in the participants dictionary,
a new Person instance is created. Finally, the parsed Turn instance is added to the Dialogue object.
Please note that this sample code assumes that each line in the file follows a specific format:
<speaker>:\r\n<message>\r\n\r\n. If your file has a different format or includes other metadata,
you may need to adjust the parsing logic accordingly.
"""
participants = {}
speaker_name = None
message = None
with open(self.file_path, encoding='utf-8') as file:
lines = file.readlines()
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue
if speaker_name is None:
speaker_name, _ = line.split(':', 1)
elif message is None:
message = line
if speaker_name not in participants:
participants[speaker_name] = Person(speaker_name, None)
speaker = participants[speaker_name]
turn = Turn(speaker, message)
self.add_turn(turn)
# Reset speaker_name and message for the next turn
speaker_name = None
message = None
def display(self):
for turn in self.turns:
print(f"{turn.speaker.name}: {turn.message}")
def export_to_file(self, file_path):
with open(file_path, 'w', encoding='utf-8') as file:
for turn in self.turns:
file.write(f"{turn.speaker.name}: {turn.message}\n")
def to_dict(self):
dialogue_dict = {"turns": []}
for turn in self.turns:
turn_dict = {
"speaker": turn.speaker.name,
"message": turn.message
}
dialogue_dict["turns"].append(turn_dict)
return dialogue_dict
def to_json(self):
dialogue_dict = self.to_dict()
return json.dumps(dialogue_dict, ensure_ascii=False, indent=2)
def participants_to_export(self):
"""
participants_to_export
:return:
"""
participants = set()
for turn in self.turns:
participants.add(turn.speaker.name)
return ', '.join(participants)
class Turn:
def __init__(self, speaker, message):
self.speaker = speaker
self.message = message
class DialogueLoader(BaseLoader, ABC):
"""Load dialogue."""
def __init__(self, file_path: str):
"""Initialize with dialogue."""
self.file_path = file_path
dialogue = Dialogue(file_path=file_path)
dialogue.parse_dialogue()
self.dialogue = dialogue
def load(self) -> List[Document]:
"""Load from dialogue."""
documents = []
participants = self.dialogue.participants_to_export()
for turn in self.dialogue.turns:
metadata = {"source": f"Dialogue File:{self.dialogue.file_path},"
f"speaker:{turn.speaker.name},"
f"participant:{participants}"}
turn_document = Document(page_content=turn.message, metadata=metadata.copy())
documents.append(turn_document)
return documents
"""Loader that loads image files."""
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from paddleocr import PaddleOCR
import os
import nltk
class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
def _get_elements(self) -> List:
def image_ocr_txt(filepath, dir_path="tmp_files"):
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
if not os.path.exists(full_dir_path):
os.makedirs(full_dir_path)
filename = os.path.split(filepath)[-1]
ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False)
result = ocr.ocr(img=filepath)
ocr_result = [i[1][0] for line in result for i in line]
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
with open(txt_file_path, 'w', encoding='utf-8') as fout:
fout.write("\n".join(ocr_result))
return txt_file_path
txt_file_path = image_ocr_txt(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
if __name__ == "__main__":
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from configs.model_config import NLTK_DATA_PATH
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg")
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
docs = loader.load()
for doc in docs:
print(doc)
"""Loader that loads image files."""
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from paddleocr import PaddleOCR
import os
import fitz
import nltk
from configs.model_config import NLTK_DATA_PATH
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
def _get_elements(self) -> List:
def pdf_ocr_txt(filepath, dir_path="tmp_files"):
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
if not os.path.exists(full_dir_path):
os.makedirs(full_dir_path)
ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False)
doc = fitz.open(filepath)
txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt")
img_name = os.path.join(full_dir_path, 'tmp.png')
with open(txt_file_path, 'w', encoding='utf-8') as fout:
for i in range(doc.page_count):
page = doc[i]
text = page.get_text("")
fout.write(text)
fout.write("\n")
img_list = page.get_images()
for img in img_list:
pix = fitz.Pixmap(doc, img[0])
if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save(img_name)
result = ocr.ocr(img_name)
ocr_result = [i[1][0] for line in result for i in line]
fout.write("\n".join(ocr_result))
if os.path.exists(img_name):
os.remove(img_name)
return txt_file_path
txt_file_path = pdf_ocr_txt(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
if __name__ == "__main__":
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf")
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
docs = loader.load()
for doc in docs:
print(doc)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论