增加jpeg格式转换，修改流式传输bug,增加文本转换处理器

9ea4124d · 孙俊华 · e4949603 · 9ea4124d · 9ea4124d · 9ea4124d
--- a/api.py
+++ b/api.py
@@ -412,10 +412,10 @@ async def stream_chat(websocket: WebSocket):
            "knowledge_base_id"]
        vs_path = get_vs_path(knowledge_base_id)

-        if not os.path.exists(vs_path):
-            await websocket.send_json({"error": f"Knowledge base {knowledge_base_id} not found"})
-            await websocket.close()
-            return
+        # if not os.path.exists(vs_path):
+        #     await websocket.send_json({"error": f"Knowledge base {knowledge_base_id} not found"})
+        #     await websocket.close()
+        #     return

        await websocket.send_json({"question": question, "turn": turn, "flag": "start"})


--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@@ -74,7 +74,7 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_T
        loader = UnstructuredPaddlePDFLoader(filepath)
        textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
        docs = loader.load_and_split(textsplitter)
-    elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"):
+    elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".jpeg") or filepath.lower().endswith(".png"):
        # 暂且将paddle相关的loader改为动态加载，可以在不上传pdf/image知识文件的前提下使用protobuf=4.x
        from loader import UnstructuredPaddleImageLoader
        loader = UnstructuredPaddleImageLoader(filepath, mode="elements")

--- a/loader/RSS_loader.py
+++ b/loader/RSS_loader.py
+from langchain.docstore.document import Document
+import feedparser
+import html2text
+import ssl
+import time
+
+
+class RSS_Url_loader:
+    def __init__(self, urls=None,interval=60):
+        '''可用参数urls数组或者是字符串形式的url列表'''
+        self.urls = []
+        self.interval = interval
+        if urls is not None:
+            try:
+                if isinstance(urls, str):
+                    urls = [urls]
+                elif isinstance(urls, list):
+                    pass
+                else:
+                    raise TypeError('urls must be a list or a string.')
+                self.urls = urls
+            except:
+                Warning('urls must be a list or a string.')
+    
+    #定时代码还要考虑是不是引入其他类，暂时先不对外开放
+    def scheduled_execution(self):
+        while True:
+            docs = self.load()
+            return docs
+            time.sleep(self.interval)
+
+    def load(self):
+        if hasattr(ssl, '_create_unverified_context'):
+            ssl._create_default_https_context = ssl._create_unverified_context
+        documents = []
+        for url in self.urls:
+            parsed = feedparser.parse(url)
+            for entry in parsed.entries:
+                if "content" in entry:
+                    data = entry.content[0].value
+                else:
+                    data = entry.description or entry.summary
+                data = html2text.html2text(data)
+                metadata = {"title": entry.title, "link": entry.link}
+                documents.append(Document(page_content=data, metadata=metadata))
+        return documents
+
+if __name__=="__main__":
+    #需要在配置文件中加入urls的配置，或者是在用户界面上加入urls的配置
+    urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"]
+    loader = RSS_Url_loader(urls)
+    docs = loader.load()
+    for doc in docs:
+        print(doc)
\ No newline at end of file
--- a/loader/__init__.py
+++ b/loader/__init__.py
+from .image_loader import UnstructuredPaddleImageLoader
+from .pdf_loader import UnstructuredPaddlePDFLoader
+from .dialogue import (
+    Person,
+    Dialogue,
+    Turn,
+    DialogueLoader
+)
+
+__all__ = [
+    "UnstructuredPaddleImageLoader",
+    "UnstructuredPaddlePDFLoader",
+    "DialogueLoader",
+]
--- a/loader/dialogue.py
+++ b/loader/dialogue.py
+import json
+from abc import ABC
+from typing import List
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class Person:
+    def __init__(self, name, age):
+        self.name = name
+        self.age = age
+
+
+class Dialogue:
+    """
+    Build an abstract dialogue model using classes and methods to represent different dialogue elements.
+    This class serves as a fundamental framework for constructing dialogue models.
+    """
+
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.turns = []
+
+    def add_turn(self, turn):
+        """
+        Create an instance of a conversation participant
+        :param turn:
+        :return:
+        """
+        self.turns.append(turn)
+
+    def parse_dialogue(self):
+        """
+        The parse_dialogue function reads the specified dialogue file and parses each dialogue turn line by line.
+        For each turn, the function extracts the name of the speaker and the message content from the text,
+        creating a Turn instance. If the speaker is not already present in the participants dictionary,
+        a new Person instance is created. Finally, the parsed Turn instance is added to the Dialogue object.
+
+        Please note that this sample code assumes that each line in the file follows a specific format:
+        <speaker>:\r\n<message>\r\n\r\n. If your file has a different format or includes other metadata,
+         you may need to adjust the parsing logic accordingly.
+        """
+        participants = {}
+        speaker_name = None
+        message = None
+
+        with open(self.file_path, encoding='utf-8') as file:
+            lines = file.readlines()
+            for i, line in enumerate(lines):
+                line = line.strip()
+                if not line:
+                    continue
+
+                if speaker_name is None:
+                    speaker_name, _ = line.split(':', 1)
+                elif message is None:
+                    message = line
+                    if speaker_name not in participants:
+                        participants[speaker_name] = Person(speaker_name, None)
+
+                    speaker = participants[speaker_name]
+                    turn = Turn(speaker, message)
+                    self.add_turn(turn)
+
+                    # Reset speaker_name and message for the next turn
+                    speaker_name = None
+                    message = None
+
+    def display(self):
+        for turn in self.turns:
+            print(f"{turn.speaker.name}: {turn.message}")
+
+    def export_to_file(self, file_path):
+        with open(file_path, 'w', encoding='utf-8') as file:
+            for turn in self.turns:
+                file.write(f"{turn.speaker.name}: {turn.message}\n")
+
+    def to_dict(self):
+        dialogue_dict = {"turns": []}
+        for turn in self.turns:
+            turn_dict = {
+                "speaker": turn.speaker.name,
+                "message": turn.message
+            }
+            dialogue_dict["turns"].append(turn_dict)
+        return dialogue_dict
+
+    def to_json(self):
+        dialogue_dict = self.to_dict()
+        return json.dumps(dialogue_dict, ensure_ascii=False, indent=2)
+
+    def participants_to_export(self):
+        """
+        participants_to_export
+        :return:
+        """
+        participants = set()
+        for turn in self.turns:
+            participants.add(turn.speaker.name)
+        return ', '.join(participants)
+
+
+class Turn:
+    def __init__(self, speaker, message):
+        self.speaker = speaker
+        self.message = message
+
+
+class DialogueLoader(BaseLoader, ABC):
+    """Load dialogue."""
+
+    def __init__(self, file_path: str):
+        """Initialize with dialogue."""
+        self.file_path = file_path
+        dialogue = Dialogue(file_path=file_path)
+        dialogue.parse_dialogue()
+        self.dialogue = dialogue
+
+    def load(self) -> List[Document]:
+        """Load from dialogue."""
+        documents = []
+        participants = self.dialogue.participants_to_export()
+
+        for turn in self.dialogue.turns:
+            metadata = {"source": f"Dialogue File：{self.dialogue.file_path},"
+                                  f"speaker：{turn.speaker.name}，"
+                                  f"participant：{participants}"}
+            turn_document = Document(page_content=turn.message, metadata=metadata.copy())
+            documents.append(turn_document)
+
+        return documents
--- a/loader/image_loader.py
+++ b/loader/image_loader.py
+"""Loader that loads image files."""
+from typing import List
+
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
+from paddleocr import PaddleOCR
+import os
+import nltk
+
+class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
+    """Loader that uses unstructured to load image files, such as PNGs and JPGs."""
+
+    def _get_elements(self) -> List:
+        def image_ocr_txt(filepath, dir_path="tmp_files"):
+            full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
+            if not os.path.exists(full_dir_path):
+                os.makedirs(full_dir_path)
+            filename = os.path.split(filepath)[-1]
+            ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False)
+            result = ocr.ocr(img=filepath)
+
+            ocr_result = [i[1][0] for line in result for i in line]
+            txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
+            with open(txt_file_path, 'w', encoding='utf-8') as fout:
+                fout.write("\n".join(ocr_result))
+            return txt_file_path
+
+        txt_file_path = image_ocr_txt(self.file_path)
+        from unstructured.partition.text import partition_text
+        return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
+      
+      
+if __name__ == "__main__":
+    import sys
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+    
+    from configs.model_config import NLTK_DATA_PATH
+    nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
+    
+    filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg")
+    loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
+    docs = loader.load()
+    for doc in docs:
+        print(doc)
--- a/loader/pdf_loader.py
+++ b/loader/pdf_loader.py
+"""Loader that loads image files."""
+from typing import List
+
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
+from paddleocr import PaddleOCR
+import os
+import fitz
+import nltk
+from configs.model_config import NLTK_DATA_PATH
+
+nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
+
+class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
+    """Loader that uses unstructured to load image files, such as PNGs and JPGs."""
+
+    def _get_elements(self) -> List:
+        def pdf_ocr_txt(filepath, dir_path="tmp_files"):
+            full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
+            if not os.path.exists(full_dir_path):
+                os.makedirs(full_dir_path)
+            ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False)
+            doc = fitz.open(filepath)
+            txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt")
+            img_name = os.path.join(full_dir_path, 'tmp.png')
+            with open(txt_file_path, 'w', encoding='utf-8') as fout:
+                for i in range(doc.page_count):
+                    page = doc[i]
+                    text = page.get_text("")
+                    fout.write(text)
+                    fout.write("\n")
+
+                    img_list = page.get_images()
+                    for img in img_list:
+                        pix = fitz.Pixmap(doc, img[0])
+                        if pix.n - pix.alpha >= 4:
+                            pix = fitz.Pixmap(fitz.csRGB, pix)
+                        pix.save(img_name)
+
+                        result = ocr.ocr(img_name)
+                        ocr_result = [i[1][0] for line in result for i in line]
+                        fout.write("\n".join(ocr_result))
+            if os.path.exists(img_name):
+                os.remove(img_name)
+            return txt_file_path
+
+        txt_file_path = pdf_ocr_txt(self.file_path)
+        from unstructured.partition.text import partition_text
+        return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
+
+
+if __name__ == "__main__":
+    import sys
+    sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+    filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf")
+    loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
+    docs = loader.load()
+    for doc in docs:
+        print(doc)