### What problem does this PR solve? add support for eml file parser #1363 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>

1年前 · ede733e130
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -39,7 +39,7 @@ from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
 from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
 from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
@@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
            table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case "audio":
            audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case "email":
            email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case _:
            return False

--- a/api/db/__init__.py
+++ b/api/db/__init__.py
@@ -85,6 +85,7 @@ class ParserType(StrEnum):
    PICTURE = "picture"
    ONE = "one"
    AUDIO = "audio"
    EMAIL = "email"
    KG = "knowledge_graph"


--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@@ -122,7 +122,7 @@ def init_llm_factory():
    LLMService.filter_delete([LLMService.model.fid == "QAnything"])
    TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
    TenantService.filter_update([1 == 1], {
        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
    ## insert openai two embedding models to the current openai user.
    print("Start to insert 2 OpenAI embedding models...")
    tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
--- a/api/settings.py
+++ b/api/settings.py
@@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
    "parsers",
    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")

 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@@ -156,7 +156,7 @@ def filename_type(filename):
        return FileType.PDF.value

    if re.match(
             r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
             r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
        return FileType.DOC.value

    if re.match(
--- a/deepdoc/parser/__init__.py
+++ b/deepdoc/parser/__init__.py
@@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
 from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
 from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
 from .txt_parser import RAGFlowTxtParser as TxtParser
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
        else:
            with open(fnm, "r",encoding=get_encoding(fnm)) as f:
                txt = f.read()
        return self.parser_txt(txt)

    @classmethod
    def parser_txt(cls, txt):
        if type(txt) != str:
            raise TypeError("txt type should be str!")
        html_doc = readability.Document(txt)
        title = html_doc.title()
        content = html_text.extract_text(html_doc.summary(html_partial=True))
        txt = f'{title}\n{content}'
        txt = f"{title}\n{content}"
        sections = txt.split("\n")
        return sections
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
@@ -0,0 +1,42 @@
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #

 from rag.nlp import find_codec,num_tokens_from_string

 class RAGFlowTxtParser:
    def __call__(self, fnm, binary=None, chunk_token_num=128):
        txt = ""
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
        else:
            with open(fnm, "r") as f:
                while True:
                    l = f.readline()
                    if not l:
                        break
                    txt += l
        return self.parser_txt(txt, chunk_token_num)

    @classmethod
    def parser_txt(cls, txt, chunk_token_num=128):
        if type(txt) != str:
            raise TypeError("txt type should be str!")
        sections = []
        for sec in txt.split("\n"):
            if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
                sections.append((sec[: int(len(sec) / 2)], ""))
                sections.append((sec[int(len(sec) / 2) :], ""))
            else:
                sections.append((sec, ""))
        return sections
--- a/rag/app/email.py
+++ b/rag/app/email.py
@@ -0,0 +1,114 @@
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #

 from email import policy
 from email.parser import BytesParser
 from rag.app.naive import chunk as naive_chunk
 import re
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
 from deepdoc.parser import HtmlParser, TxtParser
 from timeit import default_timer as timer
 from rag.settings import cron_logger
 import io


 def chunk(
    filename,
    binary=None,
    from_page=0,
    to_page=100000,
    lang="Chinese",
    callback=None,
    **kwargs,
 ):
    """
    Only eml is supported
    """
    eng = lang.lower() == "english"  # is_english(cks)
    parser_config = kwargs.get(
        "parser_config",
        {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True},
    )
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
    }
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    main_res = []
    attachment_res = []

    if binary:
        msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
    else:
        msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))

    text_txt, html_txt = [], []
    # get the email header info
    for header, value in msg.items():
        text_txt.append(f"{header}: {value}")

    #  get the email main info
    def _add_content(msg, content_type):
        if content_type == "text/plain":
            text_txt.append(
                msg.get_payload(decode=True).decode(msg.get_content_charset())
            )
        elif content_type == "text/html":
            html_txt.append(
                msg.get_payload(decode=True).decode(msg.get_content_charset())
            )
        elif "multipart" in content_type:
            if msg.is_multipart():
                for part in msg.iter_parts():
                    _add_content(part, part.get_content_type())

    _add_content(msg, msg.get_content_type())

    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
        (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
    ]

    st = timer()
    chunks = naive_merge(
        sections,
        int(parser_config.get("chunk_token_num", 128)),
        parser_config.get("delimiter", "\n!?。；！？"),
    )

    main_res.extend(tokenize_chunks(chunks, doc, eng, None))
    cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
    # get the attachment info
    for part in msg.iter_attachments():
        content_disposition = part.get("Content-Disposition")
        if content_disposition:
            dispositions = content_disposition.strip().split(";")
            if dispositions[0].lower() == "attachment":
                filename = part.get_filename()
                payload = part.get_payload(decode=True)
                try:
                    attachment_res.extend(
                        naive_chunk(filename, payload, callback=callback, **kwargs)
                    )
                except Exception:
                    pass

    return main_res + attachment_res


 if __name__ == "__main__":
    import sys

    def dummy(prog=None, msg=""):
        pass

    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
 from PIL import Image
@@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
        return sections, tbls



 def chunk(filename, binary=None, from_page=0, to_page=100000,
          lang="Chinese", callback=None, **kwargs):
    """
@@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
                    if not l:
                        break
                    txt += l
        sections = []
        for sec in txt.split("\n"):
            if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
                sections.append((sec[:int(len(sec)/2)], ""))
                sections.append((sec[int(len(sec)/2):], ""))
            else:
                sections.append((sec, ""))

        sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
        callback(0.8, "Finish parsing.")
    
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
 from io import BytesIO
 import pandas as pd

 from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
 from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email

 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
@@ -69,6 +69,7 @@ FACTORY = {
    ParserType.PICTURE.value: picture,
    ParserType.ONE.value: one,
    ParserType.AUDIO.value: audio,
    ParserType.EMAIL.value: email,
    ParserType.KG.value: knowledge_graph
 }

--- a/web/src/components/chunk-method-modal/hooks.ts
+++ b/web/src/components/chunk-method-modal/hooks.ts
@@ -27,7 +27,7 @@ const ParserListMap = new Map([
      'one',
      'qa',
      'manual',
      'knowledge_graph',
      'knowledge_graph'
    ],
  ],
  [
@@ -67,6 +67,7 @@ const ParserListMap = new Map([
  ],
  [['md'], ['naive', 'qa', 'knowledge_graph']],
  [['json'], ['naive', 'knowledge_graph']],
  [['eml'], ['email']]
 ]);

 const getParserList = (