ソースを参照

add support for eml file parser (#1768)

### What problem does this PR solve?

add support for eml file parser
#1363

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
tags/v0.10.0
黄腾 1年前
コミット
ede733e130
コミッターのメールアドレスに関連付けられたアカウントが存在しません

+ 3
- 1
api/apps/dataset_api.py ファイルの表示

@@ -39,7 +39,7 @@ from api.utils import get_uuid
from api.utils.api_utils import construct_json_result, construct_error_response
from api.utils.api_utils import construct_result, validate_request
from api.utils.file_utils import filename_type, thumbnail
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO
@@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "audio":
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "email":
email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case _:
return False


+ 1
- 0
api/db/__init__.py ファイルの表示

@@ -85,6 +85,7 @@ class ParserType(StrEnum):
PICTURE = "picture"
ONE = "one"
AUDIO = "audio"
EMAIL = "email"
KG = "knowledge_graph"

+ 1
- 1
api/db/init_data.py ファイルの表示

@@ -122,7 +122,7 @@ def init_llm_factory():
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
TenantService.filter_update([1 == 1], {
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
## insert openai two embedding models to the current openai user.
print("Start to insert 2 OpenAI embedding models...")
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])

+ 1
- 1
api/settings.py ファイルの表示

@@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
API_KEY = LLM.get("api_key", "")
PARSERS = LLM.get(
"parsers",
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")
# distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

+ 1
- 1
api/utils/file_utils.py ファイルの表示

@@ -156,7 +156,7 @@ def filename_type(filename):
return FileType.PDF.value
if re.match(
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
return FileType.DOC.value
if re.match(

+ 2
- 1
deepdoc/parser/__init__.py ファイルの表示

@@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser
from .ppt_parser import RAGFlowPptParser as PptParser
from .html_parser import RAGFlowHtmlParser as HtmlParser
from .json_parser import RAGFlowJsonParser as JsonParser
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
from .txt_parser import RAGFlowTxtParser as TxtParser

+ 6
- 1
deepdoc/parser/html_parser.py ファイルの表示

@@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
else:
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
txt = f.read()
return self.parser_txt(txt)

@classmethod
def parser_txt(cls, txt):
if type(txt) != str:
raise TypeError("txt type should be str!")
html_doc = readability.Document(txt)
title = html_doc.title()
content = html_text.extract_text(html_doc.summary(html_partial=True))
txt = f'{title}\n{content}'
txt = f"{title}\n{content}"
sections = txt.split("\n")
return sections

+ 42
- 0
deepdoc/parser/txt_parser.py ファイルの表示

@@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from rag.nlp import find_codec,num_tokens_from_string

class RAGFlowTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128):
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
return self.parser_txt(txt, chunk_token_num)

@classmethod
def parser_txt(cls, txt, chunk_token_num=128):
if type(txt) != str:
raise TypeError("txt type should be str!")
sections = []
for sec in txt.split("\n"):
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
sections.append((sec[: int(len(sec) / 2)], ""))
sections.append((sec[int(len(sec) / 2) :], ""))
else:
sections.append((sec, ""))
return sections

+ 114
- 0
rag/app/email.py ファイルの表示

@@ -0,0 +1,114 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from email import policy
from email.parser import BytesParser
from rag.app.naive import chunk as naive_chunk
import re
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
from deepdoc.parser import HtmlParser, TxtParser
from timeit import default_timer as timer
from rag.settings import cron_logger
import io


def chunk(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
**kwargs,
):
"""
Only eml is supported
"""
eng = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config",
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
)
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
main_res = []
attachment_res = []

if binary:
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
else:
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))

text_txt, html_txt = [], []
# get the email header info
for header, value in msg.items():
text_txt.append(f"{header}: {value}")

# get the email main info
def _add_content(msg, content_type):
if content_type == "text/plain":
text_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif content_type == "text/html":
html_txt.append(
msg.get_payload(decode=True).decode(msg.get_content_charset())
)
elif "multipart" in content_type:
if msg.is_multipart():
for part in msg.iter_parts():
_add_content(part, part.get_content_type())

_add_content(msg, msg.get_content_type())

sections = TxtParser.parser_txt("\n".join(text_txt)) + [
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
]

st = timer()
chunks = naive_merge(
sections,
int(parser_config.get("chunk_token_num", 128)),
parser_config.get("delimiter", "\n!?。;!?"),
)

main_res.extend(tokenize_chunks(chunks, doc, eng, None))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
# get the attachment info
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")
if content_disposition:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True)
try:
attachment_res.extend(
naive_chunk(filename, payload, callback=callback, **kwargs)
)
except Exception:
pass

return main_res + attachment_res


if __name__ == "__main__":
import sys

def dummy(prog=None, msg=""):
pass

chunk(sys.argv[1], callback=dummy)

+ 3
- 20
rag/app/naive.py ファイルの表示

@@ -17,7 +17,7 @@ from timeit import default_timer as timer
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.settings import cron_logger
from rag.utils import num_tokens_from_string
from PIL import Image
@@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
return sections, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
@@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:
break
txt += l
sections = []
for sec in txt.split("\n"):
if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
sections.append((sec[:int(len(sec)/2)], ""))
sections.append((sec[int(len(sec)/2):], ""))
else:
sections.append((sec, ""))
sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):

+ 2
- 1
rag/svr/task_executor.py ファイルの表示

@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
from io import BytesIO
import pandas as pd

from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email

from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
@@ -69,6 +69,7 @@ FACTORY = {
ParserType.PICTURE.value: picture,
ParserType.ONE.value: one,
ParserType.AUDIO.value: audio,
ParserType.EMAIL.value: email,
ParserType.KG.value: knowledge_graph
}


+ 2
- 1
web/src/components/chunk-method-modal/hooks.ts ファイルの表示

@@ -27,7 +27,7 @@ const ParserListMap = new Map([
'one',
'qa',
'manual',
'knowledge_graph',
'knowledge_graph'
],
],
[
@@ -67,6 +67,7 @@ const ParserListMap = new Map([
],
[['md'], ['naive', 'qa', 'knowledge_graph']],
[['json'], ['naive', 'knowledge_graph']],
[['eml'], ['email']]
]);

const getParserList = (

読み込み中…
キャンセル
保存