| 
                        123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 | 
                        - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - 
 - from email import policy
 - from email.parser import BytesParser
 - from rag.app.naive import chunk as naive_chunk
 - import re
 - from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
 - from deepdoc.parser import HtmlParser, TxtParser
 - from timeit import default_timer as timer
 - from rag.settings import cron_logger
 - import io
 - 
 - 
 - def chunk(
 -     filename,
 -     binary=None,
 -     from_page=0,
 -     to_page=100000,
 -     lang="Chinese",
 -     callback=None,
 -     **kwargs,
 - ):
 -     """
 -     Only eml is supported
 -     """
 -     eng = lang.lower() == "english"  # is_english(cks)
 -     parser_config = kwargs.get(
 -         "parser_config",
 -         {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
 -     )
 -     doc = {
 -         "docnm_kwd": filename,
 -         "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
 -     }
 -     doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
 -     main_res = []
 -     attachment_res = []
 - 
 -     if binary:
 -         msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
 -     else:
 -         msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
 - 
 -     text_txt, html_txt = [], []
 -     # get the email header info
 -     for header, value in msg.items():
 -         text_txt.append(f"{header}: {value}")
 - 
 -     #  get the email main info
 -     def _add_content(msg, content_type):
 -         if content_type == "text/plain":
 -             text_txt.append(
 -                 msg.get_payload(decode=True).decode(msg.get_content_charset())
 -             )
 -         elif content_type == "text/html":
 -             html_txt.append(
 -                 msg.get_payload(decode=True).decode(msg.get_content_charset())
 -             )
 -         elif "multipart" in content_type:
 -             if msg.is_multipart():
 -                 for part in msg.iter_parts():
 -                     _add_content(part, part.get_content_type())
 - 
 -     _add_content(msg, msg.get_content_type())
 - 
 -     sections = TxtParser.parser_txt("\n".join(text_txt)) + [
 -         (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
 -     ]
 - 
 -     st = timer()
 -     chunks = naive_merge(
 -         sections,
 -         int(parser_config.get("chunk_token_num", 128)),
 -         parser_config.get("delimiter", "\n!?。;!?"),
 -     )
 - 
 -     main_res.extend(tokenize_chunks(chunks, doc, eng, None))
 -     cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
 -     # get the attachment info
 -     for part in msg.iter_attachments():
 -         content_disposition = part.get("Content-Disposition")
 -         if content_disposition:
 -             dispositions = content_disposition.strip().split(";")
 -             if dispositions[0].lower() == "attachment":
 -                 filename = part.get_filename()
 -                 payload = part.get_payload(decode=True)
 -                 try:
 -                     attachment_res.extend(
 -                         naive_chunk(filename, payload, callback=callback, **kwargs)
 -                     )
 -                 except Exception:
 -                     pass
 - 
 -     return main_res + attachment_res
 - 
 - 
 - if __name__ == "__main__":
 -     import sys
 - 
 -     def dummy(prog=None, msg=""):
 -         pass
 - 
 -     chunk(sys.argv[1], callback=dummy)
 
 
  |