You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. from email import policy
  18. from email.parser import BytesParser
  19. from rag.app.naive import chunk as naive_chunk
  20. import re
  21. from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
  22. from deepdoc.parser import HtmlParser, TxtParser
  23. from timeit import default_timer as timer
  24. import io
  25. def chunk(
  26. filename,
  27. binary=None,
  28. from_page=0,
  29. to_page=100000,
  30. lang="Chinese",
  31. callback=None,
  32. **kwargs,
  33. ):
  34. """
  35. Only eml is supported
  36. """
  37. eng = lang.lower() == "english" # is_english(cks)
  38. parser_config = kwargs.get(
  39. "parser_config",
  40. {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
  41. )
  42. doc = {
  43. "docnm_kwd": filename,
  44. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
  45. }
  46. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  47. main_res = []
  48. attachment_res = []
  49. if binary:
  50. msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
  51. else:
  52. msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
  53. text_txt, html_txt = [], []
  54. # get the email header info
  55. for header, value in msg.items():
  56. text_txt.append(f"{header}: {value}")
  57. # get the email main info
  58. def _add_content(msg, content_type):
  59. if content_type == "text/plain":
  60. text_txt.append(
  61. msg.get_payload(decode=True).decode(msg.get_content_charset())
  62. )
  63. elif content_type == "text/html":
  64. html_txt.append(
  65. msg.get_payload(decode=True).decode(msg.get_content_charset())
  66. )
  67. elif "multipart" in content_type:
  68. if msg.is_multipart():
  69. for part in msg.iter_parts():
  70. _add_content(part, part.get_content_type())
  71. _add_content(msg, msg.get_content_type())
  72. sections = TxtParser.parser_txt("\n".join(text_txt)) + [
  73. (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
  74. ]
  75. st = timer()
  76. chunks = naive_merge(
  77. sections,
  78. int(parser_config.get("chunk_token_num", 128)),
  79. parser_config.get("delimiter", "\n!?。;!?"),
  80. )
  81. main_res.extend(tokenize_chunks(chunks, doc, eng, None))
  82. logging.debug("naive_merge({}): {}".format(filename, timer() - st))
  83. # get the attachment info
  84. for part in msg.iter_attachments():
  85. content_disposition = part.get("Content-Disposition")
  86. if content_disposition:
  87. dispositions = content_disposition.strip().split(";")
  88. if dispositions[0].lower() == "attachment":
  89. filename = part.get_filename()
  90. payload = part.get_payload(decode=True)
  91. try:
  92. attachment_res.extend(
  93. naive_chunk(filename, payload, callback=callback, **kwargs)
  94. )
  95. except Exception:
  96. pass
  97. return main_res + attachment_res
  98. if __name__ == "__main__":
  99. import sys
  100. def dummy(prog=None, msg=""):
  101. pass
  102. chunk(sys.argv[1], callback=dummy)