Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

email.py 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import logging
  14. from email import policy
  15. from email.parser import BytesParser
  16. from rag.app.naive import chunk as naive_chunk
  17. import re
  18. from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
  19. from deepdoc.parser import HtmlParser, TxtParser
  20. from timeit import default_timer as timer
  21. import io
  22. def chunk(
  23. filename,
  24. binary=None,
  25. from_page=0,
  26. to_page=100000,
  27. lang="Chinese",
  28. callback=None,
  29. **kwargs,
  30. ):
  31. """
  32. Only eml is supported
  33. """
  34. eng = lang.lower() == "english" # is_english(cks)
  35. parser_config = kwargs.get(
  36. "parser_config",
  37. {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
  38. )
  39. doc = {
  40. "docnm_kwd": filename,
  41. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
  42. }
  43. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  44. main_res = []
  45. attachment_res = []
  46. if binary:
  47. msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
  48. else:
  49. msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
  50. text_txt, html_txt = [], []
  51. # get the email header info
  52. for header, value in msg.items():
  53. text_txt.append(f"{header}: {value}")
  54. # get the email main info
  55. def _add_content(msg, content_type):
  56. if content_type == "text/plain":
  57. text_txt.append(
  58. msg.get_payload(decode=True).decode(msg.get_content_charset())
  59. )
  60. elif content_type == "text/html":
  61. html_txt.append(
  62. msg.get_payload(decode=True).decode(msg.get_content_charset())
  63. )
  64. elif "multipart" in content_type:
  65. if msg.is_multipart():
  66. for part in msg.iter_parts():
  67. _add_content(part, part.get_content_type())
  68. _add_content(msg, msg.get_content_type())
  69. sections = TxtParser.parser_txt("\n".join(text_txt)) + [
  70. (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
  71. ]
  72. st = timer()
  73. chunks = naive_merge(
  74. sections,
  75. int(parser_config.get("chunk_token_num", 128)),
  76. parser_config.get("delimiter", "\n!?。;!?"),
  77. )
  78. main_res.extend(tokenize_chunks(chunks, doc, eng, None))
  79. logging.debug("naive_merge({}): {}".format(filename, timer() - st))
  80. # get the attachment info
  81. for part in msg.iter_attachments():
  82. content_disposition = part.get("Content-Disposition")
  83. if content_disposition:
  84. dispositions = content_disposition.strip().split(";")
  85. if dispositions[0].lower() == "attachment":
  86. filename = part.get_filename()
  87. payload = part.get_payload(decode=True)
  88. try:
  89. attachment_res.extend(
  90. naive_chunk(filename, payload, callback=callback, **kwargs)
  91. )
  92. except Exception:
  93. pass
  94. return main_res + attachment_res
  95. if __name__ == "__main__":
  96. import sys
  97. def dummy(prog=None, msg=""):
  98. pass
  99. chunk(sys.argv[1], callback=dummy)