Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import copy
  14. from tika import parser
  15. import re
  16. from io import BytesIO
  17. from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
  18. hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
  19. tokenize_chunks, find_codec
  20. from rag.nlp import rag_tokenizer
  21. from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
  22. class Pdf(PdfParser):
  23. def __call__(self, filename, binary=None, from_page=0,
  24. to_page=100000, zoomin=3, callback=None):
  25. callback(msg="OCR is running...")
  26. self.__images__(
  27. filename if not binary else binary,
  28. zoomin,
  29. from_page,
  30. to_page,
  31. callback)
  32. callback(msg="OCR finished")
  33. from timeit import default_timer as timer
  34. start = timer()
  35. self._layouts_rec(zoomin)
  36. callback(0.67, "Layout analysis finished")
  37. print("layouts:", timer() - start)
  38. self._table_transformer_job(zoomin)
  39. callback(0.68, "Table analysis finished")
  40. self._text_merge()
  41. tbls = self._extract_table_figure(True, zoomin, True, True)
  42. self._naive_vertical_merge()
  43. self._filter_forpages()
  44. self._merge_with_same_bullet()
  45. callback(0.75, "Text merging finished.")
  46. callback(0.8, "Text extraction finished")
  47. return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
  48. for b in self.boxes], tbls
  49. def chunk(filename, binary=None, from_page=0, to_page=100000,
  50. lang="Chinese", callback=None, **kwargs):
  51. """
  52. Supported file formats are docx, pdf, txt.
  53. Since a book is long and not all the parts are useful, if it's a PDF,
  54. please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
  55. """
  56. doc = {
  57. "docnm_kwd": filename,
  58. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
  59. }
  60. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  61. pdf_parser = None
  62. sections, tbls = [], []
  63. if re.search(r"\.docx$", filename, re.IGNORECASE):
  64. callback(0.1, "Start to parse.")
  65. doc_parser = DocxParser()
  66. # TODO: table of contents need to be removed
  67. sections, tbls = doc_parser(
  68. binary if binary else filename, from_page=from_page, to_page=to_page)
  69. remove_contents_table(sections, eng=is_english(
  70. random_choices([t for t, _ in sections], k=200)))
  71. tbls = [((None, lns), None) for lns in tbls]
  72. callback(0.8, "Finish parsing.")
  73. elif re.search(r"\.pdf$", filename, re.IGNORECASE):
  74. pdf_parser = Pdf() if kwargs.get(
  75. "parser_config", {}).get(
  76. "layout_recognize", True) else PlainParser()
  77. sections, tbls = pdf_parser(filename if not binary else binary,
  78. from_page=from_page, to_page=to_page, callback=callback)
  79. elif re.search(r"\.txt$", filename, re.IGNORECASE):
  80. callback(0.1, "Start to parse.")
  81. txt = ""
  82. if binary:
  83. encoding = find_codec(binary)
  84. txt = binary.decode(encoding, errors="ignore")
  85. else:
  86. with open(filename, "r") as f:
  87. while True:
  88. l = f.readline()
  89. if not l:
  90. break
  91. txt += l
  92. sections = txt.split("\n")
  93. sections = [(l, "") for l in sections if l]
  94. remove_contents_table(sections, eng=is_english(
  95. random_choices([t for t, _ in sections], k=200)))
  96. callback(0.8, "Finish parsing.")
  97. elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
  98. callback(0.1, "Start to parse.")
  99. sections = HtmlParser()(filename, binary)
  100. sections = [(l, "") for l in sections if l]
  101. remove_contents_table(sections, eng=is_english(
  102. random_choices([t for t, _ in sections], k=200)))
  103. callback(0.8, "Finish parsing.")
  104. elif re.search(r"\.doc$", filename, re.IGNORECASE):
  105. callback(0.1, "Start to parse.")
  106. binary = BytesIO(binary)
  107. doc_parsed = parser.from_buffer(binary)
  108. sections = doc_parsed['content'].split('\n')
  109. sections = [(l, "") for l in sections if l]
  110. remove_contents_table(sections, eng=is_english(
  111. random_choices([t for t, _ in sections], k=200)))
  112. callback(0.8, "Finish parsing.")
  113. else:
  114. raise NotImplementedError(
  115. "file type not supported yet(doc, docx, pdf, txt supported)")
  116. make_colon_as_title(sections)
  117. bull = bullets_category(
  118. [t for t in random_choices([t for t, _ in sections], k=100)])
  119. if bull >= 0:
  120. chunks = ["\n".join(ck)
  121. for ck in hierarchical_merge(bull, sections, 5)]
  122. else:
  123. sections = [s.split("@") for s, _ in sections]
  124. sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
  125. chunks = naive_merge(
  126. sections, kwargs.get(
  127. "chunk_token_num", 256), kwargs.get(
  128. "delimer", "\n。;!?"))
  129. # is it English
  130. # is_english(random_choices([t for t, _ in sections], k=218))
  131. eng = lang.lower() == "english"
  132. res = tokenize_table(tbls, doc, eng)
  133. res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
  134. return res
  135. if __name__ == "__main__":
  136. import sys
  137. def dummy(prog=None, msg=""):
  138. pass
  139. chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)