You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import copy
  14. from tika import parser
  15. import re
  16. from io import BytesIO
  17. from docx import Document
  18. from api.db import ParserType
  19. from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
  20. make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
  21. from rag.nlp import rag_tokenizer
  22. from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
  23. from rag.settings import cron_logger
  24. class Docx(DocxParser):
  25. def __init__(self):
  26. pass
  27. def __clean(self, line):
  28. line = re.sub(r"\u3000", " ", line).strip()
  29. return line
  30. def old_call(self, filename, binary=None, from_page=0, to_page=100000):
  31. self.doc = Document(
  32. filename) if not binary else Document(BytesIO(binary))
  33. pn = 0
  34. lines = []
  35. for p in self.doc.paragraphs:
  36. if pn > to_page:
  37. break
  38. if from_page <= pn < to_page and p.text.strip():
  39. lines.append(self.__clean(p.text))
  40. for run in p.runs:
  41. if 'lastRenderedPageBreak' in run._element.xml:
  42. pn += 1
  43. continue
  44. if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
  45. pn += 1
  46. return [l for l in lines if l]
  47. def __call__(self, filename, binary=None, from_page=0, to_page=100000):
  48. self.doc = Document(
  49. filename) if not binary else Document(BytesIO(binary))
  50. pn = 0
  51. last_question, last_answer, last_level = "", "", -1
  52. lines = []
  53. root = DocxNode()
  54. point = root
  55. bull = bullets_category([p.text for p in self.doc.paragraphs])
  56. for p in self.doc.paragraphs:
  57. if pn > to_page:
  58. break
  59. question_level, p_text = 0, ''
  60. if from_page <= pn < to_page and p.text.strip():
  61. question_level, p_text = docx_question_level(p, bull)
  62. if not question_level or question_level > 6: # not a question
  63. last_answer = f'{last_answer}\n{p_text}'
  64. else: # is a question
  65. if last_question:
  66. while last_level <= point.level:
  67. point = point.parent
  68. new_node = DocxNode(last_question, last_answer, last_level, [], point)
  69. point.childs.append(new_node)
  70. point = new_node
  71. last_question, last_answer, last_level = '', '', -1
  72. last_level = question_level
  73. last_answer = ''
  74. last_question = p_text
  75. for run in p.runs:
  76. if 'lastRenderedPageBreak' in run._element.xml:
  77. pn += 1
  78. continue
  79. if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
  80. pn += 1
  81. if last_question:
  82. while last_level <= point.level:
  83. point = point.parent
  84. new_node = DocxNode(last_question, last_answer, last_level, [], point)
  85. point.childs.append(new_node)
  86. point = new_node
  87. last_question, last_answer, last_level = '', '', -1
  88. traversal_queue = [root]
  89. while traversal_queue:
  90. current_node: DocxNode = traversal_queue.pop()
  91. sum_text = f'{self.__clean(current_node.question)}\n{self.__clean(current_node.answer)}'
  92. if not current_node.childs and not current_node.answer.strip():
  93. continue
  94. for child in current_node.childs:
  95. sum_text = f'{sum_text}\n{self.__clean(child.question)}'
  96. traversal_queue.insert(0, child)
  97. lines.append(self.__clean(sum_text))
  98. return [l for l in lines if l]
  99. class DocxNode:
  100. def __init__(self, question: str = '', answer: str = '', level: int = 0, childs: list = [], parent = None) -> None:
  101. self.question = question
  102. self.answer = answer
  103. self.level = level
  104. self.childs = childs
  105. self.parent = parent
  106. def __str__(self) -> str:
  107. return f'''
  108. question:{self.question},
  109. answer:{self.answer},
  110. level:{self.level},
  111. childs:{self.childs}
  112. '''
  113. class Pdf(PdfParser):
  114. def __init__(self):
  115. self.model_speciess = ParserType.LAWS.value
  116. super().__init__()
  117. def __call__(self, filename, binary=None, from_page=0,
  118. to_page=100000, zoomin=3, callback=None):
  119. callback(msg="OCR is running...")
  120. self.__images__(
  121. filename if not binary else binary,
  122. zoomin,
  123. from_page,
  124. to_page,
  125. callback
  126. )
  127. callback(msg="OCR finished")
  128. from timeit import default_timer as timer
  129. start = timer()
  130. self._layouts_rec(zoomin)
  131. callback(0.67, "Layout analysis finished")
  132. cron_logger.info("layouts:".format(
  133. (timer() - start) / (self.total_page + 0.1)))
  134. self._naive_vertical_merge()
  135. callback(0.8, "Text extraction finished")
  136. return [(b["text"], self._line_tag(b, zoomin))
  137. for b in self.boxes], None
  138. def chunk(filename, binary=None, from_page=0, to_page=100000,
  139. lang="Chinese", callback=None, **kwargs):
  140. """
  141. Supported file formats are docx, pdf, txt.
  142. """
  143. doc = {
  144. "docnm_kwd": filename,
  145. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
  146. }
  147. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  148. pdf_parser = None
  149. sections = []
  150. # is it English
  151. eng = lang.lower() == "english" # is_english(sections)
  152. if re.search(r"\.docx$", filename, re.IGNORECASE):
  153. callback(0.1, "Start to parse.")
  154. for txt in Docx()(filename, binary):
  155. sections.append(txt)
  156. callback(0.8, "Finish parsing.")
  157. chunks = sections
  158. return tokenize_chunks(chunks, doc, eng, pdf_parser)
  159. elif re.search(r"\.pdf$", filename, re.IGNORECASE):
  160. pdf_parser = Pdf() if kwargs.get(
  161. "parser_config", {}).get(
  162. "layout_recognize", True) else PlainParser()
  163. for txt, poss in pdf_parser(filename if not binary else binary,
  164. from_page=from_page, to_page=to_page, callback=callback)[0]:
  165. sections.append(txt + poss)
  166. elif re.search(r"\.txt$", filename, re.IGNORECASE):
  167. callback(0.1, "Start to parse.")
  168. txt = ""
  169. if binary:
  170. encoding = find_codec(binary)
  171. txt = binary.decode(encoding, errors="ignore")
  172. else:
  173. with open(filename, "r") as f:
  174. while True:
  175. l = f.readline()
  176. if not l:
  177. break
  178. txt += l
  179. sections = txt.split("\n")
  180. sections = [l for l in sections if l]
  181. callback(0.8, "Finish parsing.")
  182. elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
  183. callback(0.1, "Start to parse.")
  184. sections = HtmlParser()(filename, binary)
  185. sections = [l for l in sections if l]
  186. callback(0.8, "Finish parsing.")
  187. elif re.search(r"\.doc$", filename, re.IGNORECASE):
  188. callback(0.1, "Start to parse.")
  189. binary = BytesIO(binary)
  190. doc_parsed = parser.from_buffer(binary)
  191. sections = doc_parsed['content'].split('\n')
  192. sections = [l for l in sections if l]
  193. callback(0.8, "Finish parsing.")
  194. else:
  195. raise NotImplementedError(
  196. "file type not supported yet(doc, docx, pdf, txt supported)")
  197. # Remove 'Contents' part
  198. remove_contents_table(sections, eng)
  199. make_colon_as_title(sections)
  200. bull = bullets_category(sections)
  201. chunks = hierarchical_merge(bull, sections, 5)
  202. if not chunks:
  203. callback(0.99, "No chunk parsed out.")
  204. return tokenize_chunks(["\n".join(ck)
  205. for ck in chunks], doc, eng, pdf_parser)
  206. if __name__ == "__main__":
  207. import sys
  208. def dummy(prog=None, msg=""):
  209. pass
  210. chunk(sys.argv[1], callback=dummy)