Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

html_parser.py 8.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. from rag.nlp import find_codec, rag_tokenizer
  18. import uuid
  19. import chardet
  20. from bs4 import BeautifulSoup, NavigableString, Tag, Comment
  21. import html
  22. def get_encoding(file):
  23. with open(file,'rb') as f:
  24. tmp = chardet.detect(f.read())
  25. return tmp['encoding']
  26. BLOCK_TAGS = [
  27. "h1", "h2", "h3", "h4", "h5", "h6",
  28. "p", "div", "article", "section", "aside",
  29. "ul", "ol", "li",
  30. "table", "pre", "code", "blockquote",
  31. "figure", "figcaption"
  32. ]
  33. TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
  34. class RAGFlowHtmlParser:
  35. def __call__(self, fnm, binary=None, chunk_token_num=None):
  36. if binary:
  37. encoding = find_codec(binary)
  38. txt = binary.decode(encoding, errors="ignore")
  39. else:
  40. with open(fnm, "r",encoding=get_encoding(fnm)) as f:
  41. txt = f.read()
  42. return self.parser_txt(txt, chunk_token_num)
  43. @classmethod
  44. def parser_txt(cls, txt, chunk_token_num):
  45. if not isinstance(txt, str):
  46. raise TypeError("txt type should be string!")
  47. temp_sections = []
  48. soup = BeautifulSoup(txt, "html5lib")
  49. # delete <style> tag
  50. for style_tag in soup.find_all(["style", "script"]):
  51. style_tag.decompose()
  52. # delete <script> tag in <div>
  53. for div_tag in soup.find_all("div"):
  54. for script_tag in div_tag.find_all("script"):
  55. script_tag.decompose()
  56. # delete inline style
  57. for tag in soup.find_all(True):
  58. if 'style' in tag.attrs:
  59. del tag.attrs['style']
  60. # delete HTML comment
  61. for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
  62. comment.extract()
  63. cls.read_text_recursively(soup.body, temp_sections, chunk_token_num=chunk_token_num)
  64. block_txt_list, table_list = cls.merge_block_text(temp_sections)
  65. sections = cls.chunk_block(block_txt_list, chunk_token_num=chunk_token_num)
  66. for table in table_list:
  67. sections.append(table.get("content", ""))
  68. return sections
  69. @classmethod
  70. def split_table(cls, html_table, chunk_token_num=512):
  71. soup = BeautifulSoup(html_table, "html.parser")
  72. rows = soup.find_all("tr")
  73. tables = []
  74. current_table = []
  75. current_count = 0
  76. table_str_list = []
  77. for row in rows:
  78. tks_str = rag_tokenizer.tokenize(str(row))
  79. token_count = len(tks_str.split(" ")) if tks_str else 0
  80. if current_count + token_count > chunk_token_num:
  81. tables.append(current_table)
  82. current_table = []
  83. current_count = 0
  84. current_table.append(row)
  85. current_count += token_count
  86. if current_table:
  87. tables.append(current_table)
  88. for table_rows in tables:
  89. new_table = soup.new_tag("table")
  90. for row in table_rows:
  91. new_table.append(row)
  92. table_str_list.append(str(new_table))
  93. return table_str_list
  94. @classmethod
  95. def read_text_recursively(cls, element, parser_result, chunk_token_num=512, parent_name=None, block_id=None):
  96. if isinstance(element, NavigableString):
  97. content = element.strip()
  98. def is_valid_html(content):
  99. try:
  100. soup = BeautifulSoup(content, "html.parser")
  101. return bool(soup.find())
  102. except Exception:
  103. return False
  104. return_info = []
  105. if content:
  106. if is_valid_html(content):
  107. soup = BeautifulSoup(content, "html.parser")
  108. child_info = cls.read_text_recursively(soup, parser_result, chunk_token_num, element.name, block_id)
  109. parser_result.extend(child_info)
  110. else:
  111. info = {"content": element.strip(), "tag_name": "inner_text", "metadata": {"block_id": block_id}}
  112. if parent_name:
  113. info["tag_name"] = parent_name
  114. return_info.append(info)
  115. return return_info
  116. elif isinstance(element, Tag):
  117. if str.lower(element.name) == "table":
  118. table_info_list = []
  119. table_id = str(uuid.uuid1())
  120. table_list = [html.unescape(str(element))]
  121. for t in table_list:
  122. table_info_list.append({"content": t, "tag_name": "table",
  123. "metadata": {"table_id": table_id, "index": table_list.index(t)}})
  124. return table_info_list
  125. else:
  126. block_id = None
  127. if str.lower(element.name) in BLOCK_TAGS:
  128. block_id = str(uuid.uuid1())
  129. for child in element.children:
  130. child_info = cls.read_text_recursively(child, parser_result, chunk_token_num, element.name,
  131. block_id)
  132. parser_result.extend(child_info)
  133. return []
  134. @classmethod
  135. def merge_block_text(cls, parser_result):
  136. block_content = []
  137. current_content = ""
  138. table_info_list = []
  139. lask_block_id = None
  140. for item in parser_result:
  141. content = item.get("content")
  142. tag_name = item.get("tag_name")
  143. title_flag = tag_name in TITLE_TAGS
  144. block_id = item.get("metadata", {}).get("block_id")
  145. if block_id:
  146. if title_flag:
  147. content = f"{TITLE_TAGS[tag_name]} {content}"
  148. if lask_block_id != block_id:
  149. if lask_block_id is not None:
  150. block_content.append(current_content)
  151. current_content = content
  152. lask_block_id = block_id
  153. else:
  154. current_content += (" " if current_content else "") + content
  155. else:
  156. if tag_name == "table":
  157. table_info_list.append(item)
  158. else:
  159. current_content += (" " if current_content else "" + content)
  160. if current_content:
  161. block_content.append(current_content)
  162. return block_content, table_info_list
  163. @classmethod
  164. def chunk_block(cls, block_txt_list, chunk_token_num=512):
  165. chunks = []
  166. current_block = ""
  167. current_token_count = 0
  168. for block in block_txt_list:
  169. tks_str = rag_tokenizer.tokenize(block)
  170. block_token_count = len(tks_str.split(" ")) if tks_str else 0
  171. if block_token_count > chunk_token_num:
  172. if current_block:
  173. chunks.append(current_block)
  174. start = 0
  175. tokens = tks_str.split(" ")
  176. while start < len(tokens):
  177. end = start + chunk_token_num
  178. split_tokens = tokens[start:end]
  179. chunks.append(" ".join(split_tokens))
  180. start = end
  181. current_block = ""
  182. current_token_count = 0
  183. else:
  184. if current_token_count + block_token_count <= chunk_token_num:
  185. current_block += ("\n" if current_block else "") + block
  186. current_token_count += block_token_count
  187. else:
  188. chunks.append(current_block)
  189. current_block = block
  190. current_token_count = block_token_count
  191. if current_block:
  192. chunks.append(current_block)
  193. return chunks