Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. from docx import Document
  17. import re
  18. import pandas as pd
  19. from collections import Counter
  20. from rag.nlp import rag_tokenizer
  21. from io import BytesIO
  22. class RAGFlowDocxParser:
  23. def __extract_table_content(self, tb):
  24. df = []
  25. for row in tb.rows:
  26. df.append([c.text for c in row.cells])
  27. return self.__compose_table_content(pd.DataFrame(df))
  28. def __compose_table_content(self, df):
  29. def blockType(b):
  30. pattern = [
  31. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  32. (r"^(20|19)[0-9]{2}年$", "Dt"),
  33. (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
  34. ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  35. (r"^第*[一二三四1-4]季度$", "Dt"),
  36. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  37. (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
  38. ("^[0-9.,+%/ -]+$", "Nu"),
  39. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  40. (r"^[A-Z]*[a-z' -]+$", "En"),
  41. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  42. (r"^.{1}$", "Sg")
  43. ]
  44. for p, n in pattern:
  45. if re.search(p, b):
  46. return n
  47. tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
  48. if len(tks) > 3:
  49. if len(tks) < 12:
  50. return "Tx"
  51. else:
  52. return "Lx"
  53. if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
  54. return "Nr"
  55. return "Ot"
  56. if len(df) < 2:
  57. return []
  58. max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
  59. 1, len(df)) for j in range(len(df.iloc[i, :]))])
  60. max_type = max(max_type.items(), key=lambda x: x[1])[0]
  61. colnm = len(df.iloc[0, :])
  62. hdrows = [0] # header is not necessarily appear in the first line
  63. if max_type == "Nu":
  64. for r in range(1, len(df)):
  65. tys = Counter([blockType(str(df.iloc[r, j]))
  66. for j in range(len(df.iloc[r, :]))])
  67. tys = max(tys.items(), key=lambda x: x[1])[0]
  68. if tys != max_type:
  69. hdrows.append(r)
  70. lines = []
  71. for i in range(1, len(df)):
  72. if i in hdrows:
  73. continue
  74. hr = [r - i for r in hdrows]
  75. hr = [r for r in hr if r < 0]
  76. t = len(hr) - 1
  77. while t > 0:
  78. if hr[t] - hr[t - 1] > 1:
  79. hr = hr[t:]
  80. break
  81. t -= 1
  82. headers = []
  83. for j in range(len(df.iloc[i, :])):
  84. t = []
  85. for h in hr:
  86. x = str(df.iloc[i + h, j]).strip()
  87. if x in t:
  88. continue
  89. t.append(x)
  90. t = ",".join(t)
  91. if t:
  92. t += ": "
  93. headers.append(t)
  94. cells = []
  95. for j in range(len(df.iloc[i, :])):
  96. if not str(df.iloc[i, j]):
  97. continue
  98. cells.append(headers[j] + str(df.iloc[i, j]))
  99. lines.append(";".join(cells))
  100. if colnm > 3:
  101. return lines
  102. return ["\n".join(lines)]
  103. def __call__(self, fnm, from_page=0, to_page=100000000):
  104. self.doc = Document(fnm) if isinstance(
  105. fnm, str) else Document(BytesIO(fnm))
  106. pn = 0 # parsed page
  107. secs = [] # parsed contents
  108. for p in self.doc.paragraphs:
  109. if pn > to_page:
  110. break
  111. runs_within_single_paragraph = [] # save runs within the range of pages
  112. for run in p.runs:
  113. if pn > to_page:
  114. break
  115. if from_page <= pn < to_page and p.text.strip():
  116. runs_within_single_paragraph.append(run.text) # append run.text first
  117. # wrap page break checker into a static method
  118. if 'lastRenderedPageBreak' in run._element.xml:
  119. pn += 1
  120. secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
  121. tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
  122. return secs, tbls