You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

docx_parser.py 4.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. from docx import Document
  14. import re
  15. import pandas as pd
  16. from collections import Counter
  17. from rag.nlp import rag_tokenizer
  18. from io import BytesIO
  19. class RAGFlowDocxParser:
  20. def __extract_table_content(self, tb):
  21. df = []
  22. for row in tb.rows:
  23. df.append([c.text for c in row.cells])
  24. return self.__compose_table_content(pd.DataFrame(df))
  25. def __compose_table_content(self, df):
  26. def blockType(b):
  27. patt = [
  28. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  29. (r"^(20|19)[0-9]{2}年$", "Dt"),
  30. (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
  31. ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  32. (r"^第*[一二三四1-4]季度$", "Dt"),
  33. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  34. (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
  35. ("^[0-9.,+%/ -]+$", "Nu"),
  36. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  37. (r"^[A-Z]*[a-z' -]+$", "En"),
  38. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  39. (r"^.{1}$", "Sg")
  40. ]
  41. for p, n in patt:
  42. if re.search(p, b):
  43. return n
  44. tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
  45. if len(tks) > 3:
  46. if len(tks) < 12:
  47. return "Tx"
  48. else:
  49. return "Lx"
  50. if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
  51. return "Nr"
  52. return "Ot"
  53. if len(df) < 2:
  54. return []
  55. max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
  56. 1, len(df)) for j in range(len(df.iloc[i, :]))])
  57. max_type = max(max_type.items(), key=lambda x: x[1])[0]
  58. colnm = len(df.iloc[0, :])
  59. hdrows = [0] # header is not nessesarily appear in the first line
  60. if max_type == "Nu":
  61. for r in range(1, len(df)):
  62. tys = Counter([blockType(str(df.iloc[r, j]))
  63. for j in range(len(df.iloc[r, :]))])
  64. tys = max(tys.items(), key=lambda x: x[1])[0]
  65. if tys != max_type:
  66. hdrows.append(r)
  67. lines = []
  68. for i in range(1, len(df)):
  69. if i in hdrows:
  70. continue
  71. hr = [r - i for r in hdrows]
  72. hr = [r for r in hr if r < 0]
  73. t = len(hr) - 1
  74. while t > 0:
  75. if hr[t] - hr[t - 1] > 1:
  76. hr = hr[t:]
  77. break
  78. t -= 1
  79. headers = []
  80. for j in range(len(df.iloc[i, :])):
  81. t = []
  82. for h in hr:
  83. x = str(df.iloc[i + h, j]).strip()
  84. if x in t:
  85. continue
  86. t.append(x)
  87. t = ",".join(t)
  88. if t:
  89. t += ": "
  90. headers.append(t)
  91. cells = []
  92. for j in range(len(df.iloc[i, :])):
  93. if not str(df.iloc[i, j]):
  94. continue
  95. cells.append(headers[j] + str(df.iloc[i, j]))
  96. lines.append(";".join(cells))
  97. if colnm > 3:
  98. return lines
  99. return ["\n".join(lines)]
  100. def __call__(self, fnm, from_page=0, to_page=100000000):
  101. self.doc = Document(fnm) if isinstance(
  102. fnm, str) else Document(BytesIO(fnm))
  103. pn = 0 # parsed page
  104. secs = [] # parsed contents
  105. for p in self.doc.paragraphs:
  106. if pn > to_page:
  107. break
  108. runs_within_single_paragraph = [] # save runs within the range of pages
  109. for run in p.runs:
  110. if pn > to_page:
  111. break
  112. if from_page <= pn < to_page and p.text.strip():
  113. runs_within_single_paragraph.append(run.text) # append run.text first
  114. # wrap page break checker into a static method
  115. if 'lastRenderedPageBreak' in run._element.xml:
  116. pn += 1
  117. secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
  118. tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
  119. return secs, tbls