Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

docx_parser.py 4.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # -*- coding: utf-8 -*-
  2. from docx import Document
  3. import re
  4. import pandas as pd
  5. from collections import Counter
  6. from rag.nlp import huqie
  7. from io import BytesIO
  8. class HuDocxParser:
  9. def __extract_table_content(self, tb):
  10. df = []
  11. for row in tb.rows:
  12. df.append([c.text for c in row.cells])
  13. return self.__compose_table_content(pd.DataFrame(df))
  14. def __compose_table_content(self, df):
  15. def blockType(b):
  16. patt = [
  17. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  18. (r"^(20|19)[0-9]{2}年$", "Dt"),
  19. (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
  20. ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  21. (r"^第*[一二三四1-4]季度$", "Dt"),
  22. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  23. (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
  24. ("^[0-9.,+%/ -]+$", "Nu"),
  25. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  26. (r"^[A-Z]*[a-z' -]+$", "En"),
  27. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  28. (r"^.{1}$", "Sg")
  29. ]
  30. for p, n in patt:
  31. if re.search(p, b):
  32. return n
  33. tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
  34. if len(tks) > 3:
  35. if len(tks) < 12:
  36. return "Tx"
  37. else:
  38. return "Lx"
  39. if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
  40. return "Nr"
  41. return "Ot"
  42. if len(df) < 2:
  43. return []
  44. max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
  45. 1, len(df)) for j in range(len(df.iloc[i, :]))])
  46. max_type = max(max_type.items(), key=lambda x: x[1])[0]
  47. colnm = len(df.iloc[0, :])
  48. hdrows = [0] # header is not nessesarily appear in the first line
  49. if max_type == "Nu":
  50. for r in range(1, len(df)):
  51. tys = Counter([blockType(str(df.iloc[r, j]))
  52. for j in range(len(df.iloc[r, :]))])
  53. tys = max(tys.items(), key=lambda x: x[1])[0]
  54. if tys != max_type:
  55. hdrows.append(r)
  56. lines = []
  57. for i in range(1, len(df)):
  58. if i in hdrows:
  59. continue
  60. hr = [r - i for r in hdrows]
  61. hr = [r for r in hr if r < 0]
  62. t = len(hr) - 1
  63. while t > 0:
  64. if hr[t] - hr[t - 1] > 1:
  65. hr = hr[t:]
  66. break
  67. t -= 1
  68. headers = []
  69. for j in range(len(df.iloc[i, :])):
  70. t = []
  71. for h in hr:
  72. x = str(df.iloc[i + h, j]).strip()
  73. if x in t:
  74. continue
  75. t.append(x)
  76. t = ",".join(t)
  77. if t:
  78. t += ": "
  79. headers.append(t)
  80. cells = []
  81. for j in range(len(df.iloc[i, :])):
  82. if not str(df.iloc[i, j]):
  83. continue
  84. cells.append(headers[j] + str(df.iloc[i, j]))
  85. lines.append(";".join(cells))
  86. if colnm > 3:
  87. return lines
  88. return ["\n".join(lines)]
  89. def __call__(self, fnm, from_page=0, to_page=100000):
  90. self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm))
  91. pn = 0
  92. secs = []
  93. for p in self.doc.paragraphs:
  94. if pn > to_page: break
  95. if from_page <= pn < to_page and p.text.strip(): secs.append((p.text, p.style.name))
  96. for run in p.runs:
  97. if 'lastRenderedPageBreak' in run._element.xml:
  98. pn += 1
  99. continue
  100. if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
  101. pn += 1
  102. tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
  103. return secs, tbls