Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

1 год назад
1 год назад
1 год назад
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # -*- coding: utf-8 -*-
  2. from openpyxl import load_workbook
  3. import sys
  4. from io import BytesIO
  5. from rag.nlp import find_codec
  6. class RAGFlowExcelParser:
  7. def html(self, fnm, chunk_rows=256):
  8. if isinstance(fnm, str):
  9. wb = load_workbook(fnm)
  10. else:
  11. wb = load_workbook(BytesIO(fnm))
  12. tb_chunks = []
  13. for sheetname in wb.sheetnames:
  14. ws = wb[sheetname]
  15. rows = list(ws.rows)
  16. if not rows: continue
  17. tb_rows_0 = "<tr>"
  18. for t in list(rows[0]):
  19. tb_rows_0 += f"<th>{t.value}</th>"
  20. tb_rows_0 += "</tr>"
  21. for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
  22. tb = ""
  23. tb += f"<table><caption>{sheetname}</caption>"
  24. tb += tb_rows_0
  25. for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
  26. tb += "<tr>"
  27. for i, c in enumerate(r):
  28. if c.value is None:
  29. tb += "<td></td>"
  30. else:
  31. tb += f"<td>{c.value}</td>"
  32. tb += "</tr>"
  33. tb += "</table>\n"
  34. tb_chunks.append(tb)
  35. return tb_chunks
  36. def __call__(self, fnm):
  37. if isinstance(fnm, str):
  38. wb = load_workbook(fnm)
  39. else:
  40. wb = load_workbook(BytesIO(fnm))
  41. res = []
  42. for sheetname in wb.sheetnames:
  43. ws = wb[sheetname]
  44. rows = list(ws.rows)
  45. if not rows:continue
  46. ti = list(rows[0])
  47. for r in list(rows[1:]):
  48. l = []
  49. for i, c in enumerate(r):
  50. if not c.value:
  51. continue
  52. t = str(ti[i].value) if i < len(ti) else ""
  53. t += (":" if t else "") + str(c.value)
  54. l.append(t)
  55. l = "; ".join(l)
  56. if sheetname.lower().find("sheet") < 0:
  57. l += " ——" + sheetname
  58. res.append(l)
  59. return res
  60. @staticmethod
  61. def row_number(fnm, binary):
  62. if fnm.split(".")[-1].lower().find("xls") >= 0:
  63. wb = load_workbook(BytesIO(binary))
  64. total = 0
  65. for sheetname in wb.sheetnames:
  66. ws = wb[sheetname]
  67. total += len(list(ws.rows))
  68. return total
  69. if fnm.split(".")[-1].lower() in ["csv", "txt"]:
  70. encoding = find_codec(binary)
  71. txt = binary.decode(encoding, errors="ignore")
  72. return len(txt.split("\n"))
  73. if __name__ == "__main__":
  74. psr = RAGFlowExcelParser()
  75. psr(sys.argv[1])