Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

1 год назад
1 год назад
1 год назад
1 год назад
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # -*- coding: utf-8 -*-
  2. from openpyxl import load_workbook
  3. import sys
  4. from io import BytesIO
  5. from rag.nlp import find_codec
  6. class RAGFlowExcelParser:
  7. def html(self, fnm):
  8. if isinstance(fnm, str):
  9. wb = load_workbook(fnm)
  10. else:
  11. wb = load_workbook(BytesIO(fnm))
  12. tb = ""
  13. for sheetname in wb.sheetnames:
  14. ws = wb[sheetname]
  15. rows = list(ws.rows)
  16. if not rows:continue
  17. tb += f"<table><caption>{sheetname}</caption><tr>"
  18. for t in list(rows[0]):
  19. tb += f"<th>{t.value}</th>"
  20. tb += "</tr>"
  21. for r in list(rows[1:]):
  22. tb += "<tr>"
  23. for i, c in enumerate(r):
  24. if c.value is None:
  25. tb += "<td></td>"
  26. else:
  27. tb += f"<td>{c.value}</td>"
  28. tb += "</tr>"
  29. tb += "</table>\n"
  30. return tb
  31. def __call__(self, fnm):
  32. if isinstance(fnm, str):
  33. wb = load_workbook(fnm)
  34. else:
  35. wb = load_workbook(BytesIO(fnm))
  36. res = []
  37. for sheetname in wb.sheetnames:
  38. ws = wb[sheetname]
  39. rows = list(ws.rows)
  40. if not rows:continue
  41. ti = list(rows[0])
  42. for r in list(rows[1:]):
  43. l = []
  44. for i, c in enumerate(r):
  45. if not c.value:
  46. continue
  47. t = str(ti[i].value) if i < len(ti) else ""
  48. t += (":" if t else "") + str(c.value)
  49. l.append(t)
  50. l = "; ".join(l)
  51. if sheetname.lower().find("sheet") < 0:
  52. l += " ——" + sheetname
  53. res.append(l)
  54. return res
  55. @staticmethod
  56. def row_number(fnm, binary):
  57. if fnm.split(".")[-1].lower().find("xls") >= 0:
  58. wb = load_workbook(BytesIO(binary))
  59. total = 0
  60. for sheetname in wb.sheetnames:
  61. ws = wb[sheetname]
  62. total += len(list(ws.rows))
  63. return total
  64. if fnm.split(".")[-1].lower() in ["csv", "txt"]:
  65. encoding = find_codec(binary)
  66. txt = binary.decode(encoding)
  67. return len(txt.split("\n"))
  68. if __name__ == "__main__":
  69. psr = RAGFlowExcelParser()
  70. psr(sys.argv[1])