You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

excel_parser.py 5.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import logging
  14. import sys
  15. from io import BytesIO
  16. import pandas as pd
  17. from openpyxl import Workbook, load_workbook
  18. from rag.nlp import find_codec
  19. class RAGFlowExcelParser:
  20. @staticmethod
  21. def _load_excel_to_workbook(file_like_object):
  22. if isinstance(file_like_object, bytes):
  23. file_like_object = BytesIO(file_like_object)
  24. # Read first 4 bytes to determine file type
  25. file_like_object.seek(0)
  26. file_head = file_like_object.read(4)
  27. file_like_object.seek(0)
  28. if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
  29. logging.info("Not an Excel file, converting CSV to Excel Workbook")
  30. try:
  31. file_like_object.seek(0)
  32. df = pd.read_csv(file_like_object)
  33. return RAGFlowExcelParser._dataframe_to_workbook(df)
  34. except Exception as e_csv:
  35. raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
  36. try:
  37. return load_workbook(file_like_object,data_only= True)
  38. except Exception as e:
  39. logging.info(f"openpyxl load error: {e}, try pandas instead")
  40. try:
  41. file_like_object.seek(0)
  42. try:
  43. df = pd.read_excel(file_like_object)
  44. return RAGFlowExcelParser._dataframe_to_workbook(df)
  45. except Exception as ex:
  46. logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
  47. file_like_object.seek(0)
  48. df = pd.read_excel(file_like_object, engine='calamine')
  49. return RAGFlowExcelParser._dataframe_to_workbook(df)
  50. except Exception as e_pandas:
  51. raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
  52. @staticmethod
  53. def _dataframe_to_workbook(df):
  54. wb = Workbook()
  55. ws = wb.active
  56. ws.title = "Data"
  57. for col_num, column_name in enumerate(df.columns, 1):
  58. ws.cell(row=1, column=col_num, value=column_name)
  59. for row_num, row in enumerate(df.values, 2):
  60. for col_num, value in enumerate(row, 1):
  61. ws.cell(row=row_num, column=col_num, value=value)
  62. return wb
  63. def html(self, fnm, chunk_rows=256):
  64. file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
  65. wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
  66. tb_chunks = []
  67. for sheetname in wb.sheetnames:
  68. ws = wb[sheetname]
  69. rows = list(ws.rows)
  70. if not rows:
  71. continue
  72. tb_rows_0 = "<tr>"
  73. for t in list(rows[0]):
  74. tb_rows_0 += f"<th>{t.value}</th>"
  75. tb_rows_0 += "</tr>"
  76. for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
  77. tb = ""
  78. tb += f"<table><caption>{sheetname}</caption>"
  79. tb += tb_rows_0
  80. for r in list(
  81. rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
  82. ):
  83. tb += "<tr>"
  84. for i, c in enumerate(r):
  85. if c.value is None:
  86. tb += "<td></td>"
  87. else:
  88. tb += f"<td>{c.value}</td>"
  89. tb += "</tr>"
  90. tb += "</table>\n"
  91. tb_chunks.append(tb)
  92. return tb_chunks
  93. def __call__(self, fnm):
  94. file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
  95. wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
  96. res = []
  97. for sheetname in wb.sheetnames:
  98. ws = wb[sheetname]
  99. rows = list(ws.rows)
  100. if not rows:
  101. continue
  102. ti = list(rows[0])
  103. for r in list(rows[1:]):
  104. fields = []
  105. for i, c in enumerate(r):
  106. if not c.value:
  107. continue
  108. t = str(ti[i].value) if i < len(ti) else ""
  109. t += (":" if t else "") + str(c.value)
  110. fields.append(t)
  111. line = "; ".join(fields)
  112. if sheetname.lower().find("sheet") < 0:
  113. line += " ——" + sheetname
  114. res.append(line)
  115. return res
  116. @staticmethod
  117. def row_number(fnm, binary):
  118. if fnm.split(".")[-1].lower().find("xls") >= 0:
  119. wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
  120. total = 0
  121. for sheetname in wb.sheetnames:
  122. ws = wb[sheetname]
  123. total += len(list(ws.rows))
  124. return total
  125. if fnm.split(".")[-1].lower() in ["csv", "txt"]:
  126. encoding = find_codec(binary)
  127. txt = binary.decode(encoding, errors="ignore")
  128. return len(txt.split("\n"))
  129. if __name__ == "__main__":
  130. psr = RAGFlowExcelParser()
  131. psr(sys.argv[1])