Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import logging
  14. import sys
  15. from io import BytesIO
  16. import pandas as pd
  17. from openpyxl import Workbook, load_workbook
  18. from rag.nlp import find_codec
  19. class RAGFlowExcelParser:
  20. @staticmethod
  21. def _load_excel_to_workbook(file_like_object):
  22. if isinstance(file_like_object, bytes):
  23. file_like_object = BytesIO(file_like_object)
  24. # Read first 4 bytes to determine file type
  25. file_like_object.seek(0)
  26. file_head = file_like_object.read(4)
  27. file_like_object.seek(0)
  28. if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
  29. logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")
  30. try:
  31. file_like_object.seek(0)
  32. df = pd.read_csv(file_like_object)
  33. return RAGFlowExcelParser._dataframe_to_workbook(df)
  34. except Exception as e_csv:
  35. raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")
  36. try:
  37. return load_workbook(file_like_object,data_only= True)
  38. except Exception as e:
  39. logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
  40. try:
  41. file_like_object.seek(0)
  42. df = pd.read_excel(file_like_object)
  43. return RAGFlowExcelParser._dataframe_to_workbook(df)
  44. except Exception as e_pandas:
  45. raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
  46. @staticmethod
  47. def _dataframe_to_workbook(df):
  48. wb = Workbook()
  49. ws = wb.active
  50. ws.title = "Data"
  51. for col_num, column_name in enumerate(df.columns, 1):
  52. ws.cell(row=1, column=col_num, value=column_name)
  53. for row_num, row in enumerate(df.values, 2):
  54. for col_num, value in enumerate(row, 1):
  55. ws.cell(row=row_num, column=col_num, value=value)
  56. return wb
  57. def html(self, fnm, chunk_rows=256):
  58. file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
  59. wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
  60. tb_chunks = []
  61. for sheetname in wb.sheetnames:
  62. ws = wb[sheetname]
  63. rows = list(ws.rows)
  64. if not rows:
  65. continue
  66. tb_rows_0 = "<tr>"
  67. for t in list(rows[0]):
  68. tb_rows_0 += f"<th>{t.value}</th>"
  69. tb_rows_0 += "</tr>"
  70. for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
  71. tb = ""
  72. tb += f"<table><caption>{sheetname}</caption>"
  73. tb += tb_rows_0
  74. for r in list(
  75. rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
  76. ):
  77. tb += "<tr>"
  78. for i, c in enumerate(r):
  79. if c.value is None:
  80. tb += "<td></td>"
  81. else:
  82. tb += f"<td>{c.value}</td>"
  83. tb += "</tr>"
  84. tb += "</table>\n"
  85. tb_chunks.append(tb)
  86. return tb_chunks
  87. def __call__(self, fnm):
  88. file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
  89. wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
  90. res = []
  91. for sheetname in wb.sheetnames:
  92. ws = wb[sheetname]
  93. rows = list(ws.rows)
  94. if not rows:
  95. continue
  96. ti = list(rows[0])
  97. for r in list(rows[1:]):
  98. fields = []
  99. for i, c in enumerate(r):
  100. if not c.value:
  101. continue
  102. t = str(ti[i].value) if i < len(ti) else ""
  103. t += (":" if t else "") + str(c.value)
  104. fields.append(t)
  105. line = "; ".join(fields)
  106. if sheetname.lower().find("sheet") < 0:
  107. line += " ——" + sheetname
  108. res.append(line)
  109. return res
  110. @staticmethod
  111. def row_number(fnm, binary):
  112. if fnm.split(".")[-1].lower().find("xls") >= 0:
  113. wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
  114. total = 0
  115. for sheetname in wb.sheetnames:
  116. ws = wb[sheetname]
  117. total += len(list(ws.rows))
  118. return total
  119. if fnm.split(".")[-1].lower() in ["csv", "txt"]:
  120. encoding = find_codec(binary)
  121. txt = binary.decode(encoding, errors="ignore")
  122. return len(txt.split("\n"))
  123. if __name__ == "__main__":
  124. psr = RAGFlowExcelParser()
  125. psr(sys.argv[1])