|
|
|
|
|
|
|
|
# |
|
|
# |
|
|
|
|
|
|
|
|
import logging |
|
|
import logging |
|
|
from openpyxl import load_workbook, Workbook |
|
|
|
|
|
import sys |
|
|
import sys |
|
|
from io import BytesIO |
|
|
from io import BytesIO |
|
|
|
|
|
|
|
|
from rag.nlp import find_codec |
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
|
|
|
from openpyxl import Workbook, load_workbook |
|
|
|
|
|
|
|
|
|
|
|
from rag.nlp import find_codec |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RAGFlowExcelParser: |
|
|
class RAGFlowExcelParser: |
|
|
|
|
|
|
|
|
@staticmethod |
|
|
@staticmethod |
|
|
def _load_excel_to_workbook(file_like_object): |
|
|
def _load_excel_to_workbook(file_like_object): |
|
|
|
|
|
if isinstance(file_like_object, bytes): |
|
|
|
|
|
file_like_object = BytesIO(file_like_object) |
|
|
|
|
|
|
|
|
|
|
|
# Read first 4 bytes to determine file type |
|
|
|
|
|
file_like_object.seek(0) |
|
|
|
|
|
file_head = file_like_object.read(4) |
|
|
|
|
|
file_like_object.seek(0) |
|
|
|
|
|
|
|
|
|
|
|
if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')): |
|
|
|
|
|
logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
file_like_object.seek(0) |
|
|
|
|
|
df = pd.read_csv(file_like_object) |
|
|
|
|
|
return RAGFlowExcelParser._dataframe_to_workbook(df) |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e_csv: |
|
|
|
|
|
raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}") |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|
return load_workbook(file_like_object) |
|
|
return load_workbook(file_like_object) |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead") |
|
|
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead") |
|
|
try: |
|
|
try: |
|
|
|
|
|
file_like_object.seek(0) |
|
|
df = pd.read_excel(file_like_object) |
|
|
df = pd.read_excel(file_like_object) |
|
|
wb = Workbook() |
|
|
|
|
|
ws = wb.active |
|
|
|
|
|
ws.title = "Data" |
|
|
|
|
|
for col_num, column_name in enumerate(df.columns, 1): |
|
|
|
|
|
ws.cell(row=1, column=col_num, value=column_name) |
|
|
|
|
|
for row_num, row in enumerate(df.values, 2): |
|
|
|
|
|
for col_num, value in enumerate(row, 1): |
|
|
|
|
|
ws.cell(row=row_num, column=col_num, value=value) |
|
|
|
|
|
return wb |
|
|
|
|
|
|
|
|
return RAGFlowExcelParser._dataframe_to_workbook(df) |
|
|
except Exception as e_pandas: |
|
|
except Exception as e_pandas: |
|
|
raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}") |
|
|
|
|
|
|
|
|
raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") |
|
|
|
|
|
|
|
|
|
|
|
@staticmethod |
|
|
|
|
|
def _dataframe_to_workbook(df): |
|
|
|
|
|
wb = Workbook() |
|
|
|
|
|
ws = wb.active |
|
|
|
|
|
ws.title = "Data" |
|
|
|
|
|
|
|
|
|
|
|
for col_num, column_name in enumerate(df.columns, 1): |
|
|
|
|
|
ws.cell(row=1, column=col_num, value=column_name) |
|
|
|
|
|
|
|
|
|
|
|
for row_num, row in enumerate(df.values, 2): |
|
|
|
|
|
for col_num, value in enumerate(row, 1): |
|
|
|
|
|
ws.cell(row=row_num, column=col_num, value=value) |
|
|
|
|
|
|
|
|
|
|
|
return wb |
|
|
|
|
|
|
|
|
def html(self, fnm, chunk_rows=256): |
|
|
def html(self, fnm, chunk_rows=256): |
|
|
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm |
|
|
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm |
|
|
|
|
|
|
|
|
tb += f"<table><caption>{sheetname}</caption>" |
|
|
tb += f"<table><caption>{sheetname}</caption>" |
|
|
tb += tb_rows_0 |
|
|
tb += tb_rows_0 |
|
|
for r in list( |
|
|
for r in list( |
|
|
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows] |
|
|
|
|
|
|
|
|
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows] |
|
|
): |
|
|
): |
|
|
tb += "<tr>" |
|
|
tb += "<tr>" |
|
|
for i, c in enumerate(r): |
|
|
for i, c in enumerate(r): |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
|
psr = RAGFlowExcelParser() |
|
|
psr = RAGFlowExcelParser() |
|
|
psr(sys.argv[1]) |
|
|
psr(sys.argv[1]) |
|
|
|
|
|
|