|  |  | @@ -12,35 +12,63 @@ | 
		
	
		
			
			|  |  |  | # | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | import logging | 
		
	
		
			
			|  |  |  | from openpyxl import load_workbook, Workbook | 
		
	
		
			
			|  |  |  | import sys | 
		
	
		
			
			|  |  |  | from io import BytesIO | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | from rag.nlp import find_codec | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | import pandas as pd | 
		
	
		
			
			|  |  |  | from openpyxl import Workbook, load_workbook | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | from rag.nlp import find_codec | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | class RAGFlowExcelParser: | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | @staticmethod | 
		
	
		
			
			|  |  |  | def _load_excel_to_workbook(file_like_object): | 
		
	
		
			
			|  |  |  | if isinstance(file_like_object, bytes): | 
		
	
		
			
			|  |  |  | file_like_object = BytesIO(file_like_object) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | # Read first 4 bytes to determine file type | 
		
	
		
			
			|  |  |  | file_like_object.seek(0) | 
		
	
		
			
			|  |  |  | file_head = file_like_object.read(4) | 
		
	
		
			
			|  |  |  | file_like_object.seek(0) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')): | 
		
	
		
			
			|  |  |  | logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook") | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | try: | 
		
	
		
			
			|  |  |  | file_like_object.seek(0) | 
		
	
		
			
			|  |  |  | df = pd.read_csv(file_like_object) | 
		
	
		
			
			|  |  |  | return RAGFlowExcelParser._dataframe_to_workbook(df) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | except Exception as e_csv: | 
		
	
		
			
			|  |  |  | raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}") | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | try: | 
		
	
		
			
			|  |  |  | return load_workbook(file_like_object) | 
		
	
		
			
			|  |  |  | except Exception as e: | 
		
	
		
			
			|  |  |  | logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead") | 
		
	
		
			
			|  |  |  | try: | 
		
	
		
			
			|  |  |  | file_like_object.seek(0) | 
		
	
		
			
			|  |  |  | df = pd.read_excel(file_like_object) | 
		
	
		
			
			|  |  |  | wb = Workbook() | 
		
	
		
			
			|  |  |  | ws = wb.active | 
		
	
		
			
			|  |  |  | ws.title = "Data" | 
		
	
		
			
			|  |  |  | for col_num, column_name in enumerate(df.columns, 1): | 
		
	
		
			
			|  |  |  | ws.cell(row=1, column=col_num, value=column_name) | 
		
	
		
			
			|  |  |  | for row_num, row in enumerate(df.values, 2): | 
		
	
		
			
			|  |  |  | for col_num, value in enumerate(row, 1): | 
		
	
		
			
			|  |  |  | ws.cell(row=row_num, column=col_num, value=value) | 
		
	
		
			
			|  |  |  | return wb | 
		
	
		
			
			|  |  |  | return RAGFlowExcelParser._dataframe_to_workbook(df) | 
		
	
		
			
			|  |  |  | except Exception as e_pandas: | 
		
	
		
			
			|  |  |  | raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}") | 
		
	
		
			
			|  |  |  | raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | @staticmethod | 
		
	
		
			
			|  |  |  | def _dataframe_to_workbook(df): | 
		
	
		
			
			|  |  |  | wb = Workbook() | 
		
	
		
			
			|  |  |  | ws = wb.active | 
		
	
		
			
			|  |  |  | ws.title = "Data" | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | for col_num, column_name in enumerate(df.columns, 1): | 
		
	
		
			
			|  |  |  | ws.cell(row=1, column=col_num, value=column_name) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | for row_num, row in enumerate(df.values, 2): | 
		
	
		
			
			|  |  |  | for col_num, value in enumerate(row, 1): | 
		
	
		
			
			|  |  |  | ws.cell(row=row_num, column=col_num, value=value) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | return wb | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | def html(self, fnm, chunk_rows=256): | 
		
	
		
			
			|  |  |  | file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm | 
		
	
	
		
			
			|  |  | @@ -62,7 +90,7 @@ class RAGFlowExcelParser: | 
		
	
		
			
			|  |  |  | tb += f"<table><caption>{sheetname}</caption>" | 
		
	
		
			
			|  |  |  | tb += tb_rows_0 | 
		
	
		
			
			|  |  |  | for r in list( | 
		
	
		
			
			|  |  |  | rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows] | 
		
	
		
			
			|  |  |  | rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows] | 
		
	
		
			
			|  |  |  | ): | 
		
	
		
			
			|  |  |  | tb += "<tr>" | 
		
	
		
			
			|  |  |  | for i, c in enumerate(r): | 
		
	
	
		
			
			|  |  | @@ -120,4 +148,3 @@ class RAGFlowExcelParser: | 
		
	
		
			
			|  |  |  | if __name__ == "__main__": | 
		
	
		
			
			|  |  |  | psr = RAGFlowExcelParser() | 
		
	
		
			
			|  |  |  | psr(sys.argv[1]) | 
		
	
		
			
			|  |  |  | 
 |