Browse Source

Refactor: Optimize error handling and support parsing of XLS(EXCEL97—2003) files. (#5633)

Optimize error handling and support parsing of XLS(EXCEL97—2003) files.
tags/v0.17.1
hy89 8 months ago
parent
commit
b0c21b00d9
No account linked to committer's email address
2 changed files with 27 additions and 71 deletions
  1. 24
    68
      deepdoc/parser/excel_parser.py
  2. 3
    3
      rag/app/table.py

+ 24
- 68
deepdoc/parser/excel_parser.py View File

# limitations under the License. # limitations under the License.
# #


import logging
from openpyxl import load_workbook, Workbook from openpyxl import load_workbook, Workbook
import sys import sys
from io import BytesIO from io import BytesIO




class RAGFlowExcelParser: class RAGFlowExcelParser:
def html(self, fnm, chunk_rows=256):

# if isinstance(fnm, str):
# wb = load_workbook(fnm)
# else:
# wb = load_workbook(BytesIO(fnm))++

s_fnm = fnm
if not isinstance(fnm, str):
s_fnm = BytesIO(fnm)
else:
pass

@staticmethod
def _load_excel_to_workbook(file_like_object):
try: try:
wb = load_workbook(s_fnm)
return load_workbook(file_like_object)
except Exception as e: except Exception as e:
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
df = pd.read_excel(s_fnm)
wb = Workbook()
# if len(wb.worksheets) > 0:
# del wb.worksheets[0]
# else: pass
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
else:
pass
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
else:
pass
else:
pass
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
try:
df = pd.read_excel(file_like_object)
wb = Workbook()
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
except Exception as e_pandas:
raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}")


def html(self, fnm, chunk_rows=256):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = [] tb_chunks = []
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
ws = wb[sheetname] ws = wb[sheetname]
return tb_chunks return tb_chunks


def __call__(self, fnm): def __call__(self, fnm):
# if isinstance(fnm, str):
# wb = load_workbook(fnm)
# else:
# wb = load_workbook(BytesIO(fnm))

s_fnm = fnm
if not isinstance(fnm, str):
s_fnm = BytesIO(fnm)
else:
pass

try:
wb = load_workbook(s_fnm)
except Exception as e:
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
df = pd.read_excel(s_fnm)
wb = Workbook()
if len(wb.worksheets) > 0:
del wb.worksheets[0]
else:
pass
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
else:
pass
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
else:
pass
else:
pass
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)


res = [] res = []
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
@staticmethod @staticmethod
def row_number(fnm, binary): def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0: if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = load_workbook(BytesIO(binary))
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
total = 0 total = 0
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
ws = wb[sheetname] ws = wb[sheetname]

+ 3
- 3
rag/app/table.py View File

from xpinyin import Pinyin from xpinyin import Pinyin
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from openpyxl import load_workbook
# from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse from dateutil.parser import parse as datetime_parse


from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
def __call__(self, fnm, binary=None, from_page=0, def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None): to_page=10000000000, callback=None):
if not binary: if not binary:
wb = load_workbook(fnm)
wb = Excel._load_excel_to_workbook(fnm)
else: else:
wb = load_workbook(BytesIO(binary))
wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0 total = 0
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows)) total += len(list(wb[sheetname].rows))

Loading…
Cancel
Save