소스 검색

Refactor: Optimize error handling and support parsing of XLS(EXCEL97—2003) files. (#5633)

Optimize error handling and support parsing of XLS(EXCEL97—2003) files.
tags/v0.17.1
hy89 8 달 전
부모
커밋
b0c21b00d9
No account linked to committer's email address
2개의 변경된 파일27개의 추가작업 그리고 71개의 파일을 삭제
  1. 24
    68
      deepdoc/parser/excel_parser.py
  2. 3
    3
      rag/app/table.py

+ 24
- 68
deepdoc/parser/excel_parser.py 파일 보기

@@ -11,6 +11,7 @@
# limitations under the License.
#

import logging
from openpyxl import load_workbook, Workbook
import sys
from io import BytesIO
@@ -21,42 +22,29 @@ import pandas as pd


class RAGFlowExcelParser:
def html(self, fnm, chunk_rows=256):

# if isinstance(fnm, str):
# wb = load_workbook(fnm)
# else:
# wb = load_workbook(BytesIO(fnm))++

s_fnm = fnm
if not isinstance(fnm, str):
s_fnm = BytesIO(fnm)
else:
pass

@staticmethod
def _load_excel_to_workbook(file_like_object):
try:
wb = load_workbook(s_fnm)
return load_workbook(file_like_object)
except Exception as e:
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
df = pd.read_excel(s_fnm)
wb = Workbook()
# if len(wb.worksheets) > 0:
# del wb.worksheets[0]
# else: pass
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
else:
pass
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
else:
pass
else:
pass
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
try:
df = pd.read_excel(file_like_object)
wb = Workbook()
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
except Exception as e_pandas:
raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}")

def html(self, fnm, chunk_rows=256):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
@@ -89,40 +77,8 @@ class RAGFlowExcelParser:
return tb_chunks

def __call__(self, fnm):
# if isinstance(fnm, str):
# wb = load_workbook(fnm)
# else:
# wb = load_workbook(BytesIO(fnm))

s_fnm = fnm
if not isinstance(fnm, str):
s_fnm = BytesIO(fnm)
else:
pass

try:
wb = load_workbook(s_fnm)
except Exception as e:
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
df = pd.read_excel(s_fnm)
wb = Workbook()
if len(wb.worksheets) > 0:
del wb.worksheets[0]
else:
pass
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
else:
pass
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
else:
pass
else:
pass
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)

res = []
for sheetname in wb.sheetnames:
@@ -148,7 +104,7 @@ class RAGFlowExcelParser:
@staticmethod
def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = load_workbook(BytesIO(binary))
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
ws = wb[sheetname]

+ 3
- 3
rag/app/table.py 파일 보기

@@ -20,7 +20,7 @@ from io import BytesIO
from xpinyin import Pinyin
import numpy as np
import pandas as pd
from openpyxl import load_workbook
# from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse

from api.db.services.knowledgebase_service import KnowledgebaseService
@@ -33,9 +33,9 @@ class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None):
if not binary:
wb = load_workbook(fnm)
wb = Excel._load_excel_to_workbook(fnm)
else:
wb = load_workbook(BytesIO(binary))
wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows))

Loading…
취소
저장