Parcourir la source

Feat: add CSV file parsing support (#5989)

### What problem does this PR solve?

Add CSV file parsing support #4552, #5849, #5870

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.17.2
Yongteng Lei il y a 7 mois
Parent
révision
7cd37c37cd
Aucun compte lié à l'adresse e-mail de l'auteur
2 fichiers modifiés avec 43 ajouts et 18 suppressions
  1. 42
    15
      deepdoc/parser/excel_parser.py
  2. 1
    3
      rag/app/naive.py

+ 42
- 15
deepdoc/parser/excel_parser.py Voir le fichier

@@ -12,35 +12,63 @@
#

import logging
from openpyxl import load_workbook, Workbook
import sys
from io import BytesIO

from rag.nlp import find_codec

import pandas as pd
from openpyxl import Workbook, load_workbook

from rag.nlp import find_codec


class RAGFlowExcelParser:

@staticmethod
def _load_excel_to_workbook(file_like_object):
if isinstance(file_like_object, bytes):
file_like_object = BytesIO(file_like_object)

# Read first 4 bytes to determine file type
file_like_object.seek(0)
file_head = file_like_object.read(4)
file_like_object.seek(0)

if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")

try:
file_like_object.seek(0)
df = pd.read_csv(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)

except Exception as e_csv:
raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")

try:
return load_workbook(file_like_object)
except Exception as e:
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
try:
file_like_object.seek(0)
df = pd.read_excel(file_like_object)
wb = Workbook()
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas:
raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}")
raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")

@staticmethod
def _dataframe_to_workbook(df):
wb = Workbook()
ws = wb.active
ws.title = "Data"

for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)

for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)

return wb

def html(self, fnm, chunk_rows=256):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
@@ -62,7 +90,7 @@ class RAGFlowExcelParser:
tb += f"<table><caption>{sheetname}</caption>"
tb += tb_rows_0
for r in list(
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
):
tb += "<tr>"
for i, c in enumerate(r):
@@ -120,4 +148,3 @@ class RAGFlowExcelParser:
if __name__ == "__main__":
psr = RAGFlowExcelParser()
psr(sys.argv[1])


+ 1
- 3
rag/app/naive.py Voir le fichier

@@ -240,7 +240,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback=callback)
res = tokenize_table(tables, doc, is_english)

elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
@@ -307,9 +307,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys


def dummy(prog=None, msg=""):
pass


chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

Chargement…
Annuler
Enregistrer