### What problem does this PR solve? Split Excel into different chunk ### Type of change - [x] New Feature (non-breaking change which adds functionality)

1 vuosi sitten · c27c02ea67
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -7,30 +7,39 @@ from rag.nlp import find_codec


 class RAGFlowExcelParser:
    def html(self, fnm):
    def html(self, fnm,chunk_rows=256):
        if isinstance(fnm, str):
            wb = load_workbook(fnm)
        else:
            wb = load_workbook(BytesIO(fnm))
        tb = ""

        tb_chunks = []
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            if not rows:continue
            tb += f"<table><caption>{sheetname}</caption><tr>"
            if not rows: continue

            tb_rows_0 = "<tr>"
            for t in list(rows[0]):
                tb += f"<th>{t.value}</th>"
            tb += "</tr>"
            for r in list(rows[1:]):
                tb += "<tr>"
                for i, c in enumerate(r):
                    if c.value is None:
                        tb += "<td></td>"
                    else:
                        tb += f"<td>{c.value}</td>"
                tb += "</tr>"
            tb += "</table>\n"
        return tb
                tb_rows_0 += f"<th>{t.value}</th>"
            tb_rows_0 += "</tr>"

            for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
                tb = ""
                tb += f"<table><caption>{sheetname}</caption>"
                tb += tb_rows_0
                for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
                    tb += "<tr>"
                    for i, c in enumerate(r):
                        if c.value is None:
                            tb += "<td></td>"
                        else:
                            tb += f"<td>{c.value}</td>"
                    tb += "</tr>"
                tb += "</table>\n"
                tb_chunks.append(tb)

        return tb_chunks

    def __call__(self, fnm):
        if isinstance(fnm, str):
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = ExcelParser()
        sections = [(excel_parser.html(binary), "")]
        sections = [(l, "") for l in excel_parser.html(binary) if l]

    elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = ExcelParser()
        sections = [excel_parser.html(binary)]
        sections = excel_parser.html(binary , 10000000)

    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")