Pārlūkot izejas kodu

Split Excel file into different chunks (#847)

### What problem does this PR solve?


Split Excel into different chunk
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.6.0
GYH pirms 1 gada
vecāks
revīzija
c27c02ea67
Revīzijas autora e-pasta adrese nav piesaistīta nevienam kontam
3 mainītis faili ar 27 papildinājumiem un 18 dzēšanām
  1. 25
    16
      deepdoc/parser/excel_parser.py
  2. 1
    1
      rag/app/naive.py
  3. 1
    1
      rag/app/one.py

+ 25
- 16
deepdoc/parser/excel_parser.py Parādīt failu





class RAGFlowExcelParser: class RAGFlowExcelParser:
def html(self, fnm):
def html(self, fnm,chunk_rows=256):
if isinstance(fnm, str): if isinstance(fnm, str):
wb = load_workbook(fnm) wb = load_workbook(fnm)
else: else:
wb = load_workbook(BytesIO(fnm)) wb = load_workbook(BytesIO(fnm))
tb = ""

tb_chunks = []
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
ws = wb[sheetname] ws = wb[sheetname]
rows = list(ws.rows) rows = list(ws.rows)
if not rows:continue
tb += f"<table><caption>{sheetname}</caption><tr>"
if not rows: continue

tb_rows_0 = "<tr>"
for t in list(rows[0]): for t in list(rows[0]):
tb += f"<th>{t.value}</th>"
tb += "</tr>"
for r in list(rows[1:]):
tb += "<tr>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{c.value}</td>"
tb += "</tr>"
tb += "</table>\n"
return tb
tb_rows_0 += f"<th>{t.value}</th>"
tb_rows_0 += "</tr>"

for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
tb = ""
tb += f"<table><caption>{sheetname}</caption>"
tb += tb_rows_0
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
tb += "<tr>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{c.value}</td>"
tb += "</tr>"
tb += "</table>\n"
tb_chunks.append(tb)

return tb_chunks


def __call__(self, fnm): def __call__(self, fnm):
if isinstance(fnm, str): if isinstance(fnm, str):

+ 1
- 1
rag/app/naive.py Parādīt failu

elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = ExcelParser() excel_parser = ExcelParser()
sections = [(excel_parser.html(binary), "")]
sections = [(l, "") for l in excel_parser.html(binary) if l]
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")

+ 1
- 1
rag/app/one.py Parādīt failu

elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = ExcelParser() excel_parser = ExcelParser()
sections = [excel_parser.html(binary)]
sections = excel_parser.html(binary , 10000000)
elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")

Notiek ielāde…
Atcelt
Saglabāt