|
|
|
@@ -7,30 +7,39 @@ from rag.nlp import find_codec |
|
|
|
|
|
|
|
|
|
|
|
class RAGFlowExcelParser: |
|
|
|
def html(self, fnm): |
|
|
|
def html(self, fnm,chunk_rows=256): |
|
|
|
if isinstance(fnm, str): |
|
|
|
wb = load_workbook(fnm) |
|
|
|
else: |
|
|
|
wb = load_workbook(BytesIO(fnm)) |
|
|
|
tb = "" |
|
|
|
|
|
|
|
tb_chunks = [] |
|
|
|
for sheetname in wb.sheetnames: |
|
|
|
ws = wb[sheetname] |
|
|
|
rows = list(ws.rows) |
|
|
|
if not rows:continue |
|
|
|
tb += f"<table><caption>{sheetname}</caption><tr>" |
|
|
|
if not rows: continue |
|
|
|
|
|
|
|
tb_rows_0 = "<tr>" |
|
|
|
for t in list(rows[0]): |
|
|
|
tb += f"<th>{t.value}</th>" |
|
|
|
tb += "</tr>" |
|
|
|
for r in list(rows[1:]): |
|
|
|
tb += "<tr>" |
|
|
|
for i, c in enumerate(r): |
|
|
|
if c.value is None: |
|
|
|
tb += "<td></td>" |
|
|
|
else: |
|
|
|
tb += f"<td>{c.value}</td>" |
|
|
|
tb += "</tr>" |
|
|
|
tb += "</table>\n" |
|
|
|
return tb |
|
|
|
tb_rows_0 += f"<th>{t.value}</th>" |
|
|
|
tb_rows_0 += "</tr>" |
|
|
|
|
|
|
|
for chunk_i in range((len(rows) - 1) // chunk_rows + 1): |
|
|
|
tb = "" |
|
|
|
tb += f"<table><caption>{sheetname}</caption>" |
|
|
|
tb += tb_rows_0 |
|
|
|
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]): |
|
|
|
tb += "<tr>" |
|
|
|
for i, c in enumerate(r): |
|
|
|
if c.value is None: |
|
|
|
tb += "<td></td>" |
|
|
|
else: |
|
|
|
tb += f"<td>{c.value}</td>" |
|
|
|
tb += "</tr>" |
|
|
|
tb += "</table>\n" |
|
|
|
tb_chunks.append(tb) |
|
|
|
|
|
|
|
return tb_chunks |
|
|
|
|
|
|
|
def __call__(self, fnm): |
|
|
|
if isinstance(fnm, str): |