Ver código fonte

Display only the duplicate column names and corresponding original source. (#8138)

### What problem does this PR solve?
This PR aims to slove #8120 which request a better error display of
duplicate column names.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.19.1
HaiyangP 4 meses atrás
pai
commit
baf32ee461
Nenhuma conta vinculada ao e-mail do autor do commit
1 arquivos alterados com 34 adições e 64 exclusões
  1. 34
    64
      rag/app/table.py

+ 34
- 64
rag/app/table.py Ver arquivo

@@ -20,6 +20,8 @@ from io import BytesIO
from xpinyin import Pinyin
import numpy as np
import pandas as pd
from collections import Counter

# from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse

@@ -30,8 +32,7 @@ from deepdoc.parser import ExcelParser


class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None):
def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
if not binary:
wb = Excel._load_excel_to_workbook(fnm)
else:
@@ -49,10 +50,7 @@ class Excel(ExcelParser):
continue
headers = [cell.value for cell in rows[0]]
missed = set([i for i, h in enumerate(headers) if h is None])
headers = [
cell.value for i,
cell in enumerate(
rows[0]) if i not in missed]
headers = [cell.value for i, cell in enumerate(rows[0]) if i not in missed]
if not headers:
continue
data = []
@@ -62,9 +60,7 @@ class Excel(ExcelParser):
continue
if rn - 1 >= to_page:
break
row = [
cell.value for ii,
cell in enumerate(r) if ii not in missed]
row = [cell.value for ii, cell in enumerate(r) if ii not in missed]
if len(row) != len(headers):
fails.append(str(i))
continue
@@ -74,8 +70,7 @@ class Excel(ExcelParser):
continue
res.append(pd.DataFrame(np.array(data), columns=headers))

callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res


@@ -87,8 +82,7 @@ def trans_datatime(s):


def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
str(s).strip(), flags=re.IGNORECASE):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no"
@@ -97,8 +91,7 @@ def trans_bool(s):
def column_data_type(arr):
arr = list(arr)
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
for a in arr:
if a is None:
continue
@@ -127,31 +120,25 @@ def column_data_type(arr):
return arr, ty


def chunk(filename, binary=None, from_page=0, to_page=10000000000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
Every row in table will be treated as a chunk.
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
Every row in table will be treated as a chunk.
"""

if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
dfs = excel_parser(
filename,
binary,
from_page=from_page,
to_page=to_page,
callback=callback)
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
@@ -170,40 +157,29 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
continue
rows.append(row)

callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

dfs = [pd.DataFrame(np.array(rows), columns=headers)]

else:
raise NotImplementedError(
"file type not supported yet(excel, text, csv supported)")
raise NotImplementedError("file type not supported yet(excel, text, csv supported)")

res = []
PY = Pinyin()
fieds_map = {
"text": "_tks",
"int": "_long",
"keyword": "_kwd",
"float": "_flt",
"datetime": "_dt",
"bool": "_kwd"}
fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
for df in dfs:
for n in ["id", "_id", "index", "idx"]:
if n in df.columns:
del df[n]
clmns = df.columns.values
if len(clmns) != len(set(clmns)):
duplicates = [col for col in clmns if list(clmns).count(col) > 1]
raise ValueError(f"Duplicate column names detected: {set(duplicates)}")
col_counts = Counter(clmns)
duplicates = [col for col, count in col_counts.items() if count > 1]
if duplicates:
raise ValueError(f"Duplicate column names detected: {duplicates}\nFrom: {clmns}")

txts = list(copy.deepcopy(clmns))
py_clmns = [
PY.get_pinyins(
re.sub(
r"(/.*|([^()]+?)|\([^()]+?\))",
"",
str(n)),
'_')[0] for n in clmns]
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|([^()]+?)|\([^()]+?\))", "", str(n)), "_")[0] for n in clmns]
clmn_tys = []
for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]])
@@ -211,15 +187,11 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
df[clmns[j]] = cln
if ty == "text":
txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " "))
for i in range(len(clmns))]
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))]

eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows():
d = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
row_txt = []
for j in range(len(clmns)):
if row[clmns[j]] is None:
@@ -229,16 +201,14 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
continue
fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
row[clmns[j]])
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt:
continue
tokenize(d, "; ".join(row_txt), eng)
res.append(d)

KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
callback(0.35, "")

return res

Carregando…
Cancelar
Salvar