浏览代码

Display only the duplicate column names and corresponding original source. (#8138)

### What problem does this PR solve?
This PR aims to slove #8120 which request a better error display of
duplicate column names.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.19.1
HaiyangP 4 个月前
父节点
当前提交
baf32ee461
没有帐户链接到提交者的电子邮件
共有 1 个文件被更改,包括 34 次插入64 次删除
  1. 34
    64
      rag/app/table.py

+ 34
- 64
rag/app/table.py 查看文件

from xpinyin import Pinyin from xpinyin import Pinyin
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from collections import Counter

# from openpyxl import load_workbook, Workbook # from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse from dateutil.parser import parse as datetime_parse






class Excel(ExcelParser): class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None):
def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
if not binary: if not binary:
wb = Excel._load_excel_to_workbook(fnm) wb = Excel._load_excel_to_workbook(fnm)
else: else:
continue continue
headers = [cell.value for cell in rows[0]] headers = [cell.value for cell in rows[0]]
missed = set([i for i, h in enumerate(headers) if h is None]) missed = set([i for i, h in enumerate(headers) if h is None])
headers = [
cell.value for i,
cell in enumerate(
rows[0]) if i not in missed]
headers = [cell.value for i, cell in enumerate(rows[0]) if i not in missed]
if not headers: if not headers:
continue continue
data = [] data = []
continue continue
if rn - 1 >= to_page: if rn - 1 >= to_page:
break break
row = [
cell.value for ii,
cell in enumerate(r) if ii not in missed]
row = [cell.value for ii, cell in enumerate(r) if ii not in missed]
if len(row) != len(headers): if len(row) != len(headers):
fails.append(str(i)) fails.append(str(i))
continue continue
continue continue
res.append(pd.DataFrame(np.array(data), columns=headers)) res.append(pd.DataFrame(np.array(data), columns=headers))


callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res return res








def trans_bool(s): def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
str(s).strip(), flags=re.IGNORECASE):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
return "yes" return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no" return "no"
def column_data_type(arr): def column_data_type(arr):
arr = list(arr) arr = list(arr)
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
for a in arr: for a in arr:
if a is None: if a is None:
continue continue
return arr, ty return arr, ty




def chunk(filename, binary=None, from_page=0, to_page=10000000000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
""" """
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
Every row in table will be treated as a chunk.
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
Every row in table will be treated as a chunk.
""" """


if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = Excel() excel_parser = Excel()
dfs = excel_parser(
filename,
binary,
from_page=from_page,
to_page=to_page,
callback=callback)
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_text(filename, binary) txt = get_text(filename, binary)
continue continue
rows.append(row) rows.append(row)


callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))


dfs = [pd.DataFrame(np.array(rows), columns=headers)] dfs = [pd.DataFrame(np.array(rows), columns=headers)]


else: else:
raise NotImplementedError(
"file type not supported yet(excel, text, csv supported)")
raise NotImplementedError("file type not supported yet(excel, text, csv supported)")


res = [] res = []
PY = Pinyin() PY = Pinyin()
fieds_map = {
"text": "_tks",
"int": "_long",
"keyword": "_kwd",
"float": "_flt",
"datetime": "_dt",
"bool": "_kwd"}
fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
for df in dfs: for df in dfs:
for n in ["id", "_id", "index", "idx"]: for n in ["id", "_id", "index", "idx"]:
if n in df.columns: if n in df.columns:
del df[n] del df[n]
clmns = df.columns.values clmns = df.columns.values
if len(clmns) != len(set(clmns)): if len(clmns) != len(set(clmns)):
duplicates = [col for col in clmns if list(clmns).count(col) > 1]
raise ValueError(f"Duplicate column names detected: {set(duplicates)}")
col_counts = Counter(clmns)
duplicates = [col for col, count in col_counts.items() if count > 1]
if duplicates:
raise ValueError(f"Duplicate column names detected: {duplicates}\nFrom: {clmns}")

txts = list(copy.deepcopy(clmns)) txts = list(copy.deepcopy(clmns))
py_clmns = [
PY.get_pinyins(
re.sub(
r"(/.*|([^()]+?)|\([^()]+?\))",
"",
str(n)),
'_')[0] for n in clmns]
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|([^()]+?)|\([^()]+?\))", "", str(n)), "_")[0] for n in clmns]
clmn_tys = [] clmn_tys = []
for j in range(len(clmns)): for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]]) cln, ty = column_data_type(df[clmns[j]])
df[clmns[j]] = cln df[clmns[j]] = cln
if ty == "text": if ty == "text":
txts.extend([str(c) for c in cln if c]) txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " "))
for i in range(len(clmns))]
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))]


eng = lang.lower() == "english" # is_english(txts) eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows(): for ii, row in df.iterrows():
d = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
row_txt = [] row_txt = []
for j in range(len(clmns)): for j in range(len(clmns)):
if row[clmns[j]] is None: if row[clmns[j]] is None:
if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]): if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
continue continue
fld = clmns_map[j][0] fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
row[clmns[j]])
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt: if not row_txt:
continue continue
tokenize(d, "; ".join(row_txt), eng) tokenize(d, "; ".join(row_txt), eng)
res.append(d) res.append(d)


KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
callback(0.35, "") callback(0.35, "")


return res return res

正在加载...
取消
保存