You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

table.py 6.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import copy
  2. import re
  3. from io import BytesIO
  4. from xpinyin import Pinyin
  5. import numpy as np
  6. import pandas as pd
  7. from openpyxl import load_workbook
  8. from dateutil.parser import parse as datetime_parse
  9. from api.db.services.knowledgebase_service import KnowledgebaseService
  10. from rag.parser import is_english, tokenize
  11. from rag.nlp import huqie, stemmer
  12. class Excel(object):
  13. def __call__(self, fnm, binary=None, callback=None):
  14. if not binary:
  15. wb = load_workbook(fnm)
  16. else:
  17. wb = load_workbook(BytesIO(binary))
  18. total = 0
  19. for sheetname in wb.sheetnames:
  20. total += len(list(wb[sheetname].rows))
  21. res, fails, done = [], [], 0
  22. for sheetname in wb.sheetnames:
  23. ws = wb[sheetname]
  24. rows = list(ws.rows)
  25. headers = [cell.value for cell in rows[0]]
  26. missed = set([i for i, h in enumerate(headers) if h is None])
  27. headers = [
  28. cell.value for i,
  29. cell in enumerate(
  30. rows[0]) if i not in missed]
  31. data = []
  32. for i, r in enumerate(rows[1:]):
  33. row = [
  34. cell.value for ii,
  35. cell in enumerate(r) if ii not in missed]
  36. if len(row) != len(headers):
  37. fails.append(str(i))
  38. continue
  39. data.append(row)
  40. done += 1
  41. if done % 999 == 0:
  42. callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + (
  43. f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else "")))
  44. res.append(pd.DataFrame(np.array(data), columns=headers))
  45. callback(0.6, ("Extract records: {}. ".format(done) + (
  46. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  47. return res
  48. def trans_datatime(s):
  49. try:
  50. return datetime_parse(s.strip()).strftime("%Y-%m-%dT%H:%M:%S")
  51. except Exception as e:
  52. pass
  53. def trans_bool(s):
  54. if re.match(r"(true|yes|是)$", str(s).strip(), flags=re.IGNORECASE):
  55. return ["yes", "是"]
  56. if re.match(r"(false|no|否)$", str(s).strip(), flags=re.IGNORECASE):
  57. return ["no", "否"]
  58. def column_data_type(arr):
  59. uni = len(set([a for a in arr if a is not None]))
  60. counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
  61. trans = {t: f for f, t in
  62. [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
  63. for a in arr:
  64. if a is None:
  65. continue
  66. if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")):
  67. counts["int"] += 1
  68. elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
  69. counts["float"] += 1
  70. elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
  71. counts["bool"] += 1
  72. elif trans_datatime(str(a)):
  73. counts["datetime"] += 1
  74. else:
  75. counts["text"] += 1
  76. counts = sorted(counts.items(), key=lambda x: x[1] * -1)
  77. ty = counts[0][0]
  78. for i in range(len(arr)):
  79. if arr[i] is None:
  80. continue
  81. try:
  82. arr[i] = trans[ty](str(arr[i]))
  83. except Exception as e:
  84. arr[i] = None
  85. if ty == "text":
  86. if len(arr) > 128 and uni / len(arr) < 0.1:
  87. ty = "keyword"
  88. return arr, ty
  89. def chunk(filename, binary=None, callback=None, **kwargs):
  90. dfs = []
  91. if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
  92. callback(0.1, "Start to parse.")
  93. excel_parser = Excel()
  94. dfs = excel_parser(filename, binary, callback)
  95. elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
  96. callback(0.1, "Start to parse.")
  97. txt = ""
  98. if binary:
  99. txt = binary.decode("utf-8")
  100. else:
  101. with open(filename, "r") as f:
  102. while True:
  103. l = f.readline()
  104. if not l:
  105. break
  106. txt += l
  107. lines = txt.split("\n")
  108. fails = []
  109. headers = lines[0].split(kwargs.get("delimiter", "\t"))
  110. rows = []
  111. for i, line in enumerate(lines[1:]):
  112. row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
  113. if len(row) != len(headers):
  114. fails.append(str(i))
  115. continue
  116. rows.append(row)
  117. if len(rows) % 999 == 0:
  118. callback(len(rows) * 0.6 / len(lines), ("Extract records: {}".format(len(rows)) + (
  119. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  120. callback(0.6, ("Extract records: {}".format(len(rows)) + (
  121. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  122. dfs = [pd.DataFrame(np.array(rows), columns=headers)]
  123. else:
  124. raise NotImplementedError(
  125. "file type not supported yet(excel, text, csv supported)")
  126. res = []
  127. PY = Pinyin()
  128. fieds_map = {
  129. "text": "_tks",
  130. "int": "_int",
  131. "keyword": "_kwd",
  132. "float": "_flt",
  133. "datetime": "_dt",
  134. "bool": "_kwd"}
  135. for df in dfs:
  136. for n in ["id", "_id", "index", "idx"]:
  137. if n in df.columns:
  138. del df[n]
  139. clmns = df.columns.values
  140. txts = list(copy.deepcopy(clmns))
  141. py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
  142. clmn_tys = []
  143. for j in range(len(clmns)):
  144. cln, ty = column_data_type(df[clmns[j]])
  145. clmn_tys.append(ty)
  146. df[clmns[j]] = cln
  147. if ty == "text":
  148. txts.extend([str(c) for c in cln if c])
  149. clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
  150. for i in range(len(clmns))]
  151. eng = is_english(txts)
  152. for ii, row in df.iterrows():
  153. d = {}
  154. row_txt = []
  155. for j in range(len(clmns)):
  156. if row[clmns[j]] is None:
  157. continue
  158. fld = clmns_map[j][0]
  159. d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
  160. row[clmns[j]])
  161. row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
  162. if not row_txt:
  163. continue
  164. tokenize(d, "; ".join(row_txt), eng)
  165. res.append(d)
  166. KnowledgebaseService.update_parser_config(
  167. kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
  168. callback(0.6, "")
  169. return res
  170. if __name__ == "__main__":
  171. import sys
  172. def dummy(a, b):
  173. pass
  174. chunk(sys.argv[1], callback=dummy)