Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

table.py 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import copy
  2. import random
  3. import re
  4. from io import BytesIO
  5. from xpinyin import Pinyin
  6. import numpy as np
  7. import pandas as pd
  8. from nltk import word_tokenize
  9. from openpyxl import load_workbook
  10. from dateutil.parser import parse as datetime_parse
  11. from rag.parser import is_english, tokenize
  12. from rag.nlp import huqie, stemmer
  13. class Excel(object):
  14. def __call__(self, fnm, binary=None, callback=None):
  15. if not binary:
  16. wb = load_workbook(fnm)
  17. else:
  18. wb = load_workbook(BytesIO(binary))
  19. total = 0
  20. for sheetname in wb.sheetnames:
  21. total += len(list(wb[sheetname].rows))
  22. res, fails, done = [], [], 0
  23. for sheetname in wb.sheetnames:
  24. ws = wb[sheetname]
  25. rows = list(ws.rows)
  26. headers = [cell.value for cell in rows[0]]
  27. missed = set([i for i,h in enumerate(headers) if h is None])
  28. headers = [cell.value for i,cell in enumerate(rows[0]) if i not in missed]
  29. data = []
  30. for i, r in enumerate(rows[1:]):
  31. row = [cell.value for ii,cell in enumerate(r) if ii not in missed]
  32. if len(row) != len(headers):
  33. fails.append(str(i))
  34. continue
  35. data.append(row)
  36. done += 1
  37. if done % 999 == 0:
  38. callback(done * 0.6/total, ("Extract records: {}".format(len(res)) + (f"{len(fails)} failure({sheetname}), line: %s..."%(",".join(fails[:3])) if fails else "")))
  39. res.append(pd.DataFrame(np.array(data), columns=headers))
  40. callback(0.6, ("Extract records: {}. ".format(done) + (
  41. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  42. return res
  43. def trans_datatime(s):
  44. try:
  45. return datetime_parse(s.strip()).strftime("%Y-%m-%dT%H:%M:%S")
  46. except Exception as e:
  47. pass
  48. def trans_bool(s):
  49. if re.match(r"(true|yes|是)$", str(s).strip(), flags=re.IGNORECASE): return ["yes", "是"]
  50. if re.match(r"(false|no|否)$", str(s).strip(), flags=re.IGNORECASE): return ["no", "否"]
  51. def column_data_type(arr):
  52. uni = len(set([a for a in arr if a is not None]))
  53. counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
  54. trans = {t:f for f,t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
  55. for a in arr:
  56. if a is None:continue
  57. if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")):
  58. counts["int"] += 1
  59. elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
  60. counts["float"] += 1
  61. elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
  62. counts["bool"] += 1
  63. elif trans_datatime(str(a)):
  64. counts["datetime"] += 1
  65. else: counts["text"] += 1
  66. counts = sorted(counts.items(), key=lambda x: x[1]*-1)
  67. ty = counts[0][0]
  68. for i in range(len(arr)):
  69. if arr[i] is None:continue
  70. try:
  71. arr[i] = trans[ty](str(arr[i]))
  72. except Exception as e:
  73. arr[i] = None
  74. if ty == "text":
  75. if len(arr) > 128 and uni/len(arr) < 0.1:
  76. ty = "keyword"
  77. return arr, ty
  78. def chunk(filename, binary=None, callback=None, **kwargs):
  79. dfs = []
  80. if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
  81. callback(0.1, "Start to parse.")
  82. excel_parser = Excel()
  83. dfs = excel_parser(filename, binary, callback)
  84. elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
  85. callback(0.1, "Start to parse.")
  86. txt = ""
  87. if binary:
  88. txt = binary.decode("utf-8")
  89. else:
  90. with open(filename, "r") as f:
  91. while True:
  92. l = f.readline()
  93. if not l: break
  94. txt += l
  95. lines = txt.split("\n")
  96. fails = []
  97. headers = lines[0].split(kwargs.get("delimiter", "\t"))
  98. rows = []
  99. for i, line in enumerate(lines[1:]):
  100. row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
  101. if len(row) != len(headers):
  102. fails.append(str(i))
  103. continue
  104. rows.append(row)
  105. if len(rows) % 999 == 0:
  106. callback(len(rows) * 0.6 / len(lines), ("Extract records: {}".format(len(rows)) + (
  107. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  108. callback(0.6, ("Extract records: {}".format(len(rows)) + (
  109. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  110. dfs = [pd.DataFrame(np.array(rows), columns=headers)]
  111. else: raise NotImplementedError("file type not supported yet(excel, text, csv supported)")
  112. res = []
  113. PY = Pinyin()
  114. fieds_map = {"text": "_tks", "int": "_int", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
  115. for df in dfs:
  116. for n in ["id", "_id", "index", "idx"]:
  117. if n in df.columns:del df[n]
  118. clmns = df.columns.values
  119. txts = list(copy.deepcopy(clmns))
  120. py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
  121. clmn_tys = []
  122. for j in range(len(clmns)):
  123. cln,ty = column_data_type(df[clmns[j]])
  124. clmn_tys.append(ty)
  125. df[clmns[j]] = cln
  126. if ty == "text": txts.extend([str(c) for c in cln if c])
  127. clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) for i in range(len(clmns))]
  128. # TODO: set this column map to KB parser configuration
  129. eng = is_english(txts)
  130. for ii,row in df.iterrows():
  131. d = {}
  132. row_txt = []
  133. for j in range(len(clmns)):
  134. if row[clmns[j]] is None:continue
  135. fld = clmns_map[j][0]
  136. d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(row[clmns[j]])
  137. row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
  138. if not row_txt:continue
  139. tokenize(d, "; ".join(row_txt), eng)
  140. print(d)
  141. res.append(d)
  142. callback(0.6, "")
  143. return res
  144. if __name__== "__main__":
  145. import sys
  146. def dummy(a, b):
  147. pass
  148. chunk(sys.argv[1], callback=dummy)