You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import copy
  2. import random
  3. import re
  4. from io import BytesIO
  5. from xpinyin import Pinyin
  6. import numpy as np
  7. import pandas as pd
  8. from nltk import word_tokenize
  9. from openpyxl import load_workbook
  10. from dateutil.parser import parse as datetime_parse
  11. from rag.parser import is_english, tokenize
  12. from rag.nlp import huqie, stemmer
  13. class Excel(object):
  14. def __call__(self, fnm, binary=None, callback=None):
  15. if not binary:
  16. wb = load_workbook(fnm)
  17. else:
  18. wb = load_workbook(BytesIO(binary))
  19. total = 0
  20. for sheetname in wb.sheetnames:
  21. total += len(list(wb[sheetname].rows))
  22. res, fails, done = [], [], 0
  23. for sheetname in wb.sheetnames:
  24. ws = wb[sheetname]
  25. rows = list(ws.rows)
  26. headers = [cell.value for cell in rows[0]]
  27. missed = set([i for i,h in enumerate(headers) if h is None])
  28. headers = [cell.value for i,cell in enumerate(rows[0]) if i not in missed]
  29. data = []
  30. for i, r in enumerate(rows[1:]):
  31. row = [cell.value for ii,cell in enumerate(r) if ii not in missed]
  32. if len(row) != len(headers):
  33. fails.append(str(i))
  34. continue
  35. data.append(row)
  36. done += 1
  37. if done % 999 == 0:
  38. callback(done * 0.6/total, ("Extract records: {}".format(len(res)) + (f"{len(fails)} failure({sheetname}), line: %s..."%(",".join(fails[:3])) if fails else "")))
  39. res.append(pd.DataFrame(np.array(data), columns=headers))
  40. callback(0.6, ("Extract records: {}. ".format(done) + (
  41. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  42. return res
  43. def trans_datatime(s):
  44. try:
  45. return datetime_parse(s.strip()).strftime("%Y-%m-%dT%H:%M:%S")
  46. except Exception as e:
  47. pass
  48. def trans_bool(s):
  49. if re.match(r"(true|yes|是)$", str(s).strip(), flags=re.IGNORECASE): return ["yes", "是"]
  50. if re.match(r"(false|no|否)$", str(s).strip(), flags=re.IGNORECASE): return ["no", "否"]
  51. def column_data_type(arr):
  52. uni = len(set([a for a in arr if a is not None]))
  53. counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
  54. trans = {t:f for f,t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
  55. for a in arr:
  56. if a is None:continue
  57. if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")):
  58. counts["int"] += 1
  59. elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")):
  60. counts["float"] += 1
  61. elif re.match(r"(true|false|yes|no|是|否)$", str(a), flags=re.IGNORECASE):
  62. counts["bool"] += 1
  63. elif trans_datatime(str(a)):
  64. counts["datetime"] += 1
  65. else: counts["text"] += 1
  66. counts = sorted(counts.items(), key=lambda x: x[1]*-1)
  67. ty = counts[0][0]
  68. for i in range(len(arr)):
  69. if arr[i] is None:continue
  70. try:
  71. arr[i] = trans[ty](str(arr[i]))
  72. except Exception as e:
  73. arr[i] = None
  74. if ty == "text":
  75. if len(arr) > 128 and uni/len(arr) < 0.1:
  76. ty = "keyword"
  77. return arr, ty
  78. def chunk(filename, binary=None, callback=None, **kwargs):
  79. dfs = []
  80. if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
  81. callback(0.1, "Start to parse.")
  82. excel_parser = Excel()
  83. dfs = excel_parser(filename, binary, callback)
  84. elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
  85. callback(0.1, "Start to parse.")
  86. txt = ""
  87. if binary:
  88. txt = binary.decode("utf-8")
  89. else:
  90. with open(filename, "r") as f:
  91. while True:
  92. l = f.readline()
  93. if not l: break
  94. txt += l
  95. lines = txt.split("\n")
  96. fails = []
  97. headers = lines[0].split(kwargs.get("delimiter", "\t"))
  98. rows = []
  99. for i, line in enumerate(lines[1:]):
  100. row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
  101. if len(row) != len(headers):
  102. fails.append(str(i))
  103. continue
  104. rows.append(row)
  105. if len(rows) % 999 == 0:
  106. callback(len(rows) * 0.6 / len(lines), ("Extract records: {}".format(len(rows)) + (
  107. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  108. callback(0.6, ("Extract records: {}".format(len(rows)) + (
  109. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  110. dfs = [pd.DataFrame(np.array(rows), columns=headers)]
  111. else: raise NotImplementedError("file type not supported yet(excel, text, csv supported)")
  112. res = []
  113. PY = Pinyin()
  114. fieds_map = {"text": "_tks", "int": "_int", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
  115. for df in dfs:
  116. for n in ["id", "_id", "index", "idx"]:
  117. if n in df.columns:del df[n]
  118. clmns = df.columns.values
  119. txts = list(copy.deepcopy(clmns))
  120. py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
  121. clmn_tys = []
  122. for j in range(len(clmns)):
  123. cln,ty = column_data_type(df[clmns[j]])
  124. clmn_tys.append(ty)
  125. df[clmns[j]] = cln
  126. if ty == "text": txts.extend([str(c) for c in cln if c])
  127. clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) for i in range(len(clmns))]
  128. # TODO: set this column map to KB parser configuration
  129. eng = is_english(txts)
  130. for ii,row in df.iterrows():
  131. d = {}
  132. row_txt = []
  133. for j in range(len(clmns)):
  134. if row[clmns[j]] is None:continue
  135. fld = clmns_map[j][0]
  136. d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(row[clmns[j]])
  137. row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
  138. if not row_txt:continue
  139. tokenize(d, "; ".join(row_txt), eng)
  140. print(d)
  141. res.append(d)
  142. callback(0.6, "")
  143. return res
  144. if __name__== "__main__":
  145. import sys
  146. def dummy(a, b):
  147. pass
  148. chunk(sys.argv[1], callback=dummy)