You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

step_two.py 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import re
  14. import copy
  15. import time
  16. import datetime
  17. import demjson3
  18. import traceback
  19. import signal
  20. import numpy as np
  21. from deepdoc.parser.resume.entities import degrees, schools, corporations
  22. from rag.nlp import rag_tokenizer, surname
  23. from xpinyin import Pinyin
  24. from contextlib import contextmanager
  25. from api.utils.log_utils import logger
  26. class TimeoutException(Exception): pass
  27. @contextmanager
  28. def time_limit(seconds):
  29. def signal_handler(signum, frame):
  30. raise TimeoutException("Timed out!")
  31. signal.signal(signal.SIGALRM, signal_handler)
  32. signal.alarm(seconds)
  33. try:
  34. yield
  35. finally:
  36. signal.alarm(0)
  37. ENV = None
  38. PY = Pinyin()
  39. def rmHtmlTag(line):
  40. return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
  41. def highest_degree(dg):
  42. if not dg: return ""
  43. if type(dg) == type(""): dg = [dg]
  44. m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
  45. return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
  46. def forEdu(cv):
  47. if not cv.get("education_obj"):
  48. cv["integerity_flt"] *= 0.8
  49. return cv
  50. first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
  51. edu_nst = []
  52. edu_end_dt = ""
  53. cv["school_rank_int"] = 1000000
  54. for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
  55. e = {}
  56. if n.get("end_time"):
  57. if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
  58. try:
  59. dt = n["end_time"]
  60. if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
  61. y, m, d = getYMD(dt)
  62. ed_dt.append(str(y))
  63. e["end_dt_kwd"] = str(y)
  64. except Exception as e:
  65. pass
  66. if n.get("start_time"):
  67. try:
  68. dt = n["start_time"]
  69. if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
  70. y, m, d = getYMD(dt)
  71. st_dt.append(str(y))
  72. e["start_dt_kwd"] = str(y)
  73. except Exception:
  74. pass
  75. r = schools.select(n.get("school_name", ""))
  76. if r:
  77. if str(r.get("type", "")) == "1": fea.append("211")
  78. if str(r.get("type", "")) == "2": fea.append("211")
  79. if str(r.get("is_abroad", "")) == "1": fea.append("留学")
  80. if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
  81. if str(r.get("is_985", "")) == "1": fea.append("985")
  82. if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
  83. if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
  84. if n.get("school_name") and isinstance(n["school_name"], str):
  85. sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
  86. e["sch_nm_kwd"] = sch[-1]
  87. fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
  88. if n.get("discipline_name") and isinstance(n["discipline_name"], str):
  89. maj.append(n["discipline_name"])
  90. e["major_kwd"] = n["discipline_name"]
  91. if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
  92. if n.get("degree"):
  93. d = degrees.get_name(n["degree"])
  94. if d: e["degree_kwd"] = d
  95. if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
  96. n.get(
  97. "school_name",
  98. ""))): d = "专升本"
  99. if d: deg.append(d)
  100. # for first degree
  101. if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
  102. fdeg = [d]
  103. if n.get("school_name"): fsch = [n["school_name"]]
  104. if n.get("discipline_name"): fmaj = [n["discipline_name"]]
  105. first_fea = copy.deepcopy(fea)
  106. edu_nst.append(e)
  107. cv["sch_rank_kwd"] = []
  108. if cv["school_rank_int"] <= 20 \
  109. or ("海外名校" in fea and cv["school_rank_int"] <= 200):
  110. cv["sch_rank_kwd"].append("顶尖学校")
  111. elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
  112. or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
  113. cv["school_rank_int"] > 200):
  114. cv["sch_rank_kwd"].append("精英学校")
  115. elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
  116. or ("海外名校" in fea and cv["school_rank_int"] > 500):
  117. cv["sch_rank_kwd"].append("优质学校")
  118. else:
  119. cv["sch_rank_kwd"].append("一般学校")
  120. if edu_nst: cv["edu_nst"] = edu_nst
  121. if fea: cv["edu_fea_kwd"] = list(set(fea))
  122. if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
  123. if maj: cv["major_kwd"] = maj
  124. if fsch: cv["first_school_name_kwd"] = fsch
  125. if fdeg: cv["first_degree_kwd"] = fdeg
  126. if fmaj: cv["first_major_kwd"] = fmaj
  127. if st_dt: cv["edu_start_kwd"] = st_dt
  128. if ed_dt: cv["edu_end_kwd"] = ed_dt
  129. if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
  130. if deg:
  131. if "本科" in deg and "专科" in deg:
  132. deg.append("专升本")
  133. deg = [d for d in deg if d != '本科']
  134. cv["degree_kwd"] = deg
  135. cv["highest_degree_kwd"] = highest_degree(deg)
  136. if edu_end_dt:
  137. try:
  138. if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
  139. if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
  140. y, m, d = getYMD(edu_end_dt)
  141. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  142. except Exception as e:
  143. logger.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
  144. if sch:
  145. cv["school_name_kwd"] = sch
  146. if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
  147. or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
  148. or not cv.get("degree_kwd"):
  149. for c in sch:
  150. if schools.is_good(c):
  151. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  152. cv["tag_kwd"].append("好学校")
  153. cv["tag_kwd"].append("好学历")
  154. break
  155. if (len(cv.get("degree_kwd", [])) >= 1 and \
  156. "本科" in cv["degree_kwd"] and \
  157. any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
  158. or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
  159. or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
  160. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  161. if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
  162. if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
  163. if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
  164. if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
  165. if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
  166. return cv
  167. def forProj(cv):
  168. if not cv.get("project_obj"): return cv
  169. pro_nms, desc = [], []
  170. for i, n in enumerate(
  171. sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
  172. reverse=True)):
  173. if n.get("name"): pro_nms.append(n["name"])
  174. if n.get("describe"): desc.append(str(n["describe"]))
  175. if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
  176. if n.get("achivement"): desc.append(str(n["achivement"]))
  177. if pro_nms:
  178. # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
  179. cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
  180. if desc:
  181. cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
  182. cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
  183. return cv
  184. def json_loads(line):
  185. return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
  186. def forWork(cv):
  187. if not cv.get("work_obj"):
  188. cv["integerity_flt"] *= 0.7
  189. return cv
  190. flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
  191. "industry_name", "subordinates_count"]
  192. duas = []
  193. scales = []
  194. fea = {c: [] for c in flds}
  195. latest_job_tm = ""
  196. goodcorp = False
  197. goodcorp_ = False
  198. work_st_tm = ""
  199. corp_tags = []
  200. for i, n in enumerate(
  201. sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
  202. reverse=True)):
  203. if type(n) == type(""):
  204. try:
  205. n = json_loads(n)
  206. except Exception:
  207. continue
  208. if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
  209. for c in flds:
  210. if not n.get(c) or str(n[c]) == '0':
  211. fea[c].append("")
  212. continue
  213. if c == "corporation_name":
  214. n[c] = corporations.corpNorm(n[c], False)
  215. if corporations.is_good(n[c]):
  216. if i == 0:
  217. goodcorp = True
  218. else:
  219. goodcorp_ = True
  220. ct = corporations.corp_tag(n[c])
  221. if i == 0:
  222. corp_tags.extend(ct)
  223. elif ct and ct[0] != "软外":
  224. corp_tags.extend([f"{t}(曾)" for t in ct])
  225. fea[c].append(rmHtmlTag(str(n[c]).lower()))
  226. y, m, d = getYMD(n.get("start_time"))
  227. if not y or not m: continue
  228. st = "%s-%02d-%02d" % (y, int(m), int(d))
  229. latest_job_tm = st
  230. y, m, d = getYMD(n.get("end_time"))
  231. if (not y or not m) and i > 0: continue
  232. if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
  233. if not y or not m: continue
  234. ed = "%s-%02d-%02d" % (y, int(m), int(d))
  235. try:
  236. duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
  237. except Exception:
  238. logger.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
  239. if n.get("scale"):
  240. r = re.search(r"^([0-9]+)", str(n["scale"]))
  241. if r: scales.append(int(r.group(1)))
  242. if goodcorp:
  243. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  244. cv["tag_kwd"].append("好公司")
  245. if goodcorp_:
  246. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  247. cv["tag_kwd"].append("好公司(曾)")
  248. if corp_tags:
  249. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  250. cv["tag_kwd"].extend(corp_tags)
  251. cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
  252. if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
  253. if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
  254. if fea["position_name"]:
  255. cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
  256. cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
  257. cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
  258. if fea["industry_name"]:
  259. cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
  260. cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
  261. cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
  262. if fea["corporation_name"]:
  263. cv["corporation_name_kwd"] = fea["corporation_name"][0]
  264. cv["corp_nm_kwd"] = fea["corporation_name"]
  265. cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
  266. cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
  267. cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
  268. if fea["responsibilities"]:
  269. cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
  270. cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
  271. if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
  272. re.match(r"[^0-9]+$", str(i))]
  273. if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
  274. if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
  275. if not cv.get("corporation_id"): cv["corporation_id"] = []
  276. for i in cv.get("corporation_id", []):
  277. cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
  278. if work_st_tm:
  279. try:
  280. if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
  281. y, m, d = getYMD(work_st_tm)
  282. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  283. except Exception as e:
  284. logger.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
  285. cv["job_num_int"] = 0
  286. if duas:
  287. cv["dua_flt"] = np.mean(duas)
  288. cv["cur_dua_int"] = duas[0]
  289. cv["job_num_int"] = len(duas)
  290. if scales: cv["scale_flt"] = np.max(scales)
  291. return cv
  292. def turnTm2Dt(b):
  293. if not b: return
  294. b = str(b).strip()
  295. if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
  296. return b
  297. def getYMD(b):
  298. y, m, d = "", "", "01"
  299. if not b: return (y, m, d)
  300. b = turnTm2Dt(b)
  301. if re.match(r"[0-9]{4}", b): y = int(b[:4])
  302. r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
  303. if r: m = r.group(1)
  304. r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
  305. if r: d = r.group(1)
  306. if not d or int(d) == 0 or int(d) > 31: d = "1"
  307. if not m or int(m) > 12 or int(m) < 1: m = "1"
  308. return (y, m, d)
  309. def birth(cv):
  310. if not cv.get("birth"):
  311. cv["integerity_flt"] *= 0.9
  312. return cv
  313. y, m, d = getYMD(cv["birth"])
  314. if not m or not y: return cv
  315. b = "%s-%02d-%02d" % (y, int(m), int(d))
  316. cv["birth_dt"] = b
  317. cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
  318. cv["age_int"] = datetime.datetime.now().year - int(y)
  319. return cv
  320. def parse(cv):
  321. for k in cv.keys():
  322. if cv[k] == '\\N': cv[k] = ''
  323. # cv = cv.asDict()
  324. tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
  325. "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
  326. "position_name", "school_name", "self_remark", "title_name"]
  327. small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
  328. kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
  329. "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
  330. "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
  331. num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
  332. "expect_salary_to", "salary_month"]
  333. is_fld = [
  334. ("is_fertility", "已育", "未育"),
  335. ("is_house", "有房", "没房"),
  336. ("is_management_experience", "有管理经验", "无管理经验"),
  337. ("is_marital", "已婚", "未婚"),
  338. ("is_oversea", "有海外经验", "无海外经验")
  339. ]
  340. rmkeys = []
  341. for k in cv.keys():
  342. if cv[k] is None: rmkeys.append(k)
  343. if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
  344. for k in rmkeys: del cv[k]
  345. integerity = 0.
  346. flds_num = 0.
  347. def hasValues(flds):
  348. nonlocal integerity, flds_num
  349. flds_num += len(flds)
  350. for f in flds:
  351. v = str(cv.get(f, ""))
  352. if len(v) > 0 and v != '0' and v != '[]': integerity += 1
  353. hasValues(tks_fld)
  354. hasValues(small_tks_fld)
  355. hasValues(kwd_fld)
  356. hasValues(num_fld)
  357. cv["integerity_flt"] = integerity / flds_num
  358. if cv.get("corporation_type"):
  359. for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
  360. (r"[//.· <\((]+.*", ""),
  361. (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
  362. (r".*(机关|事业).*", "机关"),
  363. (r".*(非盈利|Non-profit).*", "非盈利"),
  364. (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
  365. (r".*国有.*", "国企"),
  366. (r"[ ()\(\)人/·0-9-]+", ""),
  367. (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
  368. cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
  369. if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
  370. if cv.get("political_status"):
  371. for p, r in [
  372. (r".*党员.*", "党员"),
  373. (r".*(无党派|公民).*", "群众"),
  374. (r".*团员.*", "团员")]:
  375. cv["political_status"] = re.sub(p, r, cv["political_status"])
  376. if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
  377. if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
  378. keys = list(cv.keys())
  379. for k in keys:
  380. # deal with json objects
  381. if k.find("_obj") > 0:
  382. try:
  383. cv[k] = json_loads(cv[k])
  384. cv[k] = [a for _, a in cv[k].items()]
  385. nms = []
  386. for n in cv[k]:
  387. if type(n) != type({}) or "name" not in n or not n.get("name"): continue
  388. n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
  389. if not n["name"]: continue
  390. nms.append(n["name"])
  391. if nms:
  392. t = k[:-4]
  393. cv[f"{t}_kwd"] = nms
  394. cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
  395. except Exception:
  396. logger.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
  397. cv[k] = []
  398. # tokenize fields
  399. if k in tks_fld:
  400. cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
  401. if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
  402. # keyword fields
  403. if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
  404. for n in re.split(r"[\t,,;;. ]",
  405. re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
  406. ) if n]
  407. if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
  408. cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
  409. # for name field
  410. if cv.get("name"):
  411. nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
  412. nm = re.sub(r"[ \t ]+", " ", nm)
  413. if re.match(r"[a-zA-Z ]+$", nm):
  414. if len(nm.split(" ")) > 1:
  415. cv["name"] = nm
  416. else:
  417. nm = ""
  418. elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
  419. nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
  420. else:
  421. nm = ""
  422. cv["name"] = nm.strip()
  423. name = cv["name"]
  424. # name pingyin and its prefix
  425. cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
  426. cv["name_py_pref0_tks"] = ""
  427. cv["name_py_pref_tks"] = ""
  428. for py in PY.get_pinyins(nm[:20], ''):
  429. for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
  430. for py in PY.get_pinyins(nm[:20], ' '):
  431. py = py.split(" ")
  432. for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
  433. cv["name_kwd"] = name
  434. cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
  435. cv["name_tks"] = (
  436. rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
  437. ) if name else ""
  438. else:
  439. cv["integerity_flt"] /= 2.
  440. if cv.get("phone"):
  441. r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
  442. if not r:
  443. cv["phone"] = ""
  444. else:
  445. cv["phone"] = r.group(1)
  446. # deal with date fields
  447. if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
  448. cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
  449. else:
  450. y, m, d = getYMD(str(cv.get("updated_at", "")))
  451. if not y: y = "2012"
  452. if not m: m = "01"
  453. if not d: d = "01"
  454. cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  455. # long text tokenize
  456. if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
  457. # for yes or no field
  458. fea = []
  459. for f, y, n in is_fld:
  460. if f not in cv: continue
  461. if cv[f] == '是': fea.append(y)
  462. if cv[f] == '否': fea.append(n)
  463. if fea: cv["tag_kwd"] = fea
  464. cv = forEdu(cv)
  465. cv = forProj(cv)
  466. cv = forWork(cv)
  467. cv = birth(cv)
  468. cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
  469. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  470. for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
  471. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  472. if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
  473. try:
  474. if not cv.get("work_exp_flt") and cv.get("work_start_time"):
  475. if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
  476. cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
  477. cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
  478. elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
  479. y, m, d = getYMD(str(cv["work_start_time"]))
  480. cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  481. cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
  482. except Exception as e:
  483. logger.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
  484. if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
  485. keys = list(cv.keys())
  486. for k in keys:
  487. if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
  488. for k in cv.keys():
  489. if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
  490. cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
  491. keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
  492. for k in keys:
  493. if cv[k] <= 0: del cv[k]
  494. cv["tob_resume_id"] = str(cv["tob_resume_id"])
  495. cv["id"] = cv["tob_resume_id"]
  496. logger.info("CCCCCCCCCCCCCCC")
  497. return dealWithInt64(cv)
  498. def dealWithInt64(d):
  499. if isinstance(d, dict):
  500. for n, v in d.items():
  501. d[n] = dealWithInt64(v)
  502. if isinstance(d, list):
  503. d = [dealWithInt64(t) for t in d]
  504. if isinstance(d, np.integer): d = int(d)
  505. return d