Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

step_two.py 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. # -*- coding: utf-8 -*-
  2. import re, copy, time, datetime, demjson, \
  3. traceback, signal
  4. import numpy as np
  5. from deepdoc.parser.resume.entities import degrees, schools, corporations
  6. from rag.nlp import huqie, surname
  7. from xpinyin import Pinyin
  8. from contextlib import contextmanager
  9. class TimeoutException(Exception): pass
  10. @contextmanager
  11. def time_limit(seconds):
  12. def signal_handler(signum, frame):
  13. raise TimeoutException("Timed out!")
  14. signal.signal(signal.SIGALRM, signal_handler)
  15. signal.alarm(seconds)
  16. try:
  17. yield
  18. finally:
  19. signal.alarm(0)
  20. ENV = None
  21. PY = Pinyin()
  22. def rmHtmlTag(line):
  23. return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
  24. def highest_degree(dg):
  25. if not dg: return ""
  26. if type(dg) == type(""): dg = [dg]
  27. m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
  28. return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
  29. def forEdu(cv):
  30. if not cv.get("education_obj"):
  31. cv["integerity_flt"] *= 0.8
  32. return cv
  33. first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
  34. edu_nst = []
  35. edu_end_dt = ""
  36. cv["school_rank_int"] = 1000000
  37. for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
  38. e = {}
  39. if n.get("end_time"):
  40. if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
  41. try:
  42. dt = n["end_time"]
  43. if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
  44. y, m, d = getYMD(dt)
  45. ed_dt.append(str(y))
  46. e["end_dt_kwd"] = str(y)
  47. except Exception as e:
  48. pass
  49. if n.get("start_time"):
  50. try:
  51. dt = n["start_time"]
  52. if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
  53. y, m, d = getYMD(dt)
  54. st_dt.append(str(y))
  55. e["start_dt_kwd"] = str(y)
  56. except Exception as e:
  57. pass
  58. r = schools.select(n.get("school_name", ""))
  59. if r:
  60. if str(r.get("type", "")) == "1": fea.append("211")
  61. if str(r.get("type", "")) == "2": fea.append("211")
  62. if str(r.get("is_abroad", "")) == "1": fea.append("留学")
  63. if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
  64. if str(r.get("is_985", "")) == "1": fea.append("985")
  65. if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
  66. if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
  67. if n.get("school_name") and isinstance(n["school_name"], str):
  68. sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
  69. e["sch_nm_kwd"] = sch[-1]
  70. fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
  71. if n.get("discipline_name") and isinstance(n["discipline_name"], str):
  72. maj.append(n["discipline_name"])
  73. e["major_kwd"] = n["discipline_name"]
  74. if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
  75. if n.get("degree"):
  76. d = degrees.get_name(n["degree"])
  77. if d: e["degree_kwd"] = d
  78. if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
  79. n.get(
  80. "school_name",
  81. ""))): d = "专升本"
  82. if d: deg.append(d)
  83. # for first degree
  84. if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
  85. fdeg = [d]
  86. if n.get("school_name"): fsch = [n["school_name"]]
  87. if n.get("discipline_name"): fmaj = [n["discipline_name"]]
  88. first_fea = copy.deepcopy(fea)
  89. edu_nst.append(e)
  90. cv["sch_rank_kwd"] = []
  91. if cv["school_rank_int"] <= 20 \
  92. or ("海外名校" in fea and cv["school_rank_int"] <= 200):
  93. cv["sch_rank_kwd"].append("顶尖学校")
  94. elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
  95. or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
  96. cv["school_rank_int"] > 200):
  97. cv["sch_rank_kwd"].append("精英学校")
  98. elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
  99. or ("海外名校" in fea and cv["school_rank_int"] > 500):
  100. cv["sch_rank_kwd"].append("优质学校")
  101. else:
  102. cv["sch_rank_kwd"].append("一般学校")
  103. if edu_nst: cv["edu_nst"] = edu_nst
  104. if fea: cv["edu_fea_kwd"] = list(set(fea))
  105. if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
  106. if maj: cv["major_kwd"] = maj
  107. if fsch: cv["first_school_name_kwd"] = fsch
  108. if fdeg: cv["first_degree_kwd"] = fdeg
  109. if fmaj: cv["first_major_kwd"] = fmaj
  110. if st_dt: cv["edu_start_kwd"] = st_dt
  111. if ed_dt: cv["edu_end_kwd"] = ed_dt
  112. if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
  113. if deg:
  114. if "本科" in deg and "专科" in deg:
  115. deg.append("专升本")
  116. deg = [d for d in deg if d != '本科']
  117. cv["degree_kwd"] = deg
  118. cv["highest_degree_kwd"] = highest_degree(deg)
  119. if edu_end_dt:
  120. try:
  121. if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
  122. if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
  123. y, m, d = getYMD(edu_end_dt)
  124. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  125. except Exception as e:
  126. print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
  127. if sch:
  128. cv["school_name_kwd"] = sch
  129. if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
  130. or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
  131. or not cv.get("degree_kwd"):
  132. for c in sch:
  133. if schools.is_good(c):
  134. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  135. cv["tag_kwd"].append("好学校")
  136. cv["tag_kwd"].append("好学历")
  137. break
  138. if (len(cv.get("degree_kwd", [])) >= 1 and \
  139. "本科" in cv["degree_kwd"] and \
  140. any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
  141. or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
  142. or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
  143. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  144. if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
  145. if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
  146. if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
  147. if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
  148. if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
  149. return cv
  150. def forProj(cv):
  151. if not cv.get("project_obj"): return cv
  152. pro_nms, desc = [], []
  153. for i, n in enumerate(
  154. sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
  155. reverse=True)):
  156. if n.get("name"): pro_nms.append(n["name"])
  157. if n.get("describe"): desc.append(str(n["describe"]))
  158. if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
  159. if n.get("achivement"): desc.append(str(n["achivement"]))
  160. if pro_nms:
  161. # cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
  162. cv["project_name_tks"] = huqie.qie(pro_nms[0])
  163. if desc:
  164. cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
  165. cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
  166. return cv
  167. def json_loads(line):
  168. return demjson.decode(re.sub(r": *(True|False)", r": '\1'", line))
  169. def forWork(cv):
  170. if not cv.get("work_obj"):
  171. cv["integerity_flt"] *= 0.7
  172. return cv
  173. flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
  174. "industry_name", "subordinates_count"]
  175. duas = []
  176. scales = []
  177. fea = {c: [] for c in flds}
  178. latest_job_tm = ""
  179. goodcorp = False
  180. goodcorp_ = False
  181. work_st_tm = ""
  182. corp_tags = []
  183. for i, n in enumerate(
  184. sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
  185. reverse=True)):
  186. if type(n) == type(""):
  187. try:
  188. n = json_loads(n)
  189. except Exception as e:
  190. continue
  191. if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
  192. for c in flds:
  193. if not n.get(c) or str(n[c]) == '0':
  194. fea[c].append("")
  195. continue
  196. if c == "corporation_name":
  197. n[c] = corporations.corpNorm(n[c], False)
  198. if corporations.is_good(n[c]):
  199. if i == 0:
  200. goodcorp = True
  201. else:
  202. goodcorp_ = True
  203. ct = corporations.corp_tag(n[c])
  204. if i == 0:
  205. corp_tags.extend(ct)
  206. elif ct and ct[0] != "软外":
  207. corp_tags.extend([f"{t}(曾)" for t in ct])
  208. fea[c].append(rmHtmlTag(str(n[c]).lower()))
  209. y, m, d = getYMD(n.get("start_time"))
  210. if not y or not m: continue
  211. st = "%s-%02d-%02d" % (y, int(m), int(d))
  212. latest_job_tm = st
  213. y, m, d = getYMD(n.get("end_time"))
  214. if (not y or not m) and i > 0: continue
  215. if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
  216. if not y or not m: continue
  217. ed = "%s-%02d-%02d" % (y, int(m), int(d))
  218. try:
  219. duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
  220. except Exception as e:
  221. print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
  222. if n.get("scale"):
  223. r = re.search(r"^([0-9]+)", str(n["scale"]))
  224. if r: scales.append(int(r.group(1)))
  225. if goodcorp:
  226. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  227. cv["tag_kwd"].append("好公司")
  228. if goodcorp_:
  229. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  230. cv["tag_kwd"].append("好公司(曾)")
  231. if corp_tags:
  232. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  233. cv["tag_kwd"].extend(corp_tags)
  234. cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
  235. if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
  236. if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
  237. if fea["position_name"]:
  238. cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
  239. cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
  240. cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
  241. if fea["industry_name"]:
  242. cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
  243. cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
  244. cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
  245. if fea["corporation_name"]:
  246. cv["corporation_name_kwd"] = fea["corporation_name"][0]
  247. cv["corp_nm_kwd"] = fea["corporation_name"]
  248. cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
  249. cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
  250. cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
  251. if fea["responsibilities"]:
  252. cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
  253. cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
  254. if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
  255. re.match(r"[^0-9]+$", str(i))]
  256. if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
  257. if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
  258. if not cv.get("corporation_id"): cv["corporation_id"] = []
  259. for i in cv.get("corporation_id", []):
  260. cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
  261. if work_st_tm:
  262. try:
  263. if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
  264. y, m, d = getYMD(work_st_tm)
  265. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  266. except Exception as e:
  267. print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
  268. cv["job_num_int"] = 0
  269. if duas:
  270. cv["dua_flt"] = np.mean(duas)
  271. cv["cur_dua_int"] = duas[0]
  272. cv["job_num_int"] = len(duas)
  273. if scales: cv["scale_flt"] = np.max(scales)
  274. return cv
  275. def turnTm2Dt(b):
  276. if not b: return
  277. b = str(b).strip()
  278. if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
  279. return b
  280. def getYMD(b):
  281. y, m, d = "", "", "01"
  282. if not b: return (y, m, d)
  283. b = turnTm2Dt(b)
  284. if re.match(r"[0-9]{4}", b): y = int(b[:4])
  285. r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
  286. if r: m = r.group(1)
  287. r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
  288. if r: d = r.group(1)
  289. if not d or int(d) == 0 or int(d) > 31: d = "1"
  290. if not m or int(m) > 12 or int(m) < 1: m = "1"
  291. return (y, m, d)
  292. def birth(cv):
  293. if not cv.get("birth"):
  294. cv["integerity_flt"] *= 0.9
  295. return cv
  296. y, m, d = getYMD(cv["birth"])
  297. if not m or not y: return cv
  298. b = "%s-%02d-%02d" % (y, int(m), int(d))
  299. cv["birth_dt"] = b
  300. cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
  301. cv["age_int"] = datetime.datetime.now().year - int(y)
  302. return cv
  303. def parse(cv):
  304. for k in cv.keys():
  305. if cv[k] == '\\N': cv[k] = ''
  306. # cv = cv.asDict()
  307. tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
  308. "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
  309. "position_name", "school_name", "self_remark", "title_name"]
  310. small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
  311. kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
  312. "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
  313. "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
  314. num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
  315. "expect_salary_to", "salary_month"]
  316. is_fld = [
  317. ("is_fertility", "已育", "未育"),
  318. ("is_house", "有房", "没房"),
  319. ("is_management_experience", "有管理经验", "无管理经验"),
  320. ("is_marital", "已婚", "未婚"),
  321. ("is_oversea", "有海外经验", "无海外经验")
  322. ]
  323. rmkeys = []
  324. for k in cv.keys():
  325. if cv[k] is None: rmkeys.append(k)
  326. if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
  327. for k in rmkeys: del cv[k]
  328. integerity = 0.
  329. flds_num = 0.
  330. def hasValues(flds):
  331. nonlocal integerity, flds_num
  332. flds_num += len(flds)
  333. for f in flds:
  334. v = str(cv.get(f, ""))
  335. if len(v) > 0 and v != '0' and v != '[]': integerity += 1
  336. hasValues(tks_fld)
  337. hasValues(small_tks_fld)
  338. hasValues(kwd_fld)
  339. hasValues(num_fld)
  340. cv["integerity_flt"] = integerity / flds_num
  341. if cv.get("corporation_type"):
  342. for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
  343. (r"[//.· <\((]+.*", ""),
  344. (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
  345. (r".*(机关|事业).*", "机关"),
  346. (r".*(非盈利|Non-profit).*", "非盈利"),
  347. (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
  348. (r".*国有.*", "国企"),
  349. (r"[ ()\(\)人/·0-9-]+", ""),
  350. (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
  351. cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
  352. if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
  353. if cv.get("political_status"):
  354. for p, r in [
  355. (r".*党员.*", "党员"),
  356. (r".*(无党派|公民).*", "群众"),
  357. (r".*团员.*", "团员")]:
  358. cv["political_status"] = re.sub(p, r, cv["political_status"])
  359. if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
  360. if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
  361. keys = list(cv.keys())
  362. for k in keys:
  363. # deal with json objects
  364. if k.find("_obj") > 0:
  365. try:
  366. cv[k] = json_loads(cv[k])
  367. cv[k] = [a for _, a in cv[k].items()]
  368. nms = []
  369. for n in cv[k]:
  370. if type(n) != type({}) or "name" not in n or not n.get("name"): continue
  371. n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
  372. if not n["name"]: continue
  373. nms.append(n["name"])
  374. if nms:
  375. t = k[:-4]
  376. cv[f"{t}_kwd"] = nms
  377. cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
  378. except Exception as e:
  379. print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
  380. cv[k] = []
  381. # tokenize fields
  382. if k in tks_fld:
  383. cv[f"{k}_tks"] = huqie.qie(cv[k])
  384. if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
  385. # keyword fields
  386. if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
  387. for n in re.split(r"[\t,,;;. ]",
  388. re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
  389. ) if n]
  390. if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
  391. cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
  392. # for name field
  393. if cv.get("name"):
  394. nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
  395. nm = re.sub(r"[ \t ]+", " ", nm)
  396. if re.match(r"[a-zA-Z ]+$", nm):
  397. if len(nm.split(" ")) > 1:
  398. cv["name"] = nm
  399. else:
  400. nm = ""
  401. elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
  402. nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
  403. else:
  404. nm = ""
  405. cv["name"] = nm.strip()
  406. name = cv["name"]
  407. # name pingyin and its prefix
  408. cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
  409. cv["name_py_pref0_tks"] = ""
  410. cv["name_py_pref_tks"] = ""
  411. for py in PY.get_pinyins(nm[:20], ''):
  412. for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
  413. for py in PY.get_pinyins(nm[:20], ' '):
  414. py = py.split(" ")
  415. for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
  416. cv["name_kwd"] = name
  417. cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
  418. cv["name_tks"] = (
  419. huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
  420. ) if name else ""
  421. else:
  422. cv["integerity_flt"] /= 2.
  423. if cv.get("phone"):
  424. r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
  425. if not r:
  426. cv["phone"] = ""
  427. else:
  428. cv["phone"] = r.group(1)
  429. # deal with date fields
  430. if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
  431. cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
  432. else:
  433. y, m, d = getYMD(str(cv.get("updated_at", "")))
  434. if not y: y = "2012"
  435. if not m: m = "01"
  436. if not d: d = "01"
  437. cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  438. # long text tokenize
  439. if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
  440. # for yes or no field
  441. fea = []
  442. for f, y, n in is_fld:
  443. if f not in cv: continue
  444. if cv[f] == '是': fea.append(y)
  445. if cv[f] == '否': fea.append(n)
  446. if fea: cv["tag_kwd"] = fea
  447. cv = forEdu(cv)
  448. cv = forProj(cv)
  449. cv = forWork(cv)
  450. cv = birth(cv)
  451. cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
  452. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  453. for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
  454. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  455. if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
  456. try:
  457. if not cv.get("work_exp_flt") and cv.get("work_start_time"):
  458. if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
  459. cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
  460. cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
  461. elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
  462. y, m, d = getYMD(str(cv["work_start_time"]))
  463. cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  464. cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
  465. except Exception as e:
  466. print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
  467. if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
  468. keys = list(cv.keys())
  469. for k in keys:
  470. if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
  471. for k in cv.keys():
  472. if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
  473. cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
  474. keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
  475. for k in keys:
  476. if cv[k] <= 0: del cv[k]
  477. cv["tob_resume_id"] = str(cv["tob_resume_id"])
  478. cv["id"] = cv["tob_resume_id"]
  479. print("CCCCCCCCCCCCCCC")
  480. return dealWithInt64(cv)
  481. def dealWithInt64(d):
  482. if isinstance(d, dict):
  483. for n, v in d.items():
  484. d[n] = dealWithInt64(v)
  485. if isinstance(d, list):
  486. d = [dealWithInt64(t) for t in d]
  487. if isinstance(d, np.integer): d = int(d)
  488. return d