您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import re, copy, time, datetime, demjson3, \
  14. traceback, signal
  15. import numpy as np
  16. from deepdoc.parser.resume.entities import degrees, schools, corporations
  17. from rag.nlp import rag_tokenizer, surname
  18. from xpinyin import Pinyin
  19. from contextlib import contextmanager
  20. class TimeoutException(Exception): pass
  21. @contextmanager
  22. def time_limit(seconds):
  23. def signal_handler(signum, frame):
  24. raise TimeoutException("Timed out!")
  25. signal.signal(signal.SIGALRM, signal_handler)
  26. signal.alarm(seconds)
  27. try:
  28. yield
  29. finally:
  30. signal.alarm(0)
  31. ENV = None
  32. PY = Pinyin()
  33. def rmHtmlTag(line):
  34. return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
  35. def highest_degree(dg):
  36. if not dg: return ""
  37. if type(dg) == type(""): dg = [dg]
  38. m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
  39. return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
  40. def forEdu(cv):
  41. if not cv.get("education_obj"):
  42. cv["integerity_flt"] *= 0.8
  43. return cv
  44. first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
  45. edu_nst = []
  46. edu_end_dt = ""
  47. cv["school_rank_int"] = 1000000
  48. for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
  49. e = {}
  50. if n.get("end_time"):
  51. if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
  52. try:
  53. dt = n["end_time"]
  54. if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
  55. y, m, d = getYMD(dt)
  56. ed_dt.append(str(y))
  57. e["end_dt_kwd"] = str(y)
  58. except Exception as e:
  59. pass
  60. if n.get("start_time"):
  61. try:
  62. dt = n["start_time"]
  63. if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
  64. y, m, d = getYMD(dt)
  65. st_dt.append(str(y))
  66. e["start_dt_kwd"] = str(y)
  67. except Exception as e:
  68. pass
  69. r = schools.select(n.get("school_name", ""))
  70. if r:
  71. if str(r.get("type", "")) == "1": fea.append("211")
  72. if str(r.get("type", "")) == "2": fea.append("211")
  73. if str(r.get("is_abroad", "")) == "1": fea.append("留学")
  74. if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
  75. if str(r.get("is_985", "")) == "1": fea.append("985")
  76. if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
  77. if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
  78. if n.get("school_name") and isinstance(n["school_name"], str):
  79. sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
  80. e["sch_nm_kwd"] = sch[-1]
  81. fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
  82. if n.get("discipline_name") and isinstance(n["discipline_name"], str):
  83. maj.append(n["discipline_name"])
  84. e["major_kwd"] = n["discipline_name"]
  85. if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
  86. if n.get("degree"):
  87. d = degrees.get_name(n["degree"])
  88. if d: e["degree_kwd"] = d
  89. if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
  90. n.get(
  91. "school_name",
  92. ""))): d = "专升本"
  93. if d: deg.append(d)
  94. # for first degree
  95. if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
  96. fdeg = [d]
  97. if n.get("school_name"): fsch = [n["school_name"]]
  98. if n.get("discipline_name"): fmaj = [n["discipline_name"]]
  99. first_fea = copy.deepcopy(fea)
  100. edu_nst.append(e)
  101. cv["sch_rank_kwd"] = []
  102. if cv["school_rank_int"] <= 20 \
  103. or ("海外名校" in fea and cv["school_rank_int"] <= 200):
  104. cv["sch_rank_kwd"].append("顶尖学校")
  105. elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
  106. or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
  107. cv["school_rank_int"] > 200):
  108. cv["sch_rank_kwd"].append("精英学校")
  109. elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
  110. or ("海外名校" in fea and cv["school_rank_int"] > 500):
  111. cv["sch_rank_kwd"].append("优质学校")
  112. else:
  113. cv["sch_rank_kwd"].append("一般学校")
  114. if edu_nst: cv["edu_nst"] = edu_nst
  115. if fea: cv["edu_fea_kwd"] = list(set(fea))
  116. if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
  117. if maj: cv["major_kwd"] = maj
  118. if fsch: cv["first_school_name_kwd"] = fsch
  119. if fdeg: cv["first_degree_kwd"] = fdeg
  120. if fmaj: cv["first_major_kwd"] = fmaj
  121. if st_dt: cv["edu_start_kwd"] = st_dt
  122. if ed_dt: cv["edu_end_kwd"] = ed_dt
  123. if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
  124. if deg:
  125. if "本科" in deg and "专科" in deg:
  126. deg.append("专升本")
  127. deg = [d for d in deg if d != '本科']
  128. cv["degree_kwd"] = deg
  129. cv["highest_degree_kwd"] = highest_degree(deg)
  130. if edu_end_dt:
  131. try:
  132. if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
  133. if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
  134. y, m, d = getYMD(edu_end_dt)
  135. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  136. except Exception as e:
  137. print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
  138. if sch:
  139. cv["school_name_kwd"] = sch
  140. if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
  141. or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
  142. or not cv.get("degree_kwd"):
  143. for c in sch:
  144. if schools.is_good(c):
  145. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  146. cv["tag_kwd"].append("好学校")
  147. cv["tag_kwd"].append("好学历")
  148. break
  149. if (len(cv.get("degree_kwd", [])) >= 1 and \
  150. "本科" in cv["degree_kwd"] and \
  151. any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
  152. or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
  153. or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
  154. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  155. if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
  156. if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
  157. if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
  158. if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
  159. if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
  160. return cv
  161. def forProj(cv):
  162. if not cv.get("project_obj"): return cv
  163. pro_nms, desc = [], []
  164. for i, n in enumerate(
  165. sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
  166. reverse=True)):
  167. if n.get("name"): pro_nms.append(n["name"])
  168. if n.get("describe"): desc.append(str(n["describe"]))
  169. if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
  170. if n.get("achivement"): desc.append(str(n["achivement"]))
  171. if pro_nms:
  172. # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
  173. cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
  174. if desc:
  175. cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
  176. cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
  177. return cv
  178. def json_loads(line):
  179. return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
  180. def forWork(cv):
  181. if not cv.get("work_obj"):
  182. cv["integerity_flt"] *= 0.7
  183. return cv
  184. flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
  185. "industry_name", "subordinates_count"]
  186. duas = []
  187. scales = []
  188. fea = {c: [] for c in flds}
  189. latest_job_tm = ""
  190. goodcorp = False
  191. goodcorp_ = False
  192. work_st_tm = ""
  193. corp_tags = []
  194. for i, n in enumerate(
  195. sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
  196. reverse=True)):
  197. if type(n) == type(""):
  198. try:
  199. n = json_loads(n)
  200. except Exception as e:
  201. continue
  202. if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
  203. for c in flds:
  204. if not n.get(c) or str(n[c]) == '0':
  205. fea[c].append("")
  206. continue
  207. if c == "corporation_name":
  208. n[c] = corporations.corpNorm(n[c], False)
  209. if corporations.is_good(n[c]):
  210. if i == 0:
  211. goodcorp = True
  212. else:
  213. goodcorp_ = True
  214. ct = corporations.corp_tag(n[c])
  215. if i == 0:
  216. corp_tags.extend(ct)
  217. elif ct and ct[0] != "软外":
  218. corp_tags.extend([f"{t}(曾)" for t in ct])
  219. fea[c].append(rmHtmlTag(str(n[c]).lower()))
  220. y, m, d = getYMD(n.get("start_time"))
  221. if not y or not m: continue
  222. st = "%s-%02d-%02d" % (y, int(m), int(d))
  223. latest_job_tm = st
  224. y, m, d = getYMD(n.get("end_time"))
  225. if (not y or not m) and i > 0: continue
  226. if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
  227. if not y or not m: continue
  228. ed = "%s-%02d-%02d" % (y, int(m), int(d))
  229. try:
  230. duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
  231. except Exception as e:
  232. print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
  233. if n.get("scale"):
  234. r = re.search(r"^([0-9]+)", str(n["scale"]))
  235. if r: scales.append(int(r.group(1)))
  236. if goodcorp:
  237. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  238. cv["tag_kwd"].append("好公司")
  239. if goodcorp_:
  240. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  241. cv["tag_kwd"].append("好公司(曾)")
  242. if corp_tags:
  243. if "tag_kwd" not in cv: cv["tag_kwd"] = []
  244. cv["tag_kwd"].extend(corp_tags)
  245. cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
  246. if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
  247. if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
  248. if fea["position_name"]:
  249. cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
  250. cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
  251. cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
  252. if fea["industry_name"]:
  253. cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
  254. cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
  255. cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
  256. if fea["corporation_name"]:
  257. cv["corporation_name_kwd"] = fea["corporation_name"][0]
  258. cv["corp_nm_kwd"] = fea["corporation_name"]
  259. cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
  260. cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
  261. cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
  262. if fea["responsibilities"]:
  263. cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
  264. cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
  265. if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
  266. re.match(r"[^0-9]+$", str(i))]
  267. if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
  268. if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
  269. if not cv.get("corporation_id"): cv["corporation_id"] = []
  270. for i in cv.get("corporation_id", []):
  271. cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
  272. if work_st_tm:
  273. try:
  274. if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
  275. y, m, d = getYMD(work_st_tm)
  276. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  277. except Exception as e:
  278. print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
  279. cv["job_num_int"] = 0
  280. if duas:
  281. cv["dua_flt"] = np.mean(duas)
  282. cv["cur_dua_int"] = duas[0]
  283. cv["job_num_int"] = len(duas)
  284. if scales: cv["scale_flt"] = np.max(scales)
  285. return cv
  286. def turnTm2Dt(b):
  287. if not b: return
  288. b = str(b).strip()
  289. if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
  290. return b
  291. def getYMD(b):
  292. y, m, d = "", "", "01"
  293. if not b: return (y, m, d)
  294. b = turnTm2Dt(b)
  295. if re.match(r"[0-9]{4}", b): y = int(b[:4])
  296. r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
  297. if r: m = r.group(1)
  298. r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
  299. if r: d = r.group(1)
  300. if not d or int(d) == 0 or int(d) > 31: d = "1"
  301. if not m or int(m) > 12 or int(m) < 1: m = "1"
  302. return (y, m, d)
  303. def birth(cv):
  304. if not cv.get("birth"):
  305. cv["integerity_flt"] *= 0.9
  306. return cv
  307. y, m, d = getYMD(cv["birth"])
  308. if not m or not y: return cv
  309. b = "%s-%02d-%02d" % (y, int(m), int(d))
  310. cv["birth_dt"] = b
  311. cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
  312. cv["age_int"] = datetime.datetime.now().year - int(y)
  313. return cv
  314. def parse(cv):
  315. for k in cv.keys():
  316. if cv[k] == '\\N': cv[k] = ''
  317. # cv = cv.asDict()
  318. tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
  319. "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
  320. "position_name", "school_name", "self_remark", "title_name"]
  321. small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
  322. kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
  323. "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
  324. "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
  325. num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
  326. "expect_salary_to", "salary_month"]
  327. is_fld = [
  328. ("is_fertility", "已育", "未育"),
  329. ("is_house", "有房", "没房"),
  330. ("is_management_experience", "有管理经验", "无管理经验"),
  331. ("is_marital", "已婚", "未婚"),
  332. ("is_oversea", "有海外经验", "无海外经验")
  333. ]
  334. rmkeys = []
  335. for k in cv.keys():
  336. if cv[k] is None: rmkeys.append(k)
  337. if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
  338. for k in rmkeys: del cv[k]
  339. integerity = 0.
  340. flds_num = 0.
  341. def hasValues(flds):
  342. nonlocal integerity, flds_num
  343. flds_num += len(flds)
  344. for f in flds:
  345. v = str(cv.get(f, ""))
  346. if len(v) > 0 and v != '0' and v != '[]': integerity += 1
  347. hasValues(tks_fld)
  348. hasValues(small_tks_fld)
  349. hasValues(kwd_fld)
  350. hasValues(num_fld)
  351. cv["integerity_flt"] = integerity / flds_num
  352. if cv.get("corporation_type"):
  353. for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
  354. (r"[//.· <\((]+.*", ""),
  355. (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
  356. (r".*(机关|事业).*", "机关"),
  357. (r".*(非盈利|Non-profit).*", "非盈利"),
  358. (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
  359. (r".*国有.*", "国企"),
  360. (r"[ ()\(\)人/·0-9-]+", ""),
  361. (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
  362. cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
  363. if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
  364. if cv.get("political_status"):
  365. for p, r in [
  366. (r".*党员.*", "党员"),
  367. (r".*(无党派|公民).*", "群众"),
  368. (r".*团员.*", "团员")]:
  369. cv["political_status"] = re.sub(p, r, cv["political_status"])
  370. if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
  371. if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
  372. keys = list(cv.keys())
  373. for k in keys:
  374. # deal with json objects
  375. if k.find("_obj") > 0:
  376. try:
  377. cv[k] = json_loads(cv[k])
  378. cv[k] = [a for _, a in cv[k].items()]
  379. nms = []
  380. for n in cv[k]:
  381. if type(n) != type({}) or "name" not in n or not n.get("name"): continue
  382. n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
  383. if not n["name"]: continue
  384. nms.append(n["name"])
  385. if nms:
  386. t = k[:-4]
  387. cv[f"{t}_kwd"] = nms
  388. cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
  389. except Exception as e:
  390. print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
  391. cv[k] = []
  392. # tokenize fields
  393. if k in tks_fld:
  394. cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
  395. if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
  396. # keyword fields
  397. if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
  398. for n in re.split(r"[\t,,;;. ]",
  399. re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
  400. ) if n]
  401. if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
  402. cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
  403. # for name field
  404. if cv.get("name"):
  405. nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
  406. nm = re.sub(r"[ \t ]+", " ", nm)
  407. if re.match(r"[a-zA-Z ]+$", nm):
  408. if len(nm.split(" ")) > 1:
  409. cv["name"] = nm
  410. else:
  411. nm = ""
  412. elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
  413. nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
  414. else:
  415. nm = ""
  416. cv["name"] = nm.strip()
  417. name = cv["name"]
  418. # name pingyin and its prefix
  419. cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
  420. cv["name_py_pref0_tks"] = ""
  421. cv["name_py_pref_tks"] = ""
  422. for py in PY.get_pinyins(nm[:20], ''):
  423. for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
  424. for py in PY.get_pinyins(nm[:20], ' '):
  425. py = py.split(" ")
  426. for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
  427. cv["name_kwd"] = name
  428. cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
  429. cv["name_tks"] = (
  430. rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
  431. ) if name else ""
  432. else:
  433. cv["integerity_flt"] /= 2.
  434. if cv.get("phone"):
  435. r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
  436. if not r:
  437. cv["phone"] = ""
  438. else:
  439. cv["phone"] = r.group(1)
  440. # deal with date fields
  441. if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
  442. cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
  443. else:
  444. y, m, d = getYMD(str(cv.get("updated_at", "")))
  445. if not y: y = "2012"
  446. if not m: m = "01"
  447. if not d: d = "01"
  448. cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  449. # long text tokenize
  450. if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
  451. # for yes or no field
  452. fea = []
  453. for f, y, n in is_fld:
  454. if f not in cv: continue
  455. if cv[f] == '是': fea.append(y)
  456. if cv[f] == '否': fea.append(n)
  457. if fea: cv["tag_kwd"] = fea
  458. cv = forEdu(cv)
  459. cv = forProj(cv)
  460. cv = forWork(cv)
  461. cv = birth(cv)
  462. cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
  463. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  464. for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
  465. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  466. if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
  467. try:
  468. if not cv.get("work_exp_flt") and cv.get("work_start_time"):
  469. if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
  470. cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
  471. cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
  472. elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
  473. y, m, d = getYMD(str(cv["work_start_time"]))
  474. cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  475. cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
  476. except Exception as e:
  477. print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
  478. if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
  479. keys = list(cv.keys())
  480. for k in keys:
  481. if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
  482. for k in cv.keys():
  483. if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
  484. cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
  485. keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
  486. for k in keys:
  487. if cv[k] <= 0: del cv[k]
  488. cv["tob_resume_id"] = str(cv["tob_resume_id"])
  489. cv["id"] = cv["tob_resume_id"]
  490. print("CCCCCCCCCCCCCCC")
  491. return dealWithInt64(cv)
  492. def dealWithInt64(d):
  493. if isinstance(d, dict):
  494. for n, v in d.items():
  495. d[n] = dealWithInt64(v)
  496. if isinstance(d, list):
  497. d = [dealWithInt64(t) for t in d]
  498. if isinstance(d, np.integer): d = int(d)
  499. return d