Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

step_two.py 25KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import re
  18. import copy
  19. import time
  20. import datetime
  21. import demjson3
  22. import traceback
  23. import signal
  24. import numpy as np
  25. from deepdoc.parser.resume.entities import degrees, schools, corporations
  26. from rag.nlp import rag_tokenizer, surname
  27. from xpinyin import Pinyin
  28. from contextlib import contextmanager
  29. class TimeoutException(Exception):
  30. pass
  31. @contextmanager
  32. def time_limit(seconds):
  33. def signal_handler(signum, frame):
  34. raise TimeoutException("Timed out!")
  35. signal.signal(signal.SIGALRM, signal_handler)
  36. signal.alarm(seconds)
  37. try:
  38. yield
  39. finally:
  40. signal.alarm(0)
  41. ENV = None
  42. PY = Pinyin()
  43. def rmHtmlTag(line):
  44. return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, count=100000, flags=re.IGNORECASE)
  45. def highest_degree(dg):
  46. if not dg:
  47. return ""
  48. if isinstance(dg, str):
  49. dg = [dg]
  50. m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
  51. return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
  52. def forEdu(cv):
  53. if not cv.get("education_obj"):
  54. cv["integerity_flt"] *= 0.8
  55. return cv
  56. first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
  57. edu_nst = []
  58. edu_end_dt = ""
  59. cv["school_rank_int"] = 1000000
  60. for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
  61. e = {}
  62. if n.get("end_time"):
  63. if n["end_time"] > edu_end_dt:
  64. edu_end_dt = n["end_time"]
  65. try:
  66. dt = n["end_time"]
  67. if re.match(r"[0-9]{9,}", dt):
  68. dt = turnTm2Dt(dt)
  69. y, m, d = getYMD(dt)
  70. ed_dt.append(str(y))
  71. e["end_dt_kwd"] = str(y)
  72. except Exception as e:
  73. pass
  74. if n.get("start_time"):
  75. try:
  76. dt = n["start_time"]
  77. if re.match(r"[0-9]{9,}", dt):
  78. dt = turnTm2Dt(dt)
  79. y, m, d = getYMD(dt)
  80. st_dt.append(str(y))
  81. e["start_dt_kwd"] = str(y)
  82. except Exception:
  83. pass
  84. r = schools.select(n.get("school_name", ""))
  85. if r:
  86. if str(r.get("type", "")) == "1":
  87. fea.append("211")
  88. if str(r.get("type", "")) == "2":
  89. fea.append("211")
  90. if str(r.get("is_abroad", "")) == "1":
  91. fea.append("留学")
  92. if str(r.get("is_double_first", "")) == "1":
  93. fea.append("双一流")
  94. if str(r.get("is_985", "")) == "1":
  95. fea.append("985")
  96. if str(r.get("is_world_known", "")) == "1":
  97. fea.append("海外知名")
  98. if r.get("rank") and cv["school_rank_int"] > r["rank"]:
  99. cv["school_rank_int"] = r["rank"]
  100. if n.get("school_name") and isinstance(n["school_name"], str):
  101. sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
  102. e["sch_nm_kwd"] = sch[-1]
  103. fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
  104. if n.get("discipline_name") and isinstance(n["discipline_name"], str):
  105. maj.append(n["discipline_name"])
  106. e["major_kwd"] = n["discipline_name"]
  107. if not n.get("degree") and "985" in fea and not first_fea:
  108. n["degree"] = "1"
  109. if n.get("degree"):
  110. d = degrees.get_name(n["degree"])
  111. if d:
  112. e["degree_kwd"] = d
  113. if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
  114. d = "专升本"
  115. if d:
  116. deg.append(d)
  117. # for first degree
  118. if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
  119. fdeg = [d]
  120. if n.get("school_name"):
  121. fsch = [n["school_name"]]
  122. if n.get("discipline_name"):
  123. fmaj = [n["discipline_name"]]
  124. first_fea = copy.deepcopy(fea)
  125. edu_nst.append(e)
  126. cv["sch_rank_kwd"] = []
  127. if cv["school_rank_int"] <= 20 \
  128. or ("海外名校" in fea and cv["school_rank_int"] <= 200):
  129. cv["sch_rank_kwd"].append("顶尖学校")
  130. elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
  131. or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
  132. cv["school_rank_int"] > 200):
  133. cv["sch_rank_kwd"].append("精英学校")
  134. elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
  135. or ("海外名校" in fea and cv["school_rank_int"] > 500):
  136. cv["sch_rank_kwd"].append("优质学校")
  137. else:
  138. cv["sch_rank_kwd"].append("一般学校")
  139. if edu_nst:
  140. cv["edu_nst"] = edu_nst
  141. if fea:
  142. cv["edu_fea_kwd"] = list(set(fea))
  143. if first_fea:
  144. cv["edu_first_fea_kwd"] = list(set(first_fea))
  145. if maj:
  146. cv["major_kwd"] = maj
  147. if fsch:
  148. cv["first_school_name_kwd"] = fsch
  149. if fdeg:
  150. cv["first_degree_kwd"] = fdeg
  151. if fmaj:
  152. cv["first_major_kwd"] = fmaj
  153. if st_dt:
  154. cv["edu_start_kwd"] = st_dt
  155. if ed_dt:
  156. cv["edu_end_kwd"] = ed_dt
  157. if ed_dt:
  158. cv["edu_end_int"] = max([int(t) for t in ed_dt])
  159. if deg:
  160. if "本科" in deg and "专科" in deg:
  161. deg.append("专升本")
  162. deg = [d for d in deg if d != '本科']
  163. cv["degree_kwd"] = deg
  164. cv["highest_degree_kwd"] = highest_degree(deg)
  165. if edu_end_dt:
  166. try:
  167. if re.match(r"[0-9]{9,}", edu_end_dt):
  168. edu_end_dt = turnTm2Dt(edu_end_dt)
  169. if edu_end_dt.strip("\n") == "至今":
  170. edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
  171. y, m, d = getYMD(edu_end_dt)
  172. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  173. except Exception as e:
  174. logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
  175. if sch:
  176. cv["school_name_kwd"] = sch
  177. if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
  178. or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
  179. or not cv.get("degree_kwd"):
  180. for c in sch:
  181. if schools.is_good(c):
  182. if "tag_kwd" not in cv:
  183. cv["tag_kwd"] = []
  184. cv["tag_kwd"].append("好学校")
  185. cv["tag_kwd"].append("好学历")
  186. break
  187. if (len(cv.get("degree_kwd", [])) >= 1 and \
  188. "本科" in cv["degree_kwd"] and \
  189. any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
  190. or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
  191. or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
  192. if "tag_kwd" not in cv:
  193. cv["tag_kwd"] = []
  194. if "好学历" not in cv["tag_kwd"]:
  195. cv["tag_kwd"].append("好学历")
  196. if cv.get("major_kwd"):
  197. cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
  198. if cv.get("school_name_kwd"):
  199. cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
  200. if cv.get("first_school_name_kwd"):
  201. cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
  202. if cv.get("first_major_kwd"):
  203. cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
  204. return cv
  205. def forProj(cv):
  206. if not cv.get("project_obj"):
  207. return cv
  208. pro_nms, desc = [], []
  209. for i, n in enumerate(
  210. sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
  211. reverse=True)):
  212. if n.get("name"):
  213. pro_nms.append(n["name"])
  214. if n.get("describe"):
  215. desc.append(str(n["describe"]))
  216. if n.get("responsibilities"):
  217. desc.append(str(n["responsibilities"]))
  218. if n.get("achivement"):
  219. desc.append(str(n["achivement"]))
  220. if pro_nms:
  221. # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
  222. cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
  223. if desc:
  224. cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
  225. cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
  226. return cv
  227. def json_loads(line):
  228. return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
  229. def forWork(cv):
  230. if not cv.get("work_obj"):
  231. cv["integerity_flt"] *= 0.7
  232. return cv
  233. flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
  234. "industry_name", "subordinates_count"]
  235. duas = []
  236. scales = []
  237. fea = {c: [] for c in flds}
  238. latest_job_tm = ""
  239. goodcorp = False
  240. goodcorp_ = False
  241. work_st_tm = ""
  242. corp_tags = []
  243. for i, n in enumerate(
  244. sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
  245. reverse=True)):
  246. if isinstance(n, str):
  247. try:
  248. n = json_loads(n)
  249. except Exception:
  250. continue
  251. if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
  252. work_st_tm = n["start_time"]
  253. for c in flds:
  254. if not n.get(c) or str(n[c]) == '0':
  255. fea[c].append("")
  256. continue
  257. if c == "corporation_name":
  258. n[c] = corporations.corpNorm(n[c], False)
  259. if corporations.is_good(n[c]):
  260. if i == 0:
  261. goodcorp = True
  262. else:
  263. goodcorp_ = True
  264. ct = corporations.corp_tag(n[c])
  265. if i == 0:
  266. corp_tags.extend(ct)
  267. elif ct and ct[0] != "软外":
  268. corp_tags.extend([f"{t}(曾)" for t in ct])
  269. fea[c].append(rmHtmlTag(str(n[c]).lower()))
  270. y, m, d = getYMD(n.get("start_time"))
  271. if not y or not m:
  272. continue
  273. st = "%s-%02d-%02d" % (y, int(m), int(d))
  274. latest_job_tm = st
  275. y, m, d = getYMD(n.get("end_time"))
  276. if (not y or not m) and i > 0:
  277. continue
  278. if not y or not m or int(y) > 2022:
  279. y, m, d = getYMD(str(n.get("updated_at", "")))
  280. if not y or not m:
  281. continue
  282. ed = "%s-%02d-%02d" % (y, int(m), int(d))
  283. try:
  284. duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
  285. except Exception:
  286. logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
  287. if n.get("scale"):
  288. r = re.search(r"^([0-9]+)", str(n["scale"]))
  289. if r:
  290. scales.append(int(r.group(1)))
  291. if goodcorp:
  292. if "tag_kwd" not in cv:
  293. cv["tag_kwd"] = []
  294. cv["tag_kwd"].append("好公司")
  295. if goodcorp_:
  296. if "tag_kwd" not in cv:
  297. cv["tag_kwd"] = []
  298. cv["tag_kwd"].append("好公司(曾)")
  299. if corp_tags:
  300. if "tag_kwd" not in cv:
  301. cv["tag_kwd"] = []
  302. cv["tag_kwd"].extend(corp_tags)
  303. cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
  304. if latest_job_tm:
  305. cv["latest_job_dt"] = latest_job_tm
  306. if fea["corporation_id"]:
  307. cv["corporation_id"] = fea["corporation_id"]
  308. if fea["position_name"]:
  309. cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
  310. cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
  311. cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
  312. if fea["industry_name"]:
  313. cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
  314. cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
  315. cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
  316. if fea["corporation_name"]:
  317. cv["corporation_name_kwd"] = fea["corporation_name"][0]
  318. cv["corp_nm_kwd"] = fea["corporation_name"]
  319. cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
  320. cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
  321. cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
  322. if fea["responsibilities"]:
  323. cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
  324. cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
  325. if fea["subordinates_count"]:
  326. fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
  327. re.match(r"[^0-9]+$", str(i))]
  328. if fea["subordinates_count"]:
  329. cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
  330. if isinstance(cv.get("corporation_id"), int):
  331. cv["corporation_id"] = [str(cv["corporation_id"])]
  332. if not cv.get("corporation_id"):
  333. cv["corporation_id"] = []
  334. for i in cv.get("corporation_id", []):
  335. cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
  336. if work_st_tm:
  337. try:
  338. if re.match(r"[0-9]{9,}", work_st_tm):
  339. work_st_tm = turnTm2Dt(work_st_tm)
  340. y, m, d = getYMD(work_st_tm)
  341. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  342. except Exception as e:
  343. logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
  344. cv["job_num_int"] = 0
  345. if duas:
  346. cv["dua_flt"] = np.mean(duas)
  347. cv["cur_dua_int"] = duas[0]
  348. cv["job_num_int"] = len(duas)
  349. if scales:
  350. cv["scale_flt"] = np.max(scales)
  351. return cv
  352. def turnTm2Dt(b):
  353. if not b:
  354. return
  355. b = str(b).strip()
  356. if re.match(r"[0-9]{10,}", b):
  357. b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
  358. return b
  359. def getYMD(b):
  360. y, m, d = "", "", "01"
  361. if not b:
  362. return (y, m, d)
  363. b = turnTm2Dt(b)
  364. if re.match(r"[0-9]{4}", b):
  365. y = int(b[:4])
  366. r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
  367. if r:
  368. m = r.group(1)
  369. r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
  370. if r:
  371. d = r.group(1)
  372. if not d or int(d) == 0 or int(d) > 31:
  373. d = "1"
  374. if not m or int(m) > 12 or int(m) < 1:
  375. m = "1"
  376. return (y, m, d)
  377. def birth(cv):
  378. if not cv.get("birth"):
  379. cv["integerity_flt"] *= 0.9
  380. return cv
  381. y, m, d = getYMD(cv["birth"])
  382. if not m or not y:
  383. return cv
  384. b = "%s-%02d-%02d" % (y, int(m), int(d))
  385. cv["birth_dt"] = b
  386. cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
  387. cv["age_int"] = datetime.datetime.now().year - int(y)
  388. return cv
  389. def parse(cv):
  390. for k in cv.keys():
  391. if cv[k] == '\\N':
  392. cv[k] = ''
  393. # cv = cv.asDict()
  394. tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
  395. "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
  396. "position_name", "school_name", "self_remark", "title_name"]
  397. small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
  398. kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
  399. "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
  400. "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
  401. num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
  402. "expect_salary_to", "salary_month"]
  403. is_fld = [
  404. ("is_fertility", "已育", "未育"),
  405. ("is_house", "有房", "没房"),
  406. ("is_management_experience", "有管理经验", "无管理经验"),
  407. ("is_marital", "已婚", "未婚"),
  408. ("is_oversea", "有海外经验", "无海外经验")
  409. ]
  410. rmkeys = []
  411. for k in cv.keys():
  412. if cv[k] is None:
  413. rmkeys.append(k)
  414. if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
  415. rmkeys.append(k)
  416. for k in rmkeys:
  417. del cv[k]
  418. integerity = 0.
  419. flds_num = 0.
  420. def hasValues(flds):
  421. nonlocal integerity, flds_num
  422. flds_num += len(flds)
  423. for f in flds:
  424. v = str(cv.get(f, ""))
  425. if len(v) > 0 and v != '0' and v != '[]':
  426. integerity += 1
  427. hasValues(tks_fld)
  428. hasValues(small_tks_fld)
  429. hasValues(kwd_fld)
  430. hasValues(num_fld)
  431. cv["integerity_flt"] = integerity / flds_num
  432. if cv.get("corporation_type"):
  433. for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
  434. (r"[//.· <\((]+.*", ""),
  435. (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
  436. (r".*(机关|事业).*", "机关"),
  437. (r".*(非盈利|Non-profit).*", "非盈利"),
  438. (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
  439. (r".*国有.*", "国企"),
  440. (r"[ ()\(\)人/·0-9-]+", ""),
  441. (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
  442. cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], count=1000, flags=re.IGNORECASE)
  443. if len(cv["corporation_type"]) < 2:
  444. del cv["corporation_type"]
  445. if cv.get("political_status"):
  446. for p, r in [
  447. (r".*党员.*", "党员"),
  448. (r".*(无党派|公民).*", "群众"),
  449. (r".*团员.*", "团员")]:
  450. cv["political_status"] = re.sub(p, r, cv["political_status"])
  451. if not re.search(r"[党团群]", cv["political_status"]):
  452. del cv["political_status"]
  453. if cv.get("phone"):
  454. cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
  455. keys = list(cv.keys())
  456. for k in keys:
  457. # deal with json objects
  458. if k.find("_obj") > 0:
  459. try:
  460. cv[k] = json_loads(cv[k])
  461. cv[k] = [a for _, a in cv[k].items()]
  462. nms = []
  463. for n in cv[k]:
  464. if not isinstance(n, dict) or "name" not in n or not n.get("name"):
  465. continue
  466. n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
  467. if not n["name"]:
  468. continue
  469. nms.append(n["name"])
  470. if nms:
  471. t = k[:-4]
  472. cv[f"{t}_kwd"] = nms
  473. cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
  474. except Exception:
  475. logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
  476. cv[k] = []
  477. # tokenize fields
  478. if k in tks_fld:
  479. cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
  480. if k in small_tks_fld:
  481. cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
  482. # keyword fields
  483. if k in kwd_fld:
  484. cv[f"{k}_kwd"] = [n.lower()
  485. for n in re.split(r"[\t,,;;. ]",
  486. re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
  487. ) if n]
  488. if k in num_fld and cv.get(k):
  489. cv[f"{k}_int"] = cv[k]
  490. cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
  491. # for name field
  492. if cv.get("name"):
  493. nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
  494. nm = re.sub(r"[ \t ]+", " ", nm)
  495. if re.match(r"[a-zA-Z ]+$", nm):
  496. if len(nm.split()) > 1:
  497. cv["name"] = nm
  498. else:
  499. nm = ""
  500. elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
  501. nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
  502. else:
  503. nm = ""
  504. cv["name"] = nm.strip()
  505. name = cv["name"]
  506. # name pingyin and its prefix
  507. cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
  508. cv["name_py_pref0_tks"] = ""
  509. cv["name_py_pref_tks"] = ""
  510. for py in PY.get_pinyins(nm[:20], ''):
  511. for i in range(2, len(py) + 1):
  512. cv["name_py_pref_tks"] += " " + py[:i]
  513. for py in PY.get_pinyins(nm[:20], ' '):
  514. py = py.split()
  515. for i in range(1, len(py) + 1):
  516. cv["name_py_pref0_tks"] += " " + "".join(py[:i])
  517. cv["name_kwd"] = name
  518. cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
  519. cv["name_tks"] = (
  520. rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
  521. ) if name else ""
  522. else:
  523. cv["integerity_flt"] /= 2.
  524. if cv.get("phone"):
  525. r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
  526. if not r:
  527. cv["phone"] = ""
  528. else:
  529. cv["phone"] = r.group(1)
  530. # deal with date fields
  531. if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
  532. cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
  533. else:
  534. y, m, d = getYMD(str(cv.get("updated_at", "")))
  535. if not y:
  536. y = "2012"
  537. if not m:
  538. m = "01"
  539. if not d:
  540. d = "01"
  541. cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  542. # long text tokenize
  543. if cv.get("responsibilities"):
  544. cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
  545. # for yes or no field
  546. fea = []
  547. for f, y, n in is_fld:
  548. if f not in cv:
  549. continue
  550. if cv[f] == '是':
  551. fea.append(y)
  552. if cv[f] == '否':
  553. fea.append(n)
  554. if fea:
  555. cv["tag_kwd"] = fea
  556. cv = forEdu(cv)
  557. cv = forProj(cv)
  558. cv = forWork(cv)
  559. cv = birth(cv)
  560. cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
  561. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  562. for j in cv.get("sch_rank_kwd", []):
  563. cv["corp_proj_sch_deg_kwd"][i] += "+" + j
  564. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  565. if cv.get("highest_degree_kwd"):
  566. cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
  567. try:
  568. if not cv.get("work_exp_flt") and cv.get("work_start_time"):
  569. if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
  570. cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
  571. cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
  572. elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
  573. y, m, d = getYMD(str(cv["work_start_time"]))
  574. cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  575. cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
  576. except Exception as e:
  577. logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
  578. if "work_exp_flt" not in cv and cv.get("work_experience", 0):
  579. cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
  580. keys = list(cv.keys())
  581. for k in keys:
  582. if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
  583. del cv[k]
  584. for k in cv.keys():
  585. if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
  586. continue
  587. cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
  588. keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
  589. for k in keys:
  590. if cv[k] <= 0:
  591. del cv[k]
  592. cv["tob_resume_id"] = str(cv["tob_resume_id"])
  593. cv["id"] = cv["tob_resume_id"]
  594. logging.debug("CCCCCCCCCCCCCCC")
  595. return dealWithInt64(cv)
  596. def dealWithInt64(d):
  597. if isinstance(d, dict):
  598. for n, v in d.items():
  599. d[n] = dealWithInt64(v)
  600. if isinstance(d, list):
  601. d = [dealWithInt64(t) for t in d]
  602. if isinstance(d, np.integer):
  603. d = int(d)
  604. return d