Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

step_two.py 25KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import logging
  14. import re
  15. import copy
  16. import time
  17. import datetime
  18. import demjson3
  19. import traceback
  20. import signal
  21. import numpy as np
  22. from deepdoc.parser.resume.entities import degrees, schools, corporations
  23. from rag.nlp import rag_tokenizer, surname
  24. from xpinyin import Pinyin
  25. from contextlib import contextmanager
  26. class TimeoutException(Exception):
  27. pass
  28. @contextmanager
  29. def time_limit(seconds):
  30. def signal_handler(signum, frame):
  31. raise TimeoutException("Timed out!")
  32. signal.signal(signal.SIGALRM, signal_handler)
  33. signal.alarm(seconds)
  34. try:
  35. yield
  36. finally:
  37. signal.alarm(0)
  38. ENV = None
  39. PY = Pinyin()
  40. def rmHtmlTag(line):
  41. return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
  42. def highest_degree(dg):
  43. if not dg:
  44. return ""
  45. if isinstance(dg, str):
  46. dg = [dg]
  47. m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
  48. return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
  49. def forEdu(cv):
  50. if not cv.get("education_obj"):
  51. cv["integerity_flt"] *= 0.8
  52. return cv
  53. first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
  54. edu_nst = []
  55. edu_end_dt = ""
  56. cv["school_rank_int"] = 1000000
  57. for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
  58. e = {}
  59. if n.get("end_time"):
  60. if n["end_time"] > edu_end_dt:
  61. edu_end_dt = n["end_time"]
  62. try:
  63. dt = n["end_time"]
  64. if re.match(r"[0-9]{9,}", dt):
  65. dt = turnTm2Dt(dt)
  66. y, m, d = getYMD(dt)
  67. ed_dt.append(str(y))
  68. e["end_dt_kwd"] = str(y)
  69. except Exception as e:
  70. pass
  71. if n.get("start_time"):
  72. try:
  73. dt = n["start_time"]
  74. if re.match(r"[0-9]{9,}", dt):
  75. dt = turnTm2Dt(dt)
  76. y, m, d = getYMD(dt)
  77. st_dt.append(str(y))
  78. e["start_dt_kwd"] = str(y)
  79. except Exception:
  80. pass
  81. r = schools.select(n.get("school_name", ""))
  82. if r:
  83. if str(r.get("type", "")) == "1":
  84. fea.append("211")
  85. if str(r.get("type", "")) == "2":
  86. fea.append("211")
  87. if str(r.get("is_abroad", "")) == "1":
  88. fea.append("留学")
  89. if str(r.get("is_double_first", "")) == "1":
  90. fea.append("双一流")
  91. if str(r.get("is_985", "")) == "1":
  92. fea.append("985")
  93. if str(r.get("is_world_known", "")) == "1":
  94. fea.append("海外知名")
  95. if r.get("rank") and cv["school_rank_int"] > r["rank"]:
  96. cv["school_rank_int"] = r["rank"]
  97. if n.get("school_name") and isinstance(n["school_name"], str):
  98. sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
  99. e["sch_nm_kwd"] = sch[-1]
  100. fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split()[-1])
  101. if n.get("discipline_name") and isinstance(n["discipline_name"], str):
  102. maj.append(n["discipline_name"])
  103. e["major_kwd"] = n["discipline_name"]
  104. if not n.get("degree") and "985" in fea and not first_fea:
  105. n["degree"] = "1"
  106. if n.get("degree"):
  107. d = degrees.get_name(n["degree"])
  108. if d:
  109. e["degree_kwd"] = d
  110. if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
  111. d = "专升本"
  112. if d:
  113. deg.append(d)
  114. # for first degree
  115. if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
  116. fdeg = [d]
  117. if n.get("school_name"):
  118. fsch = [n["school_name"]]
  119. if n.get("discipline_name"):
  120. fmaj = [n["discipline_name"]]
  121. first_fea = copy.deepcopy(fea)
  122. edu_nst.append(e)
  123. cv["sch_rank_kwd"] = []
  124. if cv["school_rank_int"] <= 20 \
  125. or ("海外名校" in fea and cv["school_rank_int"] <= 200):
  126. cv["sch_rank_kwd"].append("顶尖学校")
  127. elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
  128. or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
  129. cv["school_rank_int"] > 200):
  130. cv["sch_rank_kwd"].append("精英学校")
  131. elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
  132. or ("海外名校" in fea and cv["school_rank_int"] > 500):
  133. cv["sch_rank_kwd"].append("优质学校")
  134. else:
  135. cv["sch_rank_kwd"].append("一般学校")
  136. if edu_nst:
  137. cv["edu_nst"] = edu_nst
  138. if fea:
  139. cv["edu_fea_kwd"] = list(set(fea))
  140. if first_fea:
  141. cv["edu_first_fea_kwd"] = list(set(first_fea))
  142. if maj:
  143. cv["major_kwd"] = maj
  144. if fsch:
  145. cv["first_school_name_kwd"] = fsch
  146. if fdeg:
  147. cv["first_degree_kwd"] = fdeg
  148. if fmaj:
  149. cv["first_major_kwd"] = fmaj
  150. if st_dt:
  151. cv["edu_start_kwd"] = st_dt
  152. if ed_dt:
  153. cv["edu_end_kwd"] = ed_dt
  154. if ed_dt:
  155. cv["edu_end_int"] = max([int(t) for t in ed_dt])
  156. if deg:
  157. if "本科" in deg and "专科" in deg:
  158. deg.append("专升本")
  159. deg = [d for d in deg if d != '本科']
  160. cv["degree_kwd"] = deg
  161. cv["highest_degree_kwd"] = highest_degree(deg)
  162. if edu_end_dt:
  163. try:
  164. if re.match(r"[0-9]{9,}", edu_end_dt):
  165. edu_end_dt = turnTm2Dt(edu_end_dt)
  166. if edu_end_dt.strip("\n") == "至今":
  167. edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
  168. y, m, d = getYMD(edu_end_dt)
  169. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  170. except Exception as e:
  171. logging.exception("forEdu {} {} {}".format(e, edu_end_dt, cv.get("work_exp_flt")))
  172. if sch:
  173. cv["school_name_kwd"] = sch
  174. if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
  175. or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
  176. or not cv.get("degree_kwd"):
  177. for c in sch:
  178. if schools.is_good(c):
  179. if "tag_kwd" not in cv:
  180. cv["tag_kwd"] = []
  181. cv["tag_kwd"].append("好学校")
  182. cv["tag_kwd"].append("好学历")
  183. break
  184. if (len(cv.get("degree_kwd", [])) >= 1 and \
  185. "本科" in cv["degree_kwd"] and \
  186. any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
  187. or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
  188. or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
  189. if "tag_kwd" not in cv:
  190. cv["tag_kwd"] = []
  191. if "好学历" not in cv["tag_kwd"]:
  192. cv["tag_kwd"].append("好学历")
  193. if cv.get("major_kwd"):
  194. cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
  195. if cv.get("school_name_kwd"):
  196. cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
  197. if cv.get("first_school_name_kwd"):
  198. cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
  199. if cv.get("first_major_kwd"):
  200. cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
  201. return cv
  202. def forProj(cv):
  203. if not cv.get("project_obj"):
  204. return cv
  205. pro_nms, desc = [], []
  206. for i, n in enumerate(
  207. sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
  208. reverse=True)):
  209. if n.get("name"):
  210. pro_nms.append(n["name"])
  211. if n.get("describe"):
  212. desc.append(str(n["describe"]))
  213. if n.get("responsibilities"):
  214. desc.append(str(n["responsibilities"]))
  215. if n.get("achivement"):
  216. desc.append(str(n["achivement"]))
  217. if pro_nms:
  218. # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
  219. cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
  220. if desc:
  221. cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
  222. cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
  223. return cv
  224. def json_loads(line):
  225. return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
  226. def forWork(cv):
  227. if not cv.get("work_obj"):
  228. cv["integerity_flt"] *= 0.7
  229. return cv
  230. flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
  231. "industry_name", "subordinates_count"]
  232. duas = []
  233. scales = []
  234. fea = {c: [] for c in flds}
  235. latest_job_tm = ""
  236. goodcorp = False
  237. goodcorp_ = False
  238. work_st_tm = ""
  239. corp_tags = []
  240. for i, n in enumerate(
  241. sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
  242. reverse=True)):
  243. if isinstance(n, str):
  244. try:
  245. n = json_loads(n)
  246. except Exception:
  247. continue
  248. if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
  249. work_st_tm = n["start_time"]
  250. for c in flds:
  251. if not n.get(c) or str(n[c]) == '0':
  252. fea[c].append("")
  253. continue
  254. if c == "corporation_name":
  255. n[c] = corporations.corpNorm(n[c], False)
  256. if corporations.is_good(n[c]):
  257. if i == 0:
  258. goodcorp = True
  259. else:
  260. goodcorp_ = True
  261. ct = corporations.corp_tag(n[c])
  262. if i == 0:
  263. corp_tags.extend(ct)
  264. elif ct and ct[0] != "软外":
  265. corp_tags.extend([f"{t}(曾)" for t in ct])
  266. fea[c].append(rmHtmlTag(str(n[c]).lower()))
  267. y, m, d = getYMD(n.get("start_time"))
  268. if not y or not m:
  269. continue
  270. st = "%s-%02d-%02d" % (y, int(m), int(d))
  271. latest_job_tm = st
  272. y, m, d = getYMD(n.get("end_time"))
  273. if (not y or not m) and i > 0:
  274. continue
  275. if not y or not m or int(y) > 2022:
  276. y, m, d = getYMD(str(n.get("updated_at", "")))
  277. if not y or not m:
  278. continue
  279. ed = "%s-%02d-%02d" % (y, int(m), int(d))
  280. try:
  281. duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
  282. except Exception:
  283. logging.exception("forWork {} {}".format(n.get("start_time"), n.get("end_time")))
  284. if n.get("scale"):
  285. r = re.search(r"^([0-9]+)", str(n["scale"]))
  286. if r:
  287. scales.append(int(r.group(1)))
  288. if goodcorp:
  289. if "tag_kwd" not in cv:
  290. cv["tag_kwd"] = []
  291. cv["tag_kwd"].append("好公司")
  292. if goodcorp_:
  293. if "tag_kwd" not in cv:
  294. cv["tag_kwd"] = []
  295. cv["tag_kwd"].append("好公司(曾)")
  296. if corp_tags:
  297. if "tag_kwd" not in cv:
  298. cv["tag_kwd"] = []
  299. cv["tag_kwd"].extend(corp_tags)
  300. cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
  301. if latest_job_tm:
  302. cv["latest_job_dt"] = latest_job_tm
  303. if fea["corporation_id"]:
  304. cv["corporation_id"] = fea["corporation_id"]
  305. if fea["position_name"]:
  306. cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
  307. cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
  308. cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
  309. if fea["industry_name"]:
  310. cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
  311. cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
  312. cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
  313. if fea["corporation_name"]:
  314. cv["corporation_name_kwd"] = fea["corporation_name"][0]
  315. cv["corp_nm_kwd"] = fea["corporation_name"]
  316. cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
  317. cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
  318. cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
  319. if fea["responsibilities"]:
  320. cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
  321. cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
  322. if fea["subordinates_count"]:
  323. fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
  324. re.match(r"[^0-9]+$", str(i))]
  325. if fea["subordinates_count"]:
  326. cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
  327. if isinstance(cv.get("corporation_id"), int):
  328. cv["corporation_id"] = [str(cv["corporation_id"])]
  329. if not cv.get("corporation_id"):
  330. cv["corporation_id"] = []
  331. for i in cv.get("corporation_id", []):
  332. cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
  333. if work_st_tm:
  334. try:
  335. if re.match(r"[0-9]{9,}", work_st_tm):
  336. work_st_tm = turnTm2Dt(work_st_tm)
  337. y, m, d = getYMD(work_st_tm)
  338. cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
  339. except Exception as e:
  340. logging.exception("forWork {} {} {}".format(e, work_st_tm, cv.get("work_exp_flt")))
  341. cv["job_num_int"] = 0
  342. if duas:
  343. cv["dua_flt"] = np.mean(duas)
  344. cv["cur_dua_int"] = duas[0]
  345. cv["job_num_int"] = len(duas)
  346. if scales:
  347. cv["scale_flt"] = np.max(scales)
  348. return cv
  349. def turnTm2Dt(b):
  350. if not b:
  351. return
  352. b = str(b).strip()
  353. if re.match(r"[0-9]{10,}", b):
  354. b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
  355. return b
  356. def getYMD(b):
  357. y, m, d = "", "", "01"
  358. if not b:
  359. return (y, m, d)
  360. b = turnTm2Dt(b)
  361. if re.match(r"[0-9]{4}", b):
  362. y = int(b[:4])
  363. r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
  364. if r:
  365. m = r.group(1)
  366. r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
  367. if r:
  368. d = r.group(1)
  369. if not d or int(d) == 0 or int(d) > 31:
  370. d = "1"
  371. if not m or int(m) > 12 or int(m) < 1:
  372. m = "1"
  373. return (y, m, d)
  374. def birth(cv):
  375. if not cv.get("birth"):
  376. cv["integerity_flt"] *= 0.9
  377. return cv
  378. y, m, d = getYMD(cv["birth"])
  379. if not m or not y:
  380. return cv
  381. b = "%s-%02d-%02d" % (y, int(m), int(d))
  382. cv["birth_dt"] = b
  383. cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
  384. cv["age_int"] = datetime.datetime.now().year - int(y)
  385. return cv
  386. def parse(cv):
  387. for k in cv.keys():
  388. if cv[k] == '\\N':
  389. cv[k] = ''
  390. # cv = cv.asDict()
  391. tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
  392. "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
  393. "position_name", "school_name", "self_remark", "title_name"]
  394. small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
  395. kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
  396. "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
  397. "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
  398. num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
  399. "expect_salary_to", "salary_month"]
  400. is_fld = [
  401. ("is_fertility", "已育", "未育"),
  402. ("is_house", "有房", "没房"),
  403. ("is_management_experience", "有管理经验", "无管理经验"),
  404. ("is_marital", "已婚", "未婚"),
  405. ("is_oversea", "有海外经验", "无海外经验")
  406. ]
  407. rmkeys = []
  408. for k in cv.keys():
  409. if cv[k] is None:
  410. rmkeys.append(k)
  411. if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
  412. rmkeys.append(k)
  413. for k in rmkeys:
  414. del cv[k]
  415. integerity = 0.
  416. flds_num = 0.
  417. def hasValues(flds):
  418. nonlocal integerity, flds_num
  419. flds_num += len(flds)
  420. for f in flds:
  421. v = str(cv.get(f, ""))
  422. if len(v) > 0 and v != '0' and v != '[]':
  423. integerity += 1
  424. hasValues(tks_fld)
  425. hasValues(small_tks_fld)
  426. hasValues(kwd_fld)
  427. hasValues(num_fld)
  428. cv["integerity_flt"] = integerity / flds_num
  429. if cv.get("corporation_type"):
  430. for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
  431. (r"[//.· <\((]+.*", ""),
  432. (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
  433. (r".*(机关|事业).*", "机关"),
  434. (r".*(非盈利|Non-profit).*", "非盈利"),
  435. (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
  436. (r".*国有.*", "国企"),
  437. (r"[ ()\(\)人/·0-9-]+", ""),
  438. (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
  439. cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
  440. if len(cv["corporation_type"]) < 2:
  441. del cv["corporation_type"]
  442. if cv.get("political_status"):
  443. for p, r in [
  444. (r".*党员.*", "党员"),
  445. (r".*(无党派|公民).*", "群众"),
  446. (r".*团员.*", "团员")]:
  447. cv["political_status"] = re.sub(p, r, cv["political_status"])
  448. if not re.search(r"[党团群]", cv["political_status"]):
  449. del cv["political_status"]
  450. if cv.get("phone"):
  451. cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
  452. keys = list(cv.keys())
  453. for k in keys:
  454. # deal with json objects
  455. if k.find("_obj") > 0:
  456. try:
  457. cv[k] = json_loads(cv[k])
  458. cv[k] = [a for _, a in cv[k].items()]
  459. nms = []
  460. for n in cv[k]:
  461. if not isinstance(n, dict) or "name" not in n or not n.get("name"):
  462. continue
  463. n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
  464. if not n["name"]:
  465. continue
  466. nms.append(n["name"])
  467. if nms:
  468. t = k[:-4]
  469. cv[f"{t}_kwd"] = nms
  470. cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
  471. except Exception:
  472. logging.exception("parse {} {}".format(str(traceback.format_exc()), cv[k]))
  473. cv[k] = []
  474. # tokenize fields
  475. if k in tks_fld:
  476. cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
  477. if k in small_tks_fld:
  478. cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
  479. # keyword fields
  480. if k in kwd_fld:
  481. cv[f"{k}_kwd"] = [n.lower()
  482. for n in re.split(r"[\t,,;;. ]",
  483. re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
  484. ) if n]
  485. if k in num_fld and cv.get(k):
  486. cv[f"{k}_int"] = cv[k]
  487. cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
  488. # for name field
  489. if cv.get("name"):
  490. nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
  491. nm = re.sub(r"[ \t ]+", " ", nm)
  492. if re.match(r"[a-zA-Z ]+$", nm):
  493. if len(nm.split()) > 1:
  494. cv["name"] = nm
  495. else:
  496. nm = ""
  497. elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
  498. nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
  499. else:
  500. nm = ""
  501. cv["name"] = nm.strip()
  502. name = cv["name"]
  503. # name pingyin and its prefix
  504. cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
  505. cv["name_py_pref0_tks"] = ""
  506. cv["name_py_pref_tks"] = ""
  507. for py in PY.get_pinyins(nm[:20], ''):
  508. for i in range(2, len(py) + 1):
  509. cv["name_py_pref_tks"] += " " + py[:i]
  510. for py in PY.get_pinyins(nm[:20], ' '):
  511. py = py.split()
  512. for i in range(1, len(py) + 1):
  513. cv["name_py_pref0_tks"] += " " + "".join(py[:i])
  514. cv["name_kwd"] = name
  515. cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
  516. cv["name_tks"] = (
  517. rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
  518. ) if name else ""
  519. else:
  520. cv["integerity_flt"] /= 2.
  521. if cv.get("phone"):
  522. r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
  523. if not r:
  524. cv["phone"] = ""
  525. else:
  526. cv["phone"] = r.group(1)
  527. # deal with date fields
  528. if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
  529. cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
  530. else:
  531. y, m, d = getYMD(str(cv.get("updated_at", "")))
  532. if not y:
  533. y = "2012"
  534. if not m:
  535. m = "01"
  536. if not d:
  537. d = "01"
  538. cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  539. # long text tokenize
  540. if cv.get("responsibilities"):
  541. cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
  542. # for yes or no field
  543. fea = []
  544. for f, y, n in is_fld:
  545. if f not in cv:
  546. continue
  547. if cv[f] == '是':
  548. fea.append(y)
  549. if cv[f] == '否':
  550. fea.append(n)
  551. if fea:
  552. cv["tag_kwd"] = fea
  553. cv = forEdu(cv)
  554. cv = forProj(cv)
  555. cv = forWork(cv)
  556. cv = birth(cv)
  557. cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
  558. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  559. for j in cv.get("sch_rank_kwd", []):
  560. cv["corp_proj_sch_deg_kwd"][i] += "+" + j
  561. for i in range(len(cv["corp_proj_sch_deg_kwd"])):
  562. if cv.get("highest_degree_kwd"):
  563. cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
  564. try:
  565. if not cv.get("work_exp_flt") and cv.get("work_start_time"):
  566. if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
  567. cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
  568. cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
  569. elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
  570. y, m, d = getYMD(str(cv["work_start_time"]))
  571. cv["work_start_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
  572. cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
  573. except Exception as e:
  574. logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
  575. if "work_exp_flt" not in cv and cv.get("work_experience", 0):
  576. cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
  577. keys = list(cv.keys())
  578. for k in keys:
  579. if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
  580. del cv[k]
  581. for k in cv.keys():
  582. if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
  583. continue
  584. cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
  585. keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
  586. for k in keys:
  587. if cv[k] <= 0:
  588. del cv[k]
  589. cv["tob_resume_id"] = str(cv["tob_resume_id"])
  590. cv["id"] = cv["tob_resume_id"]
  591. logging.debug("CCCCCCCCCCCCCCC")
  592. return dealWithInt64(cv)
  593. def dealWithInt64(d):
  594. if isinstance(d, dict):
  595. for n, v in d.items():
  596. d[n] = dealWithInt64(v)
  597. if isinstance(d, list):
  598. d = [dealWithInt64(t) for t in d]
  599. if isinstance(d, np.integer):
  600. d = int(d)
  601. return d