您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

step_one.py 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import json
  17. from deepdoc.parser.resume.entities import degrees, regions, industries
  18. FIELDS = [
  19. "address STRING",
  20. "annual_salary int",
  21. "annual_salary_from int",
  22. "annual_salary_to int",
  23. "birth STRING",
  24. "card STRING",
  25. "certificate_obj string",
  26. "city STRING",
  27. "corporation_id int",
  28. "corporation_name STRING",
  29. "corporation_type STRING",
  30. "degree STRING",
  31. "discipline_name STRING",
  32. "education_obj string",
  33. "email STRING",
  34. "expect_annual_salary int",
  35. "expect_city_names string",
  36. "expect_industry_name STRING",
  37. "expect_position_name STRING",
  38. "expect_salary_from int",
  39. "expect_salary_to int",
  40. "expect_type STRING",
  41. "gender STRING",
  42. "industry_name STRING",
  43. "industry_names STRING",
  44. "is_deleted STRING",
  45. "is_fertility STRING",
  46. "is_house STRING",
  47. "is_management_experience STRING",
  48. "is_marital STRING",
  49. "is_oversea STRING",
  50. "language_obj string",
  51. "name STRING",
  52. "nation STRING",
  53. "phone STRING",
  54. "political_status STRING",
  55. "position_name STRING",
  56. "project_obj string",
  57. "responsibilities string",
  58. "salary_month int",
  59. "scale STRING",
  60. "school_name STRING",
  61. "self_remark string",
  62. "skill_obj string",
  63. "title_name STRING",
  64. "tob_resume_id STRING",
  65. "updated_at Timestamp",
  66. "wechat STRING",
  67. "work_obj string",
  68. "work_experience int",
  69. "work_start_time BIGINT"
  70. ]
  71. def refactor(df):
  72. def deal_obj(obj, k, kk):
  73. if not isinstance(obj, type({})):
  74. return ""
  75. obj = obj.get(k, {})
  76. if not isinstance(obj, type({})):
  77. return ""
  78. return obj.get(kk, "")
  79. def loadjson(line):
  80. try:
  81. return json.loads(line)
  82. except Exception:
  83. pass
  84. return {}
  85. df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
  86. df.fillna("", inplace=True)
  87. clms = ["tob_resume_id", "updated_at"]
  88. def extract(nms, cc=None):
  89. nonlocal clms
  90. clms.extend(nms)
  91. for c in nms:
  92. if cc:
  93. df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
  94. else:
  95. df[c] = df["obj"].map(
  96. lambda x: json.dumps(
  97. x.get(
  98. c,
  99. {}),
  100. ensure_ascii=False) if isinstance(
  101. x,
  102. type(
  103. {})) and (
  104. isinstance(
  105. x.get(c),
  106. type(
  107. {})) or not x.get(c)) else str(x).replace(
  108. "None",
  109. ""))
  110. extract(["education", "work", "certificate", "project", "language",
  111. "skill"])
  112. extract(["wechat", "phone", "is_deleted",
  113. "name", "tel", "email"], "contact")
  114. extract(["nation", "expect_industry_name", "salary_month",
  115. "industry_ids", "is_house", "birth", "annual_salary_from",
  116. "annual_salary_to", "card",
  117. "expect_salary_to", "expect_salary_from",
  118. "expect_position_name", "gender", "city",
  119. "is_fertility", "expect_city_names",
  120. "political_status", "title_name", "expect_annual_salary",
  121. "industry_name", "address", "position_name", "school_name",
  122. "corporation_id",
  123. "is_oversea", "responsibilities",
  124. "work_start_time", "degree", "management_experience",
  125. "expect_type", "corporation_type", "scale", "corporation_name",
  126. "self_remark", "annual_salary", "work_experience",
  127. "discipline_name", "marital", "updated_at"], "basic")
  128. df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
  129. df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
  130. df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
  131. str(x).split(",")]))
  132. clms.append("industry_names")
  133. def arr2str(a):
  134. if not a:
  135. return ""
  136. if isinstance(a, list):
  137. a = " ".join([str(i) for i in a])
  138. return str(a).replace(",", " ")
  139. df["expect_industry_name"] = df["expect_industry_name"].map(
  140. lambda x: arr2str(x))
  141. df["gender"] = df["gender"].map(
  142. lambda x: "男" if x == 'M' else (
  143. "女" if x == 'F' else ""))
  144. for c in ["is_fertility", "is_oversea", "is_house",
  145. "management_experience", "marital"]:
  146. df[c] = df[c].map(
  147. lambda x: '是' if x == 'Y' else (
  148. '否' if x == 'N' else ""))
  149. df["is_management_experience"] = df["management_experience"]
  150. df["is_marital"] = df["marital"]
  151. clms.extend(["is_management_experience", "is_marital"])
  152. df.fillna("", inplace=True)
  153. for i in range(len(df)):
  154. if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
  155. df.loc[i, "phone"] = df.loc[i, "tel"].strip()
  156. for n in ["industry_ids", "management_experience", "marital", "tel"]:
  157. for i in range(len(clms)):
  158. if clms[i] == n:
  159. del clms[i]
  160. break
  161. clms = list(set(clms))
  162. df = df.reindex(sorted(clms), axis=1)
  163. #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
  164. for c in clms:
  165. df[c] = df[c].map(
  166. lambda s: str(s).replace(
  167. "\t",
  168. " ").replace(
  169. "\n",
  170. "\\n").replace(
  171. "\r",
  172. "\\n"))
  173. # print(df.values.tolist())
  174. return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))