| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
-
- import json
- from deepdoc.parser.resume.entities import degrees, regions, industries
-
- FIELDS = [
- "address STRING",
- "annual_salary int",
- "annual_salary_from int",
- "annual_salary_to int",
- "birth STRING",
- "card STRING",
- "certificate_obj string",
- "city STRING",
- "corporation_id int",
- "corporation_name STRING",
- "corporation_type STRING",
- "degree STRING",
- "discipline_name STRING",
- "education_obj string",
- "email STRING",
- "expect_annual_salary int",
- "expect_city_names string",
- "expect_industry_name STRING",
- "expect_position_name STRING",
- "expect_salary_from int",
- "expect_salary_to int",
- "expect_type STRING",
- "gender STRING",
- "industry_name STRING",
- "industry_names STRING",
- "is_deleted STRING",
- "is_fertility STRING",
- "is_house STRING",
- "is_management_experience STRING",
- "is_marital STRING",
- "is_oversea STRING",
- "language_obj string",
- "name STRING",
- "nation STRING",
- "phone STRING",
- "political_status STRING",
- "position_name STRING",
- "project_obj string",
- "responsibilities string",
- "salary_month int",
- "scale STRING",
- "school_name STRING",
- "self_remark string",
- "skill_obj string",
- "title_name STRING",
- "tob_resume_id STRING",
- "updated_at Timestamp",
- "wechat STRING",
- "work_obj string",
- "work_experience int",
- "work_start_time BIGINT"
- ]
-
- def refactor(df):
- def deal_obj(obj, k, kk):
- if not isinstance(obj, type({})):
- return ""
- obj = obj.get(k, {})
- if not isinstance(obj, type({})):
- return ""
- return obj.get(kk, "")
-
- def loadjson(line):
- try:
- return json.loads(line)
- except Exception as e:
- pass
- return {}
-
- df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
- df.fillna("", inplace=True)
-
- clms = ["tob_resume_id", "updated_at"]
-
- def extract(nms, cc=None):
- nonlocal clms
- clms.extend(nms)
- for c in nms:
- if cc:
- df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
- else:
- df[c] = df["obj"].map(
- lambda x: json.dumps(
- x.get(
- c,
- {}),
- ensure_ascii=False) if isinstance(
- x,
- type(
- {})) and (
- isinstance(
- x.get(c),
- type(
- {})) or not x.get(c)) else str(x).replace(
- "None",
- ""))
-
- extract(["education", "work", "certificate", "project", "language",
- "skill"])
- extract(["wechat", "phone", "is_deleted",
- "name", "tel", "email"], "contact")
- extract(["nation", "expect_industry_name", "salary_month",
- "industry_ids", "is_house", "birth", "annual_salary_from",
- "annual_salary_to", "card",
- "expect_salary_to", "expect_salary_from",
- "expect_position_name", "gender", "city",
- "is_fertility", "expect_city_names",
- "political_status", "title_name", "expect_annual_salary",
- "industry_name", "address", "position_name", "school_name",
- "corporation_id",
- "is_oversea", "responsibilities",
- "work_start_time", "degree", "management_experience",
- "expect_type", "corporation_type", "scale", "corporation_name",
- "self_remark", "annual_salary", "work_experience",
- "discipline_name", "marital", "updated_at"], "basic")
-
- df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
- df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
- df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
- str(x).split(",")]))
- clms.append("industry_names")
-
- def arr2str(a):
- if not a:
- return ""
- if isinstance(a, list):
- a = " ".join([str(i) for i in a])
- return str(a).replace(",", " ")
-
- df["expect_industry_name"] = df["expect_industry_name"].map(
- lambda x: arr2str(x))
- df["gender"] = df["gender"].map(
- lambda x: "男" if x == 'M' else (
- "女" if x == 'F' else ""))
- for c in ["is_fertility", "is_oversea", "is_house",
- "management_experience", "marital"]:
- df[c] = df[c].map(
- lambda x: '是' if x == 'Y' else (
- '否' if x == 'N' else ""))
- df["is_management_experience"] = df["management_experience"]
- df["is_marital"] = df["marital"]
- clms.extend(["is_management_experience", "is_marital"])
-
- df.fillna("", inplace=True)
- for i in range(len(df)):
- if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
- df.loc[i, "phone"] = df.loc[i, "tel"].strip()
-
- for n in ["industry_ids", "management_experience", "marital", "tel"]:
- for i in range(len(clms)):
- if clms[i] == n:
- del clms[i]
- break
-
- clms = list(set(clms))
-
- df = df.reindex(sorted(clms), axis=1)
- #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
- for c in clms:
- df[c] = df[c].map(
- lambda s: str(s).replace(
- "\t",
- " ").replace(
- "\n",
- "\\n").replace(
- "\r",
- "\\n"))
- # print(df.values.tolist())
- return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
|