zxsipola123456's picture
Upload 769 files
ab2ded1 verified
raw
history blame
No virus
6.37 kB
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
from deepdoc.parser.resume.entities import degrees, regions, industries
FIELDS = [
"address STRING",
"annual_salary int",
"annual_salary_from int",
"annual_salary_to int",
"birth STRING",
"card STRING",
"certificate_obj string",
"city STRING",
"corporation_id int",
"corporation_name STRING",
"corporation_type STRING",
"degree STRING",
"discipline_name STRING",
"education_obj string",
"email STRING",
"expect_annual_salary int",
"expect_city_names string",
"expect_industry_name STRING",
"expect_position_name STRING",
"expect_salary_from int",
"expect_salary_to int",
"expect_type STRING",
"gender STRING",
"industry_name STRING",
"industry_names STRING",
"is_deleted STRING",
"is_fertility STRING",
"is_house STRING",
"is_management_experience STRING",
"is_marital STRING",
"is_oversea STRING",
"language_obj string",
"name STRING",
"nation STRING",
"phone STRING",
"political_status STRING",
"position_name STRING",
"project_obj string",
"responsibilities string",
"salary_month int",
"scale STRING",
"school_name STRING",
"self_remark string",
"skill_obj string",
"title_name STRING",
"tob_resume_id STRING",
"updated_at Timestamp",
"wechat STRING",
"work_obj string",
"work_experience int",
"work_start_time BIGINT"
]
def refactor(df):
def deal_obj(obj, k, kk):
if not isinstance(obj, type({})):
return ""
obj = obj.get(k, {})
if not isinstance(obj, type({})):
return ""
return obj.get(kk, "")
def loadjson(line):
try:
return json.loads(line)
except Exception as e:
pass
return {}
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
df.fillna("", inplace=True)
clms = ["tob_resume_id", "updated_at"]
def extract(nms, cc=None):
nonlocal clms
clms.extend(nms)
for c in nms:
if cc:
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
else:
df[c] = df["obj"].map(
lambda x: json.dumps(
x.get(
c,
{}),
ensure_ascii=False) if isinstance(
x,
type(
{})) and (
isinstance(
x.get(c),
type(
{})) or not x.get(c)) else str(x).replace(
"None",
""))
extract(["education", "work", "certificate", "project", "language",
"skill"])
extract(["wechat", "phone", "is_deleted",
"name", "tel", "email"], "contact")
extract(["nation", "expect_industry_name", "salary_month",
"industry_ids", "is_house", "birth", "annual_salary_from",
"annual_salary_to", "card",
"expect_salary_to", "expect_salary_from",
"expect_position_name", "gender", "city",
"is_fertility", "expect_city_names",
"political_status", "title_name", "expect_annual_salary",
"industry_name", "address", "position_name", "school_name",
"corporation_id",
"is_oversea", "responsibilities",
"work_start_time", "degree", "management_experience",
"expect_type", "corporation_type", "scale", "corporation_name",
"self_remark", "annual_salary", "work_experience",
"discipline_name", "marital", "updated_at"], "basic")
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
str(x).split(",")]))
clms.append("industry_names")
def arr2str(a):
if not a:
return ""
if isinstance(a, list):
a = " ".join([str(i) for i in a])
return str(a).replace(",", " ")
df["expect_industry_name"] = df["expect_industry_name"].map(
lambda x: arr2str(x))
df["gender"] = df["gender"].map(
lambda x: "男" if x == 'M' else (
"女" if x == 'F' else ""))
for c in ["is_fertility", "is_oversea", "is_house",
"management_experience", "marital"]:
df[c] = df[c].map(
lambda x: '是' if x == 'Y' else (
'否' if x == 'N' else ""))
df["is_management_experience"] = df["management_experience"]
df["is_marital"] = df["marital"]
clms.extend(["is_management_experience", "is_marital"])
df.fillna("", inplace=True)
for i in range(len(df)):
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
for n in ["industry_ids", "management_experience", "marital", "tel"]:
for i in range(len(clms)):
if clms[i] == n:
del clms[i]
break
clms = list(set(clms))
df = df.reindex(sorted(clms), axis=1)
#print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
for c in clms:
df[c] = df[c].map(
lambda s: str(s).replace(
"\t",
" ").replace(
"\n",
"\\n").replace(
"\r",
"\\n"))
# print(df.values.tolist())
return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))