# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import json from deepdoc.parser.resume.entities import degrees, regions, industries FIELDS = [ "address STRING", "annual_salary int", "annual_salary_from int", "annual_salary_to int", "birth STRING", "card STRING", "certificate_obj string", "city STRING", "corporation_id int", "corporation_name STRING", "corporation_type STRING", "degree STRING", "discipline_name STRING", "education_obj string", "email STRING", "expect_annual_salary int", "expect_city_names string", "expect_industry_name STRING", "expect_position_name STRING", "expect_salary_from int", "expect_salary_to int", "expect_type STRING", "gender STRING", "industry_name STRING", "industry_names STRING", "is_deleted STRING", "is_fertility STRING", "is_house STRING", "is_management_experience STRING", "is_marital STRING", "is_oversea STRING", "language_obj string", "name STRING", "nation STRING", "phone STRING", "political_status STRING", "position_name STRING", "project_obj string", "responsibilities string", "salary_month int", "scale STRING", "school_name STRING", "self_remark string", "skill_obj string", "title_name STRING", "tob_resume_id STRING", "updated_at Timestamp", "wechat STRING", "work_obj string", "work_experience int", "work_start_time BIGINT" ] def refactor(df): def deal_obj(obj, k, kk): if not isinstance(obj, type({})): return "" obj = obj.get(k, {}) if not isinstance(obj, type({})): return "" return obj.get(kk, "") def loadjson(line): try: return json.loads(line) except Exception as e: pass return {} df["obj"] = df["resume_content"].map(lambda x: loadjson(x)) df.fillna("", inplace=True) clms = ["tob_resume_id", "updated_at"] def extract(nms, cc=None): nonlocal clms clms.extend(nms) for c in nms: if cc: df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c)) else: df[c] = df["obj"].map( lambda x: json.dumps( x.get( c, {}), ensure_ascii=False) if isinstance( x, type( {})) and ( isinstance( x.get(c), type( {})) or not x.get(c)) else str(x).replace( "None", "")) extract(["education", "work", "certificate", "project", "language", "skill"]) extract(["wechat", "phone", "is_deleted", "name", "tel", "email"], "contact") extract(["nation", "expect_industry_name", "salary_month", "industry_ids", "is_house", "birth", "annual_salary_from", "annual_salary_to", "card", "expect_salary_to", "expect_salary_from", "expect_position_name", "gender", "city", "is_fertility", "expect_city_names", "political_status", "title_name", "expect_annual_salary", "industry_name", "address", "position_name", "school_name", "corporation_id", "is_oversea", "responsibilities", "work_start_time", "degree", "management_experience", "expect_type", "corporation_type", "scale", "corporation_name", "self_remark", "annual_salary", "work_experience", "discipline_name", "marital", "updated_at"], "basic") df["degree"] = df["degree"].map(lambda x: degrees.get_name(x)) df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x))) df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in str(x).split(",")])) clms.append("industry_names") def arr2str(a): if not a: return "" if isinstance(a, list): a = " ".join([str(i) for i in a]) return str(a).replace(",", " ") df["expect_industry_name"] = df["expect_industry_name"].map( lambda x: arr2str(x)) df["gender"] = df["gender"].map( lambda x: "男" if x == 'M' else ( "女" if x == 'F' else "")) for c in ["is_fertility", "is_oversea", "is_house", "management_experience", "marital"]: df[c] = df[c].map( lambda x: '是' if x == 'Y' else ( '否' if x == 'N' else "")) df["is_management_experience"] = df["management_experience"] df["is_marital"] = df["marital"] clms.extend(["is_management_experience", "is_marital"]) df.fillna("", inplace=True) for i in range(len(df)): if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip(): df.loc[i, "phone"] = df.loc[i, "tel"].strip() for n in ["industry_ids", "management_experience", "marital", "tel"]: for i in range(len(clms)): if clms[i] == n: del clms[i] break clms = list(set(clms)) df = df.reindex(sorted(clms), axis=1) #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL") for c in clms: df[c] = df[c].map( lambda s: str(s).replace( "\t", " ").replace( "\n", "\\n").replace( "\r", "\\n")) # print(df.values.tolist()) return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))