# -*- coding: utf-8 -*- """ Created on Mon Jul 12 09:45:10 2021 @author: Kishore """ ######### Importing modules ############ import numpy as np from sklearn.preprocessing import LabelEncoder limit_number_of_class=10 ########### Returns a cleaned data ################### def data_clean(dataset,cols): print("data cleaning started") # dataset=data#pd.read_csv(dict['path'],header=0) # columns=cols clean_dict = {} # dropping unwanted columns dataset.drop(cols, axis=1, inplace=True) #### auto dropping ID coulmns #### auto_drop=[] for col in dataset.columns: if col not in cols: if len(dataset[col]) == dataset[col].nunique(): dataset.drop(col, axis=1, inplace=True) auto_drop.append(col) cols.append(col) #################################### clean_dict['dropped_features'] = cols clean_dict['auto_drop']=auto_drop # fname="document.txt" # f = open(fname, "a") # f.write("Documentation\n") # f.write("\n################ Data Cleaning steps ###################\n") # f.write("\n Dropped columns: "+str(dict['dropcols'])+"\n") # print(dict['dropcols']) # dropping duplicates # duplicate = dataset[dataset.duplicated()] # duplicate_rows = len(duplicate) # Dropping Duplicates dataset = dataset.drop_duplicates() # if duplicate_rows>0: # line="\n Dropped ", str(duplicate_rows) ," duplicate rows\n" # f.write(line) ############# Handling Missing values ###################################### num_data_col = dataset.select_dtypes(include=np.number).columns.tolist() cat_data_col = dataset.select_dtypes(include=['object']).columns.tolist() missing_data={} for col in num_data_col: dataset[col].fillna(dataset[col].mean(),inplace=True) missing_data[col] = dataset[col].mean() #filling missing values for categorical data for col in cat_data_col: dataset[col].fillna(dataset[col].mode()[0],inplace=True) missing_data[col] = dataset[col].mode()[0] clean_dict['missing_values']=missing_data ############################################################################### # f.write("\n Handled missing values , filled mean and mode value for numeric and categorical variable respectively \n") ############### Converting categorical to numeric values ################## labelencoder = LabelEncoder() # f.writelines("\n ......Categorical to numeric data information..... \n") # Assigning numerical values and storing in another column lst_cn=[] for col in cat_data_col: new_col = col + '_encoded' dataset[new_col] = labelencoder.fit_transform(dataset[col]) d1 = dataset.drop_duplicates(col).set_index(col) dataset[col] = dataset[new_col] dataset.drop([new_col], axis=1, inplace=True) d1.drop(d1.columns.difference([col, new_col]), 1, inplace=True) dict_map = d1.to_dict() lst_cn.append(dict_map) # print(dict_map) clean_dict['categorical_to_numeric']=lst_cn ################################################################## # print(clean_dict) # f.write("\n" + str(dict_map)) # # f.write("\n#####################################################\n") # f.close() # dict['doc_path']=fname return dataset,clean_dict # print(data.head()) # dict_param={'dropcols': ['Loan_ID', 'Gender', 'Education'], 'path':"../dataset/Loan_Approval_prediction/train.csv" } # df=data_clean(dict_param) # print(df.head())