Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Jul 12 09:45:10 2021 | |
@author: Kishore | |
""" | |
######### Importing modules ############ | |
import numpy as np | |
from sklearn.preprocessing import LabelEncoder | |
limit_number_of_class=10 | |
########### Returns a cleaned data ################### | |
def data_clean(dataset,cols): | |
print("data cleaning started") | |
# dataset=data#pd.read_csv(dict['path'],header=0) | |
# columns=cols | |
clean_dict = {} | |
# dropping unwanted columns | |
dataset.drop(cols, axis=1, inplace=True) | |
#### auto dropping ID coulmns #### | |
auto_drop=[] | |
for col in dataset.columns: | |
if col not in cols: | |
if len(dataset[col]) == dataset[col].nunique(): | |
dataset.drop(col, axis=1, inplace=True) | |
auto_drop.append(col) | |
cols.append(col) | |
#################################### | |
clean_dict['dropped_features'] = cols | |
clean_dict['auto_drop']=auto_drop | |
# fname="document.txt" | |
# f = open(fname, "a") | |
# f.write("Documentation\n") | |
# f.write("\n################ Data Cleaning steps ###################\n") | |
# f.write("\n Dropped columns: "+str(dict['dropcols'])+"\n") | |
# print(dict['dropcols']) | |
# dropping duplicates | |
# duplicate = dataset[dataset.duplicated()] | |
# duplicate_rows = len(duplicate) | |
# Dropping Duplicates | |
dataset = dataset.drop_duplicates() | |
# if duplicate_rows>0: | |
# line="\n Dropped ", str(duplicate_rows) ," duplicate rows\n" | |
# f.write(line) | |
############# Handling Missing values ###################################### | |
num_data_col = dataset.select_dtypes(include=np.number).columns.tolist() | |
cat_data_col = dataset.select_dtypes(include=['object']).columns.tolist() | |
missing_data={} | |
for col in num_data_col: | |
dataset[col].fillna(dataset[col].mean(),inplace=True) | |
missing_data[col] = dataset[col].mean() | |
#filling missing values for categorical data | |
for col in cat_data_col: | |
dataset[col].fillna(dataset[col].mode()[0],inplace=True) | |
missing_data[col] = dataset[col].mode()[0] | |
clean_dict['missing_values']=missing_data | |
############################################################################### | |
# f.write("\n Handled missing values , filled mean and mode value for numeric and categorical variable respectively \n") | |
############### Converting categorical to numeric values ################## | |
labelencoder = LabelEncoder() | |
# f.writelines("\n ......Categorical to numeric data information..... \n") | |
# Assigning numerical values and storing in another column | |
lst_cn=[] | |
for col in cat_data_col: | |
new_col = col + '_encoded' | |
dataset[new_col] = labelencoder.fit_transform(dataset[col]) | |
d1 = dataset.drop_duplicates(col).set_index(col) | |
dataset[col] = dataset[new_col] | |
dataset.drop([new_col], axis=1, inplace=True) | |
d1.drop(d1.columns.difference([col, new_col]), 1, inplace=True) | |
dict_map = d1.to_dict() | |
lst_cn.append(dict_map) | |
# print(dict_map) | |
clean_dict['categorical_to_numeric']=lst_cn | |
################################################################## | |
# print(clean_dict) | |
# f.write("\n" + str(dict_map)) | |
# | |
# f.write("\n#####################################################\n") | |
# f.close() | |
# dict['doc_path']=fname | |
return dataset,clean_dict | |
# print(data.head()) | |
# dict_param={'dropcols': ['Loan_ID', 'Gender', 'Education'], 'path':"../dataset/Loan_Approval_prediction/train.csv" } | |
# df=data_clean(dict_param) | |
# print(df.head()) | |