AiNext / autoclean.py
AICOE-Datamatics's picture
Initial code
247c8df
raw
history blame
No virus
3.61 kB
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 12 09:45:10 2021
@author: Kishore
"""
######### Importing modules ############
import numpy as np
from sklearn.preprocessing import LabelEncoder
limit_number_of_class=10
########### Returns a cleaned data ###################
def data_clean(dataset,cols):
print("data cleaning started")
# dataset=data#pd.read_csv(dict['path'],header=0)
# columns=cols
clean_dict = {}
# dropping unwanted columns
dataset.drop(cols, axis=1, inplace=True)
#### auto dropping ID coulmns ####
auto_drop=[]
for col in dataset.columns:
if col not in cols:
if len(dataset[col]) == dataset[col].nunique():
dataset.drop(col, axis=1, inplace=True)
auto_drop.append(col)
cols.append(col)
####################################
clean_dict['dropped_features'] = cols
clean_dict['auto_drop']=auto_drop
# fname="document.txt"
# f = open(fname, "a")
# f.write("Documentation\n")
# f.write("\n################ Data Cleaning steps ###################\n")
# f.write("\n Dropped columns: "+str(dict['dropcols'])+"\n")
# print(dict['dropcols'])
# dropping duplicates
# duplicate = dataset[dataset.duplicated()]
# duplicate_rows = len(duplicate)
# Dropping Duplicates
dataset = dataset.drop_duplicates()
# if duplicate_rows>0:
# line="\n Dropped ", str(duplicate_rows) ," duplicate rows\n"
# f.write(line)
############# Handling Missing values ######################################
num_data_col = dataset.select_dtypes(include=np.number).columns.tolist()
cat_data_col = dataset.select_dtypes(include=['object']).columns.tolist()
missing_data={}
for col in num_data_col:
dataset[col].fillna(dataset[col].mean(),inplace=True)
missing_data[col] = dataset[col].mean()
#filling missing values for categorical data
for col in cat_data_col:
dataset[col].fillna(dataset[col].mode()[0],inplace=True)
missing_data[col] = dataset[col].mode()[0]
clean_dict['missing_values']=missing_data
###############################################################################
# f.write("\n Handled missing values , filled mean and mode value for numeric and categorical variable respectively \n")
############### Converting categorical to numeric values ##################
labelencoder = LabelEncoder()
# f.writelines("\n ......Categorical to numeric data information..... \n")
# Assigning numerical values and storing in another column
lst_cn=[]
for col in cat_data_col:
new_col = col + '_encoded'
dataset[new_col] = labelencoder.fit_transform(dataset[col])
d1 = dataset.drop_duplicates(col).set_index(col)
dataset[col] = dataset[new_col]
dataset.drop([new_col], axis=1, inplace=True)
d1.drop(d1.columns.difference([col, new_col]), 1, inplace=True)
dict_map = d1.to_dict()
lst_cn.append(dict_map)
# print(dict_map)
clean_dict['categorical_to_numeric']=lst_cn
##################################################################
# print(clean_dict)
# f.write("\n" + str(dict_map))
#
# f.write("\n#####################################################\n")
# f.close()
# dict['doc_path']=fname
return dataset,clean_dict
# print(data.head())
# dict_param={'dropcols': ['Loan_ID', 'Gender', 'Education'], 'path':"../dataset/Loan_Approval_prediction/train.csv" }
# df=data_clean(dict_param)
# print(df.head())