File size: 3,612 Bytes
247c8df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 12 09:45:10 2021

@author: Kishore
"""

######### Importing modules ############
import numpy as np
from sklearn.preprocessing import LabelEncoder
limit_number_of_class=10


########### Returns a cleaned data ###################
def data_clean(dataset,cols):
    print("data cleaning started")
    # dataset=data#pd.read_csv(dict['path'],header=0)
    # columns=cols
    clean_dict = {}

    # dropping unwanted columns
    dataset.drop(cols, axis=1, inplace=True)

    #### auto dropping ID coulmns ####
    auto_drop=[]
    for col in dataset.columns:
        if col not in cols:
            if len(dataset[col]) == dataset[col].nunique():
                dataset.drop(col, axis=1, inplace=True)
                auto_drop.append(col)
                cols.append(col)
    ####################################

    clean_dict['dropped_features'] = cols
    clean_dict['auto_drop']=auto_drop

    # fname="document.txt"
    # f = open(fname, "a")
    # f.write("Documentation\n")
    # f.write("\n################ Data Cleaning steps ###################\n")
    # f.write("\n Dropped columns: "+str(dict['dropcols'])+"\n")
    # print(dict['dropcols'])
    # dropping duplicates
    # duplicate = dataset[dataset.duplicated()]
    # duplicate_rows = len(duplicate)

    # Dropping Duplicates
    dataset = dataset.drop_duplicates()

    # if duplicate_rows>0:
    #     line="\n Dropped ", str(duplicate_rows) ," duplicate rows\n"
    #     f.write(line)

    ############# Handling Missing values ######################################
    num_data_col = dataset.select_dtypes(include=np.number).columns.tolist() 
    cat_data_col = dataset.select_dtypes(include=['object']).columns.tolist()
    missing_data={}
    for col in num_data_col:
        dataset[col].fillna(dataset[col].mean(),inplace=True)
        missing_data[col] = dataset[col].mean()
    #filling missing values for categorical data 
    for col in cat_data_col:
        dataset[col].fillna(dataset[col].mode()[0],inplace=True)
        missing_data[col] = dataset[col].mode()[0]
    clean_dict['missing_values']=missing_data
    ###############################################################################


    # f.write("\n Handled missing values , filled mean and mode value for numeric and categorical variable respectively \n")


    ############### Converting categorical to numeric values ##################
    labelencoder = LabelEncoder()
    # f.writelines("\n ......Categorical to numeric data information..... \n")
    # Assigning numerical values and storing in another column
    lst_cn=[]
    for col in cat_data_col:
        new_col = col + '_encoded'
        dataset[new_col] = labelencoder.fit_transform(dataset[col])
        d1 = dataset.drop_duplicates(col).set_index(col)
        dataset[col] = dataset[new_col]
        dataset.drop([new_col], axis=1, inplace=True)
        d1.drop(d1.columns.difference([col, new_col]), 1, inplace=True)
        dict_map = d1.to_dict()
        lst_cn.append(dict_map)
        # print(dict_map)
    clean_dict['categorical_to_numeric']=lst_cn
    ##################################################################

    # print(clean_dict)
    #     f.write("\n" + str(dict_map))
    #
    # f.write("\n#####################################################\n")
    # f.close()

    # dict['doc_path']=fname



    return dataset,clean_dict


    
    

# print(data.head())
# dict_param={'dropcols': ['Loan_ID', 'Gender', 'Education'], 'path':"../dataset/Loan_Approval_prediction/train.csv" }
# df=data_clean(dict_param)
# print(df.head())