AiNext / model_pipeline_steps.py
AICOE-Datamatics's picture
Initial code
247c8df
raw
history blame contribute delete
No virus
11.6 kB
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 12 10:00:30 2021
@author: Kishore
"""
################## Importing Modules ###########################################
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc,roc_auc_score
import plotly.express as px
import plotly.graph_objects as go
import eli5
#####################################################################
############# Identifying the problem type (Classification/Regression) in Predictive Analytics ##########
def get_problem_type1(clean_data, dependent_variable):
limit_number_of_class=10
print("problem analysis")
if (clean_data.dtypes[dependent_variable] == 'int32' or clean_data.dtypes[dependent_variable] == 'int64') and (clean_data[dependent_variable].nunique() <= limit_number_of_class):
return "classification"
else:
return "regression"
#########################################################################################################
######################### Model Building For Predictive Aanalytics ############################
def model_build(clean_data, dependent_variable,problem_type,balance_data,steps_dict):
print("Model build started")
print("hi")
d={}
lst=[]
# print(data_dict['path'])
######## data cleaning##########
train_data = clean_data.drop(dependent_variable, axis=1)
target_data = clean_data[dependent_variable]
if problem_type=="classification":
data_dict = {}
###### Models ####################
if balance_data=="Auto":
d={}
d["Before Handling Imbalanced Dataset"]=target_data.value_counts()
oversample = SMOTE()
train_data, target_data = oversample.fit_resample(train_data, target_data)
d["After Handling Imbalanced Dataset"] = target_data.value_counts()
data_dict["Handling Imbalanced Dataset"]=d
elif balance_data == "False":
data_dict["Cannot Handle Imbalanced Dataset,It is set to False"] = ""
X_train, X_test, y_train, y_test = train_test_split(train_data,target_data, test_size=0.3,
random_state=0)
# pipeline_lr = Pipeline([('scalar1', StandardScaler()),
# ('lr_classifier', LogisticRegression(random_state=0))])
pipeline_dt = Pipeline([('scalar2', StandardScaler()),
('dt_classifier', DecisionTreeClassifier())])
pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),
('rf_classifier', RandomForestClassifier())])
pipeline_xgboost = Pipeline([('scalar4', StandardScaler()),
('xg_classifier',XGBClassifier() )])
############## Lets make the list of pipelines #####################
pipelines = [pipeline_dt, pipeline_randomforest,pipeline_xgboost]
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""
################## Dictionary of pipelines and classifier types for ease of reference ############
pipe_dict = {0: 'Decision_Tree', 1: 'RandomForest',2:'XGBoost_Classifier'}
########## Fit the pipelines##################
for pipe in pipelines:
pipe.fit(X_train, y_train)
models_info= {}
for i, model in enumerate(pipelines):
val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))
lst.append(val)
models_info[pipe_dict[i]]= model.score(X_test, y_test)
print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
df_models_info=pd.DataFrame(models_info.items(),columns=["Models","Accuracy"])
for i, model in enumerate(pipelines):
if model.score(X_test, y_test) > best_accuracy:
best_accuracy = model.score(X_test, y_test)
best_pipeline = model
best_classifier = i
# print(best_pipeline)
html_object = eli5.show_weights(best_pipeline,feature_names=X_train.columns.tolist())
result = pd.read_html(html_object.data)[0]
data_dict['Model Interpretation'] = result
val1 = 'Classifier with best accuracy:{}'.format(pipe_dict[best_classifier])
lst.append(val1)
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))
y_pred = best_pipeline.predict(X_test)
cn = confusion_matrix(y_test, y_pred)
data_dict['Model details'] = lst
fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models")
fig.update_layout(yaxis_title="Accuracy")
data_dict['model_comparison'] = fig
data_dict['Best model']= lst[-1].split(':')[1]
data_dict['Best pipeline'] = best_pipeline
data_dict['Confusion Matrix'] = cn
if len(X_train) <= 100000:
cv = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy')
data_dict['Cross Validation'] = cv
report = classification_report(y_test, y_pred)
data_dict['Classification Report']=report
y_scores = best_pipeline.predict_proba(X_test)
# One hot encode the labels in order to plot them
y_onehot = pd.get_dummies(y_test, columns=best_pipeline.classes_)
# Create an empty figure, and iteratively add new lines
# every time we compute a new class
fig = go.Figure()
fig.add_shape(
type='line', line=dict(dash='dash'),
x0=0, x1=1, y0=0, y1=1
)
for i in range(y_scores.shape[1]):
y_true = y_onehot.iloc[:, i]
y_score = y_scores[:, i]
fpr, tpr, _ = roc_curve(y_true, y_score)
auc_score = roc_auc_score(y_true, y_score)
class_name=""
for data1 in steps_dict['categorical_to_numeric']:
for key, value in data1.items():
col_name = key.split('_encoded')[0]
if col_name == dependent_variable:
# print(col_name)
# print(value)
d = {}
for j, v in value.items():
if v == y_onehot.columns[i]:
class_name=j
break
name = f"{class_name} (AUC={auc_score:.2f})"
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))
fig.update_layout(
xaxis_title='False Positive Rate',
yaxis_title='True Positive Rate',
yaxis=dict(scaleanchor="x", scaleratio=1),
xaxis=dict(constrain='domain'),
width=700, height=500
)
data_dict['ROC Curve'] = fig
print("model completed")
return data_dict
elif problem_type == "regression":
data_dict={}
X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.3,random_state=0)
pipeline_linear = Pipeline([('scalar1', StandardScaler()),('linear_cdt_regressor', LinearRegression())])
#pipeline_lr = Pipeline([('scalar2', StandardScaler()),('lr_regressor', LogisticRegression())])
pipeline_dt = Pipeline([('scalar2', StandardScaler()),('dt_regressor', DecisionTreeRegressor())])
pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),('rf_regressor', RandomForestRegressor())])
pipeline_svm = Pipeline([('scalar4', StandardScaler()), ('svr',SVR(kernel='linear'))])
pipeline_regression = [pipeline_linear,pipeline_dt,pipeline_randomforest,pipeline_svm]
best_accuracy = 0.0
best_regressor = 0
best_pipeline = ""
################## Dictionary of pipelines and classifier types for ease of reference ############
# pipe_dict = {0: 'Linear_Regression', 1: 'Logistic_Regression', 2: 'Decision_Tree', 3: 'RandomForest',4:'SVM'}
pipe_dict = {0: 'Linear_Regression', 1: 'Decision_Tree', 2: 'RandomForest', 3: 'SVM'}
for pipe in pipeline_regression:
pipe.fit(X_train, y_train)
models_info = {}
for i, model in enumerate(pipeline_regression):
val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))
lst.append(val)
models_info[pipe_dict[i]] = model.score(X_test, y_test)
print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
df_models_info = pd.DataFrame(models_info.items(), columns=["Models", "Accuracy"])
for i, model in enumerate(pipeline_regression):
if model.score(X_test, y_test) > best_accuracy:
best_accuracy = model.score(X_test, y_test)
best_pipeline = model
best_regressor = i
# print(best_pipeline)
html_object = eli5.show_weights(best_pipeline, feature_names=X_train.columns.tolist())
result = pd.read_html(html_object.data)[0]
data_dict['Model Interpretation'] = result
val1='Regressor with best accuracy:{}'.format(pipe_dict[best_regressor])
lst.append(val1)
print('Regressor with best accuracy:{}'.format(pipe_dict[best_regressor]))
data_dict['Model details'] = lst
fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models")
fig.update_layout(yaxis_title="Accuracy")
data_dict['model_comparison'] = fig
data_dict['Best model'] = lst[-1].split(':')[1]
data_dict['Best pipeline'] = best_pipeline
y_pred = best_pipeline.predict(X_test)
# print(y_pred)
mse = mean_squared_error(y_test, y_pred)
# print(mse)
rmse = math.sqrt(mse)
# print(rmse)
r2 = r2_score(y_test, y_pred)
statement_mse = "MEAN SQUARED ERROR : " + str(mse)
statement_rmse = "ROOT MEAN SQUARED ERROR : " + str(rmse)
statement_r2 = "R2 Score : " + str(r2)
data_dict['MEAN SQUARED ERROR']=statement_mse
data_dict['ROOT MEAN SQUARED ERROR']=statement_rmse
data_dict['R2 Score']=statement_r2
cv = cross_val_score(best_pipeline, X_train, y_train, cv=5)
data_dict['Cross Validation']=cv
fig = go.Figure([
go.Scatter(y=y_test, name='Actual', mode='markers'),
go.Scatter(y=y_pred, name='Predicted', mode='markers')
])
fig.update_layout(
title=str(lst[-1].split(':')[1]),
xaxis_title="Count",
yaxis_title="Target values")
# plt.show()
data_dict['Regression graph']=fig
return data_dict
else:
return d
###############################################################################################