Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Jul 12 10:00:30 2021 | |
@author: Kishore | |
""" | |
################## Importing Modules ########################################### | |
from sklearn.metrics import confusion_matrix, classification_report | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.metrics import mean_squared_error | |
from sklearn.metrics import r2_score | |
import math | |
import pandas as pd | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.linear_model import LinearRegression | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.pipeline import Pipeline | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.svm import SVR | |
from xgboost import XGBClassifier | |
from imblearn.over_sampling import SMOTE | |
from sklearn.metrics import roc_curve, auc,roc_auc_score | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import eli5 | |
##################################################################### | |
############# Identifying the problem type (Classification/Regression) in Predictive Analytics ########## | |
def get_problem_type1(clean_data, dependent_variable): | |
limit_number_of_class=10 | |
print("problem analysis") | |
if (clean_data.dtypes[dependent_variable] == 'int32' or clean_data.dtypes[dependent_variable] == 'int64') and (clean_data[dependent_variable].nunique() <= limit_number_of_class): | |
return "classification" | |
else: | |
return "regression" | |
######################################################################################################### | |
######################### Model Building For Predictive Aanalytics ############################ | |
def model_build(clean_data, dependent_variable,problem_type,balance_data,steps_dict): | |
print("Model build started") | |
print("hi") | |
d={} | |
lst=[] | |
# print(data_dict['path']) | |
######## data cleaning########## | |
train_data = clean_data.drop(dependent_variable, axis=1) | |
target_data = clean_data[dependent_variable] | |
if problem_type=="classification": | |
data_dict = {} | |
###### Models #################### | |
if balance_data=="Auto": | |
d={} | |
d["Before Handling Imbalanced Dataset"]=target_data.value_counts() | |
oversample = SMOTE() | |
train_data, target_data = oversample.fit_resample(train_data, target_data) | |
d["After Handling Imbalanced Dataset"] = target_data.value_counts() | |
data_dict["Handling Imbalanced Dataset"]=d | |
elif balance_data == "False": | |
data_dict["Cannot Handle Imbalanced Dataset,It is set to False"] = "" | |
X_train, X_test, y_train, y_test = train_test_split(train_data,target_data, test_size=0.3, | |
random_state=0) | |
# pipeline_lr = Pipeline([('scalar1', StandardScaler()), | |
# ('lr_classifier', LogisticRegression(random_state=0))]) | |
pipeline_dt = Pipeline([('scalar2', StandardScaler()), | |
('dt_classifier', DecisionTreeClassifier())]) | |
pipeline_randomforest = Pipeline([('scalar3', StandardScaler()), | |
('rf_classifier', RandomForestClassifier())]) | |
pipeline_xgboost = Pipeline([('scalar4', StandardScaler()), | |
('xg_classifier',XGBClassifier() )]) | |
############## Lets make the list of pipelines ##################### | |
pipelines = [pipeline_dt, pipeline_randomforest,pipeline_xgboost] | |
best_accuracy = 0.0 | |
best_classifier = 0 | |
best_pipeline = "" | |
################## Dictionary of pipelines and classifier types for ease of reference ############ | |
pipe_dict = {0: 'Decision_Tree', 1: 'RandomForest',2:'XGBoost_Classifier'} | |
########## Fit the pipelines################## | |
for pipe in pipelines: | |
pipe.fit(X_train, y_train) | |
models_info= {} | |
for i, model in enumerate(pipelines): | |
val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)) | |
lst.append(val) | |
models_info[pipe_dict[i]]= model.score(X_test, y_test) | |
print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))) | |
df_models_info=pd.DataFrame(models_info.items(),columns=["Models","Accuracy"]) | |
for i, model in enumerate(pipelines): | |
if model.score(X_test, y_test) > best_accuracy: | |
best_accuracy = model.score(X_test, y_test) | |
best_pipeline = model | |
best_classifier = i | |
# print(best_pipeline) | |
html_object = eli5.show_weights(best_pipeline,feature_names=X_train.columns.tolist()) | |
result = pd.read_html(html_object.data)[0] | |
data_dict['Model Interpretation'] = result | |
val1 = 'Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]) | |
lst.append(val1) | |
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier])) | |
y_pred = best_pipeline.predict(X_test) | |
cn = confusion_matrix(y_test, y_pred) | |
data_dict['Model details'] = lst | |
fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models") | |
fig.update_layout(yaxis_title="Accuracy") | |
data_dict['model_comparison'] = fig | |
data_dict['Best model']= lst[-1].split(':')[1] | |
data_dict['Best pipeline'] = best_pipeline | |
data_dict['Confusion Matrix'] = cn | |
if len(X_train) <= 100000: | |
cv = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy') | |
data_dict['Cross Validation'] = cv | |
report = classification_report(y_test, y_pred) | |
data_dict['Classification Report']=report | |
y_scores = best_pipeline.predict_proba(X_test) | |
# One hot encode the labels in order to plot them | |
y_onehot = pd.get_dummies(y_test, columns=best_pipeline.classes_) | |
# Create an empty figure, and iteratively add new lines | |
# every time we compute a new class | |
fig = go.Figure() | |
fig.add_shape( | |
type='line', line=dict(dash='dash'), | |
x0=0, x1=1, y0=0, y1=1 | |
) | |
for i in range(y_scores.shape[1]): | |
y_true = y_onehot.iloc[:, i] | |
y_score = y_scores[:, i] | |
fpr, tpr, _ = roc_curve(y_true, y_score) | |
auc_score = roc_auc_score(y_true, y_score) | |
class_name="" | |
for data1 in steps_dict['categorical_to_numeric']: | |
for key, value in data1.items(): | |
col_name = key.split('_encoded')[0] | |
if col_name == dependent_variable: | |
# print(col_name) | |
# print(value) | |
d = {} | |
for j, v in value.items(): | |
if v == y_onehot.columns[i]: | |
class_name=j | |
break | |
name = f"{class_name} (AUC={auc_score:.2f})" | |
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) | |
fig.update_layout( | |
xaxis_title='False Positive Rate', | |
yaxis_title='True Positive Rate', | |
yaxis=dict(scaleanchor="x", scaleratio=1), | |
xaxis=dict(constrain='domain'), | |
width=700, height=500 | |
) | |
data_dict['ROC Curve'] = fig | |
print("model completed") | |
return data_dict | |
elif problem_type == "regression": | |
data_dict={} | |
X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.3,random_state=0) | |
pipeline_linear = Pipeline([('scalar1', StandardScaler()),('linear_cdt_regressor', LinearRegression())]) | |
#pipeline_lr = Pipeline([('scalar2', StandardScaler()),('lr_regressor', LogisticRegression())]) | |
pipeline_dt = Pipeline([('scalar2', StandardScaler()),('dt_regressor', DecisionTreeRegressor())]) | |
pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),('rf_regressor', RandomForestRegressor())]) | |
pipeline_svm = Pipeline([('scalar4', StandardScaler()), ('svr',SVR(kernel='linear'))]) | |
pipeline_regression = [pipeline_linear,pipeline_dt,pipeline_randomforest,pipeline_svm] | |
best_accuracy = 0.0 | |
best_regressor = 0 | |
best_pipeline = "" | |
################## Dictionary of pipelines and classifier types for ease of reference ############ | |
# pipe_dict = {0: 'Linear_Regression', 1: 'Logistic_Regression', 2: 'Decision_Tree', 3: 'RandomForest',4:'SVM'} | |
pipe_dict = {0: 'Linear_Regression', 1: 'Decision_Tree', 2: 'RandomForest', 3: 'SVM'} | |
for pipe in pipeline_regression: | |
pipe.fit(X_train, y_train) | |
models_info = {} | |
for i, model in enumerate(pipeline_regression): | |
val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)) | |
lst.append(val) | |
models_info[pipe_dict[i]] = model.score(X_test, y_test) | |
print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))) | |
df_models_info = pd.DataFrame(models_info.items(), columns=["Models", "Accuracy"]) | |
for i, model in enumerate(pipeline_regression): | |
if model.score(X_test, y_test) > best_accuracy: | |
best_accuracy = model.score(X_test, y_test) | |
best_pipeline = model | |
best_regressor = i | |
# print(best_pipeline) | |
html_object = eli5.show_weights(best_pipeline, feature_names=X_train.columns.tolist()) | |
result = pd.read_html(html_object.data)[0] | |
data_dict['Model Interpretation'] = result | |
val1='Regressor with best accuracy:{}'.format(pipe_dict[best_regressor]) | |
lst.append(val1) | |
print('Regressor with best accuracy:{}'.format(pipe_dict[best_regressor])) | |
data_dict['Model details'] = lst | |
fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models") | |
fig.update_layout(yaxis_title="Accuracy") | |
data_dict['model_comparison'] = fig | |
data_dict['Best model'] = lst[-1].split(':')[1] | |
data_dict['Best pipeline'] = best_pipeline | |
y_pred = best_pipeline.predict(X_test) | |
# print(y_pred) | |
mse = mean_squared_error(y_test, y_pred) | |
# print(mse) | |
rmse = math.sqrt(mse) | |
# print(rmse) | |
r2 = r2_score(y_test, y_pred) | |
statement_mse = "MEAN SQUARED ERROR : " + str(mse) | |
statement_rmse = "ROOT MEAN SQUARED ERROR : " + str(rmse) | |
statement_r2 = "R2 Score : " + str(r2) | |
data_dict['MEAN SQUARED ERROR']=statement_mse | |
data_dict['ROOT MEAN SQUARED ERROR']=statement_rmse | |
data_dict['R2 Score']=statement_r2 | |
cv = cross_val_score(best_pipeline, X_train, y_train, cv=5) | |
data_dict['Cross Validation']=cv | |
fig = go.Figure([ | |
go.Scatter(y=y_test, name='Actual', mode='markers'), | |
go.Scatter(y=y_pred, name='Predicted', mode='markers') | |
]) | |
fig.update_layout( | |
title=str(lst[-1].split(':')[1]), | |
xaxis_title="Count", | |
yaxis_title="Target values") | |
# plt.show() | |
data_dict['Regression graph']=fig | |
return data_dict | |
else: | |
return d | |
############################################################################################### | |