# -*- coding: utf-8 -*- """ Created on Mon Jul 12 10:00:30 2021 @author: Kishore """ ################## Importing Modules ########################################### from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score import math import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from xgboost import XGBClassifier from imblearn.over_sampling import SMOTE from sklearn.metrics import roc_curve, auc,roc_auc_score import plotly.express as px import plotly.graph_objects as go import eli5 ##################################################################### ############# Identifying the problem type (Classification/Regression) in Predictive Analytics ########## def get_problem_type1(clean_data, dependent_variable): limit_number_of_class=10 print("problem analysis") if (clean_data.dtypes[dependent_variable] == 'int32' or clean_data.dtypes[dependent_variable] == 'int64') and (clean_data[dependent_variable].nunique() <= limit_number_of_class): return "classification" else: return "regression" ######################################################################################################### ######################### Model Building For Predictive Aanalytics ############################ def model_build(clean_data, dependent_variable,problem_type,balance_data,steps_dict): print("Model build started") print("hi") d={} lst=[] # print(data_dict['path']) ######## data cleaning########## train_data = clean_data.drop(dependent_variable, axis=1) target_data = clean_data[dependent_variable] if problem_type=="classification": data_dict = {} ###### Models #################### if balance_data=="Auto": d={} d["Before Handling Imbalanced Dataset"]=target_data.value_counts() oversample = SMOTE() train_data, target_data = oversample.fit_resample(train_data, target_data) d["After Handling Imbalanced Dataset"] = target_data.value_counts() data_dict["Handling Imbalanced Dataset"]=d elif balance_data == "False": data_dict["Cannot Handle Imbalanced Dataset,It is set to False"] = "" X_train, X_test, y_train, y_test = train_test_split(train_data,target_data, test_size=0.3, random_state=0) # pipeline_lr = Pipeline([('scalar1', StandardScaler()), # ('lr_classifier', LogisticRegression(random_state=0))]) pipeline_dt = Pipeline([('scalar2', StandardScaler()), ('dt_classifier', DecisionTreeClassifier())]) pipeline_randomforest = Pipeline([('scalar3', StandardScaler()), ('rf_classifier', RandomForestClassifier())]) pipeline_xgboost = Pipeline([('scalar4', StandardScaler()), ('xg_classifier',XGBClassifier() )]) ############## Lets make the list of pipelines ##################### pipelines = [pipeline_dt, pipeline_randomforest,pipeline_xgboost] best_accuracy = 0.0 best_classifier = 0 best_pipeline = "" ################## Dictionary of pipelines and classifier types for ease of reference ############ pipe_dict = {0: 'Decision_Tree', 1: 'RandomForest',2:'XGBoost_Classifier'} ########## Fit the pipelines################## for pipe in pipelines: pipe.fit(X_train, y_train) models_info= {} for i, model in enumerate(pipelines): val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)) lst.append(val) models_info[pipe_dict[i]]= model.score(X_test, y_test) print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))) df_models_info=pd.DataFrame(models_info.items(),columns=["Models","Accuracy"]) for i, model in enumerate(pipelines): if model.score(X_test, y_test) > best_accuracy: best_accuracy = model.score(X_test, y_test) best_pipeline = model best_classifier = i # print(best_pipeline) html_object = eli5.show_weights(best_pipeline,feature_names=X_train.columns.tolist()) result = pd.read_html(html_object.data)[0] data_dict['Model Interpretation'] = result val1 = 'Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]) lst.append(val1) print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier])) y_pred = best_pipeline.predict(X_test) cn = confusion_matrix(y_test, y_pred) data_dict['Model details'] = lst fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models") fig.update_layout(yaxis_title="Accuracy") data_dict['model_comparison'] = fig data_dict['Best model']= lst[-1].split(':')[1] data_dict['Best pipeline'] = best_pipeline data_dict['Confusion Matrix'] = cn if len(X_train) <= 100000: cv = cross_val_score(best_pipeline, X_train, y_train, cv=5, scoring='accuracy') data_dict['Cross Validation'] = cv report = classification_report(y_test, y_pred) data_dict['Classification Report']=report y_scores = best_pipeline.predict_proba(X_test) # One hot encode the labels in order to plot them y_onehot = pd.get_dummies(y_test, columns=best_pipeline.classes_) # Create an empty figure, and iteratively add new lines # every time we compute a new class fig = go.Figure() fig.add_shape( type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1 ) for i in range(y_scores.shape[1]): y_true = y_onehot.iloc[:, i] y_score = y_scores[:, i] fpr, tpr, _ = roc_curve(y_true, y_score) auc_score = roc_auc_score(y_true, y_score) class_name="" for data1 in steps_dict['categorical_to_numeric']: for key, value in data1.items(): col_name = key.split('_encoded')[0] if col_name == dependent_variable: # print(col_name) # print(value) d = {} for j, v in value.items(): if v == y_onehot.columns[i]: class_name=j break name = f"{class_name} (AUC={auc_score:.2f})" fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) fig.update_layout( xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', yaxis=dict(scaleanchor="x", scaleratio=1), xaxis=dict(constrain='domain'), width=700, height=500 ) data_dict['ROC Curve'] = fig print("model completed") return data_dict elif problem_type == "regression": data_dict={} X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.3,random_state=0) pipeline_linear = Pipeline([('scalar1', StandardScaler()),('linear_cdt_regressor', LinearRegression())]) #pipeline_lr = Pipeline([('scalar2', StandardScaler()),('lr_regressor', LogisticRegression())]) pipeline_dt = Pipeline([('scalar2', StandardScaler()),('dt_regressor', DecisionTreeRegressor())]) pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),('rf_regressor', RandomForestRegressor())]) pipeline_svm = Pipeline([('scalar4', StandardScaler()), ('svr',SVR(kernel='linear'))]) pipeline_regression = [pipeline_linear,pipeline_dt,pipeline_randomforest,pipeline_svm] best_accuracy = 0.0 best_regressor = 0 best_pipeline = "" ################## Dictionary of pipelines and classifier types for ease of reference ############ # pipe_dict = {0: 'Linear_Regression', 1: 'Logistic_Regression', 2: 'Decision_Tree', 3: 'RandomForest',4:'SVM'} pipe_dict = {0: 'Linear_Regression', 1: 'Decision_Tree', 2: 'RandomForest', 3: 'SVM'} for pipe in pipeline_regression: pipe.fit(X_train, y_train) models_info = {} for i, model in enumerate(pipeline_regression): val = "{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)) lst.append(val) models_info[pipe_dict[i]] = model.score(X_test, y_test) print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test))) df_models_info = pd.DataFrame(models_info.items(), columns=["Models", "Accuracy"]) for i, model in enumerate(pipeline_regression): if model.score(X_test, y_test) > best_accuracy: best_accuracy = model.score(X_test, y_test) best_pipeline = model best_regressor = i # print(best_pipeline) html_object = eli5.show_weights(best_pipeline, feature_names=X_train.columns.tolist()) result = pd.read_html(html_object.data)[0] data_dict['Model Interpretation'] = result val1='Regressor with best accuracy:{}'.format(pipe_dict[best_regressor]) lst.append(val1) print('Regressor with best accuracy:{}'.format(pipe_dict[best_regressor])) data_dict['Model details'] = lst fig = px.histogram(df_models_info, x="Models", y="Accuracy", color="Models") fig.update_layout(yaxis_title="Accuracy") data_dict['model_comparison'] = fig data_dict['Best model'] = lst[-1].split(':')[1] data_dict['Best pipeline'] = best_pipeline y_pred = best_pipeline.predict(X_test) # print(y_pred) mse = mean_squared_error(y_test, y_pred) # print(mse) rmse = math.sqrt(mse) # print(rmse) r2 = r2_score(y_test, y_pred) statement_mse = "MEAN SQUARED ERROR : " + str(mse) statement_rmse = "ROOT MEAN SQUARED ERROR : " + str(rmse) statement_r2 = "R2 Score : " + str(r2) data_dict['MEAN SQUARED ERROR']=statement_mse data_dict['ROOT MEAN SQUARED ERROR']=statement_rmse data_dict['R2 Score']=statement_r2 cv = cross_val_score(best_pipeline, X_train, y_train, cv=5) data_dict['Cross Validation']=cv fig = go.Figure([ go.Scatter(y=y_test, name='Actual', mode='markers'), go.Scatter(y=y_pred, name='Predicted', mode='markers') ]) fig.update_layout( title=str(lst[-1].split(':')[1]), xaxis_title="Count", yaxis_title="Target values") # plt.show() data_dict['Regression graph']=fig return data_dict else: return d ###############################################################################################