# -*- coding: utf-8 -*- #### Importing Modules #### import base64 import pandas as pd import streamlit as st from autoclean import data_clean from model_pipeline_steps import get_problem_type1, model_build from PIL import Image from DA_P1 import get_details, imbalnce_ratio, word_cloud, plotly_target, plot_ngram import pickle from NLP_text_classification import model_train, predict_text, predict_csv from kmeans import k_means from jinja2.ext import i18n info = {} #********* Handling rturn variable in cache memory to solve reloading issue in streamlit ******# @st.cache(allow_output_mutation=True) def get_details_local(data): final_output = get_details(data) return final_output @st.cache(allow_output_mutation=True) def clean(dataset, drop_features): cleaned_data, steps_dict = data_clean(dataset, drop_features) return cleaned_data, steps_dict @st.cache(allow_output_mutation=True) def get_problem_type_local(cleaned_data, target_data): p_type = get_problem_type1(cleaned_data, target_data) return p_type @st.cache(allow_output_mutation=True) def model_build_local(cleaned_data, target_data, p_type, balance_data, steps_dict): model = model_build(cleaned_data, target_data, p_type, balance_data, steps_dict) return model @st.cache(allow_output_mutation=True) def model_train_local(dataset, input_feature, target_data, balance_data): model_info = model_train(dataset, input_feature, target_data, balance_data) return model_info @st.cache(allow_output_mutation=True) def word_cloud_local(dataset, input_col): plt = word_cloud(dataset, input_col) return plt @st.cache(allow_output_mutation=True) def plotly_target_local(dataset, tg_col): plt = plotly_target(dataset, tg_col) return plt @st.cache(allow_output_mutation=True) def plot_ngram_local(dataset, tg_col): plt = plot_ngram(dataset, tg_col) return plt #******************************************************************# def main(): try: # setting tab title and icon st.set_page_config(page_title="AiNext", page_icon="image.png") # Hiding streamlit wateermark hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) # To do Navigation Menu st.markdown( '', unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) # Image in sidebar and link to mail image_loan = Image.open("AI.jpg") st.sidebar.image(image_loan, use_column_width=True) st.sidebar.markdown( """Mail us at - technology.coe@digital.datamatics.com""", unsafe_allow_html=True) # Upload CSV File st.header("Upload Input csv file") file_upload = st.file_uploader(" ", type=["csv"]) if file_upload is not None: # Selecting Experiment type (Supervised or UnSupervised) st.subheader("Select the Experiment type") exp_type = st.selectbox(label=' ', options=['Select', 'Supervised', 'UnSupervised']) print(exp_type) # **************************** Supervised Section ********************************** # if exp_type == "Supervised": st.subheader("Supervised") # read Dataset dataset = pd.read_csv(file_upload) # read columns cols = dataset.columns.tolist() st.text(" ") # Selecting features to drop st.subheader("choose the features which you want to drop") drop_features = st.multiselect('', cols) # print(drop_features) # Selecting target feature st.text(" ") st.subheader("Pick Your Target feature") target_data = st.selectbox(label=' ', options=cols, index=len(cols) - 1) # print(target_data) # **** Following code is to identify problem type is NLP text classification or Predictive analysis using Input feature **** # total_len = len(cols) drop_len = len(drop_features) problem_statement = "" input_feature_temp = "" st.sidebar.text(" ") sidebar_col1, sidebar_col2, sidebar_col3 = st.sidebar.beta_columns(3) if st.checkbox("Check Problem Type"): if (target_data not in drop_features) and ((total_len - drop_len) == 2): temp_data = dataset.drop(drop_features, axis=1) temp_data = temp_data.drop(target_data, axis=1) temp_col = temp_data.columns.tolist() print(temp_data.dtypes[temp_col[0]]) if temp_data.dtypes[temp_col[0]] == "object": print("NLP text Classification") html_string = "" sidebar_col2.markdown(html_string, unsafe_allow_html=True) problem_statement = "NLP text Classification" input_feature_temp = temp_col[0] else: html_string = "" sidebar_col2.markdown(html_string, unsafe_allow_html=True) print("Predictive Analytics") problem_statement = "Predictive Analytics" elif (target_data not in drop_features) and ((total_len - drop_len) > 2): html_string = "" sidebar_col2.markdown(html_string, unsafe_allow_html=True) print("Predictive Analytics") problem_statement = "Predictive Analytics" elif (target_data in drop_features): st.error("Selected Target column is also selected to drop.So Can't proceed") #******************************************************************************************# # *********************** Predictive Analytics Section *************************************# if problem_statement == "Predictive Analytics" and problem_statement != "": # ************ Data Analysis Code goes here ********** # final_output = get_details_local(dataset) # print(dataset) st.text(" ") first = dataset.head(10) # last = dataset.tail(10) if st.button("Click here to Analyze Data"): container = st.beta_expander("Data Analysis and visualization Details") # c1,c2=container.beta_columns(2) container.subheader("First 10 Rows") container.write(first) # c2.subheader("Last 10 Rows") # c2.write(last) container.text(" ") overview_con = container.beta_container() overview_con.subheader("Overview of Dataset") overview_con.text(" ") ov_c1, ov_c2, ov_c3 = overview_con.beta_columns(3) ov_c1.write("Statistics") for key, value in final_output['overview']['data_statistics'].items(): temp = str(key) + ": " + str(value) ov_c1.text(temp) ov_c2.write("Variable Info") for key, value in final_output['overview']['variable_type'].items(): temp = str(key) + ": " + str(value) ov_c2.text(temp) ov_c3.write("Reproduction") for key, value in final_output['reproduction'].items(): temp = str(key) + ": " + str(value) ov_c3.text(temp) container.text(" ") numeric_con = container.beta_container() numeric_con.subheader("Numeric Variable Information") numeric_con.text(" ") for key, value in final_output['numerical_variable_info']['variable_info'].items(): numeric_con.text(" ") temp_key = "Numeric Column:" + str(key) numeric_con.write(temp_key) num_c1, num_c2, num_c3, num_c4 = numeric_con.beta_columns(4) i = 1 for key1, value1 in value.items(): temp = str(key1) + ": " + str(value1) if (i <= 7): num_c1.text(temp) elif (i > 7 and i <= 14): num_c2.text(temp) elif (i > 14 and i <= 21): num_c3.text(temp) elif i > 21 and i <= 24: num_c4.text(temp) elif i > 24: numeric_con.plotly_chart(value1, config={'displaylogo': False}) i = i + 1 container.text(" ") categorical_con = container.beta_container() categorical_con.subheader("Categorical Variable Information") categorical_con.text(" ") for key, value in final_output['categorical_variable_info']['variable_info'].items(): categorical_con.text(" ") temp_key = "Categorical Column:" + str(key) categorical_con.write(temp_key) num_c1, num_c2, num_c3, num_c4 = categorical_con.beta_columns(4) i = 1 for key1, value1 in value.items(): temp = str(key1) + ": " + str(value1) if (i <= 5): num_c1.text(temp) elif (i > 5 and i <= 10): num_c2.text(temp) elif (i > 10 and i <= 15): num_c3.text(temp) elif i > 15 and i <= 16: num_c4.text(temp) elif i > 16: categorical_con.plotly_chart(value1, config={'displaylogo': False}) i = i + 1 container.text(" ") container.text("Scatter chart Matrix") container.plotly_chart(final_output['scatter_chart_matrix'],config = {'displaylogo': False}) container.text(" ") container.text(" ") corr_con = container.beta_container() corr_con.subheader("Correlation Matrix Information") corr_con.text(" ") # corr_c1, corr_c2, corr_c3 = corr_con.beta_columns(3) # j = 0 for key1, value1 in final_output['correlation_matrix_info'].items(): corr_con.text(" ") corr_con.write(key1) # col.pyplot(value) corr_con.plotly_chart(value1, config={'displaylogo': False}) # col.plotly_chart(value1,use_container_width=True) # j=j+1 container.text(" ") missing_con = container.beta_container() missing_con.subheader("Missing Values Information") missing_con.text(" ") mis_c1, mis_c2 = missing_con.beta_columns(2) mis_c3, mis_c4 = missing_con.beta_columns(2) k = 0 for key, value in final_output['missing_values_info'].items(): corr_con.text(" ") col = mis_c1 if k == 0: col = mis_c1 elif k == 1: col = mis_c2 elif k == 2: col = mis_c3 elif k == 3: col = mis_c4 col.write(key) col.pyplot(value) k = k + 1 # ********************************************************# # ****** Option for handling Imbalanced Dataset ******# st.text(" ") ir_res = imbalnce_ratio(dataset, target_data) ir_res = "Imbalance Ratio (" + ir_res + ")" st.subheader("Select below option to Handle Imbalanced Dataset (optional)") st.text(ir_res) balance_data = st.selectbox(label=' ', options=["Auto", "False"]) #*******************************************************# #********* Data Cleaning and Model Building code goes here *********# st.text(" ") if (st.checkbox('Start build model') is True) and (target_data not in drop_features): st.text(" ") cleaned_data, steps_dict = clean(dataset, drop_features) sample_data = cleaned_data.head() info['clean_data'] = sample_data info['auto_drop'] = steps_dict['auto_drop'] p_type = get_problem_type_local(cleaned_data, target_data) statement_ptype = "Problem type :" + p_type info['problem'] = statement_ptype statement_target = "Target column: " + target_data info['target_statement'] = statement_target info['target'] = target_data model = model_build_local(cleaned_data, target_data, p_type, balance_data, steps_dict) info['model'] = model info['step_dict'] = steps_dict elif target_data in drop_features: st.error("Selected Target column is also selected to drop.So Can't proceed") #**********************************************************************************# # print(info) # ******************* Model Result ***********************# if info: for columns in info['auto_drop']: txt = "automatically dropped column: " + columns st.write(txt) st.text(" ") st.subheader("After Cleaning data") st.write(info['clean_data']) st.write(info['problem']) st.write(info['target_statement']) # print(info['model']) for key, val in info['model'].items(): st.text(" ") # if key == "Regression graph" : # st.write(key) # st.pyplot(val) if key == "Best pipeline" or key == "step_dict": pass elif key == "ROC Curve" or key == "model_comparison" or key == "Regression graph": st.write(key) st.plotly_chart(val, config={'displaylogo': False}) elif key == "Classification Report": st.write(key) st.text(val) elif key == "Handling Imbalanced Dataset": st.write(key) for key1, val1 in val.items(): st.write(key1) st.text(val1) else: st.write(key) st.write(val) st.text(" ") st.text(" ") # ***************************************************************# # ************************** Prediction **************************# st.subheader("Upload csv file for Predictions : ") file_upload1 = st.file_uploader(" ", type=["csv"]) print(file_upload1) if file_upload1 is not None: try: test_data = pd.read_csv(file_upload1) data = test_data.copy() data.drop(info['step_dict']['dropped_features'], axis=1, inplace=True) for col in data.columns: data[col].fillna(info['step_dict']['missing_values'][col], inplace=True) # print(info['target']) for data1 in info['step_dict']['categorical_to_numeric']: for key, value in data1.items(): col_name = key.split('_encoded')[0] if col_name != info['target']: # print(col_name) # print(value) data[col_name].replace(value, inplace=True) if info['target'] in data.columns: data.drop([info['target']], axis=1, inplace=True) final_model = info['model']['Best pipeline'] # print(final_model) predictions = final_model.predict(data) # print(predictions) print(len(test_data)) print(len(predictions)) predict_column_name = info['target'] + "_prediction" test_data[predict_column_name] = predictions for data1 in info['step_dict']['categorical_to_numeric']: for key, value in data1.items(): col_name = key.split('_encoded')[0] if col_name == info['target']: # print(col_name) # print(value) d = {} for i, v in value.items(): d[v] = i test_data[predict_column_name].replace(d, inplace=True) # csv = test_data.to_csv(index=False) # b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here # href = f'Download The Prediction Results CSV File (right-click and save as <some_name>.csv)' csv = test_data.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download Predicted file' st.markdown(href, unsafe_allow_html=True) output_model = pickle.dumps(final_model) b64 = base64.b64encode(output_model).decode() href = f'Download Best Model .pkl File ' st.markdown(href, unsafe_allow_html=True) except Exception as e: st.text(e) st.error("Uploaded wrong data for prediction") # ***************************************************************************# # *********************** End of Predictive Analytics Section *************************************# # *********************** NLP text Classification Section *************************************# elif problem_statement == "NLP text Classification" and problem_statement != "": try: # ********* Data Analysis and visualization code ************** # st.text(" ") vis_con = st.beta_expander("Data Visualization") st.text(" ") vis_con.subheader("Select Input Feature") select_col = ["Select"] t_cols = select_col + cols input_col = vis_con.selectbox(label=' ', options=t_cols) st.set_option('deprecation.showPyplotGlobalUse', False) res = word_cloud_local(dataset, input_col) if res is not None: vis_con.plotly_chart(res) true_bigrams = plot_ngram_local(dataset, input_col) if true_bigrams is not None: vis_con.plotly_chart(true_bigrams, config={'displaylogo': False}) st.text(" ") vis_con.subheader("Select target Feature") tg_col = vis_con.selectbox(label=' ', options=t_cols) plot_res = plotly_target_local(dataset, tg_col) if plot_res is not None: vis_con.plotly_chart(plot_res, config={'displaylogo': False}) #*****************************************************************************************# # ****** Option for handling Imbalanced Dataset ****** # input_feature = input_feature_temp st.text(" ") ir_res = imbalnce_ratio(dataset, target_data) ir_res = "Imbalance Ratio (" + ir_res + ")" st.subheader("Select below option to Handle Imbalanced Dataset (optional)") st.text(ir_res) balance_data = st.selectbox(label=' ', options=["Auto", "False"]) #***********************************************************# # ********* Data Cleaning and Model Building code goes here *********# st.text(" ") if st.checkbox("Start Build model") and input_feature != target_data: model_info = model_train_local(dataset, input_feature, target_data, balance_data) #************ Model Result ***************# for key, val in model_info.items(): st.text(" ") if key == "Classification Report": st.write(key) st.text(val) elif key == "model_comparison" or key == "ROC Curve": st.write(key) st.plotly_chart(val, config={'displaylogo': False}) elif key == "Handling Imbalanced Dataset": st.write(key) for key1, val1 in val.items(): st.write(key1) st.text(val1) elif key == "Best pipeline" or key == "tfidf_vector": pass else: st.write(key) st.write(val) #***********************************************************# # ****************** Prediction ******************* # c1, c2 = st.beta_columns(2) exp1 = c1.beta_expander("Prediction on text data") exp2 = c2.beta_expander("Prediction on csv data") form_predict = exp1.form("predict") text_val = form_predict.text_area("Enter text for prediction") if form_predict.form_submit_button("Predict") and text_val != "": prediction = predict_text(text_val, model_info["Best pipeline"], model_info["tfidf_vector"]) prediction = "Result :" + str(prediction[0]) form_predict.write(prediction) f_up = exp2.file_uploader("predict_csv", type=["csv"]) if f_up and exp2.button("Predict"): df = pd.read_csv(f_up, encoding='ISO-8859-1') df_copy = df.copy() predictions = predict_csv(df_copy, model_info["Best pipeline"], model_info["tfidf_vector"], input_feature) predict_column_name = target_data + "_prediction" df[predict_column_name] = predictions csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download Predicted file' exp2.markdown(href, unsafe_allow_html=True) output_model = pickle.dumps(model_info["Best pipeline"]) b64 = base64.b64encode(output_model).decode() href = f'Download Best Model .pkl File ' exp2.markdown(href, unsafe_allow_html=True) print("completed") elif target_data == input_feature: st.error("Input feature and target data cannot be same") except Exception as e: st.error(e) st.error("Something went wrong") # ****************************************************** # # *********************** End of NLP text Classification Section *************************************# # ************************* End of Supervised Section **************************************************# # **************************** UnSupervised Section (In Progress) ********************************** # elif exp_type == "UnSupervised": st.subheader("UnSupervised") # ************ Data Analysis Code goes here ********** # dataset = pd.read_csv(file_upload) final_output = get_details_local(dataset) cols = dataset.columns.tolist() # print(dataset) st.text(" ") first = dataset.head(10) # last = dataset.tail(10) if st.button("Click here to Analyze Data"): container = st.beta_expander("Data Analysis and visualization Details") # c1,c2=container.beta_columns(2) container.subheader("First 10 Rows") container.write(first) # c2.subheader("Last 10 Rows") # c2.write(last) container.text(" ") overview_con = container.beta_container() overview_con.subheader("Overview of Dataset") overview_con.text(" ") ov_c1, ov_c2, ov_c3 = overview_con.beta_columns(3) ov_c1.write("Statistics") for key, value in final_output['overview']['data_statistics'].items(): temp = str(key) + ": " + str(value) ov_c1.text(temp) ov_c2.write("Variable Info") for key, value in final_output['overview']['variable_type'].items(): temp = str(key) + ": " + str(value) ov_c2.text(temp) ov_c3.write("Reproduction") for key, value in final_output['reproduction'].items(): temp = str(key) + ": " + str(value) ov_c3.text(temp) container.text(" ") numeric_con = container.beta_container() numeric_con.subheader("Numeric Variable Information") numeric_con.text(" ") for key, value in final_output['numerical_variable_info']['variable_info'].items(): numeric_con.text(" ") temp_key = "Numeric Column:" + str(key) numeric_con.write(temp_key) num_c1, num_c2, num_c3, num_c4 = numeric_con.beta_columns(4) i = 1 for key1, value1 in value.items(): temp = str(key1) + ": " + str(value1) if (i <= 7): num_c1.text(temp) elif (i > 7 and i <= 14): num_c2.text(temp) elif (i > 14 and i <= 21): num_c3.text(temp) elif i > 21 and i <= 24: num_c4.text(temp) elif i > 24: numeric_con.plotly_chart(value1, config={'displaylogo': False}) i = i + 1 container.text(" ") categorical_con = container.beta_container() categorical_con.subheader("Categorical Variable Information") categorical_con.text(" ") for key, value in final_output['categorical_variable_info']['variable_info'].items(): categorical_con.text(" ") temp_key = "Categorical Column:" + str(key) categorical_con.write(temp_key) num_c1, num_c2, num_c3, num_c4 = categorical_con.beta_columns(4) i = 1 for key1, value1 in value.items(): temp = str(key1) + ": " + str(value1) if (i <= 5): num_c1.text(temp) elif (i > 5 and i <= 10): num_c2.text(temp) elif (i > 10 and i <= 15): num_c3.text(temp) elif i > 15 and i <= 16: num_c4.text(temp) elif i > 16: categorical_con.plotly_chart(value1, config={'displaylogo': False}) i = i + 1 container.text(" ") container.text("Scatter chart Matrix") container.plotly_chart(final_output['scatter_chart_matrix'],config = {'displaylogo': False}) container.text(" ") container.text(" ") corr_con = container.beta_container() corr_con.subheader("Correlation Matrix Information") corr_con.text(" ") # corr_c1, corr_c2, corr_c3 = corr_con.beta_columns(3) # j = 0 for key1, value1 in final_output['correlation_matrix_info'].items(): corr_con.text(" ") corr_con.write(key1) # col.pyplot(value) corr_con.plotly_chart(value1, config={'displaylogo': False}) # col.plotly_chart(value1,use_container_width=True) # j=j+1 container.text(" ") missing_con = container.beta_container() missing_con.subheader("Missing Values Information") missing_con.text(" ") mis_c1, mis_c2 = missing_con.beta_columns(2) mis_c3, mis_c4 = missing_con.beta_columns(2) k = 0 for key, value in final_output['missing_values_info'].items(): corr_con.text(" ") col = mis_c1 if k == 0: col = mis_c1 elif k == 1: col = mis_c2 elif k == 2: col = mis_c3 elif k == 3: col = mis_c4 col.write(key) col.pyplot(value) k = k + 1 # ********************************************************# # *********** Selecting Model for clustering ***********# st.subheader("Select the Model") model = st.selectbox(label=' ', options=['Select', 'KMeans']) #********************************************************# # ******* Data cleaning and checking with elbow technique using Kmeans clustering *******# if model == "KMeans": st.text(" ") st.subheader("choose the features which you want to drop") drop_features = st.multiselect('', cols) st.text(" ") cleaned_data, steps_dict = clean(dataset, drop_features) sample_data = cleaned_data.head() info['clean_data'] = sample_data info['auto_drop'] = steps_dict['auto_drop'] val1 = k_means(dataset, cols, drop_features, sample_data) st.write("Elbow-Curve") st.plotly_chart(val1, config={'displaylogo': False}) # st.write("Silhouette-Score") # st.plotly_chart(val2, config={'displaylogo': False}) # ******************************************************************************* # else: pass # **************************** End of UnSupervised Section ********************************** # except Exception as e: st.header(e) if __name__ == '__main__': main()