#!/usr/bin/env python # coding: utf-8 # In[28]: import numpy as np import pandas as pd import datetime import seaborn as sns import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import missingno as msno import statistics import plotly import plotly.express as px import plotly.graph_objects as go from wordcloud import WordCloud, STOPWORDS import nlplot # #### Reading the Data set ######### # In[2]: def get_details(data): try: correlation_matrix_info = {} missing_values_info = {} print("started") s_time = datetime.datetime.now() data_columns=data.columns.tolist() # ##################################### # # ########## Types of variable ############ # In[3]: num_data = data.select_dtypes(include=np.number) # numeric data num_data_col = data.select_dtypes(include=np.number).columns.tolist() # numeric column name # print("numeric column",len(num_data_col)) cat_data = data.select_dtypes(include=['object']) # Categorical data cat_data_col = data.select_dtypes(include=['object']).columns.tolist() # categorical column names # print("Categorical column",len(cat_data_col)) bool_data = data.select_dtypes(include=["bool_"]) # bool data bool_data_col = data.select_dtypes(include=["bool_"]).columns.tolist() # bool column names # print("Boolean column",len(bool_data_col)) unsupported_data = data.select_dtypes(exclude=["number", "bool_", "object_"]) # ########################################################################################## # # ################################### No of columns ######################################### # In[4]: column = data.columns col_length = len(column) row_length = len(data) # print("Number of variables ",col_length) #Number of variables # print("Number of observations ",row_length) #Number of observations total_cells = col_length * row_length # ############################################################################################ # # ################################ Missing cell and % ######################################### # In[5]: missing_values = np.where(pd.isnull(data)) no_of_missing_values = len(missing_values[0]) # no of missing cells missing_value_per = (no_of_missing_values / total_cells) * 100 # missing cell % # print("no of missing cells ",no_of_missing_values) # print("missing cell(%) ",missing_value_per,"%") # ############################################################################################# # # ################################# duplicate rows and % ####################################### # In[6]: duplicate = data[data.duplicated()] duplicate_rows = len(duplicate) dup_row_per = (duplicate_rows / row_length) * 100 # print("Duplicate rows ",duplicate_rows) # print("Duplicate rows (%) ",dup_row_per,"%") # ############################################################################################### # # #################################### Memory usage ############################################### # In[7]: memory_usage = data.memory_usage(deep=True).sum() memory_usage_MB = memory_usage / 1024 ** 2 # print("Total size in memory ",memory_usage_MB,"MiB") avg_memory_usage = data.memory_usage(deep=True).mean() avg_memory_usage_MB = avg_memory_usage / 1024 ** 2 # print("Average record size in memory ",avg_memory_usage_MB,"MiB") # ################################################################################################# # print("Overview Completed") # ####################################### General Insights of Numeric Variable ########################################## # # In[8]: num_variable = {} for col in num_data_col: val = {} distinct_val = data[col].nunique() val['distinct'] = int(distinct_val) total_count = len(data[col]) distinct_per = (distinct_val / total_count) * 100 val['distinct_percent'] = str(distinct_per) + "%" null = data[col].isnull().sum() val['missing'] = int(null) percent_missing = data[col].isnull().sum() * 100 / len(data[col]) val['missing_percent'] = str(percent_missing) + "%" zeros_in_col = (data[col] == 0).sum() val['zeros'] = int(zeros_in_col) zero_percent = (zeros_in_col / total_count) * 100 val['zero_percent'] = str(zero_percent) + "%" mean = data[col].mean() val['mean'] = float(mean) mini = data[col].min() val['minimum'] = str(mini) median = data[col].median() val['median'] = str(median) maxi = data[col].max() val['maximum'] = str(maxi) # infinite = df[col].isin([np.inf, -np.inf]) infinite = np.isinf(data[col]).values.sum() val['infinite'] = int(infinite) infinite_percent = infinite * 100 / len(data[col]) val['infinite_percent'] = str(infinite_percent) + "%" percent5 = np.percentile(data[col], 5) val['5th_percentile'] = str(percent5) percent95 = np.percentile(data[col], 95) val['95th_percentile'] = str(percent95) range1 = maxi - mini val['range'] = str(range1) q1 = np.percentile(data[col], 25) val['q1'] = str(q1) q3 = np.percentile(data[col], 75) val['q3'] = str(q3) iqr = q3 - q1 val['iqr'] = str(iqr) sample = data[col] standard_deviation = statistics.stdev(data[col]) val['standard_deviation'] = str(standard_deviation) df1 = pd.DataFrame(data) val['skewness'] = str(data[col].skew()) val['kurtosis'] = str(data[col].kurtosis()) val['sum'] = str(data[col].sum()) val['variance'] = str(data[col].var()) cv = standard_deviation / mean # val['co-efficient_variance'] = str(cv) val['monotocity'] = str(((all(data[col][i] <= data[col][i + 1] for i in range(len(data[col]) - 1)) or all(data[col][i] >= data[col][i + 1] for i in range(len(data[col] - 1)))))) #fig, ax = px.subplots(figsize=(10, 10)) fig = px.histogram(data, x=col) fig.update_layout(bargap=0.2) #fig.update_layout(width=25,height=25) val['visual_path'] = fig out_fig = px.box(data, x=col) val['outlier_img'] = out_fig #st.plotly_chart(fig) #px.close(fig) num_variable[col] = val ######################################################################################################################### print("Numeric Variable Completed") ####################################### General Insights of Categorical Variable ########################################## # In[9]: cat_variable = {} for col in cat_data_col: val = {} distinct_val = data[col].nunique() total_count = len(data[col]) distinct_per = (distinct_val / total_count) * 100 val['distinct'] = int(distinct_val) val['distinct_percent'] = str(round(distinct_per, 5)) + "%" missing_val = np.where(pd.isnull(data[col])) missing_val_count = len(missing_val[0]) missing_value_per = (missing_val_count / total_count) * 100 val['missing'] = int(missing_val_count) val['missing_percent'] = str(str(round(missing_value_per, 5))) + "%" memory_usage_col = data[col].memory_usage(deep=True) memory_usage_col_MB = memory_usage_col / 1024 ** 2 val['memory'] = str(round(memory_usage_col_MB, 5)) + " MiB" measurer = np.vectorize(len) temp_df1 = data[col].dropna() length_result = measurer(temp_df1.values.astype(str)) val['max_length'] = int(length_result.max()) val['median_length'] = int(np.median(length_result)) val['mean_length'] = float(length_result.mean()) val['min_length'] = int(length_result.min()) temp_df = pd.DataFrame(data[col].str.len()) val['total_character'] = int(temp_df.sum()) lst = [] for i in data[col]: if type(i) == str: l = list(set(i)) for j in l: if j not in lst: lst.append(j) val['distinct_character'] = int(len(lst)) val['distinct_categories'] = "" val['distinct_blocks'] = "??" val['distinct_scripts'] = "??" val['unique'] = "??" val['unique_percent'] = "??" #fig=plt.figure() fig = px.histogram(data, y=col) #fig.update_layout(width=25,height=25) val['visual_path'] = fig #px.close(fig) cat_variable[col] = val # #################################################################################################### print("Categorical Variable Completed") ##### Scatter Plot for dataset ########## sc_fig = px.scatter_matrix(data) ######################################### ################# Correlation matrix Visualization ############################# ################## pearson ############################# pearsoncorr = num_data.corr(method='pearson') fig = go.Figure(data = [ go.Heatmap( z=pearsoncorr, x=pearsoncorr.columns, y=pearsoncorr.columns) ]) correlation_matrix_info['pearsons'] = fig ########################################################## ################## spearman's ############################# spearmancorr = num_data.corr(method='spearman') fig = go.Figure(data = [ go.Heatmap( z=spearmancorr, x=spearmancorr.columns, y=spearmancorr.columns) ]) correlation_matrix_info['spearmans'] = fig ########################################################### # ################# kendall's ############################# pearsoncorr = num_data.corr(method='kendall') fig = go.Figure(data = [ go.Heatmap( z=pearsoncorr, x=pearsoncorr.columns, y=pearsoncorr.columns) ]) correlation_matrix_info['kendall'] = fig ####################################################### ###################################################################################################################### ############################################### Missing Values #################################################### #################### Count ################ fig1=plt.figure() msno.bar(data, figsize=(20, 20), color="dodgerblue") missing_values_info['count'] = fig1 plt.close(fig1) ########################################### ################## Matrix ################## fig2=msno.matrix(data, color=(0.27, 0.52, 1.0)) fig_2 = fig2.get_figure() missing_values_info['matrix'] = fig_2 plt.close() ############################################# ################ heatmap ################ fig3=msno.heatmap(data) fig_3 = fig3.get_figure() missing_values_info['heatmap'] = fig_3 plt.close() ############################################# ############## dendrogram ################## fig4=msno.dendrogram(data) fig_4 = fig4.get_figure() missing_values_info['dendrogram'] = fig_4 plt.close() ################################################ ################################################################### f_time = datetime.datetime.now() duration = f_time - s_time final_output = {} overview = {} reproduction = {} numerical_variable_info = {} categorical_variable_info = {} data_statistics = {} variable_type = {} data_statistics['number_of_variables'] = int(col_length) data_statistics['number_of_observations'] = int(row_length) data_statistics['no_of_missing_cells'] = int(no_of_missing_values) data_statistics['missing_cell_percent'] = str(round(missing_value_per, 5)) + "%" data_statistics['duplicate_rows'] = int(duplicate_rows) data_statistics['duplicate_rows_percent'] = str(round(dup_row_per, 5)) + "%" data_statistics['total_size_in_memory'] = str(round(memory_usage_MB, 5)) + "MiB" data_statistics['average_memory_Usage'] = str(round(avg_memory_usage_MB, 5)) + "MiB" variable_type['numeric_column'] = int(len(num_data_col)) variable_type['categorical_column'] = int(len(cat_data_col)) variable_type['boolean_column'] = int(len(bool_data_col)) overview['data_statistics'] = data_statistics overview['variable_type'] = variable_type reproduction['analysis_started'] = str(s_time) reproduction['analysis_finished'] = str(f_time) reproduction['duration'] = str(duration) reproduction['software_version'] = "??" reproduction['download_configuration'] = "??" numerical_variable_info['variable_info'] = num_variable categorical_variable_info['variable_info'] = cat_variable ################## Main Functions ###################################### final_output['overview'] = overview final_output['reproduction'] = reproduction final_output['numerical_variable_info'] = numerical_variable_info final_output['categorical_variable_info'] = categorical_variable_info final_output['scatter_chart_matrix']=sc_fig final_output['correlation_matrix_info'] = correlation_matrix_info final_output['missing_values_info'] = missing_values_info ####################################################################### return final_output except Exception as e: # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] return None ############### Prints the Imbalance Ration of the dataset ################## def imbalnce_ratio(dataset, target): val = "" if dataset[target].nunique() <= 10: dt = dataset[target].value_counts() ln = len(dataset[target].value_counts()) for i in range(0, ln): ir_cal = round(dt[i] / len(dataset) * 10, 1) category ="/"+ str(dt.index[i]) if ir_cal.is_integer(): val = val + str(int(ir_cal)) val =val+ category else: val = val + str(ir_cal) val = val + category if i != (ln - 1): val = val + " : " return val ################################################################### ########### return's an image which describes about Text visulization ############ def word_cloud(dataset, column): if column == "Select": pass else: comment_words = ' ' wc = WordCloud(stopwords=set(STOPWORDS), max_words=200, max_font_size=100) for val in dataset[column]: # typecaste each val to string val = str(val) # split the value tokens = val.split() # Converts each token into lowercase for i in range(len(tokens)): tokens[i] = tokens[i].lower() for words in tokens: comment_words = comment_words + words + ' ' wc.generate(comment_words) word_list = [] freq_list = [] fontsize_list = [] position_list = [] orientation_list = [] color_list = [] for (word, freq), fontsize, position, orientation, color in wc.layout_: word_list.append(word) freq_list.append(freq) fontsize_list.append(fontsize) position_list.append(position) orientation_list.append(orientation) color_list.append(color) # get the positions x = [] y = [] for i in position_list: x.append(i[0]) y.append(i[1]) # get the relative occurence frequencies new_freq_list = [] for i in freq_list: new_freq_list.append(i * 100) trace = go.Scatter(x=x, y=y, textfont=dict(size=new_freq_list, color=color_list), hoverinfo='text', hovertext=['{0} {1:.2f} %'.format(w, f) for w, f in zip(word_list, new_freq_list)], mode='text', text=word_list ) layout = go.Layout({'xaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}, 'yaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}}) fig = go.Figure(data=[trace], layout=layout) return fig ############################################################################### ########### return's an image which describes about target feature for NLP text classification ############ def plotly_target(dataset, column): if column == "Select": return None else: fig = px.histogram(dataset, y=column) fig.update_layout(bargap=0.2) return fig ############################################################################################################ ############ Plotting n-gram for text feature in NLP Text Classification ########################### def plot_ngram(dataset, input_col): if input_col == 'Select': return None else: train = dataset train[input_col] = train[input_col].apply(lambda x: x.lower()) npt = nlplot.NLPlot(train, target_col=input_col) stopwords = npt.get_stopword(top_n=30, min_freq=0) fig = npt.bar_ngram( title='bi-gram', xaxis_label='word_count', yaxis_label='word', ngram=2, top_n=50, width=700, height=1100, stopwords=stopwords, ) return fig #################################################################################################