AiNext / DA_P1.py
AICOE-Datamatics's picture
Initial code
247c8df
raw
history blame contribute delete
No virus
19.5 kB
#!/usr/bin/env python
# coding: utf-8
# In[28]:
import numpy as np
import pandas as pd
import datetime
import seaborn as sns
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import missingno as msno
import statistics
import plotly
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
import nlplot
# #### Reading the Data set #########
# In[2]:
def get_details(data):
try:
correlation_matrix_info = {}
missing_values_info = {}
print("started")
s_time = datetime.datetime.now()
data_columns=data.columns.tolist()
# #####################################
#
# ########## Types of variable ############
# In[3]:
num_data = data.select_dtypes(include=np.number) # numeric data
num_data_col = data.select_dtypes(include=np.number).columns.tolist() # numeric column name
# print("numeric column",len(num_data_col))
cat_data = data.select_dtypes(include=['object']) # Categorical data
cat_data_col = data.select_dtypes(include=['object']).columns.tolist() # categorical column names
# print("Categorical column",len(cat_data_col))
bool_data = data.select_dtypes(include=["bool_"]) # bool data
bool_data_col = data.select_dtypes(include=["bool_"]).columns.tolist() # bool column names
# print("Boolean column",len(bool_data_col))
unsupported_data = data.select_dtypes(exclude=["number", "bool_", "object_"])
# ##########################################################################################
#
# ################################### No of columns #########################################
# In[4]:
column = data.columns
col_length = len(column)
row_length = len(data)
# print("Number of variables ",col_length) #Number of variables
# print("Number of observations ",row_length) #Number of observations
total_cells = col_length * row_length
# ############################################################################################
#
# ################################ Missing cell and % #########################################
# In[5]:
missing_values = np.where(pd.isnull(data))
no_of_missing_values = len(missing_values[0]) # no of missing cells
missing_value_per = (no_of_missing_values / total_cells) * 100 # missing cell %
# print("no of missing cells ",no_of_missing_values)
# print("missing cell(%) ",missing_value_per,"%")
# #############################################################################################
#
# ################################# duplicate rows and % #######################################
# In[6]:
duplicate = data[data.duplicated()]
duplicate_rows = len(duplicate)
dup_row_per = (duplicate_rows / row_length) * 100
# print("Duplicate rows ",duplicate_rows)
# print("Duplicate rows (%) ",dup_row_per,"%")
# ###############################################################################################
#
# #################################### Memory usage ###############################################
# In[7]:
memory_usage = data.memory_usage(deep=True).sum()
memory_usage_MB = memory_usage / 1024 ** 2
# print("Total size in memory ",memory_usage_MB,"MiB")
avg_memory_usage = data.memory_usage(deep=True).mean()
avg_memory_usage_MB = avg_memory_usage / 1024 ** 2
# print("Average record size in memory ",avg_memory_usage_MB,"MiB")
# #################################################################################################
#
print("Overview Completed")
# ####################################### General Insights of Numeric Variable ##########################################
#
# In[8]:
num_variable = {}
for col in num_data_col:
val = {}
distinct_val = data[col].nunique()
val['distinct'] = int(distinct_val)
total_count = len(data[col])
distinct_per = (distinct_val / total_count) * 100
val['distinct_percent'] = str(distinct_per) + "%"
null = data[col].isnull().sum()
val['missing'] = int(null)
percent_missing = data[col].isnull().sum() * 100 / len(data[col])
val['missing_percent'] = str(percent_missing) + "%"
zeros_in_col = (data[col] == 0).sum()
val['zeros'] = int(zeros_in_col)
zero_percent = (zeros_in_col / total_count) * 100
val['zero_percent'] = str(zero_percent) + "%"
mean = data[col].mean()
val['mean'] = float(mean)
mini = data[col].min()
val['minimum'] = str(mini)
median = data[col].median()
val['median'] = str(median)
maxi = data[col].max()
val['maximum'] = str(maxi)
# infinite = df[col].isin([np.inf, -np.inf])
infinite = np.isinf(data[col]).values.sum()
val['infinite'] = int(infinite)
infinite_percent = infinite * 100 / len(data[col])
val['infinite_percent'] = str(infinite_percent) + "%"
percent5 = np.percentile(data[col], 5)
val['5th_percentile'] = str(percent5)
percent95 = np.percentile(data[col], 95)
val['95th_percentile'] = str(percent95)
range1 = maxi - mini
val['range'] = str(range1)
q1 = np.percentile(data[col], 25)
val['q1'] = str(q1)
q3 = np.percentile(data[col], 75)
val['q3'] = str(q3)
iqr = q3 - q1
val['iqr'] = str(iqr)
sample = data[col]
standard_deviation = statistics.stdev(data[col])
val['standard_deviation'] = str(standard_deviation)
df1 = pd.DataFrame(data)
val['skewness'] = str(data[col].skew())
val['kurtosis'] = str(data[col].kurtosis())
val['sum'] = str(data[col].sum())
val['variance'] = str(data[col].var())
cv = standard_deviation / mean
# val['co-efficient_variance'] = str(cv)
val['monotocity'] = str(((all(data[col][i] <= data[col][i + 1] for i in range(len(data[col]) - 1))
or all(data[col][i] >= data[col][i + 1] for i in range(len(data[col] - 1))))))
#fig, ax = px.subplots(figsize=(10, 10))
fig = px.histogram(data, x=col)
fig.update_layout(bargap=0.2)
#fig.update_layout(width=25,height=25)
val['visual_path'] = fig
out_fig = px.box(data, x=col)
val['outlier_img'] = out_fig
#st.plotly_chart(fig)
#px.close(fig)
num_variable[col] = val
#########################################################################################################################
print("Numeric Variable Completed")
####################################### General Insights of Categorical Variable ##########################################
# In[9]:
cat_variable = {}
for col in cat_data_col:
val = {}
distinct_val = data[col].nunique()
total_count = len(data[col])
distinct_per = (distinct_val / total_count) * 100
val['distinct'] = int(distinct_val)
val['distinct_percent'] = str(round(distinct_per, 5)) + "%"
missing_val = np.where(pd.isnull(data[col]))
missing_val_count = len(missing_val[0])
missing_value_per = (missing_val_count / total_count) * 100
val['missing'] = int(missing_val_count)
val['missing_percent'] = str(str(round(missing_value_per, 5))) + "%"
memory_usage_col = data[col].memory_usage(deep=True)
memory_usage_col_MB = memory_usage_col / 1024 ** 2
val['memory'] = str(round(memory_usage_col_MB, 5)) + " MiB"
measurer = np.vectorize(len)
temp_df1 = data[col].dropna()
length_result = measurer(temp_df1.values.astype(str))
val['max_length'] = int(length_result.max())
val['median_length'] = int(np.median(length_result))
val['mean_length'] = float(length_result.mean())
val['min_length'] = int(length_result.min())
temp_df = pd.DataFrame(data[col].str.len())
val['total_character'] = int(temp_df.sum())
lst = []
for i in data[col]:
if type(i) == str:
l = list(set(i))
for j in l:
if j not in lst:
lst.append(j)
val['distinct_character'] = int(len(lst))
val['distinct_categories'] = ""
val['distinct_blocks'] = "??"
val['distinct_scripts'] = "??"
val['unique'] = "??"
val['unique_percent'] = "??"
#fig=plt.figure()
fig = px.histogram(data, y=col)
#fig.update_layout(width=25,height=25)
val['visual_path'] = fig
#px.close(fig)
cat_variable[col] = val
# ####################################################################################################
print("Categorical Variable Completed")
##### Scatter Plot for dataset ##########
sc_fig = px.scatter_matrix(data)
#########################################
################# Correlation matrix Visualization #############################
################## pearson #############################
pearsoncorr = num_data.corr(method='pearson')
fig = go.Figure(data = [
go.Heatmap(
z=pearsoncorr,
x=pearsoncorr.columns,
y=pearsoncorr.columns)
])
correlation_matrix_info['pearsons'] = fig
##########################################################
################## spearman's #############################
spearmancorr = num_data.corr(method='spearman')
fig = go.Figure(data = [
go.Heatmap(
z=spearmancorr,
x=spearmancorr.columns,
y=spearmancorr.columns)
])
correlation_matrix_info['spearmans'] = fig
###########################################################
# ################# kendall's #############################
pearsoncorr = num_data.corr(method='kendall')
fig = go.Figure(data = [
go.Heatmap(
z=pearsoncorr,
x=pearsoncorr.columns,
y=pearsoncorr.columns)
])
correlation_matrix_info['kendall'] = fig
#######################################################
######################################################################################################################
############################################### Missing Values ####################################################
#################### Count ################
fig1=plt.figure()
msno.bar(data, figsize=(20, 20), color="dodgerblue")
missing_values_info['count'] = fig1
plt.close(fig1)
###########################################
################## Matrix ##################
fig2=msno.matrix(data, color=(0.27, 0.52, 1.0))
fig_2 = fig2.get_figure()
missing_values_info['matrix'] = fig_2
plt.close()
#############################################
################ heatmap ################
fig3=msno.heatmap(data)
fig_3 = fig3.get_figure()
missing_values_info['heatmap'] = fig_3
plt.close()
#############################################
############## dendrogram ##################
fig4=msno.dendrogram(data)
fig_4 = fig4.get_figure()
missing_values_info['dendrogram'] = fig_4
plt.close()
################################################
###################################################################
f_time = datetime.datetime.now()
duration = f_time - s_time
final_output = {}
overview = {}
reproduction = {}
numerical_variable_info = {}
categorical_variable_info = {}
data_statistics = {}
variable_type = {}
data_statistics['number_of_variables'] = int(col_length)
data_statistics['number_of_observations'] = int(row_length)
data_statistics['no_of_missing_cells'] = int(no_of_missing_values)
data_statistics['missing_cell_percent'] = str(round(missing_value_per, 5)) + "%"
data_statistics['duplicate_rows'] = int(duplicate_rows)
data_statistics['duplicate_rows_percent'] = str(round(dup_row_per, 5)) + "%"
data_statistics['total_size_in_memory'] = str(round(memory_usage_MB, 5)) + "MiB"
data_statistics['average_memory_Usage'] = str(round(avg_memory_usage_MB, 5)) + "MiB"
variable_type['numeric_column'] = int(len(num_data_col))
variable_type['categorical_column'] = int(len(cat_data_col))
variable_type['boolean_column'] = int(len(bool_data_col))
overview['data_statistics'] = data_statistics
overview['variable_type'] = variable_type
reproduction['analysis_started'] = str(s_time)
reproduction['analysis_finished'] = str(f_time)
reproduction['duration'] = str(duration)
reproduction['software_version'] = "??"
reproduction['download_configuration'] = "??"
numerical_variable_info['variable_info'] = num_variable
categorical_variable_info['variable_info'] = cat_variable
################## Main Functions ######################################
final_output['overview'] = overview
final_output['reproduction'] = reproduction
final_output['numerical_variable_info'] = numerical_variable_info
final_output['categorical_variable_info'] = categorical_variable_info
final_output['scatter_chart_matrix']=sc_fig
final_output['correlation_matrix_info'] = correlation_matrix_info
final_output['missing_values_info'] = missing_values_info
#######################################################################
return final_output
except Exception as e:
# exc_type, exc_obj, exc_tb = sys.exc_info()
# fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
return None
############### Prints the Imbalance Ration of the dataset ##################
def imbalnce_ratio(dataset, target):
val = ""
if dataset[target].nunique() <= 10:
dt = dataset[target].value_counts()
ln = len(dataset[target].value_counts())
for i in range(0, ln):
ir_cal = round(dt[i] / len(dataset) * 10, 1)
category ="/"+ str(dt.index[i])
if ir_cal.is_integer():
val = val + str(int(ir_cal))
val =val+ category
else:
val = val + str(ir_cal)
val = val + category
if i != (ln - 1):
val = val + " : "
return val
###################################################################
########### return's an image which describes about Text visulization ############
def word_cloud(dataset, column):
if column == "Select":
pass
else:
comment_words = ' '
wc = WordCloud(stopwords=set(STOPWORDS),
max_words=200,
max_font_size=100)
for val in dataset[column]:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
for words in tokens:
comment_words = comment_words + words + ' '
wc.generate(comment_words)
word_list = []
freq_list = []
fontsize_list = []
position_list = []
orientation_list = []
color_list = []
for (word, freq), fontsize, position, orientation, color in wc.layout_:
word_list.append(word)
freq_list.append(freq)
fontsize_list.append(fontsize)
position_list.append(position)
orientation_list.append(orientation)
color_list.append(color)
# get the positions
x = []
y = []
for i in position_list:
x.append(i[0])
y.append(i[1])
# get the relative occurence frequencies
new_freq_list = []
for i in freq_list:
new_freq_list.append(i * 100)
trace = go.Scatter(x=x,
y=y,
textfont=dict(size=new_freq_list,
color=color_list),
hoverinfo='text',
hovertext=['{0} {1:.2f} %'.format(w, f) for w, f in zip(word_list, new_freq_list)],
mode='text',
text=word_list
)
layout = go.Layout({'xaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False},
'yaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}})
fig = go.Figure(data=[trace], layout=layout)
return fig
###############################################################################
########### return's an image which describes about target feature for NLP text classification ############
def plotly_target(dataset, column):
if column == "Select":
return None
else:
fig = px.histogram(dataset, y=column)
fig.update_layout(bargap=0.2)
return fig
############################################################################################################
############ Plotting n-gram for text feature in NLP Text Classification ###########################
def plot_ngram(dataset, input_col):
if input_col == 'Select':
return None
else:
train = dataset
train[input_col] = train[input_col].apply(lambda x: x.lower())
npt = nlplot.NLPlot(train, target_col=input_col)
stopwords = npt.get_stopword(top_n=30, min_freq=0)
fig = npt.bar_ngram(
title='bi-gram',
xaxis_label='word_count',
yaxis_label='word',
ngram=2,
top_n=50,
width=700,
height=1100,
stopwords=stopwords,
)
return fig
#################################################################################################