Tedvalson commited on
Commit
d071810
β€’
1 Parent(s): 3dd0be5

Add example app

Browse files
Files changed (2) hide show
  1. app.py +249 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestRegressor
6
+ from sklearn.metrics import mean_squared_error, r2_score
7
+ import altair as alt
8
+ import time
9
+ import zipfile
10
+
11
+ # Page title
12
+ st.set_page_config(page_title='ML Model Building', page_icon='πŸ€–')
13
+ st.title('πŸ€– ML Model Building')
14
+
15
+ with st.expander('About this app'):
16
+ st.markdown('**What can this app do?**')
17
+ st.info('This app allow users to build a machine learning (ML) model in an end-to-end workflow. Particularly, this encompasses data upload, data pre-processing, ML model building and post-model analysis.')
18
+
19
+ st.markdown('**How to use the app?**')
20
+ st.warning('To engage with the app, go to the sidebar and 1. Select a data set and 2. Adjust the model parameters by adjusting the various slider widgets. As a result, this would initiate the ML model building process, display the model results as well as allowing users to download the generated models and accompanying data.')
21
+
22
+ st.markdown('**Under the hood**')
23
+ st.markdown('Data sets:')
24
+ st.code('''- Drug solubility data set
25
+ ''', language='markdown')
26
+
27
+ st.markdown('Libraries used:')
28
+ st.code('''- Pandas for data wrangling
29
+ - Scikit-learn for building a machine learning model
30
+ - Altair for chart creation
31
+ - Streamlit for user interface
32
+ ''', language='markdown')
33
+
34
+
35
+ # Sidebar for accepting input parameters
36
+ with st.sidebar:
37
+ # Load data
38
+ st.header('1.1. Input data')
39
+
40
+ st.markdown('**1. Use custom data**')
41
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
42
+ if uploaded_file is not None:
43
+ df = pd.read_csv(uploaded_file, index_col=False)
44
+
45
+ # Download example data
46
+ @st.cache_data
47
+ def convert_df(input_df):
48
+ return input_df.to_csv(index=False).encode('utf-8')
49
+ example_csv = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
50
+ csv = convert_df(example_csv)
51
+ st.download_button(
52
+ label="Download example CSV",
53
+ data=csv,
54
+ file_name='delaney_solubility_with_descriptors.csv',
55
+ mime='text/csv',
56
+ )
57
+
58
+ # Select example data
59
+ st.markdown('**1.2. Use example data**')
60
+ example_data = st.toggle('Load example data')
61
+ if example_data:
62
+ df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
63
+
64
+ st.header('2. Set Parameters')
65
+ parameter_split_size = st.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
66
+
67
+ st.subheader('2.1. Learning Parameters')
68
+ with st.expander('See parameters'):
69
+ parameter_n_estimators = st.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100)
70
+ parameter_max_features = st.select_slider('Max features (max_features)', options=['all', 'sqrt', 'log2'])
71
+ parameter_min_samples_split = st.slider('Minimum number of samples required to split an internal node (min_samples_split)', 2, 10, 2, 1)
72
+ parameter_min_samples_leaf = st.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1)
73
+
74
+ st.subheader('2.2. General Parameters')
75
+ with st.expander('See parameters', expanded=False):
76
+ parameter_random_state = st.slider('Seed number (random_state)', 0, 1000, 42, 1)
77
+ parameter_criterion = st.select_slider('Performance measure (criterion)', options=['squared_error', 'absolute_error', 'friedman_mse'])
78
+ parameter_bootstrap = st.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False])
79
+ parameter_oob_score = st.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True])
80
+
81
+ sleep_time = st.slider('Sleep time', 0, 3, 0)
82
+
83
+ # Initiate the model building process
84
+ if uploaded_file or example_data:
85
+ with st.status("Running ...", expanded=True) as status:
86
+
87
+ st.write("Loading data ...")
88
+ time.sleep(sleep_time)
89
+
90
+ st.write("Preparing data ...")
91
+ time.sleep(sleep_time)
92
+ X = df.iloc[:,:-1]
93
+ y = df.iloc[:,-1]
94
+
95
+ st.write("Splitting data ...")
96
+ time.sleep(sleep_time)
97
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
98
+
99
+ st.write("Model training ...")
100
+ time.sleep(sleep_time)
101
+
102
+ if parameter_max_features == 'all':
103
+ parameter_max_features = None
104
+ parameter_max_features_metric = X.shape[1]
105
+
106
+ rf = RandomForestRegressor(
107
+ n_estimators=parameter_n_estimators,
108
+ max_features=parameter_max_features,
109
+ min_samples_split=parameter_min_samples_split,
110
+ min_samples_leaf=parameter_min_samples_leaf,
111
+ random_state=parameter_random_state,
112
+ criterion=parameter_criterion,
113
+ bootstrap=parameter_bootstrap,
114
+ oob_score=parameter_oob_score)
115
+ rf.fit(X_train, y_train)
116
+
117
+ st.write("Applying model to make predictions ...")
118
+ time.sleep(sleep_time)
119
+ y_train_pred = rf.predict(X_train)
120
+ y_test_pred = rf.predict(X_test)
121
+
122
+ st.write("Evaluating performance metrics ...")
123
+ time.sleep(sleep_time)
124
+ train_mse = mean_squared_error(y_train, y_train_pred)
125
+ train_r2 = r2_score(y_train, y_train_pred)
126
+ test_mse = mean_squared_error(y_test, y_test_pred)
127
+ test_r2 = r2_score(y_test, y_test_pred)
128
+
129
+ st.write("Displaying performance metrics ...")
130
+ time.sleep(sleep_time)
131
+ parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
132
+ #if 'Mse' in parameter_criterion_string:
133
+ # parameter_criterion_string = parameter_criterion_string.replace('Mse', 'MSE')
134
+ rf_results = pd.DataFrame(['Random forest', train_mse, train_r2, test_mse, test_r2]).transpose()
135
+ rf_results.columns = ['Method', f'Training {parameter_criterion_string}', 'Training R2', f'Test {parameter_criterion_string}', 'Test R2']
136
+ # Convert objects to numerics
137
+ for col in rf_results.columns:
138
+ rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
139
+ # Round to 3 digits
140
+ rf_results = rf_results.round(3)
141
+
142
+ status.update(label="Status", state="complete", expanded=False)
143
+
144
+ # Display data info
145
+ st.header('Input data', divider='rainbow')
146
+ col = st.columns(4)
147
+ col[0].metric(label="No. of samples", value=X.shape[0], delta="")
148
+ col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
149
+ col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
150
+ col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
151
+
152
+ with st.expander('Initial dataset', expanded=True):
153
+ st.dataframe(df, height=210, use_container_width=True)
154
+ with st.expander('Train split', expanded=False):
155
+ train_col = st.columns((3,1))
156
+ with train_col[0]:
157
+ st.markdown('**X**')
158
+ st.dataframe(X_train, height=210, hide_index=True, use_container_width=True)
159
+ with train_col[1]:
160
+ st.markdown('**y**')
161
+ st.dataframe(y_train, height=210, hide_index=True, use_container_width=True)
162
+ with st.expander('Test split', expanded=False):
163
+ test_col = st.columns((3,1))
164
+ with test_col[0]:
165
+ st.markdown('**X**')
166
+ st.dataframe(X_test, height=210, hide_index=True, use_container_width=True)
167
+ with test_col[1]:
168
+ st.markdown('**y**')
169
+ st.dataframe(y_test, height=210, hide_index=True, use_container_width=True)
170
+
171
+ # Zip dataset files
172
+ df.to_csv('dataset.csv', index=False)
173
+ X_train.to_csv('X_train.csv', index=False)
174
+ y_train.to_csv('y_train.csv', index=False)
175
+ X_test.to_csv('X_test.csv', index=False)
176
+ y_test.to_csv('y_test.csv', index=False)
177
+
178
+ list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
179
+ with zipfile.ZipFile('dataset.zip', 'w') as zipF:
180
+ for file in list_files:
181
+ zipF.write(file, compress_type=zipfile.ZIP_DEFLATED)
182
+
183
+ with open('dataset.zip', 'rb') as datazip:
184
+ btn = st.download_button(
185
+ label='Download ZIP',
186
+ data=datazip,
187
+ file_name="dataset.zip",
188
+ mime="application/octet-stream"
189
+ )
190
+
191
+ # Display model parameters
192
+ st.header('Model parameters', divider='rainbow')
193
+ parameters_col = st.columns(3)
194
+ parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
195
+ parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
196
+ parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
197
+
198
+ # Display feature importance plot
199
+ importances = rf.feature_importances_
200
+ feature_names = list(X.columns)
201
+ forest_importances = pd.Series(importances, index=feature_names)
202
+ df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
203
+
204
+ bars = alt.Chart(df_importance).mark_bar(size=40).encode(
205
+ x='value:Q',
206
+ y=alt.Y('feature:N', sort='-x')
207
+ ).properties(height=250)
208
+
209
+ performance_col = st.columns((2, 0.2, 3))
210
+ with performance_col[0]:
211
+ st.header('Model performance', divider='rainbow')
212
+ st.dataframe(rf_results.T.reset_index().rename(columns={'index': 'Parameter', 0: 'Value'}))
213
+ with performance_col[2]:
214
+ st.header('Feature importance', divider='rainbow')
215
+ st.altair_chart(bars, theme='streamlit', use_container_width=True)
216
+
217
+ # Prediction results
218
+ st.header('Prediction results', divider='rainbow')
219
+ s_y_train = pd.Series(y_train, name='actual').reset_index(drop=True)
220
+ s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
221
+ df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
222
+ df_train['class'] = 'train'
223
+
224
+ s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
225
+ s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
226
+ df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
227
+ df_test['class'] = 'test'
228
+
229
+ df_prediction = pd.concat([df_train, df_test], axis=0)
230
+
231
+ prediction_col = st.columns((2, 0.2, 3))
232
+
233
+ # Display dataframe
234
+ with prediction_col[0]:
235
+ st.dataframe(df_prediction, height=320, use_container_width=True)
236
+
237
+ # Display scatter plot of actual vs predicted values
238
+ with prediction_col[2]:
239
+ scatter = alt.Chart(df_prediction).mark_circle(size=60).encode(
240
+ x='actual',
241
+ y='predicted',
242
+ color='class'
243
+ )
244
+ st.altair_chart(scatter, theme='streamlit', use_container_width=True)
245
+
246
+
247
+ # Ask for CSV upload if none is detected
248
+ else:
249
+ st.warning('πŸ‘ˆ Upload a CSV file or click *"Load example data"* to get started!')
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.29.0
2
+ pandas>=1.3.0
3
+ scikit-learn
4
+ altair>=4.0