import sys tabpfn_path = 'TabPFN' sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618) from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier import numpy as np from pathlib import Path import pandas as pd import torch import gradio as gr import openml import os import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap def compute(table: np.array): vfunc = np.vectorize(lambda s: len(str(s))) non_empty_row_mask = (vfunc(table).sum(1) != 0) table = table[non_empty_row_mask] empty_mask = table == '(predict)' empty_inds = np.where(empty_mask) if not len(empty_inds[0]): return "⚠️ **ERROR: Please leave at least one field blank for prediction.**", None, None if not np.all(empty_inds[1][0] == empty_inds[1]): return "⚠️ **Please only leave fields of one column blank for prediction.**", None, None y_column = empty_inds[1][0] eval_lines = empty_inds[0] train_table = np.delete(table, eval_lines, axis=0) eval_table = table[eval_lines] try: x_train = torch.tensor(np.delete(train_table, y_column, axis=1).astype(np.float32)) x_eval = torch.tensor(np.delete(eval_table, y_column, axis=1).astype(np.float32)) y_train = train_table[:, y_column] except ValueError: return "⚠️ **Please only add numbers (to the inputs) or leave fields empty.**", None, None classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu') classifier.fit(x_train, y_train) y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True) # print(file, type(file)) out_table = pd.DataFrame(table.copy().astype(str)) out_table.iloc[eval_lines, y_column] = [f"{y_e} (p={p_e:.2f})" for y_e, p_e in zip(y_eval, p_eval)] out_table = out_table.iloc[eval_lines, :] out_table.columns = headers # PLOTTING fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(111) cm = plt.cm.RdBu cm_bright = ListedColormap(["#FF0000", "#0000FF"]) # Plot the training points vfunc = np.vectorize(lambda x : np.where(classifier.classes_ == x)[0]) y_train_index = vfunc(y_train) y_train_index = y_train_index == 0 y_train = y_train_index #x_train = x_train[y_train_index <= 1] #y_train = y_train[y_train_index <= 1] #y_train_index = y_train_index[y_train_index <= 1] ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train_index, cmap=cm_bright) classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu') classifier.fit(x_train[:, 0:2], y_train) DecisionBoundaryDisplay.from_estimator( classifier, x_train[:, 0:2], alpha=0.6, ax=ax, eps=2.0, grid_resolution=100, response_method="predict_proba" ) plt.xlabel(headers[0]) plt.ylabel(headers[1]) return None, out_table, fig def upload_file(file, remove_entries=10): global headers if file.name.endswith('.arff'): dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name) X_, _, categorical_indicator_, attribute_names_ = dataset.get_data( dataset_format="array" ) df = pd.DataFrame(X_, columns=attribute_names_) headers = np.arange(len(df.columns)) df.columns = headers elif file.name.endswith('.csv') or file.name.endswith('.data'): df = pd.read_csv(file.name, header='infer') headers = np.arange(len(df.columns)) df.columns = headers df.iloc[0:remove_entries, -1] = '' return df def update_table(table): global headers table = pd.DataFrame(table) vfunc = np.vectorize(lambda s: len(str(s))) non_empty_row_mask = (vfunc(table).sum(1) != 0) table = table[non_empty_row_mask] empty_mask = table == '' empty_inds = np.where(empty_mask) if not len(empty_inds[0]): return table y_column = empty_inds[1][0] eval_lines = empty_inds[0] table.iloc[eval_lines, y_column] = '' table.columns = headers return table headers = [] gr.Markdown("""This demo allows you to play with the **TabPFN**. The TabPFN will classify the values for all empty cells in the label column. Please, provide everything but the label column as numeric values. You can also upload datasets to fill the table automatically. """) with gr.Blocks() as demo: with gr.Row(): with gr.Column(): inp_table = gr.DataFrame(type='numpy', value=upload_file(Path('iris.csv'), remove_entries=10) , headers=[''] * 3) inp_file = gr.File( label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.') examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'], inputs=[inp_file], outputs=[inp_table], fn=upload_file, cache_examples=True) #inp_table.change(fn=update_table, inputs=inp_table, outputs=inp_table) with gr.Column(): btn = gr.Button("Calculate Predictions") out_text = gr.Markdown() out_plot = gr.Plot() out_table = gr.DataFrame() btn.click(fn=compute, inputs=inp_table, outputs=[out_text, out_table, out_plot]) inp_file.change(fn=upload_file, inputs=inp_file, outputs=inp_table) demo.launch(share=True)