import solara import random import torch import torch.nn.functional as F import pandas as pd from transformers import AutoTokenizer, AutoModelForCausalLM # Cargar el tokenizer y el modelo preentrenado en español tokenizer = AutoTokenizer.from_pretrained("datificate/gpt2-small-spanish") model = AutoModelForCausalLM.from_pretrained("datificate/gpt2-small-spanish") # tokenizer = AutoTokenizer.from_pretrained('gpt2') # model = AutoModelForCausalLM.from_pretrained('gpt2') text1 = solara.reactive("Hola, ¿cómo estás") @solara.component def Page(): with solara.Column(margin=10): solara.Markdown("#Siguiente token prediction ") solara.Markdown("Hemos creado esta herramienta para entender el funcionamiento de los modelos de lenguage autoregresivos... ") def on_action_cell(column, row_index): text1.value += tokenizer.decode(top_10.indices[0][row_index]) cell_actions = [solara.CellAction(icon="mdi-thumb-up", name="Select", on_click=on_action_cell)] solara.InputText("Enter text:", value=text1, continuous_update=True) if text1.value != "": tokens = tokenizer.encode(text1.value, return_tensors="pt") spans1 = "" spans2 = "" for i, token in enumerate(tokens[0]): random.seed(i) random_color = ''.join([random.choice('0123456789ABCDEF') for k in range(6)]) spans1 += " " + f"{token}" spans2 += " " + f""" {token}{tokenizer.decode([token])}""" solara.Markdown(f'{spans2}') solara.Markdown(f'{spans1}') outputs = model.generate(tokens, max_new_tokens=1, output_scores=True, return_dict_in_generate=True, pad_token_id=tokenizer.eos_token_id) scores = F.softmax(outputs.scores[0], dim=-1) top_10 = torch.topk(scores, 10) df = pd.DataFrame() df["probs"] = top_10.values[0] df["probs"] = [f"{value:.2%}" for value in df["probs"].values] df["next token ID"] = [top_10.indices[0][i].numpy() for i in range(10)] df["predicted next token"] = [tokenizer.decode(top_10.indices[0][i]) for i in range(10)] solara.Markdown("###Prediction") solara.DataFrame(df, items_per_page=10, cell_actions=cell_actions) Page()