tokenizer-demo / app.py
Takateru Yamakoshi
fix
493d41a
raw
history blame contribute delete
No virus
6.18 kB
import pandas as pd
import streamlit as st
import numpy as np
import torch
import io
import time
@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(tokenizer_name):
from transformers import AutoTokenizer
model_name_dict = {
"BERT":"bert-base-uncased",
"RoBERTa":"roberta-base",
"ALBERT":"albert-base-v2",
"GPT2":"gpt2",
#"Llama":"meta-lama/Llama-2-7b-chat-hf",
#"Gemma":"google/gemma-7b",
}
tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
return tokenizer
def generate_markdown(text,color='black',font='Arial',size=20):
return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
def TokenizeText(sentence,tokenizer_name):
if len(sentence)>0:
#if tokenizer_name.startswith('gpt2'):
# input_sent = tokenizer(sentence)['input_ids']
#else:
# input_sent = tokenizer(sentence)['input_ids'][1:-1]
input_sent = tokenizer(sentence)['input_ids']
encoded_sent = [str(token) for token in input_sent]
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
num_tokens = len(decoded_sent)
#char_nums = [len(word)+2 for word in decoded_sent]
#word_cols = st.columns(char_nums)
#for word_col,word in zip(word_cols,decoded_sent):
#with word_col:
#st.write(word)
#st.write(' '.join(encoded_sent))
#st.write(' '.join(decoded_sent))
st.markdown(generate_markdown(' '.join(encoded_sent),size=16), unsafe_allow_html=True)
st.markdown(generate_markdown(' '.join(decoded_sent),size=16), unsafe_allow_html=True)
st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)
return num_tokens
def DeTokenizeText(input_str):
if len(input_str)>0:
input_sent = [int(element) for element in input_str.strip().split(' ')]
encoded_sent = [str(token) for token in input_sent]
decoded_sent = tokenizer.decode(input_sent)
num_tokens = len(input_sent)
#char_nums = [len(word)+2 for word in decoded_sent]
#word_cols = st.columns(char_nums)
#for word_col,word in zip(word_cols,decoded_sent):
#with word_col:
#st.write(word)
#st.write(' '.join(encoded_sent))
#st.write(' '.join(decoded_sent))
st.markdown(generate_markdown(decoded_sent), unsafe_allow_html=True)
return num_tokens
if __name__=='__main__':
# Config
max_width = 1500
padding_top = 0
padding_right = 2
padding_bottom = 0
padding_left = 2
define_margins = f"""
<style>
.appview-container .main .block-container{{
max-width: {max_width}px;
padding-top: {padding_top}rem;
padding-right: {padding_right}rem;
padding-left: {padding_left}rem;
padding-bottom: {padding_bottom}rem;
}}
</style>
"""
hide_table_row_index = """
<style>
tbody th {display:none}
.blank {display:none}
</style>
"""
st.markdown(define_margins, unsafe_allow_html=True)
st.markdown(hide_table_row_index, unsafe_allow_html=True)
# Title
st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
st.markdown(generate_markdown('- quick and easy way to explore how tokenizers work -',size=24), unsafe_allow_html=True)
# Select and load the tokenizer
st.sidebar.write('1. Choose the tokenizer from below')
tokenizer_name = st.sidebar.selectbox('',
("BERT","RoBERTa","ALBERT",
"GPT2"))
tokenizer = load_model(tokenizer_name)
st.sidebar.write('2. Optional settings')
comparison_mode = st.sidebar.checkbox('Compare two texts')
detokenize = st.sidebar.checkbox('de-tokenize')
st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
+f'and "de-tokenize" converts a list of tokenized indices back to strings.')
st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces.')
if comparison_mode:
sent_cols = st.columns(2)
num_tokens = {}
sents = {}
for sent_id, sent_col in enumerate(sent_cols):
with sent_col:
if detokenize:
sentence = st.text_input(f'Tokenized IDs {sent_id+1}')
num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
else:
sentence = st.text_input(f'Text {sent_id+1}')
num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
sents[f'sent_{sent_id+1}'] = sentence
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
st.markdown(generate_markdown('# Tokens&colon; ',size=16), unsafe_allow_html=True)
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
else:
st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)
else:
if detokenize:
#if tokenizer_name.startswith('gpt2'):
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
#else:
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
num_tokens = DeTokenizeText(sentence)
else:
sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
num_tokens = TokenizeText(sentence,tokenizer_name)