Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

App Files Files Community

taka-yamakoshi commited on Jul 16, 2022

Commit

21c2f11

•

1 Parent(s): a999c8e

include detokenize

Browse files

Files changed (1) hide show

app.py +33 -7

app.py CHANGED Viewed

@@ -24,11 +24,14 @@ def load_model(model_name):
 def generate_markdown(text,color='black',font='Arial',size=20):
     return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
-def TokenizeText(sentence):
     if len(sentence)>0:
-        input_sent = tokenizer(sentence)['input_ids']
-        encoded_sent = [str(token) for token in input_sent[1:-1]]
-        decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
         num_tokens = len(decoded_sent)
         #char_nums = [len(word)+2 for word in decoded_sent]
@@ -44,6 +47,22 @@ def TokenizeText(sentence):
         return num_tokens
 if __name__=='__main__':
@@ -76,7 +95,7 @@ if __name__=='__main__':
     # Title
     st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
-    st.markdown(generate_markdown('Quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
     # Select and load the tokenizer
     tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
@@ -87,6 +106,7 @@ if __name__=='__main__':
     tokenizer = load_model(tokenizer_name)
     comparison_mode = st.sidebar.checkbox('Compare two texts')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
@@ -95,7 +115,10 @@ if __name__=='__main__':
             with sent_col:
                 sentence = st.text_input(f'Text {sent_id+1}')
                 sents[f'sent_{sent_id+1}'] = sentence
-                num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence)
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
             st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
@@ -106,4 +129,7 @@ if __name__=='__main__':
     else:
         sentence = st.text_input(f'Text')
-        num_tokens = TokenizeText(sentence)

 def generate_markdown(text,color='black',font='Arial',size=20):
     return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
+def TokenizeText(sentence,tokenizer_name):
     if len(sentence)>0:
+        if tokenizer_name.startswith('gpt2'):
+            input_sent = tokenizer(sentence)['input_ids']
+        else:
+            input_sent = tokenizer(sentence)['input_ids'][1:-1]
+        encoded_sent = [str(token) for token in input_sent]
+        decoded_sent = [tokenizer.decode([token]) for token in input_sent]
         num_tokens = len(decoded_sent)
         #char_nums = [len(word)+2 for word in decoded_sent]
         return num_tokens
+def DeTokenizeText(input_str):
+    if len(input_str)>0:
+        input_sent = [int(element) for element in input_str.strip().split(' ')]
+        encoded_sent = [str(token) for token in input_sent]
+        decoded_sent = [tokenizer.decode([token]) for token in input_sent]
+        num_tokens = len(decoded_sent)
+        #char_nums = [len(word)+2 for word in decoded_sent]
+        #word_cols = st.columns(char_nums)
+        #for word_col,word in zip(word_cols,decoded_sent):
+            #with word_col:
+                #st.write(word)
+        #st.write('   '.join(encoded_sent))
+        #st.write('   '.join(decoded_sent))
+        st.markdown(generate_markdown('   '.join(decoded_sent)), unsafe_allow_html=True)
+        return num_tokens
 if __name__=='__main__':
     # Title
     st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
+    st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
     # Select and load the tokenizer
     tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
     tokenizer = load_model(tokenizer_name)
     comparison_mode = st.sidebar.checkbox('Compare two texts')
+    detokenize = st.sidebar.checkbox('de-tokenize')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
             with sent_col:
                 sentence = st.text_input(f'Text {sent_id+1}')
                 sents[f'sent_{sent_id+1}'] = sentence
+                if detokenize:
+                    num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
+                else:
+                    num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
             st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
     else:
         sentence = st.text_input(f'Text')
+        if detokenize:
+            num_tokens = DeTokenizeText(sentence)
+        else:
+            num_tokens = TokenizeText(sentence,tokenizer_name)