taka-yamakoshi commited on
Commit
21c2f11
1 Parent(s): a999c8e

include detokenize

Browse files
Files changed (1) hide show
  1. app.py +33 -7
app.py CHANGED
@@ -24,11 +24,14 @@ def load_model(model_name):
24
  def generate_markdown(text,color='black',font='Arial',size=20):
25
  return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
26
 
27
- def TokenizeText(sentence):
28
  if len(sentence)>0:
29
- input_sent = tokenizer(sentence)['input_ids']
30
- encoded_sent = [str(token) for token in input_sent[1:-1]]
31
- decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
 
 
 
32
  num_tokens = len(decoded_sent)
33
 
34
  #char_nums = [len(word)+2 for word in decoded_sent]
@@ -44,6 +47,22 @@ def TokenizeText(sentence):
44
 
45
  return num_tokens
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  if __name__=='__main__':
49
 
@@ -76,7 +95,7 @@ if __name__=='__main__':
76
 
77
  # Title
78
  st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
79
- st.markdown(generate_markdown('Quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
80
 
81
  # Select and load the tokenizer
82
  tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
@@ -87,6 +106,7 @@ if __name__=='__main__':
87
  tokenizer = load_model(tokenizer_name)
88
 
89
  comparison_mode = st.sidebar.checkbox('Compare two texts')
 
90
  if comparison_mode:
91
  sent_cols = st.columns(2)
92
  num_tokens = {}
@@ -95,7 +115,10 @@ if __name__=='__main__':
95
  with sent_col:
96
  sentence = st.text_input(f'Text {sent_id+1}')
97
  sents[f'sent_{sent_id+1}'] = sentence
98
- num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence)
 
 
 
99
 
100
  if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
101
  st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
@@ -106,4 +129,7 @@ if __name__=='__main__':
106
 
107
  else:
108
  sentence = st.text_input(f'Text')
109
- num_tokens = TokenizeText(sentence)
 
 
 
 
24
  def generate_markdown(text,color='black',font='Arial',size=20):
25
  return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
26
 
27
+ def TokenizeText(sentence,tokenizer_name):
28
  if len(sentence)>0:
29
+ if tokenizer_name.startswith('gpt2'):
30
+ input_sent = tokenizer(sentence)['input_ids']
31
+ else:
32
+ input_sent = tokenizer(sentence)['input_ids'][1:-1]
33
+ encoded_sent = [str(token) for token in input_sent]
34
+ decoded_sent = [tokenizer.decode([token]) for token in input_sent]
35
  num_tokens = len(decoded_sent)
36
 
37
  #char_nums = [len(word)+2 for word in decoded_sent]
 
47
 
48
  return num_tokens
49
 
50
+ def DeTokenizeText(input_str):
51
+ if len(input_str)>0:
52
+ input_sent = [int(element) for element in input_str.strip().split(' ')]
53
+ encoded_sent = [str(token) for token in input_sent]
54
+ decoded_sent = [tokenizer.decode([token]) for token in input_sent]
55
+ num_tokens = len(decoded_sent)
56
+
57
+ #char_nums = [len(word)+2 for word in decoded_sent]
58
+ #word_cols = st.columns(char_nums)
59
+ #for word_col,word in zip(word_cols,decoded_sent):
60
+ #with word_col:
61
+ #st.write(word)
62
+ #st.write(' '.join(encoded_sent))
63
+ #st.write(' '.join(decoded_sent))
64
+ st.markdown(generate_markdown(' '.join(decoded_sent)), unsafe_allow_html=True)
65
+ return num_tokens
66
 
67
  if __name__=='__main__':
68
 
 
95
 
96
  # Title
97
  st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
98
+ st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
99
 
100
  # Select and load the tokenizer
101
  tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
 
106
  tokenizer = load_model(tokenizer_name)
107
 
108
  comparison_mode = st.sidebar.checkbox('Compare two texts')
109
+ detokenize = st.sidebar.checkbox('de-tokenize')
110
  if comparison_mode:
111
  sent_cols = st.columns(2)
112
  num_tokens = {}
 
115
  with sent_col:
116
  sentence = st.text_input(f'Text {sent_id+1}')
117
  sents[f'sent_{sent_id+1}'] = sentence
118
+ if detokenize:
119
+ num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
120
+ else:
121
+ num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
122
 
123
  if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
124
  st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
 
129
 
130
  else:
131
  sentence = st.text_input(f'Text')
132
+ if detokenize:
133
+ num_tokens = DeTokenizeText(sentence)
134
+ else:
135
+ num_tokens = TokenizeText(sentence,tokenizer_name)