import streamlit as st import pandas as pd from transformers import AutoTokenizer import csv st.title("jp vocab viewer") vocab_list = { "llama" :"tokyotech-llm/Swallow-70b-NVE-instruct-hf", "llama_swallow" :"tokyotech-llm/Swallow-70b-instruct-hf", "mistral" :"mistralai/Mistral-7B-Instruct-v0.2", "mistral-shisa" :"augmxnt/shisa-7b-v1", "elyza/ELYZA-japanese-Llama-2-7b-fast-instruct":"elyza/ELYZA-japanese-Llama-2-7b-fast-instruct", "stablelm-3b-4e1t" : "stabilityai/stablelm-3b-4e1t", "stablelm-3b-4e1t-ja" : "stabilityai/japanese-stablelm-3b-4e1t-instruct", "cyberagent-calm2" :"cyberagent/calm2-7b", "cyberagent-open-calm":"cyberagent/open-calm-7b", "llm-jp" :"llm-jp/llm-jp-13b-v1.0", "stockmark/stockmark-13b":"stockmark/stockmark-13b", "stockmark/gpt-neox-japanese-1.4b":"stockmark/gpt-neox-japanese-1.4b", "novelai-nerdstash-v1": "NovelAI/nerdstash-tokenizer-v1", "novelai-nerdstash-v2": "NovelAI/nerdstash-tokenizer-v2", "line-gpt2": "line-corporation/japanese-large-lm-1.7b", "line-gptneox": "line-corporation/japanese-large-lm-3.6b-instruction-sft", "line-gptneox": "line-corporation/japanese-large-lm-3.6b-instruction-sft", "qwen-rinna-nekomata":"rinna/nekomata-7b", "llama-rinna-youri":"rinna/youri-7b", "rinna-gptneox":"rinna/japanese-gpt-neox-3.6b", "rinna-bilingual-gpt-neox":"rinna/bilingual-gpt-neox-4b-instruction-sft", "matsuo-lab-weblab":"matsuo-lab/weblab-10b", "plamo":"pfnet/plamo-13b-instruct", "sudy-super-Sentinel":"sudy-super/Sentinel", "sudy-super-baku-10b-chat":"sudy-super/baku-10b-chat", "aixaixsatoshi-calm2-7b-chat-7b-moe":"aixsatoshi/calm2-7b-chat-7b-moe", "llama-karakuri-lm":"karakuri-ai/karakuri-lm-70b-chat-v0.1", "watashiha-gpt-6b":"watashiha/watashiha-gpt-6b", "watashiha-llama-ogiri":"watashiha/Watashiha-Llama-2-13B-Ogiri-sft-neuron", "llama-superswallow":"nitky/Superswallow-70b-v0.1" } # dropdownlist st.sidebar.title("Select Vocab") # ラジオボタンで選択 selected_vocab = st.sidebar.radio("", list(vocab_list.keys())) # tokenizer repo_id = vocab_list[selected_vocab] tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) def escape_byte(b): ret = "" if b == 0x00: ret = "" elif b == 0x01: ret = "" elif b == 0x02: ret = "" elif b == 0x03: ret = "" elif b == 0x04: ret = "" elif b == 0x05: ret = "" elif b == 0x06: ret = "" elif b == 0x07: ret = "" elif b == 0x08: ret = "" elif b == 0x0a: ret = "" elif b == 0x0b: ret = "" elif b == 0x0c: ret = "" elif b == 0x0d: ret = "" elif b == 0x0e: ret = "" elif b == 0x0f: ret = "" elif b == 0x10: ret = "" elif b == 0x11: ret = "" elif b == 0x12: ret = "" elif b == 0x13: ret = "" elif b == 0x14: ret = "" elif b == 0x15: ret = "" elif b == 0x16: ret = "" elif b == 0x17: ret = "" elif b == 0x18: ret = "" elif b == 0x19: ret = "" elif b == 0x1a: ret = "" elif b == 0x1b: ret = "" elif b == 0x1c: ret = "" elif b == 0x1d: ret = "" elif b == 0x1e: ret = "" elif b == 0x1f: ret = "" elif b == 0x20: # ret = "" ret = " " elif b == 0x7f: ret = "" return ret def escape_decode(token): ret = token tok_enc = token.encode() if len(tok_enc) == 1: b = tok_enc[0] if b >= 0x00 and b <= 0x08: ret = escape_byte(b) elif b >= 0x0a and b <= 0x19: ret = escape_byte(b) elif b >= 0x1a and b <= 0x20: ret = escape_byte(b) elif b == 0x7f: ret = escape_byte(b) elif tok_enc == b"\xef\xbf\xbd": ret = "" elif tok_enc == b"\xe2\x80\xa8" or tok_enc == b"\xe2\x80\xab" or tok_enc == b"\xe2\x80\xac" or tok_enc == b"\xe2\x80\xad" or tok_enc == b"\xe2\x80\xaf": ret = "" elif tok_enc == b"\xe2\x80\x8e" or tok_enc == b"\xe2\x80\x8f" or tok_enc == b"\x2c\xe2\x80\x8e" : ret = "" else: escape_tok_enc = bytes() for b in tok_enc: es = escape_byte(b) if len(es) > 0: escape_tok_enc += es.encode() else: # add byte escape_tok_enc += b.to_bytes(1, "big") ret = escape_tok_enc.decode() return ret # sort sorted_token_ids = sorted(tokenizer.get_vocab().values()) df = pd.DataFrame() df["token_id"] = sorted_token_ids df["token"] = list([ escape_decode(tokenizer.decode([i])) for i in sorted_token_ids]) df["bytes"] = list([ " ".join([f"{b:02X}" for b in tokenizer.decode([i]).encode()]) for i in sorted_token_ids]) df["toktype"] = ["BYTE" if len(b) == 2 else "" for b in df["bytes"]] df["token_id"] = df["token_id"].astype(int) df["token"] = df["token"].astype(str) # byte to hexstring st.subheader(f"{selected_vocab}") st.write(f"""vocab_size""", tokenizer.vocab_size) st.write(f"""class_name""", tokenizer.__class__.__name__) csv_str = df.to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8") st.download_button( label="CSV Download", data=csv_str, file_name=f'vocabviewer_dump_{repo_id.replace("/","--")}.csv', mime='text/csv' ) st.dataframe(df, width=1000, height=1000)