import streamlit as st
import pandas as pd
from transformers import AutoTokenizer
import csv

st.title("jp vocab viewer")

vocab_list = {
    "llama"   :"tokyotech-llm/Swallow-70b-NVE-instruct-hf",
    "llama_swallow" :"tokyotech-llm/Swallow-70b-instruct-hf",
    "mistral" :"mistralai/Mistral-7B-Instruct-v0.2",
    "mistral-shisa" :"augmxnt/shisa-7b-v1",
    "elyza/ELYZA-japanese-Llama-2-7b-fast-instruct":"elyza/ELYZA-japanese-Llama-2-7b-fast-instruct",
    "stablelm-3b-4e1t" : "stabilityai/stablelm-3b-4e1t",
    "stablelm-3b-4e1t-ja" : "stabilityai/japanese-stablelm-3b-4e1t-instruct",
    "cyberagent-calm2"   :"cyberagent/calm2-7b",
    "cyberagent-open-calm":"cyberagent/open-calm-7b",
    "llm-jp"   :"llm-jp/llm-jp-13b-v1.0",
    "stockmark/stockmark-13b":"stockmark/stockmark-13b",
    "stockmark/gpt-neox-japanese-1.4b":"stockmark/gpt-neox-japanese-1.4b",
    "novelai-nerdstash-v1": "NovelAI/nerdstash-tokenizer-v1",
    "novelai-nerdstash-v2": "NovelAI/nerdstash-tokenizer-v2",
    "line-gpt2": "line-corporation/japanese-large-lm-1.7b",
    "line-gptneox": "line-corporation/japanese-large-lm-3.6b-instruction-sft",
    "line-gptneox": "line-corporation/japanese-large-lm-3.6b-instruction-sft",
    "qwen-rinna-nekomata":"rinna/nekomata-7b",
    "llama-rinna-youri":"rinna/youri-7b",
    "rinna-gptneox":"rinna/japanese-gpt-neox-3.6b",
    "rinna-bilingual-gpt-neox":"rinna/bilingual-gpt-neox-4b-instruction-sft",
    "matsuo-lab-weblab":"matsuo-lab/weblab-10b",
    "plamo":"pfnet/plamo-13b-instruct",
    "sudy-super-Sentinel":"sudy-super/Sentinel",
    "sudy-super-baku-10b-chat":"sudy-super/baku-10b-chat",
    "aixaixsatoshi-calm2-7b-chat-7b-moe":"aixsatoshi/calm2-7b-chat-7b-moe",
    "llama-karakuri-lm":"karakuri-ai/karakuri-lm-70b-chat-v0.1",
    "watashiha-gpt-6b":"watashiha/watashiha-gpt-6b",
    "watashiha-llama-ogiri":"watashiha/Watashiha-Llama-2-13B-Ogiri-sft-neuron",
    "llama-superswallow":"nitky/Superswallow-70b-v0.1"
}

# dropdownlist
st.sidebar.title("Select Vocab")

# ラジオボタンで選択
selected_vocab = st.sidebar.radio("", list(vocab_list.keys()))

# tokenizer
repo_id = vocab_list[selected_vocab]
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)

def escape_byte(b):
    ret = ""
    if b == 0x00: 
        ret = "<!!!!VOCABVIEWER ASCII 0x00 NUL!!!!>"
    elif b == 0x01: 
        ret = "<!!!!VOCABVIEWER ASCII 0x01 SOH!!!!>"
    elif b == 0x02: 
        ret = "<!!!!VOCABVIEWER ASCII 0x02 STX!!!!>"
    elif b == 0x03: 
        ret = "<!!!!VOCABVIEWER ASCII 0x03 ETX!!!!>"
    elif b == 0x04: 
        ret = "<!!!!VOCABVIEWER ASCII 0x04 EOT!!!!>"
    elif b == 0x05:
        ret = "<!!!!VOCABVIEWER ASCII 0x05 ENQ!!!!>"
    elif b == 0x06:
        ret = "<!!!!VOCABVIEWER ASCII 0x06 ACK!!!!>"
    elif b == 0x07:
        ret = "<!!!!VOCABVIEWER ASCII 0x07 BEL!!!!>"
    elif b == 0x08:
        ret = "<!!!!VOCABVIEWER ASCII 0x08 BS!!!!>"
    elif b == 0x0a:
        ret = "<!!!!VOCABVIEWER ASCII 0x0a LF!!!!>"
    elif b == 0x0b:
        ret = "<!!!!VOCABVIEWER ASCII 0x0b VT!!!!>"
    elif b == 0x0c:
        ret = "<!!!!VOCABVIEWER ASCII 0x0c FF!!!!>"
    elif b == 0x0d:
        ret = "<!!!!VOCABVIEWER ASCII 0x0d CR!!!!>"
    elif b == 0x0e:
        ret = "<!!!!VOCABVIEWER ASCII 0x0e SO!!!!>"
    elif b == 0x0f:
        ret = "<!!!!VOCABVIEWER ASCII 0x0f SI!!!!>"
    elif b == 0x10:
        ret = "<!!!!VOCABVIEWER ASCII 0x10 DLE!!!!>"
    elif b == 0x11:
        ret = "<!!!!VOCABVIEWER ASCII 0x11 DC1!!!!>"
    elif b == 0x12:
        ret = "<!!!!VOCABVIEWER ASCII 0x12 DC2!!!!>"
    elif b == 0x13:
        ret = "<!!!!VOCABVIEWER ASCII 0x13 DC3!!!!>"
    elif b == 0x14:
        ret = "<!!!!VOCABVIEWER ASCII 0x14 DC4!!!!>"
    elif b == 0x15:
        ret = "<!!!!VOCABVIEWER ASCII 0x15 NAK!!!!>"
    elif b == 0x16:
        ret = "<!!!!VOCABVIEWER ASCII 0x16 SYN!!!!>"
    elif b == 0x17:
        ret = "<!!!!VOCABVIEWER ASCII 0x17 ETB!!!!>"
    elif b == 0x18:
        ret = "<!!!!VOCABVIEWER ASCII 0x18 CAN!!!!>"
    elif b == 0x19:
        ret = "<!!!!VOCABVIEWER ASCII 0x19 EM!!!!>"
    elif b == 0x1a:
        ret = "<!!!!VOCABVIEWER ASCII 0x1a SUB!!!!>"
    elif b == 0x1b:
        ret = "<!!!!VOCABVIEWER ASCII 0x1b ESC!!!!>"
    elif b == 0x1c:
        ret = "<!!!!VOCABVIEWER ASCII 0x1c FS!!!!>"
    elif b == 0x1d:
        ret = "<!!!!VOCABVIEWER ASCII 0x1d GS!!!!>"
    elif b == 0x1e:
        ret = "<!!!!VOCABVIEWER ASCII 0x1e RS!!!!>"
    elif b == 0x1f:
        ret = "<!!!!VOCABVIEWER ASCII 0x1f US!!!!>"
    elif b == 0x20:
        # ret = "<!!!!VOCABVIEWER ASCII 0x20 SPC!!!!>"
        ret = " "
    elif b == 0x7f:
        ret = "<!!!!VOCABVIEWER ASCII 0x7f DEL!!!!>"

    return ret

def escape_decode(token):
    ret = token
    tok_enc = token.encode()
    if len(tok_enc) == 1:
        b = tok_enc[0]
        if b >= 0x00 and b <= 0x08:
            ret = escape_byte(b)
        elif b >= 0x0a and b <= 0x19:
            ret = escape_byte(b)
        elif b >= 0x1a and b <= 0x20:
            ret = escape_byte(b)
        elif b == 0x7f:
            ret = escape_byte(b)
    elif tok_enc == b"\xef\xbf\xbd":
        ret = "<!!!!VOCABVIEWER REPLACEMENT CHARACTER U+FFFD!!!!>"
    elif tok_enc == b"\xe2\x80\xa8" or tok_enc == b"\xe2\x80\xab" or tok_enc == b"\xe2\x80\xac" or tok_enc == b"\xe2\x80\xad" or tok_enc == b"\xe2\x80\xaf":
        ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
    elif tok_enc == b"\xe2\x80\x8e" or tok_enc == b"\xe2\x80\x8f" or tok_enc == b"\x2c\xe2\x80\x8e" :
        ret = "<!!!!VOCABVIEWER ZERO WIDTH SPACE !!!!>"
    else:
        escape_tok_enc = bytes()
        for b in tok_enc:
            es = escape_byte(b)
            if len(es) > 0:
                escape_tok_enc += es.encode()
            else:
                # add byte
                escape_tok_enc += b.to_bytes(1, "big")
        
        ret = escape_tok_enc.decode()

    return ret
# sort
sorted_token_ids = sorted(tokenizer.get_vocab().values())

df = pd.DataFrame()
df["token_id"] = sorted_token_ids
df["token"] = list([  escape_decode(tokenizer.decode([i])) for i in sorted_token_ids])
df["bytes"] = list([ " ".join([f"{b:02X}" for b in tokenizer.decode([i]).encode()])  for i in sorted_token_ids])
df["toktype"] = ["BYTE" if len(b) == 2 else "" for b in df["bytes"]]

df["token_id"] = df["token_id"].astype(int)
df["token"] = df["token"].astype(str)

# byte to hexstring
st.subheader(f"{selected_vocab}")

st.write(f"""vocab_size""", tokenizer.vocab_size)
st.write(f"""class_name""", tokenizer.__class__.__name__)

csv_str = df.to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8")

st.download_button(
    label="CSV Download",
    data=csv_str,
    file_name=f'vocabviewer_dump_{repo_id.replace("/","--")}.csv',
    mime='text/csv'
)

st.dataframe(df, width=1000, height=1000)