File size: 4,923 Bytes
30f2a35
0708e87
30f2a35
 
0708e87
30f2a35
 
 
 
 
 
b33ef6b
 
 
 
30f2a35
 
 
 
 
 
b33ef6b
30f2a35
0708e87
b33ef6b
 
 
 
30f2a35
0708e87
30f2a35
 
0708e87
b33ef6b
 
 
 
 
 
 
30f2a35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0708e87
30f2a35
 
 
 
b33ef6b
 
 
0708e87
30f2a35
 
 
6c4a187
b33ef6b
0708e87
30f2a35
 
 
 
 
b33ef6b
 
 
 
 
 
 
 
 
 
 
 
30f2a35
b33ef6b
 
 
30f2a35
 
 
0708e87
30f2a35
 
 
 
 
 
 
 
 
0708e87
 
0be8507
30f2a35
 
 
 
b33ef6b
 
 
 
30f2a35
 
 
 
 
 
 
 
 
 
 
 
b33ef6b
30f2a35
 
b33ef6b
30f2a35
0708e87
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import time
import gradio as gr
from os import getenv
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=getenv("OPENROUTER_API_KEY"),
)

css = """
body.show-thoughts .thought {
    display: block !important;
}

.thought {
    opacity: 0.8; 
    font-family: "Courier New", monospace;
    border: 1px gray solid;
    padding: 10px;
    border-radius: 5px;
    display: none;
}

.thought-prompt {
    opacity: 0.8;
    font-family: "Courier New", monospace;
}
"""

with open("contemplator.txt", "r") as f:
    system_msg = f.read()

def make_thinking_prompt(time):
    i = int(time * 4) % 40
    if i > 20:
        i = 40 - i
    return "🤔 [" + "." * i + "Thinking" + "." * (20 - i) + "]"


def streaming(message, history, system_msg, model):
    messages = [
        {
            "role": "system",
            "content": system_msg
        }
    ]
    for user, assistant in history:
        messages.append({
            "role": "user",
            "content": user
        })
        messages.append({
            "role": "assistant",
            "content": assistant
        })

    messages.append({
        "role": "user",
        "content": message
    })
    
    thinking_prompt = "<p class='thought-prompt'>" + "🤨 Understanding..." + "</p>"
    yield thinking_prompt

    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_completion_tokens=8000,
        temperature=0.0,
        stream=True,
    )
    
    reply = ""
    
    start_time = time.time()
    try:
        for i, chunk in enumerate(completion):
            reply += chunk.choices[0].delta.content
            answer = ""
            if not "</inner_thoughts>" in reply:
                thought_text = f'<div class="thought">{reply.replace("<inner_thoughts>", "").strip()}</div>'
                thinking_prompt = "<p class='thought-prompt'>" + make_thinking_prompt(time.time() - start_time) + "</p>"
            else:
                thought_text = f'<div class="thought">{reply.replace("<inner_thoughts>", "").split("</inner_thoughts>")[0].strip()}</div>'
                answer = reply.split("</inner_thoughts>")[1].replace("<final_answer>", "").replace("</final_answer>", "").strip()
                thinking_prompt = f"<p class='thought-prompt'>⌛ Thought for {time.time() - start_time:.2f} seconds</p>"
            yield thinking_prompt + thought_text + "<br>" + answer
        yield thinking_prompt + thought_text + "<br>" + answer
    except Exception as e:
        print(e)
        yield f"An error occurred. {e}"
        
markdown = """
## 🫐 Overthink 1(o1)

Insprired by how o1 works, this LLM is instructed to generate very long and detailed chain-of-thoughts. It will think extra hard before providing an answer. 

Actually this does help with reasoning, compared to normal step-by-step reasoning. I wrote a blog post about this [here](https://huggingface.co/blog/wenbopan/recreating-o1).

Sometimes this LLM overthinks for super simple questions, but it's fun to watch. Hope you enjoy it!

### System Message

This is done by instructing the model with a large system message, which you can check on the top tab.
"""

with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css, fill_height=True) as demo:
    with gr.Row(equal_height=True):
        with gr.Column(scale=1, min_width=300):
            with gr.Tab("Settings"):
                gr.Markdown(markdown)
                model = gr.Dropdown(["nousresearch/hermes-3-llama-3.1-405b:free", "nousresearch/hermes-3-llama-3.1-70b", "meta-llama/llama-3.1-405b-instruct", "google/gemini-pro-1.5-exp", "meta-llama/llama-3.1-8b-instruct:free"], value="nousresearch/hermes-3-llama-3.1-405b:free", label="Model")
                show_thoughts = gr.Checkbox(False, label="Show Thoughts", interactive=True, elem_id="show_thoughts")
                
                show_thoughts.change(None, js="""function run(){ checked = document.querySelector('#show_thoughts input[type="checkbox"]').checked; document.querySelector('body').classList.toggle('show-thoughts', checked); } """)
            with gr.Tab("System Message"):
                system_msg = gr.TextArea(system_msg, label="System Message")
        with gr.Column(scale=3, min_width=300):
            gr.ChatInterface(
                streaming, 
                additional_inputs=[
                    system_msg,
                    model
                ],
                examples=[
                    ["How do you do?    ", None, None, None],
                    ["How many R's in strawberry?", None, None, None],
                    ["Solve the puzzle of 24 points: 1 2 3 4", None, None, None],
                    ["Find x such that ⌈x⌉ + x = 23/7. Express x as a common fraction.", None, None, None],
                ],
                cache_examples=False
            )

if __name__ == "__main__":
    demo.launch()