KingNish commited on
Commit
1ec2047
1 Parent(s): dbccc7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -58
app.py CHANGED
@@ -1,88 +1,168 @@
1
- import os
2
- import gradio as gr
3
- import copy
4
  from llama_cpp import Llama
5
- from huggingface_hub import hf_hub_download
 
 
 
 
 
6
 
 
 
7
 
8
- llm = Llama(
9
- model_path=hf_hub_download(
10
- repo_id=os.environ.get("REPO_ID", "microsoft/Phi-3-mini-4k-instruct-gguf"),
11
- filename=os.environ.get("MODEL_FILE", "Phi-3-mini-4k-instruct-q4.gguf"),
12
- ),
13
- n_ctx=2048,
14
- n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
15
- )
16
 
 
 
 
 
 
17
 
18
- def generate_text(
 
19
  message,
20
  history: list[tuple[str, str]],
 
21
  system_message,
22
  max_tokens,
23
  temperature,
24
  top_p,
 
 
25
  ):
26
- temp = ""
27
- input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
28
- for interaction in history:
29
- input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- input_prompt = input_prompt + str(message) + " [/INST] "
32
 
33
- output = llm(
34
- input_prompt,
35
- temperature=temperature,
36
- top_p=top_p,
37
- top_k=40,
38
- repeat_penalty=1.1,
39
- max_tokens=max_tokens,
40
- stop=[
41
- "<|prompter|>",
42
- "<|endoftext|>",
43
- "<|endoftext|> \n",
44
- "ASSISTANT:",
45
- "USER:",
46
- "SYSTEM:",
47
- ],
48
- stream=True,
 
 
49
  )
50
- for out in output:
51
- stream = copy.deepcopy(out)
52
- temp += stream["choices"][0]["text"]
53
- yield temp
 
 
 
 
 
 
54
 
 
 
55
 
56
  demo = gr.ChatInterface(
57
- generate_text,
58
- title="llama-cpp-python on GPU",
59
- description="Running LLM with https://github.com/abetlen/llama-cpp-python",
60
- examples=[
61
- ['How to setup a human base on Mars? Give short answer.'],
62
- ['Explain theory of relativity to me like I’m 8 years old.'],
63
- ['What is 9,000 * 9,000?'],
64
- ['Write a pun-filled happy birthday message to my friend Alex.'],
65
- ['Justify why a penguin might make a good king of the jungle.']
66
- ],
67
- cache_examples=False,
68
- retry_btn=None,
69
- undo_btn="Delete Previous",
70
- clear_btn="Clear",
71
  additional_inputs=[
72
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
73
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
 
 
 
 
 
 
74
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
75
  gr.Slider(
76
  minimum=0.1,
77
  maximum=1.0,
78
  value=0.95,
79
  step=0.05,
80
- label="Top-p (nucleus sampling)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  ),
82
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
84
 
85
-
86
  if __name__ == "__main__":
87
- demo.launch()
88
-
 
1
+ import spaces
2
+ import json
3
+ import subprocess
4
  from llama_cpp import Llama
5
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
6
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
7
+ from llama_cpp_agent.chat_history import BasicChatHistory
8
+ from llama_cpp_agent.chat_history.messages import Roles
9
+ import gradio as gr
10
+ from huggingface_hub import hf_hub_download
11
 
12
+ llm = None
13
+ llm_model = None
14
 
15
+ # Download the new model
16
+ hf_hub_download(
17
+ repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
18
+ filename="llama-3.2-1b-instruct-q4_k_m.gguf",
19
+ local_dir="./models"
20
+ )
 
 
21
 
22
+ def get_messages_formatter_type(model_name):
23
+ if "Llama" in model_name:
24
+ return MessagesFormatterType.LLAMA_3
25
+ else:
26
+ raise ValueError(f"Unsupported model: {model_name}")
27
 
28
+ @spaces.GPU
29
+ def respond(
30
  message,
31
  history: list[tuple[str, str]],
32
+ model,
33
  system_message,
34
  max_tokens,
35
  temperature,
36
  top_p,
37
+ top_k,
38
+ repeat_penalty,
39
  ):
40
+ global llm
41
+ global llm_model
42
+
43
+ chat_template = get_messages_formatter_type(model)
44
+
45
+ if llm is None or llm_model != model:
46
+ llm = Llama(
47
+ model_path=f"models/{model}",
48
+ flash_attn=True,
49
+ n_gpu_layers=81,
50
+ n_batch=1024,
51
+ n_ctx=8192,
52
+ )
53
+ llm_model = model
54
+
55
+ provider = LlamaCppPythonProvider(llm)
56
+
57
+ agent = LlamaCppAgent(
58
+ provider,
59
+ system_prompt=f"{system_message}",
60
+ predefined_messages_formatter_type=chat_template,
61
+ debug_output=True
62
+ )
63
+
64
+ settings = provider.get_provider_default_settings()
65
+ settings.temperature = temperature
66
+ settings.top_k = top_k
67
+ settings.top_p = top_p
68
+ settings.max_tokens = max_tokens
69
+ settings.repeat_penalty = repeat_penalty
70
+ settings.stream = True
71
 
72
+ messages = BasicChatHistory()
73
 
74
+ for msn in history:
75
+ user = {
76
+ 'role': Roles.user,
77
+ 'content': msn[0]
78
+ }
79
+ assistant = {
80
+ 'role': Roles.assistant,
81
+ 'content': msn[1]
82
+ }
83
+ messages.add_message(user)
84
+ messages.add_message(assistant)
85
+
86
+ stream = agent.get_chat_response(
87
+ message,
88
+ llm_sampling_settings=settings,
89
+ chat_history=messages,
90
+ returns_streaming_generator=True,
91
+ print_output=False
92
  )
93
+
94
+ outputs = ""
95
+ for output in stream:
96
+ outputs += output
97
+ yield outputs
98
+
99
+ description = """<p><center>
100
+ <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
101
+
102
+ Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
103
 
104
+ </center></p>
105
+ """
106
 
107
  demo = gr.ChatInterface(
108
+ respond,
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  additional_inputs=[
110
+ gr.Dropdown([
111
+ "llama-3.2-1b-instruct-q4_k_m.gguf"
112
+ ],
113
+ value="llama-3.2-1b-instruct-q4_k_m.gguf",
114
+ label="Model"
115
+ ),
116
+ gr.Textbox(value="You are a world-class AI system, capable of complex reasoning and reflection. Reason through the query inside <thinking> tags, and then provide your final response inside <output> tags. If you detect that you made a mistake in your reasoning at any point, correct yourself inside <reflection> tags.", label="System message"),
117
+ gr.Slider(minimum=1, maximum=8192, value=2048, step=1, label="Max tokens"),
118
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
119
  gr.Slider(
120
  minimum=0.1,
121
  maximum=1.0,
122
  value=0.95,
123
  step=0.05,
124
+ label="Top-p",
125
+ ),
126
+ gr.Slider(
127
+ minimum=0,
128
+ maximum=100,
129
+ value=40,
130
+ step=1,
131
+ label="Top-k",
132
+ ),
133
+ gr.Slider(
134
+ minimum=0.0,
135
+ maximum=2.0,
136
+ value=1.1,
137
+ step=0.1,
138
+ label="Repetition penalty",
139
  ),
140
  ],
141
+ theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
142
+ body_background_fill_dark="#16141c",
143
+ block_background_fill_dark="#16141c",
144
+ block_border_width="1px",
145
+ block_title_background_fill_dark="#1e1c26",
146
+ input_background_fill_dark="#292733",
147
+ button_secondary_background_fill_dark="#24212b",
148
+ border_color_accent_dark="#343140",
149
+ border_color_primary_dark="#343140",
150
+ background_fill_secondary_dark="#16141c",
151
+ color_accent_soft_dark="transparent",
152
+ code_background_fill_dark="#292733",
153
+ ),
154
+ retry_btn="Retry",
155
+ undo_btn="Undo",
156
+ clear_btn="Clear",
157
+ submit_btn="Send",
158
+ title="Meta Llama 3.2 (1B)",
159
+ description=description,
160
+ chatbot=gr.Chatbot(
161
+ scale=1,
162
+ likeable=False,
163
+ show_copy_button=True
164
+ )
165
  )
166
 
 
167
  if __name__ == "__main__":
168
+ demo.launch()