Text Generation
PEFT
Safetensors
llama-2
Eval Results
dfurman commited on
Commit
ac39384
1 Parent(s): 0982fc8

Upload basic_inference_llama_2_70b_dolphin.ipynb

Browse files
assets/basic_inference_llama_2_70b_dolphin.ipynb ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6f46e840-8a7f-4be2-a082-49b9ebf5a8c5",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\n",
14
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
15
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "!pip install -q -U huggingface_hub peft transformers torch accelerate\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "id": "2d2918a1-d701-4a66-946c-6f668cb4ac1e",
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "Mon Jul 24 21:41:13 2023 \n",
34
+ "+-----------------------------------------------------------------------------+\n",
35
+ "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n",
36
+ "|-------------------------------+----------------------+----------------------+\n",
37
+ "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
38
+ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
39
+ "| | | MIG M. |\n",
40
+ "|===============================+======================+======================|\n",
41
+ "| 0 NVIDIA H100 PCIe On | 00000000:06:00.0 Off | 0 |\n",
42
+ "| N/A 39C P0 52W / 350W | 0MiB / 81559MiB | 0% Default |\n",
43
+ "| | | Disabled |\n",
44
+ "+-------------------------------+----------------------+----------------------+\n",
45
+ " \n",
46
+ "+-----------------------------------------------------------------------------+\n",
47
+ "| Processes: |\n",
48
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
49
+ "| ID ID Usage |\n",
50
+ "|=============================================================================|\n",
51
+ "| No running processes found |\n",
52
+ "+-----------------------------------------------------------------------------+\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "!nvidia-smi"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 3,
63
+ "id": "0afdf8a6-ea7d-44ab-a1f9-a19e550e9dbd",
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "/home/ubuntu/.local/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).\n",
71
+ " from pandas.core.computation.check import NUMEXPR_INSTALLED\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "import torch\n",
77
+ "from peft import PeftModel, PeftConfig\n",
78
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 4,
84
+ "id": "adfcd11e-8d98-4cf3-abf4-e9fa933eb0d6",
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "data": {
89
+ "application/vnd.jupyter.widget-view+json": {
90
+ "model_id": "7dc80313fdcd41a5a7ee168956df3dd9",
91
+ "version_major": 2,
92
+ "version_minor": 0
93
+ },
94
+ "text/plain": [
95
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
96
+ ]
97
+ },
98
+ "metadata": {},
99
+ "output_type": "display_data"
100
+ }
101
+ ],
102
+ "source": [
103
+ "from huggingface_hub import notebook_login\n",
104
+ "\n",
105
+ "notebook_login()"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 5,
111
+ "id": "82cfa4fb-af16-4927-82c4-1fbf0fa84bfa",
112
+ "metadata": {},
113
+ "outputs": [
114
+ {
115
+ "name": "stderr",
116
+ "output_type": "stream",
117
+ "text": [
118
+ "/home/ubuntu/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:2193: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n",
119
+ " warnings.warn(\n"
120
+ ]
121
+ },
122
+ {
123
+ "data": {
124
+ "application/vnd.jupyter.widget-view+json": {
125
+ "model_id": "d0f18088e32f4d4b857d2de5430528d4",
126
+ "version_major": 2,
127
+ "version_minor": 0
128
+ },
129
+ "text/plain": [
130
+ "Loading checkpoint shards: 0%| | 0/15 [00:00<?, ?it/s]"
131
+ ]
132
+ },
133
+ "metadata": {},
134
+ "output_type": "display_data"
135
+ }
136
+ ],
137
+ "source": [
138
+ "# peft_model_id = \"results/checkpoint-12500\"\n",
139
+ "peft_model_id = \"dfurman/llama-2-70b-dolphin-peft\"\n",
140
+ "config = PeftConfig.from_pretrained(peft_model_id)\n",
141
+ "\n",
142
+ "bnb_config = BitsAndBytesConfig(\n",
143
+ " load_in_4bit=True,\n",
144
+ " bnb_4bit_quant_type=\"nf4\",\n",
145
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
146
+ ")\n",
147
+ "\n",
148
+ "model = AutoModelForCausalLM.from_pretrained(\n",
149
+ " config.base_model_name_or_path,\n",
150
+ " quantization_config=bnb_config,\n",
151
+ " use_auth_token=True,\n",
152
+ " torch_dtype=torch.bfloat16,\n",
153
+ " device_map=\"auto\",\n",
154
+ ")\n",
155
+ "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
156
+ "tokenizer.pad_token = tokenizer.eos_token\n",
157
+ "\n",
158
+ "# Load the Lora model\n",
159
+ "model = PeftModel.from_pretrained(model, peft_model_id)"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 6,
165
+ "id": "d86f6a79-95f2-4e05-9bc7-3cbcbbbc9552",
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": [
169
+ "# text generation function\n",
170
+ "\n",
171
+ "\n",
172
+ "def llama_generate(\n",
173
+ " model: AutoModelForCausalLM,\n",
174
+ " tokenizer: AutoTokenizer,\n",
175
+ " prompt: str,\n",
176
+ " max_new_tokens: int = 128,\n",
177
+ " temperature: int = 1.0,\n",
178
+ ") -> str:\n",
179
+ " \"\"\"\n",
180
+ " Initialize the pipeline\n",
181
+ " Uses Hugging Face GenerationConfig defaults\n",
182
+ " https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
183
+ " Args:\n",
184
+ " model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
185
+ " tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
186
+ " prompt (str): Prompt for text generation\n",
187
+ " max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
188
+ " temperature (float, optional): The value used to modulate the next token probabilities.\n",
189
+ " Defaults to 1.0\n",
190
+ " \"\"\"\n",
191
+ " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
192
+ "\n",
193
+ " inputs = tokenizer(\n",
194
+ " [prompt],\n",
195
+ " return_tensors=\"pt\",\n",
196
+ " return_token_type_ids=False,\n",
197
+ " ).to(\n",
198
+ " device\n",
199
+ " ) # tokenize inputs, load on device\n",
200
+ "\n",
201
+ " # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
202
+ " with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
203
+ " response = model.generate(\n",
204
+ " **inputs,\n",
205
+ " max_new_tokens=max_new_tokens,\n",
206
+ " temperature=temperature,\n",
207
+ " return_dict_in_generate=True,\n",
208
+ " eos_token_id=tokenizer.eos_token_id,\n",
209
+ " pad_token_id=tokenizer.pad_token_id,\n",
210
+ " )\n",
211
+ "\n",
212
+ " decoded_output = tokenizer.decode(\n",
213
+ " response[\"sequences\"][0],\n",
214
+ " skip_special_tokens=True,\n",
215
+ " ) # grab output in natural language\n",
216
+ "\n",
217
+ " return decoded_output[len(prompt) :] # remove prompt from output"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 7,
223
+ "id": "28be263a-dd15-419f-a67e-7ca05b27435f",
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "name": "stdout",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "Sure! Here's a delicious and easy vegan banana bread recipe:\n",
231
+ "\n",
232
+ "Ingredients:\n",
233
+ "- 2 cups all-purpose flour\n",
234
+ "- 1/2 cup sugar\n",
235
+ "- 1/2 cup vegan butter (such as Earth Balance)\n",
236
+ "- 1/2 cup vegan milk (such as almond milk)\n",
237
+ "- 1/2 cup unsweetened applesauce\n",
238
+ "- 1/2 cup mashed ripe bananas (about 2 medium bananas)\n",
239
+ "- 1 teaspoon baking soda\n",
240
+ "- 1/2 teaspoon salt\n",
241
+ "- 1/2 teaspoon ground cinnamon\n",
242
+ "- 1/2 teaspoon ground nutmeg\n",
243
+ "- 1/2 teaspoon ground cloves\n",
244
+ "- 1/2 cup chopped walnuts (optional)\n",
245
+ "\n",
246
+ "Instructions:\n",
247
+ "1. Preheat the oven to 350°F (175°C). Grease a 9x5-inch loaf pan with vegan butter or cooking spray.\n",
248
+ "2. In a large bowl, mix together the flour, sugar, vegan butter, vegan milk, applesauce, bananas, baking soda, salt, cinnamon, nutmeg, and cloves. Stir until well combined.\n",
249
+ "3. Fold in the chopped walnuts, if using.\n",
250
+ "4. Pour the batter into the prepared loaf pan.\n",
251
+ "5. Bake for 50-60 minutes, or until a toothpick inserted into the center of the bread comes out clean.\n",
252
+ "6. Let the bread cool in the pan for 10 minutes before transferring it to a wire rack to cool completely.\n",
253
+ "7. Slice and enjoy!\n",
254
+ "\n",
255
+ "Note: You can also add chocolate chips, dried fruit, or other mix-ins to the batter for extra flavor and texture. Enjoy your vegan banana bread!\n",
256
+ "\n",
257
+ "\n",
258
+ "\n",
259
+ "\n",
260
+ "\n",
261
+ "\n",
262
+ "\n",
263
+ "\n",
264
+ "\n",
265
+ "\n",
266
+ "\n",
267
+ "\n",
268
+ "\n",
269
+ "\n",
270
+ "\n",
271
+ "\n",
272
+ "\n",
273
+ "\n",
274
+ "\n",
275
+ "\n",
276
+ "\n",
277
+ "\n",
278
+ "\n",
279
+ "\n",
280
+ "\n",
281
+ "\n",
282
+ "\n",
283
+ "\n",
284
+ "\n",
285
+ "\n",
286
+ "\n",
287
+ "\n",
288
+ "\n",
289
+ "\n",
290
+ "\n",
291
+ "\n",
292
+ "\n",
293
+ "\n",
294
+ "\n",
295
+ "\n",
296
+ "\n",
297
+ "\n",
298
+ "\n",
299
+ "\n",
300
+ "\n",
301
+ "\n",
302
+ "\n",
303
+ "\n",
304
+ "\n",
305
+ "\n",
306
+ "\n",
307
+ "\n"
308
+ ]
309
+ }
310
+ ],
311
+ "source": [
312
+ "prompt = \"You are a helpful assistant. Tell me a recipe for vegan banana bread.\\n\"\n",
313
+ "\n",
314
+ "response = llama_generate(\n",
315
+ " model,\n",
316
+ " tokenizer,\n",
317
+ " prompt,\n",
318
+ " max_new_tokens=500,\n",
319
+ " temperature=0.92,\n",
320
+ ")\n",
321
+ "\n",
322
+ "print(response)"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "id": "3625b3ff-6467-43ea-8557-9541934539ec",
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": []
332
+ }
333
+ ],
334
+ "metadata": {
335
+ "kernelspec": {
336
+ "display_name": "Python 3",
337
+ "language": "python",
338
+ "name": "python3"
339
+ },
340
+ "language_info": {
341
+ "codemirror_mode": {
342
+ "name": "ipython",
343
+ "version": 3
344
+ },
345
+ "file_extension": ".py",
346
+ "mimetype": "text/x-python",
347
+ "name": "python",
348
+ "nbconvert_exporter": "python",
349
+ "pygments_lexer": "ipython3",
350
+ "version": "3.8.10"
351
+ }
352
+ },
353
+ "nbformat": 4,
354
+ "nbformat_minor": 5
355
+ }