Upload basic_inference_llama_2_70b_dolphin.ipynb

Browse files

Files changed (1) hide show

assets/basic_inference_llama_2_70b_dolphin.ipynb +355 -0

assets/basic_inference_llama_2_70b_dolphin.ipynb ADDED Viewed

	@@ -0,0 +1,355 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6f46e840-8a7f-4be2-a082-49b9ebf5a8c5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -q -U huggingface_hub peft transformers torch accelerate\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2d2918a1-d701-4a66-946c-6f668cb4ac1e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mon Jul 24 21:41:13 2023       \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |\n",
+      "|-------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                               |                      |               MIG M. |\n",
+      "|===============================+======================+======================|\n",
+      "|   0  NVIDIA H100 PCIe    On   | 00000000:06:00.0 Off |                    0 |\n",
+      "| N/A   39C    P0    52W / 350W |      0MiB / 81559MiB |      0%      Default |\n",
+      "|                               |                      |             Disabled |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "                                                                               \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| Processes:                                                                  |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
+      "|        ID   ID                                                   Usage      |\n",
+      "|=============================================================================|\n",
+      "|  No running processes found                                                 |\n",
+      "+-----------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0afdf8a6-ea7d-44ab-a1f9-a19e550e9dbd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/.local/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).\n",
+      "  from pandas.core.computation.check import NUMEXPR_INSTALLED\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "adfcd11e-8d98-4cf3-abf4-e9fa933eb0d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7dc80313fdcd41a5a7ee168956df3dd9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "82cfa4fb-af16-4927-82c4-1fbf0fa84bfa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/.local/lib/python3.8/site-packages/transformers/modeling_utils.py:2193: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0f18088e32f4d4b857d2de5430528d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# peft_model_id = \"results/checkpoint-12500\"\n",
+    "peft_model_id = \"dfurman/llama-2-70b-dolphin-peft\"\n",
+    "config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    ")\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    config.base_model_name_or_path,\n",
+    "    quantization_config=bnb_config,\n",
+    "    use_auth_token=True,\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "# Load the Lora model\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d86f6a79-95f2-4e05-9bc7-3cbcbbbc9552",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# text generation function\n",
+    "\n",
+    "\n",
+    "def llama_generate(\n",
+    "    model: AutoModelForCausalLM,\n",
+    "    tokenizer: AutoTokenizer,\n",
+    "    prompt: str,\n",
+    "    max_new_tokens: int = 128,\n",
+    "    temperature: int = 1.0,\n",
+    ") -> str:\n",
+    "    \"\"\"\n",
+    "    Initialize the pipeline\n",
+    "    Uses Hugging Face GenerationConfig defaults\n",
+    "        https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig\n",
+    "    Args:\n",
+    "        model (transformers.AutoModelForCausalLM): Falcon model for text generation\n",
+    "        tokenizer (transformers.AutoTokenizer): Tokenizer for model\n",
+    "        prompt (str): Prompt for text generation\n",
+    "        max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.\n",
+    "        temperature (float, optional): The value used to modulate the next token probabilities.\n",
+    "            Defaults to 1.0\n",
+    "    \"\"\"\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "    inputs = tokenizer(\n",
+    "        [prompt],\n",
+    "        return_tensors=\"pt\",\n",
+    "        return_token_type_ids=False,\n",
+    "    ).to(\n",
+    "        device\n",
+    "    )  # tokenize inputs, load on device\n",
+    "\n",
+    "    # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.\n",
+    "    with torch.autocast(\"cuda\", dtype=torch.bfloat16):\n",
+    "        response = model.generate(\n",
+    "            **inputs,\n",
+    "            max_new_tokens=max_new_tokens,\n",
+    "            temperature=temperature,\n",
+    "            return_dict_in_generate=True,\n",
+    "            eos_token_id=tokenizer.eos_token_id,\n",
+    "            pad_token_id=tokenizer.pad_token_id,\n",
+    "        )\n",
+    "\n",
+    "    decoded_output = tokenizer.decode(\n",
+    "        response[\"sequences\"][0],\n",
+    "        skip_special_tokens=True,\n",
+    "    )  # grab output in natural language\n",
+    "\n",
+    "    return decoded_output[len(prompt) :]  # remove prompt from output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "28be263a-dd15-419f-a67e-7ca05b27435f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sure! Here's a delicious and easy vegan banana bread recipe:\n",
+      "\n",
+      "Ingredients:\n",
+      "- 2 cups all-purpose flour\n",
+      "- 1/2 cup sugar\n",
+      "- 1/2 cup vegan butter (such as Earth Balance)\n",
+      "- 1/2 cup vegan milk (such as almond milk)\n",
+      "- 1/2 cup unsweetened applesauce\n",
+      "- 1/2 cup mashed ripe bananas (about 2 medium bananas)\n",
+      "- 1 teaspoon baking soda\n",
+      "- 1/2 teaspoon salt\n",
+      "- 1/2 teaspoon ground cinnamon\n",
+      "- 1/2 teaspoon ground nutmeg\n",
+      "- 1/2 teaspoon ground cloves\n",
+      "- 1/2 cup chopped walnuts (optional)\n",
+      "\n",
+      "Instructions:\n",
+      "1. Preheat the oven to 350°F (175°C). Grease a 9x5-inch loaf pan with vegan butter or cooking spray.\n",
+      "2. In a large bowl, mix together the flour, sugar, vegan butter, vegan milk, applesauce, bananas, baking soda, salt, cinnamon, nutmeg, and cloves. Stir until well combined.\n",
+      "3. Fold in the chopped walnuts, if using.\n",
+      "4. Pour the batter into the prepared loaf pan.\n",
+      "5. Bake for 50-60 minutes, or until a toothpick inserted into the center of the bread comes out clean.\n",
+      "6. Let the bread cool in the pan for 10 minutes before transferring it to a wire rack to cool completely.\n",
+      "7. Slice and enjoy!\n",
+      "\n",
+      "Note: You can also add chocolate chips, dried fruit, or other mix-ins to the batter for extra flavor and texture. Enjoy your vegan banana bread!\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompt = \"You are a helpful assistant. Tell me a recipe for vegan banana bread.\\n\"\n",
+    "\n",
+    "response = llama_generate(\n",
+    "    model,\n",
+    "    tokenizer,\n",
+    "    prompt,\n",
+    "    max_new_tokens=500,\n",
+    "    temperature=0.92,\n",
+    ")\n",
+    "\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3625b3ff-6467-43ea-8557-9541934539ec",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}