Text Generation
PEFT
Safetensors
llama-2
Eval Results
dfurman commited on
Commit
e88b0d5
1 Parent(s): ec06663

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -77
README.md CHANGED
@@ -154,29 +154,26 @@ While great efforts have been taken to clean the pretraining data, it is possibl
154
 
155
  Basic usage: [notebook](assets/basic_inference_llama_2_dolphin.ipynb)
156
 
157
- Install and import the package dependencies:
158
-
159
  ```python
160
  !pip install -q -U huggingface_hub peft transformers torch accelerate
161
  ```
162
 
163
  ```python
 
164
  import torch
165
  from peft import PeftModel, PeftConfig
166
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
167
- ```
168
-
169
- Sign into a HF account with access to Llama-2:
 
 
170
 
171
- ```python
172
- from huggingface_hub import notebook_login
173
  notebook_login()
174
  ```
175
 
176
- Basic model loading:
177
-
178
  ```python
179
- peft_model_id = "dfurman/llama-2-70b-dolphin-peft"
180
  config = PeftConfig.from_pretrained(peft_model_id)
181
 
182
  bnb_config = BitsAndBytesConfig(
@@ -189,83 +186,42 @@ model = AutoModelForCausalLM.from_pretrained(
189
  config.base_model_name_or_path,
190
  quantization_config=bnb_config,
191
  use_auth_token=True,
192
- torch_dtype=torch.bfloat16,
193
  device_map="auto",
194
  )
195
- tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
 
196
  tokenizer.pad_token = tokenizer.eos_token
197
 
198
- # Load the Lora model
199
  model = PeftModel.from_pretrained(model, peft_model_id)
200
- ```
201
 
202
- Once loaded, the model and tokenizer can be used with the following code:
203
-
204
- ```python
205
- def llama_generate(
206
- model: AutoModelForCausalLM,
207
- tokenizer: AutoTokenizer,
208
- prompt: str,
209
- max_new_tokens: int = 128,
210
- temperature: float = 0.92,
211
- ) -> str:
212
- """
213
- Initialize the pipeline
214
- Uses Hugging Face GenerationConfig defaults
215
- https://huggingface.co/docs/transformers/v4.29.1/en/main_classes/text_generation#transformers.GenerationConfig
216
- Args:
217
- model (transformers.AutoModelForCausalLM): Falcon model for text generation
218
- tokenizer (transformers.AutoTokenizer): Tokenizer for model
219
- prompt (str): Prompt for text generation
220
- max_new_tokens (int, optional): Max new tokens after the prompt to generate. Defaults to 128.
221
- temperature (float, optional): The value used to modulate the next token probabilities.
222
- Defaults to 1.0
223
- """
224
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
225
-
226
- inputs = tokenizer(
227
- [prompt],
228
- return_tensors="pt",
229
- return_token_type_ids=False,
230
- ).to(
231
- device
232
- ) # tokenize inputs, load on device
233
-
234
- # when running Torch modules in lower precision, it is best practice to use the torch.autocast context manager.
235
- with torch.autocast("cuda", dtype=torch.bfloat16):
236
- response = model.generate(
237
- **inputs,
238
- max_new_tokens=max_new_tokens,
239
- temperature=temperature,
240
- return_dict_in_generate=True,
241
- eos_token_id=tokenizer.eos_token_id,
242
- pad_token_id=tokenizer.pad_token_id,
243
- )
244
-
245
- decoded_output = tokenizer.decode(
246
- response["sequences"][0],
247
- skip_special_tokens=True,
248
- ) # grab output in natural language
249
-
250
- return decoded_output[len(prompt) :] # remove prompt from output
251
  ```
252
 
253
- We can now generate text! For example:
254
-
255
  ```python
256
- prompt = "You are a helpful assistant. Tell me a recipe for vegan banana bread.\n"
257
-
258
- response = llama_generate(
259
- model,
260
- tokenizer,
261
- prompt,
262
- max_new_tokens=500,
263
- temperature=0.92,
264
- )
265
-
266
- print(response)
 
 
 
 
 
 
 
 
 
 
267
  ```
268
 
 
269
  ### Runtime tests
270
 
271
  | runtime / 50 tokens (sec) | GPU | attn | torch dtype | VRAM (GB) |
 
154
 
155
  Basic usage: [notebook](assets/basic_inference_llama_2_dolphin.ipynb)
156
 
 
 
157
  ```python
158
  !pip install -q -U huggingface_hub peft transformers torch accelerate
159
  ```
160
 
161
  ```python
162
+ from huggingface_hub import notebook_login
163
  import torch
164
  from peft import PeftModel, PeftConfig
165
+ from transformers import (
166
+ AutoModelForCausalLM,
167
+ AutoTokenizer,
168
+ BitsAndBytesConfig,
169
+ pipeline,
170
+ )
171
 
 
 
172
  notebook_login()
173
  ```
174
 
 
 
175
  ```python
176
+ peft_model_id = "dfurman/llama-2-13b-dolphin-peft"
177
  config = PeftConfig.from_pretrained(peft_model_id)
178
 
179
  bnb_config = BitsAndBytesConfig(
 
186
  config.base_model_name_or_path,
187
  quantization_config=bnb_config,
188
  use_auth_token=True,
 
189
  device_map="auto",
190
  )
191
+
192
+ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, use_fast=True)
193
  tokenizer.pad_token = tokenizer.eos_token
194
 
 
195
  model = PeftModel.from_pretrained(model, peft_model_id)
 
196
 
197
+ format_template = "You are a helpful assistant. {query}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  ```
199
 
 
 
200
  ```python
201
+ # First, format the prompt
202
+ query = "Tell me a recipe for vegan banana bread."
203
+ prompt = format_template.format(query=query)
204
+
205
+ # Inference can be done using model.generate
206
+ print("\n\n*** Generate:")
207
+
208
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
209
+ with torch.autocast("cuda", dtype=torch.bfloat16):
210
+ output = model.generate(
211
+ input_ids=input_ids,
212
+ max_new_tokens=512,
213
+ do_sample=True,
214
+ temperature=0.7,
215
+ return_dict_in_generate=True,
216
+ eos_token_id=tokenizer.eos_token_id,
217
+ pad_token_id=tokenizer.pad_token_id,
218
+ repetition_penalty=1.2,
219
+ )
220
+
221
+ print(tokenizer.decode(output["sequences"][0], skip_special_tokens=True))
222
  ```
223
 
224
+
225
  ### Runtime tests
226
 
227
  | runtime / 50 tokens (sec) | GPU | attn | torch dtype | VRAM (GB) |