ka1kuk commited on
Commit
66a1c16
β€’
1 Parent(s): 5484efc

Update networks/message_streamer.py

Browse files
Files changed (1) hide show
  1. networks/message_streamer.py +18 -3
networks/message_streamer.py CHANGED
@@ -11,6 +11,8 @@ class MessageStreamer:
11
  MODEL_MAP = {
12
  "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1", # 72.62, fast [Recommended]
13
  "mistral-7b": "mistralai/Mistral-7B-Instruct-v0.2", # 65.71, fast
 
 
14
  # "openchat-3.5": "openchat/openchat-3.5-1210", # 68.89, fast
15
  # "zephyr-7b-beta": "HuggingFaceH4/zephyr-7b-beta", # ❌ Too Slow
16
  # "llama-70b": "meta-llama/Llama-2-70b-chat-hf", # ❌ Require Pro User
@@ -21,12 +23,16 @@ class MessageStreamer:
21
  STOP_SEQUENCES_MAP = {
22
  "mixtral-8x7b": "</s>",
23
  "mistral-7b": "</s>",
 
24
  "openchat-3.5": "<|end_of_turn|>",
 
25
  }
26
  TOKEN_LIMIT_MAP = {
27
  "mixtral-8x7b": 32768,
28
  "mistral-7b": 32768,
 
29
  "openchat-3.5": 8192,
 
30
  }
31
  TOKEN_RESERVED = 100
32
 
@@ -58,9 +64,11 @@ class MessageStreamer:
58
  def chat_response(
59
  self,
60
  prompt: str = None,
61
- temperature: float = 0,
 
62
  max_new_tokens: int = None,
63
  api_key: str = None,
 
64
  ):
65
  # https://huggingface.co/docs/api-inference/detailed_parameters?code=curl
66
  # curl --proxy http://<server>:<port> https://api-inference.huggingface.co/models/<org>/<model_name> -X POST -d '{"inputs":"who are you?","parameters":{"max_new_token":64}}' -H 'Content-Type: application/json' -H 'Authorization: Bearer <HF_TOKEN>'
@@ -79,9 +87,11 @@ class MessageStreamer:
79
 
80
  if temperature is None or temperature < 0:
81
  temperature = 0.0
82
- # temperature must be positive and <= 1 for HF LLM models
83
  temperature = max(temperature, 0.01)
84
- temperature = min(temperature, 1)
 
 
85
 
86
  token_limit = int(
87
  self.TOKEN_LIMIT_MAP[self.model]
@@ -102,13 +112,18 @@ class MessageStreamer:
102
  # huggingface_hub/inference/_text_generation.py:
103
  # class TextGenerationRequest > param `stream`
104
  # https://huggingface.co/docs/text-generation-inference/conceptual/streaming#streaming-with-curl
 
105
  self.request_body = {
106
  "inputs": prompt,
107
  "parameters": {
108
  "temperature": temperature,
 
109
  "max_new_tokens": max_new_tokens,
110
  "return_full_text": False,
111
  },
 
 
 
112
  "stream": True,
113
  }
114
 
 
11
  MODEL_MAP = {
12
  "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1", # 72.62, fast [Recommended]
13
  "mistral-7b": "mistralai/Mistral-7B-Instruct-v0.2", # 65.71, fast
14
+ "nous-mixtral-8x7b": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
15
+ "gemma-7b": "google/gemma-7b-it",
16
  # "openchat-3.5": "openchat/openchat-3.5-1210", # 68.89, fast
17
  # "zephyr-7b-beta": "HuggingFaceH4/zephyr-7b-beta", # ❌ Too Slow
18
  # "llama-70b": "meta-llama/Llama-2-70b-chat-hf", # ❌ Require Pro User
 
23
  STOP_SEQUENCES_MAP = {
24
  "mixtral-8x7b": "</s>",
25
  "mistral-7b": "</s>",
26
+ "nous-mixtral-8x7b": "<|im_end|>",
27
  "openchat-3.5": "<|end_of_turn|>",
28
+ "gemma-7b": "<eos>",
29
  }
30
  TOKEN_LIMIT_MAP = {
31
  "mixtral-8x7b": 32768,
32
  "mistral-7b": 32768,
33
+ "nous-mixtral-8x7b": 32768,
34
  "openchat-3.5": 8192,
35
+ "gemma-7b": 8192,
36
  }
37
  TOKEN_RESERVED = 100
38
 
 
64
  def chat_response(
65
  self,
66
  prompt: str = None,
67
+ temperature: float = 0.5,
68
+ top_p: float = 0.95,
69
  max_new_tokens: int = None,
70
  api_key: str = None,
71
+ use_cache: bool = False,
72
  ):
73
  # https://huggingface.co/docs/api-inference/detailed_parameters?code=curl
74
  # curl --proxy http://<server>:<port> https://api-inference.huggingface.co/models/<org>/<model_name> -X POST -d '{"inputs":"who are you?","parameters":{"max_new_token":64}}' -H 'Content-Type: application/json' -H 'Authorization: Bearer <HF_TOKEN>'
 
87
 
88
  if temperature is None or temperature < 0:
89
  temperature = 0.0
90
+ # temperature must 0 < and < 1 for HF LLM models
91
  temperature = max(temperature, 0.01)
92
+ temperature = min(temperature, 0.99)
93
+ top_p = max(top_p, 0.01)
94
+ top_p = min(top_p, 0.99)
95
 
96
  token_limit = int(
97
  self.TOKEN_LIMIT_MAP[self.model]
 
112
  # huggingface_hub/inference/_text_generation.py:
113
  # class TextGenerationRequest > param `stream`
114
  # https://huggingface.co/docs/text-generation-inference/conceptual/streaming#streaming-with-curl
115
+ # https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
116
  self.request_body = {
117
  "inputs": prompt,
118
  "parameters": {
119
  "temperature": temperature,
120
+ "top_p": top_p,
121
  "max_new_tokens": max_new_tokens,
122
  "return_full_text": False,
123
  },
124
+ "options": {
125
+ "use_cache": use_cache,
126
+ },
127
  "stream": True,
128
  }
129