Spaces:

ka1kuk
/

LLM-api

Running

App Files Files Community

ka1kuk commited on Feb 24

Commit

66a1c16

•

1 Parent(s): 5484efc

Update networks/message_streamer.py

Browse files

Files changed (1) hide show

networks/message_streamer.py +18 -3

networks/message_streamer.py CHANGED Viewed

@@ -11,6 +11,8 @@ class MessageStreamer:
     MODEL_MAP = {
         "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",  # 72.62, fast [Recommended]
         "mistral-7b": "mistralai/Mistral-7B-Instruct-v0.2",  # 65.71, fast
         # "openchat-3.5": "openchat/openchat-3.5-1210",  # 68.89, fast
         # "zephyr-7b-beta": "HuggingFaceH4/zephyr-7b-beta",  # ❌ Too Slow
         # "llama-70b": "meta-llama/Llama-2-70b-chat-hf",  # ❌ Require Pro User
@@ -21,12 +23,16 @@ class MessageStreamer:
     STOP_SEQUENCES_MAP = {
         "mixtral-8x7b": "</s>",
         "mistral-7b": "</s>",
         "openchat-3.5": "<|end_of_turn|>",
     }
     TOKEN_LIMIT_MAP = {
         "mixtral-8x7b": 32768,
         "mistral-7b": 32768,
         "openchat-3.5": 8192,
     }
     TOKEN_RESERVED = 100
@@ -58,9 +64,11 @@ class MessageStreamer:
     def chat_response(
         self,
         prompt: str = None,
-        temperature: float = 0,
         max_new_tokens: int = None,
         api_key: str = None,
     ):
         # https://huggingface.co/docs/api-inference/detailed_parameters?code=curl
         # curl --proxy http://<server>:<port> https://api-inference.huggingface.co/models/<org>/<model_name> -X POST -d '{"inputs":"who are you?","parameters":{"max_new_token":64}}' -H 'Content-Type: application/json' -H 'Authorization: Bearer <HF_TOKEN>'
@@ -79,9 +87,11 @@ class MessageStreamer:
         if temperature is None or temperature < 0:
             temperature = 0.0
-        # temperature must be positive and <= 1 for HF LLM models
         temperature = max(temperature, 0.01)
-        temperature = min(temperature, 1)
         token_limit = int(
             self.TOKEN_LIMIT_MAP[self.model]
@@ -102,13 +112,18 @@ class MessageStreamer:
         #   huggingface_hub/inference/_text_generation.py:
         #     class TextGenerationRequest > param `stream`
         # https://huggingface.co/docs/text-generation-inference/conceptual/streaming#streaming-with-curl
         self.request_body = {
             "inputs": prompt,
             "parameters": {
                 "temperature": temperature,
                 "max_new_tokens": max_new_tokens,
                 "return_full_text": False,
             },
             "stream": True,
         }

     MODEL_MAP = {
         "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",  # 72.62, fast [Recommended]
         "mistral-7b": "mistralai/Mistral-7B-Instruct-v0.2",  # 65.71, fast
+        "nous-mixtral-8x7b": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+        "gemma-7b": "google/gemma-7b-it",
         # "openchat-3.5": "openchat/openchat-3.5-1210",  # 68.89, fast
         # "zephyr-7b-beta": "HuggingFaceH4/zephyr-7b-beta",  # ❌ Too Slow
         # "llama-70b": "meta-llama/Llama-2-70b-chat-hf",  # ❌ Require Pro User
     STOP_SEQUENCES_MAP = {
         "mixtral-8x7b": "</s>",
         "mistral-7b": "</s>",
+        "nous-mixtral-8x7b": "<|im_end|>",
         "openchat-3.5": "<|end_of_turn|>",
+        "gemma-7b": "<eos>",
     }
     TOKEN_LIMIT_MAP = {
         "mixtral-8x7b": 32768,
         "mistral-7b": 32768,
+        "nous-mixtral-8x7b": 32768,
         "openchat-3.5": 8192,
+        "gemma-7b": 8192,
     }
     TOKEN_RESERVED = 100
     def chat_response(
         self,
         prompt: str = None,
+        temperature: float = 0.5,
+        top_p: float = 0.95,
         max_new_tokens: int = None,
         api_key: str = None,
+        use_cache: bool = False,
     ):
         # https://huggingface.co/docs/api-inference/detailed_parameters?code=curl
         # curl --proxy http://<server>:<port> https://api-inference.huggingface.co/models/<org>/<model_name> -X POST -d '{"inputs":"who are you?","parameters":{"max_new_token":64}}' -H 'Content-Type: application/json' -H 'Authorization: Bearer <HF_TOKEN>'
         if temperature is None or temperature < 0:
             temperature = 0.0
+        # temperature must  0 < and < 1 for HF LLM models
         temperature = max(temperature, 0.01)
+        temperature = min(temperature, 0.99)
+        top_p = max(top_p, 0.01)
+        top_p = min(top_p, 0.99)
         token_limit = int(
             self.TOKEN_LIMIT_MAP[self.model]
         #   huggingface_hub/inference/_text_generation.py:
         #     class TextGenerationRequest > param `stream`
         # https://huggingface.co/docs/text-generation-inference/conceptual/streaming#streaming-with-curl
+        # https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
         self.request_body = {
             "inputs": prompt,
             "parameters": {
                 "temperature": temperature,
+                "top_p": top_p,
                 "max_new_tokens": max_new_tokens,
                 "return_full_text": False,
             },
+            "options": {
+                "use_cache": use_cache,
+            },
             "stream": True,
         }