Spaces:

Ritesh-hf
/

rag-api

Sleeping

App Files Files Community

Ritesh-hf commited on Aug 29

Commit

294a7fa

•

1 Parent(s): ab310ba

revert previous changes

Browse files

Files changed (2) hide show

app.py +132 -36
test.py +2 -2

app.py CHANGED Viewed

@@ -1,42 +1,138 @@
 import gradio as gr
 import spaces
-from transformers import AutoTokenizer, AutoModel
 import torch
-# Load the model and tokenizer
-model_name = "Alibaba-NLP/gte-large-en-v1.5"  # Adjust the model identifier if necessary
-tokenizer = AutoTokenizer.from_pretrained(model_name,  trust_remote_code=True)
-model = AutoModel.from_pretrained(model_name,  trust_remote_code=True)
-# Move model to GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-@spaces.GPU(duration=1)
-def generate_embeddings(text):
-    # Tokenize input text
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
-    # Move inputs to GPU if available
-    inputs = {key: value.to(device) for key, value in inputs.items()}
-    # Get model outputs
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # Extract embeddings (using the mean of the last hidden state as a simple approach)
-    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().squeeze().tolist()
-    return embeddings
-# Define the Gradio interface
-interface = gr.Interface(
-    fn=generate_embeddings,
-    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
-    outputs=gr.JSON(label="Text Embeddings"),
-    title="Text Embeddings Generator",
-    description="Generate text embeddings using the Alibaba-NLP-gte-large-en-v1.5 model."
 )
-if __name__ == "__main__":
-    interface.launch()

+import os
+from dotenv import load_dotenv
+load_dotenv(".env")
+os.environ['USER_AGENT'] = os.getenv("USER_AGENT")
+os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
+os.environ["TOKENIZERS_PARALLELISM"]='true'
+import nltk
+nltk.download('punkt_tab')
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from pinecone import Pinecone
+from pinecone_text.sparse import BM25Encoder
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.retrievers import PineconeHybridSearchRetriever
+from langchain_groq import ChatGroq
 import gradio as gr
 import spaces
 import torch
+try:
+    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+    index_name = "traveler-demo-website-vectorstore"
+    # connect to index
+    pinecone_index = pc.Index(index_name)
+except:
+    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+    index_name = "traveler-demo-website-vectorstore"
+    # connect to index
+    pinecone_index = pc.Index(index_name)
+bm25 = BM25Encoder().load("./bm25_traveler_website.json")
+embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-large-en-v1.5", model_kwargs={"trust_remote_code":True, 'device': 'cuda'})
+retriever = PineconeHybridSearchRetriever(
+    embeddings=embed_model,
+    sparse_encoder=bm25,
+    index=pinecone_index,
+    top_k=20,
+    alpha=0.5,
+)
+llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.1, max_tokens=1024, max_retries=2)
+### Contextualize question ###
+contextualize_q_system_prompt = """Given a chat history and the latest user question \
+which might reference context in the chat history, formulate a standalone question \
+which can be understood without the chat history. Do NOT answer the question, \
+just reformulate it if needed and otherwise return it as is.
+"""
+contextualize_q_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", contextualize_q_system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}")
+    ]
+)
+history_aware_retriever = create_history_aware_retriever(
+    llm, retriever, contextualize_q_prompt
 )
+qa_system_prompt = """You are a highly skilled information retrieval assistant. Use the following pieces of retrieved context to answer the question. \
+Provide links to sources provided in the answer. \
+If you don't know the answer, just say that you don't know. \
+Do not give extra long answers. \
+When responding to queries, your responses should be comprehensive and well-organized. For each response: \
+    1. Provide Clear Answers \
+    2. Include Detailed References: \
+        - Include links to sources and any links or sites where there is a mentioned in the answer.
+        - Links to Sources: Provide URLs to credible sources where users can verify the information or explore further. \
+        - Downloadable Materials: Include links to any relevant downloadable resources if applicable. \
+        - Reference Sites: Mention specific websites or platforms that offer additional information. \
+    3. Formatting for Readability: \
+        - Bullet Points or Lists: Where applicable, use bullet points or numbered lists to present information clearly. \
+        - Emphasize Important Information: Use bold or italics to highlight key details. \
+    4. Organize Content Logically \
+Do not include anything about context in the answer. \
+{context}
+"""
+qa_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", qa_system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}")
+    ]
+)
+question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+### Statefully manage chat history ###
+store = {}
+def get_session_history(session_id: str) -> BaseChatMessageHistory:
+    if session_id not in store:
+        store[session_id] = ChatMessageHistory()
+    return store[session_id]
+conversational_rag_chain = RunnableWithMessageHistory(
+    rag_chain,
+    get_session_history,
+    input_messages_key="input",
+    history_messages_key="chat_history",
+    output_messages_key="answer",
+)
+@spaces.GPU(duration=8)
+def handle_message(question, history={}):
+    response = ''
+    chain = conversational_rag_chain.pick("answer")
+    for chunk in chain.stream(
+         {"input": question},
+        config={
+            "configurable": {"session_id": "abc123"}
+        },
+    ):
+        response += chunk
+        yield response
+if __name__ == '__main__':
+    demo = gr.ChatInterface(fn=handle_message)
+    demo.launch()

test.py CHANGED Viewed

@@ -8,8 +8,8 @@ while True:
     question = input("Question: ")
     start_time = timeit.default_timer()
     result = client.predict(
-            question=question,
-            api_name="/chat"
     )
     end_time = timeit.default_timer()
     print(result)

     question = input("Question: ")
     start_time = timeit.default_timer()
     result = client.predict(
+            text=question,
+            api_name="/predict"
     )
     end_time = timeit.default_timer()
     print(result)