rakeshkumar1812 commited on
Commit
5103377
1 Parent(s): 9bfdccf

Upload three files for url RAG

Browse files
Files changed (3) hide show
  1. app.py +67 -0
  2. requirements.txt +20 -0
  3. utils.py +63 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import utils
3
+
4
+
5
+ # https://github.com/serkanyasr/RAG-with-LangChain-URL-PDF/blob/main/utils.py
6
+
7
+
8
+ st.set_page_config(layout="wide")
9
+ st.markdown("<h1 style='font-size:24px;'>RAG with LangChain & GenAI: Any url</h1>", unsafe_allow_html=True)
10
+ # st.title("RAG with LangChain & GenAI: Any url")
11
+
12
+ # URL text box for user input
13
+ url_input = st.text_input("Enter a URL to be queried:", "")
14
+
15
+ # Input text box for user input
16
+ user_input = st.text_input("Enter your Question below:", "")
17
+
18
+ # Display the user input
19
+ # st.write("You entered:", user_input)
20
+ # st.write("URL entered:", url_input)
21
+ sumbit_btn = st.button(label="Submit",key="url_btn")
22
+
23
+ if sumbit_btn:
24
+ with st.spinner("Processing..."):
25
+ st.success("Response: Answering with RAG...")
26
+ response = utils.rag_with_url(url_input,user_input)
27
+ st.markdown(response)
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+ # st.title("Retrieval-Augmented Generation (RAG) with LangChain : PDF ")
37
+ # st.divider()
38
+
39
+ # col_input , col_rag , col_normal = st.columns([3,5,5])
40
+ # with col_input:
41
+ # selected_file = st.file_uploader("PDF File", type=["pdf"])
42
+ # st.divider()
43
+ # prompt = st.text_input("Prompt",key="pdf_prompt")
44
+ # st.divider()
45
+ # sumbit_btn = st.button(label="Submit",key="pdf_btn")
46
+
47
+ # if sumbit_btn:
48
+ # with col_rag:
49
+ # with st.spinner("Processing..."):
50
+ # st.success("Response: Answering with RAG...")
51
+ # response,relevant_documents = utils.rag_with_pdf(file_path=f"./data/{selected_file.name}",
52
+ # prompt=prompt)
53
+ # st.markdown(response)
54
+ # st.divider()
55
+ # st.info("Documents")
56
+ # for doc in relevant_documents:
57
+ # st.caption(doc.page_content)
58
+ # st.markdown(f"Source: {doc.metadata}")
59
+ # st.divider()
60
+
61
+ # with col_normal:
62
+ # with st.spinner("Processing..."):
63
+ # st.info("Response: Answering without RAG...")
64
+ # response = utils.ask_gemini(prompt)
65
+ # st.markdown(response)
66
+ # st.divider()
67
+
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ python-dotenv
3
+ langchain-openai
4
+ langchain-cohere
5
+ langchain-google-genai
6
+ openai
7
+ streamlit
8
+ python-dotenv
9
+ bs4
10
+ cohere
11
+ faiss-cpu
12
+ pypdf
13
+ huggingface_hub
14
+ langchain_community
15
+
16
+ unstructured
17
+ tiktoken
18
+ libmagic
19
+ python-magic
20
+ python-magic-bin
utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_google_genai import ChatGoogleGenerativeAI
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from langchain_cohere import CohereEmbeddings
4
+ from langchain_openai import OpenAI
5
+ from langchain_community.document_loaders.web_base import WebBaseLoader
6
+ from langchain_community.document_loaders.pdf import PyPDFLoader
7
+ from langchain_community.vectorstores.faiss import FAISS
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
10
+ import os
11
+
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
+
16
+ GEMINI_API_KEY = os.getenv("GOOGLE_AI_API_KEY")
17
+ HF_API_KEY = os.getenv("HF_API_KEY")
18
+
19
+ llm_gemini = ChatGoogleGenerativeAI( google_api_key= GEMINI_API_KEY, model="gemini-pro")
20
+ embeddings_hf = HuggingFaceInferenceAPIEmbeddings(api_key=HF_API_KEY, model="sentence-transformers/all-MiniLM-16-v2")
21
+
22
+ # OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")
23
+ # COHERE_API_KEY = os.getenv("COHERE_API_KEY")
24
+ # llm_openai = OpenAI(api_key=OPEN_AI_API_KEY, model="gpt-3.5-turbo")
25
+ # embeddings_open_ai = OpenAIEmbeddings(api_key=OPEN_AI_API_KEY) # OPEN_AI
26
+ # embeddings_cohere = CohereEmbeddings(api_key=COHERE_API_KEY,model="embed-multilingual-v3.0") # embed-english-v3.0
27
+
28
+
29
+
30
+ def ask_gemini(prompt):
31
+ AI_Respose = llm_gemini.invoke(prompt)
32
+ return AI_Respose.content
33
+
34
+
35
+
36
+ def rag_with_url(target_url, prompt):
37
+
38
+ loader = WebBaseLoader(target_url)
39
+ raw_document = loader.load()
40
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
41
+ splited_document = text_splitter.split_documents(raw_document)
42
+ vector_store = FAISS.from_documents(splited_document, embeddings_hf)
43
+ retriever = vector_store.as_retriever()
44
+ relevant_documents = retriever.get_relevant_documents(prompt)
45
+ final_prompt = prompt + " " + " ".join([doc.page_content for doc in relevant_documents])
46
+ AI_Respose = llm_gemini.invoke(final_prompt)
47
+
48
+ return AI_Respose.content
49
+
50
+
51
+
52
+ # def rag_with_pdf(file_path, prompt):
53
+ # loader = PyPDFLoader(file_path)
54
+ # raw_document = loader.load()
55
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, length_function = len)
56
+ # splited_document = text_splitter.split_documents(raw_document)
57
+ # vector_store = FAISS.from_documents(splited_document, embeddings_hf)
58
+ # retriever = vector_store.as_retriever()
59
+ # relevant_documents = retriever.get_relevant_documents(prompt)
60
+ # final_prompt = prompt + " " + " ".join([doc.page_content for doc in relevant_documents])
61
+ # AI_Respose = llm_gemini.invoke(final_prompt)
62
+ # return AI_Respose.content, relevant_documents
63
+