Spaces:

mitultiwari
/

legal-qna

Sleeping

App Files Files Community

mitultiwari commited on Mar 13

Commit

5255e92

•

1 Parent(s): 09d2176

pdf rag

Browse files

Files changed (7) hide show

Dockerfile +11 -0
README.md +4 -7
app.py +55 -0
chainlit.md +5 -0
data/test.txt +1 -0
requirements.txt +108 -0
src/retrieval_lib.py +105 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,8 @@
 ---
-title: Legal Qna
-emoji: 👀
-colorFrom: red
-colorTo: blue
 sdk: docker
 pinned: false
-license: openrail
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PDF RAG Demo
+emoji: 📉
+colorFrom: pink
+colorTo: yellow
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# You can find this code for Chainlit python streaming here (https://docs.chainlit.io/concepts/streaming/python)
+# OpenAI Chat completion
+import os
+from openai import AsyncOpenAI  # importing openai for API usage
+import chainlit as cl  # importing chainlit for our app
+from chainlit.prompt import Prompt, PromptMessage  # importing prompt tools
+from chainlit.playground.providers import ChatOpenAI  # importing ChatOpenAI tools
+from dotenv import load_dotenv
+from src.retrieval_lib import initialize_index, load_pdf_to_text, split_text, load_text_to_index, query_index, create_answer_prompt, generate_answer
+load_dotenv()
+retriever = initialize_index()
+@cl.on_chat_start  # marks a function that will be executed at the start of a user session
+async def start_chat():
+    settings = {
+        "model": "gpt-3.5-turbo",
+        "temperature": 0,
+        "max_tokens": 500,
+        "top_p": 1,
+        "frequency_penalty": 0,
+        "presence_penalty": 0,
+    }
+    cl.user_session.set("settings", settings)
+@cl.on_message  # marks a function that should be run each time the chatbot receives a message from a user
+async def main(message: cl.Message):
+    settings = cl.user_session.get("settings")
+    client = AsyncOpenAI()
+    print(message.content)
+    #print([m.to_openai() for m in prompt.messages])
+    query = message.content
+    # query = "what is the reason for the lawsuit"
+    retrieved_docs = query_index(retriever, query)
+    print("retrieved_docs: \n", len(retrieved_docs))
+    answer_prompt = create_answer_prompt()
+    print("answer_prompt: \n", answer_prompt)
+    result = generate_answer(retriever, answer_prompt, query)
+    print("result: \n", result["response"].content)
+    msg = cl.Message(content="")
+    msg.content = result["response"].content
+    # Send and close the message stream
+    await msg.send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# PDF RAG
+RAG over a PDF document
+Disclaimer: this is running the query over the pdf document and generating answers using LLM. LLMs can hellucinate and can generate wrong answers.

data/test.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ i

requirements.txt ADDED Viewed

	@@ -0,0 +1,108 @@

+aiofiles==23.2.1
+aiohttp==3.9.3
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==3.7.1
+appdirs==1.4.4
+async-timeout==4.0.3
+asyncer==0.0.2
+attrs==23.2.0
+bidict==0.23.1
+certifi==2024.2.2
+chainlit==0.7.700
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.5.14
+datasets==2.18.0
+Deprecated==1.2.14
+dill==0.3.8
+distro==1.9.0
+exceptiongroup==1.2.0
+faiss-cpu==1.8.0
+fastapi==0.100.1
+fastapi-socketio==0.0.10
+filelock==3.13.1
+filetype==1.2.0
+frozenlist==1.4.1
+fsspec==2024.2.0
+googleapis-common-protos==1.62.0
+grpcio==1.62.1
+h11==0.14.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.21.4
+idna==3.6
+importlib-metadata==6.11.0
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.1.11
+langchain-community==0.0.27
+langchain-core==0.1.30
+langchain-openai==0.0.8
+langchain-text-splitters==0.0.1
+langchainhub==0.1.15
+langsmith==0.1.23
+Lazify==0.4.0
+marshmallow==3.21.1
+multidict==6.0.5
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+numpy==1.26.4
+openai==1.13.3
+opentelemetry-api==1.23.0
+opentelemetry-exporter-otlp==1.23.0
+opentelemetry-exporter-otlp-proto-common==1.23.0
+opentelemetry-exporter-otlp-proto-grpc==1.23.0
+opentelemetry-exporter-otlp-proto-http==1.23.0
+opentelemetry-instrumentation==0.44b0
+opentelemetry-proto==1.23.0
+opentelemetry-sdk==1.23.0
+opentelemetry-semantic-conventions==0.44b0
+orjson==3.9.15
+packaging==23.2
+pandas==2.2.1
+protobuf==4.25.3
+pyarrow==15.0.1
+pyarrow-hotfix==0.6
+pydantic==2.6.3
+pydantic_core==2.16.3
+PyJWT==2.8.0
+PyMuPDF==1.23.26
+PyMuPDFb==1.23.22
+pysbd==0.3.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-engineio==4.9.0
+python-graphql-client==0.4.3
+python-multipart==0.0.6
+python-socketio==5.11.1
+pytz==2024.1
+PyYAML==6.0.1
+ragas==0.1.3
+regex==2023.12.25
+requests==2.31.0
+simple-websocket==1.0.0
+six==1.16.0
+sniffio==1.3.1
+SQLAlchemy==2.0.28
+starlette==0.27.0
+syncer==2.0.3
+tenacity==8.2.3
+tiktoken==0.6.0
+tomli==2.0.1
+tqdm==4.66.2
+types-requests==2.31.0.20240311
+typing-inspect==0.9.0
+typing_extensions==4.10.0
+tzdata==2024.1
+uptrace==1.22.0
+urllib3==2.2.1
+uvicorn==0.23.2
+watchfiles==0.20.0
+websockets==12.0
+wrapt==1.16.0
+wsproto==1.2.0
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.17.0

src/retrieval_lib.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# import libraries
+import os
+import openai
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.prompts import ChatPromptTemplate
+from operator import itemgetter
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+LLM_MODEL_NAME = "gpt-3.5-turbo"
+# load PDF doc and convert to text
+def load_pdf_to_text(pdf_path):
+    # create a document loader
+    loader = PyMuPDFLoader(pdf_path)
+    # load the document
+    doc = loader.load()
+    return doc
+def split_text(text):
+    # create a text splitter
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=700,
+        chunk_overlap=100,
+    )
+    # split the text
+    split_text = splitter.split_documents(text)
+    return split_text
+# load text into FAISS index
+def load_text_to_index(doc_splits):
+    embeddings = OpenAIEmbeddings(
+        model = "text-embedding-3-small"
+    )
+    vector_store = FAISS.from_documents(doc_splits, embeddings)
+    retriever = vector_store.as_retriever()
+    return retriever
+# query FAISS index
+def query_index(retriever, query):
+    retrieved_docs = retriever.invoke(query)
+    return retrieved_docs
+# create answer prompt
+def create_answer_prompt():
+    template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
+    Context:
+    {context}
+    Question:
+    {question}
+    """
+    print("template: ", len(template))
+    prompt = ChatPromptTemplate.from_template(template)
+    return prompt
+# generate answer
+def generate_answer(retriever, answer_prompt, query):
+    print("generate_answer()")
+    QnA_LLM = ChatOpenAI(model_name=LLM_MODEL_NAME, temperature=0.0)
+    retrieval_qna_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | RunnablePassthrough.assign(context = itemgetter("context"))
+        | {"response": answer_prompt | QnA_LLM, "context": itemgetter("context")}
+    )
+    result = retrieval_qna_chain.invoke({"question": query})
+    return result
+def initialize_index():
+    # load pdf
+    cwd = os.path.abspath(os.getcwd())
+    data_dir = "data"
+    pdf_file = "nvidia_earnings_report.pdf"
+    # pdf_file = "musk-v-altman-openai-complaint-sf.pdf"
+    pdf_path = os.path.join(cwd, data_dir, pdf_file)
+    print("path: ", pdf_path)
+    doc = load_pdf_to_text(pdf_path)
+    print("doc: \n", len(doc))
+    doc_splits = split_text(doc)
+    print("doc_splits length: \n", len(doc_splits))
+    retriever = load_text_to_index(doc_splits)
+    return retriever
+def main():
+    retriever = initialize_index()
+    # query = "Who is the E-VP, Operations"
+    query = "what is the reason for the lawsuit"
+    retrieved_docs = query_index(retriever, query)
+    print("retrieved_docs: \n", len(retrieved_docs))
+    answer_prompt = create_answer_prompt()
+    print("answer_prompt: \n", answer_prompt)
+    result = generate_answer(retriever, answer_prompt, query)
+    print("result: \n", result["response"].content)
+if __name__ == "__main__":
+    main()