Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

XThomasBU commited on Jan 28

Commit

b83cc65

•

1 Parent(s): 8591fb3

hf sync commit

Browse files

Files changed (15) hide show

Dockerfile +29 -0
README.md +11 -11
code/chainlit.md → chainlit.md +2 -0
code/config.yml +11 -8
code/main.py +68 -53
code/modules/chat_model_loader.py +21 -7
code/modules/constants.py +31 -7
code/modules/data_loader.py +18 -15
code/modules/embedding_model_loader.py +6 -0
code/modules/helpers.py +162 -0
code/modules/llm_tutor.py +11 -3
code/modules/vector_db.py +12 -3
data/webpage.pdf +0 -0
requirements.txt +5 -1
storage/data/urls.txt +1 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
+RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
+COPY . /code
+RUN ls -R
+# Change permissions to allow writing to the directory
+RUN chmod -R 777 /code
+# Create a logs directory and set permissions
+RUN mkdir /code/logs && chmod 777 /code/logs
+# Create a cache directory within the application's working directory
+RUN mkdir /.cache && chmod -R 777 /.cache
+RUN --mount=type=secret,id=HUGGINGFACEHUB_API_TOKEN,mode=0444,required=true
+RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=true
+CMD ["chainlit", "run", "code/main.py", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
-# dl4ds_tutor
-## Setup
-1. conda create -n dl4ds_tutor python=3.9
-2. conda activate dl4ds_tutor
-3. pip install -r requirements.txt
-4. Create a .env file and add your openai api key as 'OPENAI_API_KEY=XXX'
-## Instructions
-1. Add files to `data/`
-2. cd code
-3. chainlit run main.py

+---
+title: Dl4ds Tutor
+emoji: 🏃
+colorFrom: green
+colorTo: red
+sdk: docker
+pinned: false
+---
+DL4DS Tutor
+===========
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

code/chainlit.md → chainlit.md RENAMED Viewed

@@ -3,6 +3,8 @@
 Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
 This is still very much a Work in Progress.
 ## Useful Links 🔗
 - **Documentation:**  [Chainlit Documentation](https://docs.chainlit.io) 📚

 Hi there, this is an LLM chatbot designed to help answer questions on the course content, built using Langchain and Chainlit.
 This is still very much a Work in Progress.
+### --- Please wait while the Tutor loads... ---
 ## Useful Links 🔗
 - **Documentation:**  [Chainlit Documentation](https://docs.chainlit.io) 📚

code/config.yml CHANGED Viewed

@@ -1,26 +1,29 @@
 embedding_options:
   embedd_files: True # bool
   persist_directory: null # str or None
-  data_path: '../data' # str
   db_option : 'FAISS' # str
   db_path : 'vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
-  search_top_k : 5 # int
 llm_params:
-  use_history: True # bool
-  llm_loader: 'openai' # str [ctransformers, openai]
   openai_params:
     model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
-  ctransformers_params:
-    model: "TheBloke/Llama-2-7B-Chat-GGML"
     model_type: "llama"
 splitter_options:
   use_splitter: True # bool
   split_by_token : True # bool
   remove_leftover_delimiters: True # bool
   remove_chunks: False # bool
-  chunk_size : 800 # int
-  chunk_overlap : 80 # int
   chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
   front_chunks_to_remove : null # int or None
   last_chunks_to_remove : null # int or None

 embedding_options:
   embedd_files: True # bool
   persist_directory: null # str or None
+  data_path: 'storage/data' # str
+  url_file_path: 'storage/data/urls.txt' # str
+  expand_urls: True # bool
   db_option : 'FAISS' # str
   db_path : 'vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
+  search_top_k : 3 # int
 llm_params:
+  use_history: False # bool
+  llm_loader: 'local_llm' # str [local_llm, openai]
   openai_params:
     model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
+  local_llm_params:
+    model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
     model_type: "llama"
+    temperature: 0.2
 splitter_options:
   use_splitter: True # bool
   split_by_token : True # bool
   remove_leftover_delimiters: True # bool
   remove_chunks: False # bool
+  chunk_size : 300 # int
+  chunk_overlap : 30 # int
   chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
   front_chunks_to_remove : null # int or None
   last_chunks_to_remove : null # int or None

code/main.py CHANGED Viewed

@@ -12,6 +12,8 @@ import logging
 from dotenv import load_dotenv
 from modules.llm_tutor import LLMTutor
 logger = logging.getLogger(__name__)
@@ -31,22 +33,70 @@ file_handler.setLevel(logging.INFO)
 file_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
-with open("config.yml", "r") as f:
-    config = yaml.safe_load(f)
-print(config)
-logger.info("Config file loaded")
-logger.info(f"Config: {config}")
-logger.info("Creating llm_tutor instance")
-llm_tutor = LLMTutor(config, logger=logger)
 # chainlit code
 @cl.on_chat_start
 async def start():
     chain = llm_tutor.qa_bot()
-    msg = cl.Message(content="Starting the bot...")
     await msg.send()
-    msg.content = "Hey, What Can I Help You With?"
     await msg.update()
     cl.user_session.set("chain", chain)
@@ -54,56 +104,21 @@ async def start():
 @cl.on_message
 async def main(message):
     chain = cl.user_session.get("chain")
-    cb = cl.AsyncLangchainCallbackHandler(
-        stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
-    )
-    cb.answer_reached = True
     # res=await chain.acall(message, callbacks=[cb])
-    res = await chain.acall(message.content, callbacks=[cb])
-    # print(f"response: {res}")
     try:
         answer = res["answer"]
     except:
         answer = res["result"]
     print(f"answer: {answer}")
-    source_elements_dict = {}
-    source_elements = []
-    found_sources = []
-    for idx, source in enumerate(res["source_documents"]):
-        title = source.metadata["source"]
-        if title not in source_elements_dict:
-            source_elements_dict[title] = {
-                "page_number": [source.metadata["page"]],
-                "url": source.metadata["source"],
-                "content": source.page_content,
-            }
-        else:
-            source_elements_dict[title]["page_number"].append(source.metadata["page"])
-        source_elements_dict[title][
-            "content_" + str(source.metadata["page"])
-        ] = source.page_content
-        # sort the page numbers
-        # source_elements_dict[title]["page_number"].sort()
-    for title, source in source_elements_dict.items():
-        # create a string for the page numbers
-        page_numbers = ", ".join([str(x) for x in source["page_number"]])
-        text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
-        source_elements.append(cl.Pdf(name="File", path=title))
-        found_sources.append("File")
-        # for pn in source["page_number"]:
-        #     source_elements.append(
-        #         cl.Text(name=str(pn), content=source["content_"+str(pn)])
-        #     )
-        #     found_sources.append(str(pn))
-    if found_sources:
-        answer += f"\nSource:{', '.join(found_sources)}"
-    else:
-        answer += f"\nNo source found."
-    await cl.Message(content=answer, elements=source_elements).send()

 from dotenv import load_dotenv
 from modules.llm_tutor import LLMTutor
+from modules.constants import *
+from modules.helpers import get_sources
 logger = logging.getLogger(__name__)
 file_handler.setFormatter(formatter)
 logger.addHandler(file_handler)
+# Adding option to select the chat profile
+@cl.set_chat_profiles
+async def chat_profile():
+    return [
+        cl.ChatProfile(
+            name="Llama",
+            markdown_description="Use the local LLM: **Tiny Llama**.",
+        ),
+        # cl.ChatProfile(
+        #     name="Mistral",
+        #     markdown_description="Use the local LLM: **Mistral**.",
+        # ),
+        cl.ChatProfile(
+            name="gpt-3.5-turbo-1106",
+            markdown_description="Use OpenAI API for **gpt-3.5-turbo-1106**.",
+        ),
+        cl.ChatProfile(
+            name="gpt-4",
+            markdown_description="Use OpenAI API for **gpt-4**.",
+        ),
+    ]
+@cl.author_rename
+def rename(orig_author: str):
+    rename_dict = {"Chatbot": "AI Tutor"}
+    return rename_dict.get(orig_author, orig_author)
 # chainlit code
 @cl.on_chat_start
 async def start():
+    with open("code/config.yml", "r") as f:
+        config = yaml.safe_load(f)
+        print(config)
+        logger.info("Config file loaded")
+        logger.info(f"Config: {config}")
+        logger.info("Creating llm_tutor instance")
+    chat_profile = cl.user_session.get("chat_profile")
+    if chat_profile is not None:
+        if chat_profile.lower() in ["gpt-3.5-turbo-1106", "gpt-4"]:
+            config["llm_params"]["llm_loader"] = "openai"
+            config["llm_params"]["openai_params"]["model"] = chat_profile.lower()
+        elif chat_profile.lower() == "llama":
+            config["llm_params"]["llm_loader"] = "local_llm"
+            config["llm_params"]["local_llm_params"]["model"] = LLAMA_PATH
+            config["llm_params"]["local_llm_params"]["model_type"] = "llama"
+        elif chat_profile.lower() == "mistral":
+            config["llm_params"]["llm_loader"] = "local_llm"
+            config["llm_params"]["local_llm_params"]["model"] = MISTRAL_PATH
+            config["llm_params"]["local_llm_params"]["model_type"] = "mistral"
+        else:
+            pass
+    llm_tutor = LLMTutor(config, logger=logger)
     chain = llm_tutor.qa_bot()
+    model = config["llm_params"]["local_llm_params"]["model"]
+    msg = cl.Message(content=f"Starting the bot {model}...")
     await msg.send()
+    msg.content = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else! You can find me at {model}"
     await msg.update()
     cl.user_session.set("chain", chain)
 @cl.on_message
 async def main(message):
+    user = cl.user_session.get("user")
     chain = cl.user_session.get("chain")
+    # cb = cl.AsyncLangchainCallbackHandler(
+    #     stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"]
+    # )
+    # cb.answer_reached = True
     # res=await chain.acall(message, callbacks=[cb])
+    res = await chain.acall(message.content)
+    print(f"response: {res}")
     try:
         answer = res["answer"]
     except:
         answer = res["result"]
     print(f"answer: {answer}")
+    answer_with_sources, source_elements = get_sources(res, answer)
+    await cl.Message(content=answer_with_sources, elements=source_elements).send()

code/modules/chat_model_loader.py CHANGED Viewed

@@ -1,24 +1,38 @@
 from langchain_community.chat_models import ChatOpenAI
 from langchain.llms import CTransformers
 class ChatModelLoader:
     def __init__(self, config):
         self.config = config
     def load_chat_model(self):
         if self.config["llm_params"]["llm_loader"] == "openai":
             llm = ChatOpenAI(
                 model_name=self.config["llm_params"]["openai_params"]["model"]
             )
-        elif self.config["llm_params"]["llm_loader"] == "Ctransformers":
-            llm = CTransformers(
-                model=self.config["llm_params"]["ctransformers_params"]["model"],
-                model_type=self.config["llm_params"]["ctransformers_params"][
-                    "model_type"
                 ],
-                max_new_tokens=512,
-                temperature=0.5,
             )
         else:
             raise ValueError("Invalid LLM Loader")

 from langchain_community.chat_models import ChatOpenAI
 from langchain.llms import CTransformers
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from transformers import AutoTokenizer, TextStreamer
+from langchain.llms import LlamaCpp
+import torch
+import transformers
+import os
+from langchain.callbacks.manager import CallbackManager
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 class ChatModelLoader:
     def __init__(self, config):
         self.config = config
+        self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
     def load_chat_model(self):
         if self.config["llm_params"]["llm_loader"] == "openai":
             llm = ChatOpenAI(
                 model_name=self.config["llm_params"]["openai_params"]["model"]
             )
+        elif self.config["llm_params"]["llm_loader"] == "local_llm":
+            n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
+            model_path = self.config["llm_params"]["local_llm_params"]["model"]
+            llm = LlamaCpp(
+                model_path=model_path,
+                n_batch=n_batch,
+                n_ctx=2048,
+                f16_kv=True,
+                verbose=True,
+                n_threads=2,
+                temperature=self.config["llm_params"]["local_llm_params"][
+                    "temperature"
                 ],
             )
         else:
             raise ValueError("Invalid LLM Loader")

code/modules/constants.py CHANGED Viewed

@@ -10,15 +10,15 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 # Prompt Templates
-prompt_template = """Use the following pieces of information to answer the user's question.
-If you don't know the answer, just say that you don't know, don't try to make up an answer.
-Context: {context}
-Question: {question}
-Only return the helpful answer below and nothing else.
-Helpful answer:
-"""
 prompt_template_with_history = """Use the following pieces of information to answer the user's question.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
@@ -31,3 +31,27 @@ Question: {question}
 Only return the helpful answer below and nothing else.
 Helpful answer:
 """

 # Prompt Templates
+# prompt_template = """Use the following pieces of information to answer the user's question.
+# If you don't know the answer, just say that you don't know.
+# Context: {context}
+# Question: {question}
+# Only return the helpful answer below and nothing else.
+# Helpful answer:
+# """
 prompt_template_with_history = """Use the following pieces of information to answer the user's question.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
 Only return the helpful answer below and nothing else.
 Helpful answer:
 """
+prompt_template = """
+<|im_start|>system
+Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
+Context:
+{context}
+<|im_end|>
+<|im_start|>user
+Question: Who is the instructor for this course?
+<|im_end|>
+<|im_start|>assistant
+The instructor for this course is Prof. Thomas Gardos.
+<|im_end|>
+<|im_start|>user
+Question: {question}
+<|im_end|>
+<|im_start|>assistant
+"""
+# Model Paths
+LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"

code/modules/data_loader.py CHANGED Viewed

@@ -225,21 +225,24 @@ class DataLoader:
             # Handle link by link
             for link_index, link in enumerate(weblinks):
-                logger.info(f"\tSplitting link {link_index+1} : {link}")
-                if "youtube" in link:
-                    title, document_chunks = get_youtube_transcript(link)
-                else:
-                    title, document_chunks = get_html(link)
-                # Additional wrangling - Remove leftover delimiters and any specified chunks
-                if self.remove_leftover_delimiters:
-                    document_chunks = remove_delimiters(document_chunks)
-                if self.config["splitter_options"]["remove_chunks"]:
-                    document_chunks = remove_chunks(document_chunks)
-                print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
-                self.document_names.append(title)
-                self.document_chunks_full.extend(document_chunks)
         logger.info(
             f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"

             # Handle link by link
             for link_index, link in enumerate(weblinks):
+                try:
+                    logger.info(f"\tSplitting link {link_index+1} : {link}")
+                    if "youtube" in link:
+                        title, document_chunks = get_youtube_transcript(link)
+                    else:
+                        title, document_chunks = get_html(link)
+                    # Additional wrangling - Remove leftover delimiters and any specified chunks
+                    if self.remove_leftover_delimiters:
+                        document_chunks = remove_delimiters(document_chunks)
+                    if self.config["splitter_options"]["remove_chunks"]:
+                        document_chunks = remove_chunks(document_chunks)
+                    print(f"\t\tExtracted no. of chunks: {len(document_chunks)}")
+                    self.document_names.append(title)
+                    self.document_chunks_full.extend(document_chunks)
+                except:
+                    logger.info(f"\t\tError splitting link {link_index+1} : {link}")
         logger.info(
             f"\tNumber of document chunks extracted in total: {len(self.document_chunks_full)}\n\n"

code/modules/embedding_model_loader.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain.embeddings import HuggingFaceEmbeddings
 from modules.constants import *
 class EmbeddingModelLoader:
@@ -20,4 +22,8 @@ class EmbeddingModelLoader:
                 model_name="sentence-transformers/all-MiniLM-L6-v2",
                 model_kwargs={"device": "cpu"},
             )
         return embedding_model

 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.embeddings import LlamaCppEmbeddings
 from modules.constants import *
+import os
 class EmbeddingModelLoader:
                 model_name="sentence-transformers/all-MiniLM-L6-v2",
                 model_kwargs={"device": "cpu"},
             )
+            # embedding_model = LlamaCppEmbeddings(
+            #     model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
+            # )
         return embedding_model

code/modules/helpers.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+from urllib.parse import urlparse
+import chainlit as cl
+"""
+Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113
+"""
+class WebpageCrawler:
+    def __init__(self):
+        pass
+    def getdata(self, url):
+        r = requests.get(url)
+        return r.text
+    def url_exists(self, url):
+        try:
+            response = requests.head(url)
+            return response.status_code == 200
+        except requests.ConnectionError:
+            return False
+    def get_links(self, website_link, base_url=None):
+        if base_url is None:
+            base_url = website_link
+        html_data = self.getdata(website_link)
+        soup = BeautifulSoup(html_data, "html.parser")
+        list_links = []
+        for link in soup.find_all("a", href=True):
+            # Append to list if new link contains original link
+            if str(link["href"]).startswith((str(website_link))):
+                list_links.append(link["href"])
+            # Include all href that do not start with website link but with "/"
+            if str(link["href"]).startswith("/"):
+                if link["href"] not in self.dict_href_links:
+                    print(link["href"])
+                    self.dict_href_links[link["href"]] = None
+                    link_with_www = base_url + link["href"][1:]
+                    if self.url_exists(link_with_www):
+                        print("adjusted link =", link_with_www)
+                        list_links.append(link_with_www)
+        # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
+        dict_links = dict.fromkeys(list_links, "Not-checked")
+        return dict_links
+    def get_subpage_links(self, l, base_url):
+        for link in tqdm(l):
+            # If not crawled through this page start crawling and get links
+            if l[link] == "Not-checked":
+                dict_links_subpages = self.get_links(link, base_url)
+                # Change the dictionary value of the link to "Checked"
+                l[link] = "Checked"
+            else:
+                # Create an empty dictionary in case every link is checked
+                dict_links_subpages = {}
+            # Add new dictionary to old dictionary
+            l = {**dict_links_subpages, **l}
+        return l
+    def get_all_pages(self, url, base_url):
+        dict_links = {url: "Not-checked"}
+        self.dict_href_links = {}
+        counter, counter2 = None, 0
+        while counter != 0:
+            counter2 += 1
+            dict_links2 = self.get_subpage_links(dict_links, base_url)
+            # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
+            # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
+            counter = sum(value == "Not-checked" for value in dict_links2.values())
+            dict_links = dict_links2
+        checked_urls = [
+            url for url, status in dict_links.items() if status == "Checked"
+        ]
+        return checked_urls
+def get_urls_from_file(file_path: str):
+    """
+    Function to get urls from a file
+    """
+    with open(file_path, "r") as f:
+        urls = f.readlines()
+    urls = [url.strip() for url in urls]
+    return urls
+def get_base_url(url):
+    parsed_url = urlparse(url)
+    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
+    return base_url
+def get_sources(res, answer):
+    source_elements_dict = {}
+    source_elements = []
+    found_sources = []
+    source_dict = {}  # Dictionary to store URL elements
+    for idx, source in enumerate(res["source_documents"]):
+        source_metadata = source.metadata
+        url = source_metadata["source"]
+        if url not in source_dict:
+            source_dict[url] = [source.page_content]
+        else:
+            source_dict[url].append(source.page_content)
+    for source_idx, (url, text_list) in enumerate(source_dict.items()):
+        full_text = ""
+        for url_idx, text in enumerate(text_list):
+            full_text += f"Source {url_idx+1}:\n {text}\n\n\n"
+        source_elements.append(cl.Text(name=url, content=full_text))
+        found_sources.append(url)
+    if found_sources:
+        answer += f"\n\nSources: {', '.join(found_sources)} "
+    else:
+        answer += f"\n\nNo source found."
+    # for idx, source in enumerate(res["source_documents"]):
+    #     title = source.metadata["source"]
+    #     if title not in source_elements_dict:
+    #         source_elements_dict[title] = {
+    #             "page_number": [source.metadata["page"]],
+    #             "url": source.metadata["source"],
+    #             "content": source.page_content,
+    #         }
+    #     else:
+    #         source_elements_dict[title]["page_number"].append(source.metadata["page"])
+    #     source_elements_dict[title][
+    #         "content_" + str(source.metadata["page"])
+    #     ] = source.page_content
+    #     # sort the page numbers
+    #     # source_elements_dict[title]["page_number"].sort()
+    # for title, source in source_elements_dict.items():
+    #     # create a string for the page numbers
+    #     page_numbers = ", ".join([str(x) for x in source["page_number"]])
+    #     text_for_source = f"Page Number(s): {page_numbers}\nURL: {source['url']}"
+    #     source_elements.append(cl.Pdf(name="File", path=title))
+    #     found_sources.append("File")
+    #     # for pn in source["page_number"]:
+    #     #     source_elements.append(
+    #     #         cl.Text(name=str(pn), content=source["content_"+str(pn)])
+    #     #     )
+    #     #     found_sources.append(str(pn))
+    # if found_sources:
+    #     answer += f"\nSource:{', '.join(found_sources)}"
+    # else:
+    #     answer += f"\nNo source found."
+    return answer, source_elements

code/modules/llm_tutor.py CHANGED Viewed

@@ -18,7 +18,7 @@ class LLMTutor:
     def __init__(self, config, logger=None):
         self.config = config
         self.vector_db = VectorDB(config, logger=logger)
-        if self.config['embedding_options']['embedd_files']:
             self.vector_db.create_database()
             self.vector_db.save_database()
@@ -47,7 +47,11 @@ class LLMTutor:
             qa_chain = ConversationalRetrievalChain.from_llm(
                 llm=llm,
                 chain_type="stuff",
-                retriever=db.as_retriever(search_kwargs={"k": self.config["embedding_options"]["search_top_k"]}),
                 return_source_documents=True,
                 memory=memory,
                 combine_docs_chain_kwargs={"prompt": prompt},
@@ -56,7 +60,11 @@ class LLMTutor:
             qa_chain = RetrievalQA.from_chain_type(
                 llm=llm,
                 chain_type="stuff",
-                retriever=db.as_retriever(search_kwargs={"k": self.config["embedding_options"]["search_top_k"]}),
                 return_source_documents=True,
                 chain_type_kwargs={"prompt": prompt},
             )

     def __init__(self, config, logger=None):
         self.config = config
         self.vector_db = VectorDB(config, logger=logger)
+        if self.config["embedding_options"]["embedd_files"]:
             self.vector_db.create_database()
             self.vector_db.save_database()
             qa_chain = ConversationalRetrievalChain.from_llm(
                 llm=llm,
                 chain_type="stuff",
+                retriever=db.as_retriever(
+                    search_kwargs={
+                        "k": self.config["embedding_options"]["search_top_k"]
+                    }
+                ),
                 return_source_documents=True,
                 memory=memory,
                 combine_docs_chain_kwargs={"prompt": prompt},
             qa_chain = RetrievalQA.from_chain_type(
                 llm=llm,
                 chain_type="stuff",
+                retriever=db.as_retriever(
+                    search_kwargs={
+                        "k": self.config["embedding_options"]["search_top_k"]
+                    }
+                ),
                 return_source_documents=True,
                 chain_type_kwargs={"prompt": prompt},
             )

code/modules/vector_db.py CHANGED Viewed

@@ -6,6 +6,7 @@ from modules.embedding_model_loader import EmbeddingModelLoader
 from langchain.vectorstores import FAISS
 from modules.data_loader import DataLoader
 from modules.constants import *
 class VectorDB:
@@ -13,6 +14,7 @@ class VectorDB:
         self.config = config
         self.db_option = config["embedding_options"]["db_option"]
         self.document_names = None
         # Set up logging to both console and a file
         if logger is None:
@@ -43,7 +45,14 @@ class VectorDB:
             os.path.join(self.config["embedding_options"]["data_path"], file)
             for file in files
         ]
-        return files
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
@@ -63,8 +72,8 @@ class VectorDB:
     def create_database(self):
         data_loader = DataLoader(self.config)
         self.logger.info("Loading data")
-        files = self.load_files()
-        document_chunks, document_names = data_loader.get_chunks(files, [""])
         self.logger.info("Completed loading data")
         self.create_embedding_model()

 from langchain.vectorstores import FAISS
 from modules.data_loader import DataLoader
 from modules.constants import *
+from modules.helpers import *
 class VectorDB:
         self.config = config
         self.db_option = config["embedding_options"]["db_option"]
         self.document_names = None
+        self.webpage_crawler = WebpageCrawler()
         # Set up logging to both console and a file
         if logger is None:
             os.path.join(self.config["embedding_options"]["data_path"], file)
             for file in files
         ]
+        urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"])
+        if self.config["embedding_options"]["expand_urls"]:
+            all_urls = []
+            for url in urls:
+                base_url = get_base_url(url)
+                all_urls.extend(self.webpage_crawler.get_all_pages(url, base_url))
+            urls = all_urls
+        return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
     def create_database(self):
         data_loader = DataLoader(self.config)
         self.logger.info("Loading data")
+        files, urls = self.load_files()
+        document_chunks, document_names = data_loader.get_chunks(files, urls)
         self.logger.info("Completed loading data")
         self.create_embedding_model()

data/webpage.pdf DELETED Viewed

Binary file (51.3 kB)

requirements.txt CHANGED Viewed

@@ -11,4 +11,8 @@ ctransformers==0.2.27
 python-dotenv==1.0.0
 openai==1.6.1
 pymupdf==1.23.8
-chainlit==0.7.700

 python-dotenv==1.0.0
 openai==1.6.1
 pymupdf==1.23.8
+chainlit==0.7.700
+beautifulsoup4==4.12.2
+fake-useragent==1.4.0
+git+https://github.com/huggingface/accelerate.git
+llama-cpp-python

storage/data/urls.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://dl4ds.github.io/sp2024/