Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

XThomasBU commited on Jun 7

Commit

6d056d5

•

1 Parent(s): 4dc8546

updates, added metadat to prompt

Browse files

Files changed (12) hide show

Dockerfile +3 -5
Dockerfile.dev +27 -0
code/config.yml +4 -4
code/main.py +9 -5
code/modules/constants.py +2 -0
code/modules/data_loader.py +88 -91
code/modules/embedding_model_loader.py +6 -2
code/modules/helpers.py +119 -87
code/modules/llm_tutor.py +93 -11
code/modules/vector_db.py +10 -15
public/test.css +13 -0
requirements.txt +19 -19

Dockerfile CHANGED Viewed

@@ -1,14 +1,12 @@
-FROM python:3.9
 WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
-RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
 COPY . /code

+FROM python:3.11
 WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r /code/requirements.txt
 COPY . /code

Dockerfile.dev ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.11
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r /code/requirements.txt
+COPY . /code
+RUN ls -R
+# Change permissions to allow writing to the directory
+RUN chmod -R 777 /code
+# Create a logs directory and set permissions
+RUN mkdir /code/logs && chmod 777 /code/logs
+# Create a cache directory within the application's working directory
+RUN mkdir /.cache && chmod -R 777 /.cache
+# Expose the port the app runs on
+EXPOSE 8051
+CMD python code/modules/vector_db.py && chainlit run code/main.py --port 8051

code/config.yml CHANGED Viewed

@@ -2,18 +2,18 @@ embedding_options:
   embedd_files: False # bool
   data_path: 'storage/data' # str
   url_file_path: 'storage/data/urls.txt' # str
-  expand_urls: False # bool
   db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
   db_path : 'vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
   score_threshold : 0.2 # float
 llm_params:
-  use_history: False # bool
   memory_window: 3 # int
-  llm_loader: 'local_llm' # str [local_llm, openai]
   openai_params:
-    model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
   local_llm_params:
     model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
     model_type: "llama"

   embedd_files: False # bool
   data_path: 'storage/data' # str
   url_file_path: 'storage/data/urls.txt' # str
+  expand_urls: True # bool
   db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
   db_path : 'vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int
   score_threshold : 0.2 # float
 llm_params:
+  use_history: True # bool
   memory_window: 3 # int
+  llm_loader: 'openai' # str [local_llm, openai]
   openai_params:
+    model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
   local_llm_params:
     model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
     model_type: "llama"

code/main.py CHANGED Viewed

@@ -38,10 +38,6 @@ logger.addHandler(file_handler)
 @cl.set_chat_profiles
 async def chat_profile():
     return [
-        cl.ChatProfile(
-            name="Llama",
-            markdown_description="Use the local LLM: **Tiny Llama**.",
-        ),
         # cl.ChatProfile(
         #     name="Mistral",
         #     markdown_description="Use the local LLM: **Mistral**.",
@@ -54,6 +50,10 @@ async def chat_profile():
             name="gpt-4",
             markdown_description="Use OpenAI API for **gpt-4**.",
         ),
     ]
@@ -96,7 +96,7 @@ async def start():
     model = config["llm_params"]["local_llm_params"]["model"]
     msg = cl.Message(content=f"Starting the bot {model}...")
     await msg.send()
-    msg.content = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
     await msg.update()
     cl.user_session.set("chain", chain)
@@ -119,6 +119,10 @@ async def main(message):
         answer = res["result"]
     print(f"answer: {answer}")
     answer_with_sources, source_elements = get_sources(res, answer)
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

 @cl.set_chat_profiles
 async def chat_profile():
     return [
         # cl.ChatProfile(
         #     name="Mistral",
         #     markdown_description="Use the local LLM: **Mistral**.",
             name="gpt-4",
             markdown_description="Use OpenAI API for **gpt-4**.",
         ),
+        cl.ChatProfile(
+            name="Llama",
+            markdown_description="Use the local LLM: **Tiny Llama**.",
+        ),
     ]
     model = config["llm_params"]["local_llm_params"]["model"]
     msg = cl.Message(content=f"Starting the bot {model}...")
     await msg.send()
+    msg.content = opening_message
     await msg.update()
     cl.user_session.set("chain", chain)
         answer = res["result"]
     print(f"answer: {answer}")
+    logger.info(f"Question: {res['question']}")
+    logger.info(f"History: {res['chat_history']}")
+    logger.info(f"Answer: {answer}\n")
     answer_with_sources, source_elements = get_sources(res, answer)
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

code/modules/constants.py CHANGED Viewed

@@ -6,7 +6,9 @@ load_dotenv()
 # API Keys - Loaded from the .env file
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 # Prompt Templates

 # API Keys - Loaded from the .env file
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
 # Prompt Templates

code/modules/data_loader.py CHANGED Viewed

@@ -14,17 +14,15 @@ from llama_parse import LlamaParse
 from langchain.schema import Document
 import logging
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_experimental.text_splitter import SemanticChunker
-from langchain_openai.embeddings import OpenAIEmbeddings
 from ragatouille import RAGPretrainedModel
 from langchain.chains import LLMChain
 from langchain.llms import OpenAI
 from langchain import PromptTemplate
 try:
-    from modules.helpers import get_lecture_metadata
 except:
-    from helpers import get_lecture_metadata
 logger = logging.getLogger(__name__)
@@ -96,6 +94,14 @@ class FileReader:
         loader = WebBaseLoader(url)
         return loader.load()
 class ChunkProcessor:
     def __init__(self, config):
@@ -120,17 +126,6 @@ class ChunkProcessor:
             self.splitter = None
         logger.info("ChunkProcessor instance created")
-    # def extract_metadata(self, document_content):
-    #     llm = OpenAI()
-    #     prompt_template = PromptTemplate(
-    #         input_variables=["document_content"],
-    #         template="Extract metadata for this document:\n\n{document_content}\n\nMetadata:",
-    #     )
-    #     chain = LLMChain(llm=llm, prompt=prompt_template)
-    #     metadata = chain.run(document_content=document_content)
-    #     return metadata
     def remove_delimiters(self, document_chunks: list):
         for chunk in document_chunks:
             for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
@@ -151,7 +146,12 @@ class ChunkProcessor:
         self, documents, file_type="txt", source="", page=0, metadata={}
     ):
         documents = [Document(page_content=documents, source=source, page=page)]
-        if file_type == "txt":
             document_chunks = self.splitter.split_documents(documents)
         elif file_type == "pdf":
             document_chunks = documents  # Full page for now
@@ -179,58 +179,54 @@ class ChunkProcessor:
         self.documents = []
         self.document_metadata = []
-        lecture_metadata = get_lecture_metadata(
             "https://dl4ds.github.io/sp2024/lectures/",
             "https://dl4ds.github.io/sp2024/schedule/",
-        )  # TODO: Use more efficiently
         for file_index, file_path in enumerate(uploaded_files):
             file_name = os.path.basename(file_path)
-            file_type = file_name.split(".")[-1].lower()
-            # try:
-            if file_type == "pdf":
-                documents = file_reader.read_pdf(file_path)
-            elif file_type == "txt":
-                documents = file_reader.read_txt(file_path)
-            elif file_type == "docx":
-                documents = file_reader.read_docx(file_path)
-            elif file_type == "srt":
-                documents = file_reader.read_srt(file_path)
-            else:
-                logger.warning(f"Unsupported file type: {file_type}")
-                continue
-            # full_text = ""
-            # for doc in documents:
-            #     full_text += doc.page_content
-            #     break  # getting only first page for now
-            # extracted_metadata = self.extract_metadata(full_text)
-            for doc in documents:
-                page_num = doc.metadata.get("page", 0)
-                self.documents.append(doc.page_content)
-                self.document_metadata.append({"source": file_path, "page": page_num})
-                if "lecture" in file_path.lower():
-                    metadata = lecture_metadata.get(file_path, {})
-                    metadata["source_type"] = "lecture"
-                    self.document_metadata[-1].update(metadata)
                 else:
-                    metadata = {"source_type": "other"}
-                self.child_document_names.append(f"{file_name}_{page_num}")
-                self.parent_document_names.append(file_name)
-                if self.config["embedding_options"]["db_option"] not in ["RAGatouille"]:
-                    document_chunks = self.process_chunks(
-                        self.documents[-1],
-                        file_type,
-                        source=file_path,
-                        page=page_num,
-                        metadata=metadata,
                     )
-                    self.document_chunks_full.extend(document_chunks)
             # except Exception as e:
             #     logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -252,37 +248,38 @@ class ChunkProcessor:
             logger.info(f"Splitting weblinks: total of {len(weblinks)}")
             for link_index, link in enumerate(weblinks):
-                try:
-                    logger.info(f"\tSplitting link {link_index+1} : {link}")
-                    if "youtube" in link:
-                        documents = file_reader.read_youtube_transcript(link)
-                    else:
-                        documents = file_reader.read_html(link)
-                    for doc in documents:
-                        page_num = doc.metadata.get("page", 0)
-                        self.documents.append(doc.page_content)
-                        self.document_metadata.append(
-                            {"source": link, "page": page_num}
                         )
-                        self.child_document_names.append(f"{link}")
-                    self.parent_document_names.append(link)
-                    if self.config["embedding_options"]["db_option"] not in [
-                        "RAGatouille"
-                    ]:
-                        document_chunks = self.process_chunks(
-                            self.documents[-1],
-                            "txt",
-                            source=link,
-                            page=0,
-                            metadata={"source_type": "webpage"},
-                        )
-                        self.document_chunks_full.extend(document_chunks)
-                except Exception as e:
-                    logger.error(
-                        f"Error splitting link {link_index+1} : {link}: {str(e)}"
-                    )
 class DataLoader:

 from langchain.schema import Document
 import logging
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from ragatouille import RAGPretrainedModel
 from langchain.chains import LLMChain
 from langchain.llms import OpenAI
 from langchain import PromptTemplate
 try:
+    from modules.helpers import get_metadata
 except:
+    from helpers import get_metadata
 logger = logging.getLogger(__name__)
         loader = WebBaseLoader(url)
         return loader.load()
+    def read_tex_from_url(self, tex_url):
+        response = requests.get(tex_url)
+        if response.status_code == 200:
+            return [Document(page_content=response.text)]
+        else:
+            print("Failed to fetch .tex file from URL:", tex_url)
+            return None
 class ChunkProcessor:
     def __init__(self, config):
             self.splitter = None
         logger.info("ChunkProcessor instance created")
     def remove_delimiters(self, document_chunks: list):
         for chunk in document_chunks:
             for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
         self, documents, file_type="txt", source="", page=0, metadata={}
     ):
         documents = [Document(page_content=documents, source=source, page=page)]
+        if (
+            file_type == "txt"
+            or file_type == "docx"
+            or file_type == "srt"
+            or file_type == "tex"
+        ):
             document_chunks = self.splitter.split_documents(documents)
         elif file_type == "pdf":
             document_chunks = documents  # Full page for now
         self.documents = []
         self.document_metadata = []
+        addl_metadata = get_metadata(
             "https://dl4ds.github.io/sp2024/lectures/",
             "https://dl4ds.github.io/sp2024/schedule/",
+        )  # For any additional metadata
         for file_index, file_path in enumerate(uploaded_files):
             file_name = os.path.basename(file_path)
+            if file_name not in self.parent_document_names:
+                file_type = file_name.split(".")[-1].lower()
+                # try:
+                if file_type == "pdf":
+                    documents = file_reader.read_pdf(file_path)
+                elif file_type == "txt":
+                    documents = file_reader.read_txt(file_path)
+                elif file_type == "docx":
+                    documents = file_reader.read_docx(file_path)
+                elif file_type == "srt":
+                    documents = file_reader.read_srt(file_path)
+                elif file_type == "tex":
+                    documents = file_reader.read_tex_from_url(file_path)
                 else:
+                    logger.warning(f"Unsupported file type: {file_type}")
+                    continue
+                for doc in documents:
+                    page_num = doc.metadata.get("page", 0)
+                    self.documents.append(doc.page_content)
+                    self.document_metadata.append(
+                        {"source": file_path, "page": page_num}
                     )
+                    metadata = addl_metadata.get(file_path, {})
+                    self.document_metadata[-1].update(metadata)
+                    self.child_document_names.append(f"{file_name}_{page_num}")
+                    self.parent_document_names.append(file_name)
+                    if self.config["embedding_options"]["db_option"] not in [
+                        "RAGatouille"
+                    ]:
+                        document_chunks = self.process_chunks(
+                            self.documents[-1],
+                            file_type,
+                            source=file_path,
+                            page=page_num,
+                            metadata=metadata,
+                        )
+                        self.document_chunks_full.extend(document_chunks)
             # except Exception as e:
             #     logger.error(f"Error processing file {file_name}: {str(e)}")
             logger.info(f"Splitting weblinks: total of {len(weblinks)}")
             for link_index, link in enumerate(weblinks):
+                if link not in self.parent_document_names:
+                    try:
+                        logger.info(f"\tSplitting link {link_index+1} : {link}")
+                        if "youtube" in link:
+                            documents = file_reader.read_youtube_transcript(link)
+                        else:
+                            documents = file_reader.read_html(link)
+                        for doc in documents:
+                            page_num = doc.metadata.get("page", 0)
+                            self.documents.append(doc.page_content)
+                            self.document_metadata.append(
+                                {"source": link, "page": page_num}
+                            )
+                            self.child_document_names.append(f"{link}")
+                        self.parent_document_names.append(link)
+                        if self.config["embedding_options"]["db_option"] not in [
+                            "RAGatouille"
+                        ]:
+                            document_chunks = self.process_chunks(
+                                self.documents[-1],
+                                "txt",
+                                source=link,
+                                page=0,
+                                metadata={"source_type": "webpage"},
+                            )
+                            self.document_chunks_full.extend(document_chunks)
+                    except Exception as e:
+                        logger.error(
+                            f"Error splitting link {link_index+1} : {link}: {str(e)}"
                         )
 class DataLoader:

code/modules/embedding_model_loader.py CHANGED Viewed

@@ -24,8 +24,12 @@ class EmbeddingModelLoader:
             )
         else:
             embedding_model = HuggingFaceEmbeddings(
-                model_name="sentence-transformers/all-MiniLM-L6-v2",
-                model_kwargs={"device": "cpu"},
             )
             # embedding_model = LlamaCppEmbeddings(
             #     model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")

             )
         else:
             embedding_model = HuggingFaceEmbeddings(
+                model_name=self.config["embedding_options"]["model"],
+                model_kwargs={
+                    "device": "cpu",
+                    "token": f"{HUGGINGFACE_TOKEN}",
+                    "trust_remote_code": True,
+                },
             )
             # embedding_model = LlamaCppEmbeddings(
             #     model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")

code/modules/helpers.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import requests
 from bs4 import BeautifulSoup
 from tqdm import tqdm
-from urllib.parse import urlparse
 import chainlit as cl
 from langchain import PromptTemplate
 import requests
 from bs4 import BeautifulSoup
 try:
     from modules.constants import *
@@ -19,82 +23,112 @@ Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db
 class WebpageCrawler:
     def __init__(self):
-        pass
-    def getdata(self, url):
-        r = requests.get(url)
-        return r.text
-    def url_exists(self, url):
         try:
             response = requests.head(url)
             return response.status_code == 200
         except requests.ConnectionError:
             return False
-    def get_links(self, website_link, base_url=None):
-        if base_url is None:
-            base_url = website_link
-        html_data = self.getdata(website_link)
         soup = BeautifulSoup(html_data, "html.parser")
         list_links = []
         for link in soup.find_all("a", href=True):
-            # clean the link
-            # remove empty spaces
-            link["href"] = link["href"].strip()
-            # Append to list if new link contains original link
-            if str(link["href"]).startswith((str(website_link))):
-                list_links.append(link["href"])
-            # Include all href that do not start with website link but with "/"
-            if str(link["href"]).startswith("/"):
-                if link["href"] not in self.dict_href_links:
-                    print(link["href"])
-                    self.dict_href_links[link["href"]] = None
-                    link_with_www = base_url + link["href"][1:]
-                    if self.url_exists(link_with_www):
-                        print("adjusted link =", link_with_www)
-                        list_links.append(link_with_www)
-        # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
-        dict_links = dict.fromkeys(list_links, "Not-checked")
-        return dict_links
-    def get_subpage_links(self, l, base_url):
-        for link in tqdm(l):
-            print("checking link:", link)
-            if not link.endswith("/"):
-                l[link] = "Checked"
-                dict_links_subpages = {}
             else:
-                # If not crawled through this page start crawling and get links
-                if l[link] == "Not-checked":
-                    dict_links_subpages = self.get_links(link, base_url)
-                    # Change the dictionary value of the link to "Checked"
-                    l[link] = "Checked"
-                else:
-                    # Create an empty dictionary in case every link is checked
-                    dict_links_subpages = {}
-            # Add new dictionary to old dictionary
-            l = {**dict_links_subpages, **l}
-        return l
-    def get_all_pages(self, url, base_url):
-        dict_links = {url: "Not-checked"}
-        self.dict_href_links = {}
-        counter, counter2 = None, 0
-        while counter != 0:
-            counter2 += 1
-            dict_links2 = self.get_subpage_links(dict_links, base_url)
-            # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
-            # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
-            counter = sum(value == "Not-checked" for value in dict_links2.values())
-            dict_links = dict_links2
-        checked_urls = [
-            url for url, status in dict_links.items() if status == "Checked"
-        ]
-        return checked_urls
 def get_urls_from_file(file_path: str):
@@ -183,40 +217,38 @@ def get_sources(res, answer):
         name = f"Source {idx + 1} Text\n"
         full_answer += name
-        source_elements.append(cl.Text(name=name, content=source_data["text"]))
         # Add a PDF element if the source is a PDF file
         if source_data["url"].lower().endswith(".pdf"):
             name = f"Source {idx + 1} PDF\n"
             full_answer += name
             pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
-            source_elements.append(cl.Pdf(name=name, url=pdf_url))
-    # Finally, include lecture metadata for each unique source
-    # displayed_urls = set()
-    # full_answer += "\n**Metadata:**\n"
-    # for url_name, source_data in source_dict.items():
-    #     if source_data["url"] not in displayed_urls:
-    #         full_answer += f"\nSource: {source_data['url']}\n"
-    #         full_answer += f"Type: {source_data['source_type']}\n"
-    #         full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
-    #         full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
-    #         full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
-    #         displayed_urls.add(source_data["url"])
     full_answer += "\n**Metadata:**\n"
-    for url_name, source_data in source_dict.items():
-        full_answer += f"\nSource: {source_data['url']}\n"
-        full_answer += f"Page: {source_data['page']}\n"
-        full_answer += f"Type: {source_data['source_type']}\n"
-        full_answer += f"Date: {source_data['date']}\n"
-        full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
-        full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
-        full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
     return full_answer, source_elements
-def get_lecture_metadata(lectures_url, schedule_url):
     """
     Function to get the lecture metadata from the lectures and schedule URLs.
     """

 import requests
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 import chainlit as cl
 from langchain import PromptTemplate
 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin, urldefrag
+import asyncio
+import aiohttp
+from aiohttp import ClientSession
+from typing import Dict, Any, List
 try:
     from modules.constants import *
 class WebpageCrawler:
     def __init__(self):
+        self.dict_href_links = {}
+    async def fetch(self, session: ClientSession, url: str) -> str:
+        async with session.get(url) as response:
+            try:
+                return await response.text()
+            except UnicodeDecodeError:
+                return await response.text(encoding="latin1")
+    def url_exists(self, url: str) -> bool:
         try:
             response = requests.head(url)
             return response.status_code == 200
         except requests.ConnectionError:
             return False
+    async def get_links(self, session: ClientSession, website_link: str, base_url: str):
+        html_data = await self.fetch(session, website_link)
         soup = BeautifulSoup(html_data, "html.parser")
         list_links = []
         for link in soup.find_all("a", href=True):
+            href = link["href"].strip()
+            full_url = urljoin(base_url, href)
+            normalized_url = self.normalize_url(full_url)  # sections removed
+            if (
+                normalized_url not in self.dict_href_links
+                and self.is_child_url(normalized_url, base_url)
+                and self.url_exists(normalized_url)
+            ):
+                self.dict_href_links[normalized_url] = None
+                list_links.append(normalized_url)
+        return list_links
+    async def get_subpage_links(
+        self, session: ClientSession, urls: list, base_url: str
+    ):
+        tasks = [self.get_links(session, url, base_url) for url in urls]
+        results = await asyncio.gather(*tasks)
+        all_links = [link for sublist in results for link in sublist]
+        return all_links
+    async def get_all_pages(self, url: str, base_url: str):
+        async with aiohttp.ClientSession() as session:
+            dict_links = {url: "Not-checked"}
+            counter = None
+            while counter != 0:
+                unchecked_links = [
+                    link
+                    for link, status in dict_links.items()
+                    if status == "Not-checked"
+                ]
+                if not unchecked_links:
+                    break
+                new_links = await self.get_subpage_links(
+                    session, unchecked_links, base_url
+                )
+                for link in unchecked_links:
+                    dict_links[link] = "Checked"
+                    print(f"Checked: {link}")
+                dict_links.update(
+                    {
+                        link: "Not-checked"
+                        for link in new_links
+                        if link not in dict_links
+                    }
+                )
+                counter = len(
+                    [
+                        status
+                        for status in dict_links.values()
+                        if status == "Not-checked"
+                    ]
+                )
+            checked_urls = [
+                url for url, status in dict_links.items() if status == "Checked"
+            ]
+            return checked_urls
+    def is_webpage(self, url: str) -> bool:
+        try:
+            response = requests.head(url, allow_redirects=True)
+            content_type = response.headers.get("Content-Type", "").lower()
+            return "text/html" in content_type
+        except requests.RequestException:
+            return False
+    def clean_url_list(self, urls):
+        files, webpages = [], []
+        for url in urls:
+            if self.is_webpage(url):
+                webpages.append(url)
             else:
+                files.append(url)
+        return files, webpages
+    def is_child_url(self, url, base_url):
+        return url.startswith(base_url)
+    def normalize_url(self, url: str):
+        # Strip the fragment identifier
+        defragged_url, _ = urldefrag(url)
+        return defragged_url
 def get_urls_from_file(file_path: str):
         name = f"Source {idx + 1} Text\n"
         full_answer += name
+        source_elements.append(
+            cl.Text(name=name, content=source_data["text"], display="side")
+        )
         # Add a PDF element if the source is a PDF file
         if source_data["url"].lower().endswith(".pdf"):
             name = f"Source {idx + 1} PDF\n"
             full_answer += name
             pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
+            source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side"))
     full_answer += "\n**Metadata:**\n"
+    for idx, (url_name, source_data) in enumerate(source_dict.items()):
+        full_answer += f"\nSource {idx + 1} Metadata:\n"
+        source_elements.append(
+            cl.Text(
+                name=f"Source {idx + 1} Metadata",
+                content=f"Source: {source_data['url']}\n"
+                f"Page: {source_data['page']}\n"
+                f"Type: {source_data['source_type']}\n"
+                f"Date: {source_data['date']}\n"
+                f"TL;DR: {source_data['lecture_tldr']}\n"
+                f"Lecture Recording: {source_data['lecture_recording']}\n"
+                f"Suggested Readings: {source_data['suggested_readings']}\n",
+                display="side",
+            )
+        )
     return full_answer, source_elements
+def get_metadata(lectures_url, schedule_url):
     """
     Function to get the lecture metadata from the lectures and schedule URLs.
     """

code/modules/llm_tutor.py CHANGED Viewed

@@ -5,18 +5,99 @@ from langchain_community.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA, ConversationalRetrievalChain
 from langchain.llms import CTransformers
-from langchain.memory import ConversationBufferWindowMemory
 from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
 import os
 from modules.constants import *
 from modules.helpers import get_prompt
 from modules.chat_model_loader import ChatModelLoader
 from modules.vector_db import VectorDB, VectorDBScore
 class LLMTutor:
     def __init__(self, config, logger=None):
         self.config = config
         self.vector_db = VectorDB(config, logger=logger)
         if self.config["embedding_options"]["embedd_files"]:
             self.vector_db.create_database()
@@ -36,26 +117,28 @@ class LLMTutor:
         if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
             retriever = VectorDBScore(
                 vectorstore=db,
-                search_type="similarity_score_threshold",
-                search_kwargs={
-                    "score_threshold": self.config["embedding_options"][
-                        "score_threshold"
-                    ],
-                    "k": self.config["embedding_options"]["search_top_k"],
-                },
             )
         elif self.config["embedding_options"]["db_option"] == "RAGatouille":
             retriever = db.as_langchain_retriever(
                 k=self.config["embedding_options"]["search_top_k"]
             )
         if self.config["llm_params"]["use_history"]:
-            memory = ConversationBufferWindowMemory(
                 k=self.config["llm_params"]["memory_window"],
                 memory_key="chat_history",
                 return_messages=True,
                 output_key="answer",
             )
-            qa_chain = ConversationalRetrievalChain.from_llm(
                 llm=llm,
                 chain_type="stuff",
                 retriever=retriever,
@@ -82,7 +165,6 @@ class LLMTutor:
     # QA Model Function
     def qa_bot(self):
         db = self.vector_db.load_database()
-        self.llm = self.load_llm()
         qa_prompt = self.set_custom_prompt()
         qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)

 from langchain.vectorstores import FAISS
 from langchain.chains import RetrievalQA, ConversationalRetrievalChain
 from langchain.llms import CTransformers
+from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryBufferMemory
 from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
 import os
 from modules.constants import *
 from modules.helpers import get_prompt
 from modules.chat_model_loader import ChatModelLoader
 from modules.vector_db import VectorDB, VectorDBScore
+from typing import Dict, Any, Optional
+from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
+import inspect
+from langchain.chains.conversational_retrieval.base import _get_chat_history
+class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        question = inputs["question"]
+        get_chat_history = self.get_chat_history or _get_chat_history
+        chat_history_str = get_chat_history(inputs["chat_history"])
+        print(f"chat_history_str: {chat_history_str}")
+        if chat_history_str:
+            callbacks = _run_manager.get_child()
+            new_question = await self.question_generator.arun(
+                question=question, chat_history=chat_history_str, callbacks=callbacks
+            )
+        else:
+            new_question = question
+        accepts_run_manager = (
+            "run_manager" in inspect.signature(self._aget_docs).parameters
+        )
+        if accepts_run_manager:
+            docs = await self._aget_docs(new_question, inputs, run_manager=_run_manager)
+        else:
+            docs = await self._aget_docs(new_question, inputs)  # type: ignore[call-arg]
+        output: Dict[str, Any] = {}
+        if self.response_if_no_docs_found is not None and len(docs) == 0:
+            output[self.output_key] = self.response_if_no_docs_found
+        else:
+            new_inputs = inputs.copy()
+            if self.rephrase_question:
+                new_inputs["question"] = new_question
+            new_inputs["chat_history"] = chat_history_str
+            # Prepare the final prompt with metadata
+            context = "\n\n".join(
+                [
+                    f"Document content: {doc.page_content}\nMetadata: {doc.metadata}"
+                    for doc in docs
+                ]
+            )
+            final_prompt = f"""
+                You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Use the following pieces of information to answer the user's question.
+                If you don't know the answer, just say that you don't know—don't try to make up an answer.
+                Use the chat history to answer the question only if it's relevant; otherwise, ignore it. The context for the answer will be under "Document context:".
+                Use the metadata from each document to guide the user to the correct sources.
+                The context is ordered by relevance to the question. Give more weight to the most relevant documents.
+                Talk in a friendly and personalized manner, similar to how you would speak to a friend who needs help. Make the conversation engaging and avoid sounding repetitive or robotic.
+                Chat History:
+                {chat_history_str}
+                Context:
+                {context}
+                Question: {new_question}
+                AI Tutor:
+                """
+            new_inputs["input"] = final_prompt
+            new_inputs["question"] = final_prompt
+            output["final_prompt"] = final_prompt
+            answer = await self.combine_docs_chain.arun(
+                input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs
+            )
+            output[self.output_key] = answer
+        if self.return_source_documents:
+            output["source_documents"] = docs
+        if self.return_generated_question:
+            output["generated_question"] = new_question
+        return output
 class LLMTutor:
     def __init__(self, config, logger=None):
         self.config = config
+        self.llm = self.load_llm()
         self.vector_db = VectorDB(config, logger=logger)
         if self.config["embedding_options"]["embedd_files"]:
             self.vector_db.create_database()
         if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
             retriever = VectorDBScore(
                 vectorstore=db,
+                # search_type="similarity_score_threshold",
+                # search_kwargs={
+                #     "score_threshold": self.config["embedding_options"][
+                #         "score_threshold"
+                #     ],
+                #     "k": self.config["embedding_options"]["search_top_k"],
+                # },
             )
         elif self.config["embedding_options"]["db_option"] == "RAGatouille":
             retriever = db.as_langchain_retriever(
                 k=self.config["embedding_options"]["search_top_k"]
             )
         if self.config["llm_params"]["use_history"]:
+            memory = ConversationSummaryBufferMemory(
+                llm = llm,
                 k=self.config["llm_params"]["memory_window"],
                 memory_key="chat_history",
                 return_messages=True,
                 output_key="answer",
+                max_token_limit=128,
             )
+            qa_chain = CustomConversationalRetrievalChain.from_llm(
                 llm=llm,
                 chain_type="stuff",
                 retriever=retriever,
     # QA Model Function
     def qa_bot(self):
         db = self.vector_db.load_database()
         qa_prompt = self.set_custom_prompt()
         qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)

code/modules/vector_db.py CHANGED Viewed

@@ -96,21 +96,17 @@ class VectorDB:
         if self.config["embedding_options"]["expand_urls"]:
             all_urls = []
             for url in urls:
-                base_url = get_base_url(url)
-                all_urls.extend(self.webpage_crawler.get_all_pages(url, base_url))
             urls = all_urls
         return files, urls
-    def clean_url_list(self, urls):
-        # get lecture pdf links
-        lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
-        lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
-        urls = [
-            link for link in urls if link.endswith("/")
-        ]  # only keep links that end with a '/'. Extract Files Seperately
-        return urls, lecture_pdfs
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         self.embedding_model_loader = EmbeddingModelLoader(self.config)
@@ -158,12 +154,11 @@ class VectorDB:
         data_loader = DataLoader(self.config)
         self.logger.info("Loading data")
         files, urls = self.load_files()
-        urls, lecture_pdfs = self.clean_url_list(urls)
-        files += lecture_pdfs
         if "storage/data/urls.txt" in files:
             files.remove("storage/data/urls.txt")
         document_chunks, document_names, documents, document_metadata = (
-            data_loader.get_chunks(files, urls)
         )
         self.logger.info("Completed loading data")
         self.initialize_database(

         if self.config["embedding_options"]["expand_urls"]:
             all_urls = []
             for url in urls:
+                loop = asyncio.get_event_loop()
+                all_urls.extend(
+                    loop.run_until_complete(
+                        self.webpage_crawler.get_all_pages(
+                            url, url
+                        )  # only get child urls, if you want to get all urls, replace the second argument with the base url
+                    )
+                )
             urls = all_urls
         return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         self.embedding_model_loader = EmbeddingModelLoader(self.config)
         data_loader = DataLoader(self.config)
         self.logger.info("Loading data")
         files, urls = self.load_files()
+        files, webpages = self.webpage_crawler.clean_url_list(urls)
         if "storage/data/urls.txt" in files:
             files.remove("storage/data/urls.txt")
         document_chunks, document_names, documents, document_metadata = (
+            data_loader.get_chunks(files, webpages)
         )
         self.logger.info("Completed loading data")
         self.initialize_database(

public/test.css CHANGED Viewed

@@ -1,3 +1,16 @@
 a[href*='https://github.com/Chainlit/chainlit'] {
     visibility: hidden;
 }

 a[href*='https://github.com/Chainlit/chainlit'] {
     visibility: hidden;
+}
+.message-avatar .MuiAvatar-root {
+    background-color: transparent; /* Remove the background color */
+    color: #FFFFFF; /* Change this to your desired text color */
+    border: 0.25px solid #FFFFFF; /* Add a white border for the circle */
+    border-radius: 50%; /* Ensure the avatar remains circular */
+    background-image: url('http://localhost:8051/logo?theme=dark'); /* Path to your logo */
+    background-size: cover; /* Ensure the logo covers the entire avatar */
+    background-position: center; /* Center the logo */
+    background-repeat: no-repeat; /* Prevent the logo from repeating */
+    width: 38px; /* Adjust the width as needed */
+    height: 38px; /* Adjust the height as needed */
 }

requirements.txt CHANGED Viewed

@@ -1,20 +1,20 @@
-streamlit==1.29.0
-PyYAML==6.0.1
 pysrt==1.1.2
-langchain==0.0.353
-tiktoken==0.5.2
-streamlit-chat==0.1.1
-pypdf==3.17.4
-sentence-transformers==2.2.2
-faiss-cpu==1.7.4
-ctransformers==0.2.27
-python-dotenv==1.0.0
-openai==1.6.1
-pymupdf==1.23.8
-chainlit==1.0.200
-beautifulsoup4==4.12.2
-fake-useragent==1.4.0
-git+https://github.com/huggingface/accelerate.git
-llama-cpp-python
-PyPDF2==3.0.1
-ragatouille==0.0.8.post2

+# Automatically generated by https://github.com/damnever/pigar.
+beautifulsoup4==4.12.3
+chainlit==1.1.202
+langchain==0.1.20
+langchain-community==0.0.38
+langchain-core==0.1.52
+llama-parse==0.4.4
 pysrt==1.1.2
+python-dotenv==1.0.1
+PyYAML==6.0.1
+RAGatouille==0.0.8.post2
+requests==2.32.3
+torch==2.3.1
+tqdm==4.66.4
+transformers==4.41.2
+llama-cpp-python==0.2.77
+fake_useragent==1.5.1
+chromadb==0.5.0
+pymupdf==1.24.5