Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

Thomas (Tom) Gardos commited on Aug 19

Commit

59567ad

•

2 Parent(s): e17a5d0 ca828e7

Merge pull request #82 from DL4DS/remove_hardcoded

Browse files

Files changed (12) hide show

README.md +4 -3
code/main.py +0 -2
code/modules/chat/helpers.py +0 -2
code/modules/chat/langchain/langchain_rag.py +0 -1
code/modules/chat/langchain/utils.py +0 -3
code/modules/chat_processor/helpers.py +0 -2
code/modules/config/{user_config.yml → project_config.yml} +4 -0
code/modules/dataloader/data_loader.py +25 -10
code/modules/dataloader/helpers.py +8 -3
code/modules/retriever/helpers.py +0 -1
code/modules/vectorstore/store_manager.py +13 -12
docs/setup.md +1 -1

README.md CHANGED Viewed

@@ -37,7 +37,7 @@ Please visit [setup](https://dl4ds.github.io/dl4ds_tutor/guide/setup/) for more
 3. **To test Data Loading (Optional)**
    ```bash
    cd code
-   python -m modules.dataloader.data_loader
    ```
 4. **Create the Vector Database**
@@ -47,9 +47,10 @@ Please visit [setup](https://dl4ds.github.io/dl4ds_tutor/guide/setup/) for more
    ```
    - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
-5. **Run the Chainlit App**
    ```bash
-   chainlit run main.py
    ```
 ## Documentation

 3. **To test Data Loading (Optional)**
    ```bash
    cd code
+   python -m modules.dataloader.data_loader --links "your_pdf_link"
    ```
 4. **Create the Vector Database**
    ```
    - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
+6. **Run the FastAPI App**
    ```bash
+   cd code
+   uvicorn app:app --port 7860
    ```
 ## Documentation

code/main.py CHANGED Viewed

@@ -505,7 +505,6 @@ class Chatbot:
             token_count += token_count_cb.total_tokens
             for question in list_of_questions:
                 actions.append(
                     cl.Action(
                         name="follow up question",
@@ -549,7 +548,6 @@ class Chatbot:
     @cl.header_auth_callback
     def header_auth_callback(headers: dict) -> Optional[cl.User]:
         print("\n\n\nI am here\n\n\n")
         # try: # TODO: Add try-except block after testing
         # TODO: Implement to get the user information from the headers (not the cookie)

             token_count += token_count_cb.total_tokens
             for question in list_of_questions:
                 actions.append(
                     cl.Action(
                         name="follow up question",
     @cl.header_auth_callback
     def header_auth_callback(headers: dict) -> Optional[cl.User]:
         print("\n\n\nI am here\n\n\n")
         # try: # TODO: Add try-except block after testing
         # TODO: Implement to get the user information from the headers (not the cookie)

code/modules/chat/helpers.py CHANGED Viewed

@@ -42,7 +42,6 @@ def get_sources(res, answer, stream=True, view_sources=False):
         full_answer += answer
     if view_sources:
         # Then, display the sources
         # check if the answer has sources
         if len(source_dict) == 0:
@@ -51,7 +50,6 @@ def get_sources(res, answer, stream=True, view_sources=False):
         else:
             full_answer += "\n\n**Sources:**\n"
             for idx, (url_name, source_data) in enumerate(source_dict.items()):
                 full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n"
                 name = f"Source {idx + 1} Text\n"

         full_answer += answer
     if view_sources:
         # Then, display the sources
         # check if the answer has sources
         if len(source_dict) == 0:
         else:
             full_answer += "\n\n**Sources:**\n"
             for idx, (url_name, source_data) in enumerate(source_dict.items()):
                 full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n"
                 name = f"Source {idx + 1} Text\n"

code/modules/chat/langchain/langchain_rag.py CHANGED Viewed

@@ -19,7 +19,6 @@ from .utils import (
 class Langchain_RAG_V1(BaseRAG):
     def __init__(
         self,
         llm,

 class Langchain_RAG_V1(BaseRAG):
     def __init__(
         self,
         llm,

code/modules/chat/langchain/utils.py CHANGED Viewed

@@ -26,7 +26,6 @@ CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
 class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
@@ -139,7 +138,6 @@ class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
 class CustomRunnableWithHistory(RunnableWithMessageHistory):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
@@ -282,7 +280,6 @@ def create_retrieval_chain(
 # TODO: Remove Hard-coded values
 async def return_questions(query, response, chat_history_str, context, config):
     system = (
         "You are someone that suggests a question based on the student's input and chat history. "
         "Generate a question that is relevant to the student's input and chat history. "

 class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
 class CustomRunnableWithHistory(RunnableWithMessageHistory):
     def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str:
         _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "}
         buffer = ""
 # TODO: Remove Hard-coded values
 async def return_questions(query, response, chat_history_str, context, config):
     system = (
         "You are someone that suggests a question based on the student's input and chat history. "
         "Generate a question that is relevant to the student's input and chat history. "

code/modules/chat_processor/helpers.py CHANGED Viewed

@@ -156,7 +156,6 @@ async def update_user_info(user_info):
 async def check_user_cooldown(user_info, current_time):
     # # Check if no tokens left
     tokens_left = user_info.metadata.get("tokens_left", 0)
     if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False):
@@ -214,7 +213,6 @@ async def reset_tokens_for_user(user_info):
     # Calculate how many tokens should have been regenerated proportionally
     if current_tokens < max_tokens:
         # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration
         regeneration_rate_per_second = max_tokens / REGEN_TIME

 async def check_user_cooldown(user_info, current_time):
     # # Check if no tokens left
     tokens_left = user_info.metadata.get("tokens_left", 0)
     if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False):
     # Calculate how many tokens should have been regenerated proportionally
     if current_tokens < max_tokens:
         # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration
         regeneration_rate_per_second = max_tokens / REGEN_TIME

code/modules/config/{user_config.yml → project_config.yml} RENAMED Viewed

@@ -1,3 +1,7 @@
 retriever:
   retriever_hf_paths:
     RAGatouille: "XThomasBU/Colbert_Index"

 retriever:
   retriever_hf_paths:
     RAGatouille: "XThomasBU/Colbert_Index"
+metadata:
+  metadata_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"]
+  slide_base_link: "https://dl4ds.github.io"

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -222,8 +222,7 @@ class ChunkProcessor:
     def chunk_docs(self, file_reader, uploaded_files, weblinks):
         addl_metadata = get_metadata(
-            "https://dl4ds.github.io/sp2024/lectures/",
-            "https://dl4ds.github.io/sp2024/schedule/",
         )  # For any additional metadata
         # remove already processed files if reparse_files is False
@@ -325,7 +324,6 @@ class ChunkProcessor:
             return
         try:
             if file_path in self.document_data:
                 self.logger.warning(f"File {file_name} already processed")
                 documents = [
@@ -419,6 +417,15 @@ class DataLoader:
 if __name__ == "__main__":
     import yaml
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
@@ -426,6 +433,12 @@ if __name__ == "__main__":
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
     STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
     uploaded_files = [
         os.path.join(STORAGE_DIR, file)
@@ -434,13 +447,15 @@ if __name__ == "__main__":
     ]
     data_loader = DataLoader(config, logger=logger)
-    document_chunks, document_names, documents, document_metadata = (
-        data_loader.get_chunks(
-            [
-                "https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
-            ],
-            [],
-        )
     )
     print(document_names[:5])

     def chunk_docs(self, file_reader, uploaded_files, weblinks):
         addl_metadata = get_metadata(
+            *self.config["metadata"]["metadata_links"], self.config
         )  # For any additional metadata
         # remove already processed files if reparse_files is False
             return
         try:
             if file_path in self.document_data:
                 self.logger.warning(f"File {file_name} already processed")
                 documents = [
 if __name__ == "__main__":
     import yaml
+    import argparse
+    parser = argparse.ArgumentParser(description="Process some links.")
+    parser.add_argument(
+        "--links", nargs="+", required=True, help="List of links to process."
+    )
+    args = parser.parse_args()
+    links_to_process = args.links
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    with open("../code/modules/config/project_config.yml", "r") as f:
+        project_config = yaml.safe_load(f)
+    # Combine project config with the main config
+    config.update(project_config)
     STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
     uploaded_files = [
         os.path.join(STORAGE_DIR, file)
     ]
     data_loader = DataLoader(config, logger=logger)
+    # Just for testing
+    (
+        document_chunks,
+        document_names,
+        documents,
+        document_metadata,
+    ) = data_loader.get_chunks(
+        links_to_process,
+        [],
     )
     print(document_names[:5])

code/modules/dataloader/helpers.py CHANGED Viewed

@@ -21,7 +21,8 @@ def get_base_url(url):
     return base_url
-def get_metadata(lectures_url, schedule_url):
     """
     Function to get the lecture metadata from the lectures and schedule URLs.
     """
@@ -50,7 +51,9 @@ def get_metadata(lectures_url, schedule_url):
             slides_link_tag = description_div.find("a", title="Download slides")
             slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
             slides_link = (
-                f"https://dl4ds.github.io{slides_link}" if slides_link else None
             )
             if slides_link:
                 date_mapping[slides_link] = date
@@ -70,7 +73,9 @@ def get_metadata(lectures_url, schedule_url):
             slides_link_tag = block.find("a", title="Download slides")
             slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
             slides_link = (
-                f"https://dl4ds.github.io{slides_link}" if slides_link else None
             )
             # Extract the link to the lecture recording

     return base_url
+### THIS FUNCTION IS NOT GENERALIZABLE.. IT IS SPECIFIC TO THE COURSE WEBSITE ###
+def get_metadata(lectures_url, schedule_url, config):
     """
     Function to get the lecture metadata from the lectures and schedule URLs.
     """
             slides_link_tag = description_div.find("a", title="Download slides")
             slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
             slides_link = (
+                f"{config['metadata']['slide_base_link']}{slides_link}"
+                if slides_link
+                else None
             )
             if slides_link:
                 date_mapping[slides_link] = date
             slides_link_tag = block.find("a", title="Download slides")
             slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
             slides_link = (
+                f"{config['metadata']['slide_base_link']}{slides_link}"
+                if slides_link
+                else None
             )
             # Extract the link to the lecture recording

code/modules/retriever/helpers.py CHANGED Viewed

@@ -6,7 +6,6 @@ from typing import List
 class VectorStoreRetrieverScore(VectorStoreRetriever):
     # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
     def _get_relevant_documents(
         self, query: str, *, run_manager: CallbackManagerForRetrieverRun

 class VectorStoreRetrieverScore(VectorStoreRetriever):
     # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500
     def _get_relevant_documents(
         self, query: str, *, run_manager: CallbackManagerForRetrieverRun

code/modules/vectorstore/store_manager.py CHANGED Viewed

@@ -47,7 +47,6 @@ class VectorStoreManager:
         return logger
     def load_files(self):
         files = os.listdir(self.config["vectorstore"]["data_path"])
         files = [
             os.path.join(self.config["vectorstore"]["data_path"], file)
@@ -69,7 +68,6 @@ class VectorStoreManager:
         return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         embedding_model_loader = EmbeddingModelLoader(self.config)
         embedding_model = embedding_model_loader.load_embedding_model()
@@ -100,7 +98,6 @@ class VectorStoreManager:
         )
     def create_database(self):
         start_time = time.time()  # Start time for creating database
         data_loader = DataLoader(self.config, self.logger)
         self.logger.info("Loading data")
@@ -110,9 +107,12 @@ class VectorStoreManager:
         self.logger.info(f"Number of webpages: {len(webpages)}")
         if f"{self.config['vectorstore']['url_file_path']}" in files:
             files.remove(f"{self.config['vectorstores']['url_file_path']}")  # cleanup
-        document_chunks, document_names, documents, document_metadata = (
-            data_loader.get_chunks(files, webpages)
-        )
         num_documents = len(document_chunks)
         self.logger.info(f"Number of documents in the DB: {num_documents}")
         metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
@@ -128,7 +128,6 @@ class VectorStoreManager:
         )
     def load_database(self):
         start_time = time.time()  # Start time for loading database
         if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
             self.embedding_model = self.create_embedding_model()
@@ -168,19 +167,21 @@ if __name__ == "__main__":
     with open("modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
-    with open("modules/config/user_config.yml", "r") as f:
-        user_config = yaml.safe_load(f)
     print(config)
-    print(user_config)
     print(f"Trying to create database with config: {config}")
     vector_db = VectorStoreManager(config)
     if config["vectorstore"]["load_from_HF"]:
         if (
             config["vectorstore"]["db_option"]
-            in user_config["retriever"]["retriever_hf_paths"]
         ):
             vector_db.load_from_HF(
-                HF_PATH=user_config["retriever"]["retriever_hf_paths"][
                     config["vectorstore"]["db_option"]
                 ]
             )

         return logger
     def load_files(self):
         files = os.listdir(self.config["vectorstore"]["data_path"])
         files = [
             os.path.join(self.config["vectorstore"]["data_path"], file)
         return files, urls
     def create_embedding_model(self):
         self.logger.info("Creating embedding function")
         embedding_model_loader = EmbeddingModelLoader(self.config)
         embedding_model = embedding_model_loader.load_embedding_model()
         )
     def create_database(self):
         start_time = time.time()  # Start time for creating database
         data_loader = DataLoader(self.config, self.logger)
         self.logger.info("Loading data")
         self.logger.info(f"Number of webpages: {len(webpages)}")
         if f"{self.config['vectorstore']['url_file_path']}" in files:
             files.remove(f"{self.config['vectorstores']['url_file_path']}")  # cleanup
+        (
+            document_chunks,
+            document_names,
+            documents,
+            document_metadata,
+        ) = data_loader.get_chunks(files, webpages)
         num_documents = len(document_chunks)
         self.logger.info(f"Number of documents in the DB: {num_documents}")
         metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
         )
     def load_database(self):
         start_time = time.time()  # Start time for loading database
         if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]:
             self.embedding_model = self.create_embedding_model()
     with open("modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    with open("modules/config/project_config.yml", "r") as f:
+        project_config = yaml.safe_load(f)
+    # combine the two configs
+    config.update(project_config)
     print(config)
     print(f"Trying to create database with config: {config}")
     vector_db = VectorStoreManager(config)
     if config["vectorstore"]["load_from_HF"]:
         if (
             config["vectorstore"]["db_option"]
+            in config["retriever"]["retriever_hf_paths"]
         ):
             vector_db.load_from_HF(
+                HF_PATH=config["retriever"]["retriever_hf_paths"][
                     config["vectorstore"]["db_option"]
                 ]
             )

docs/setup.md CHANGED Viewed

@@ -124,4 +124,4 @@ CHAINLIT_URL=<your_chainlit_url>
 # Configuration
 The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
-The configuration file `code/modules/user_config.yaml` contains user-defined parameters.

 # Configuration
 The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
+The configuration file `code/modules/project_config.yaml` contains project-specific parameters.