Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

Ethan Chang commited on Jul 29

Commit

3b63120

•

2 Parent(s): dd677c3 30045eb

Merge branch 'dev_branch' into remove_tinyllama

Browse files

Files changed (12) hide show

.github/workflows/push_to_hf_space_prototype.yml +14 -13
code/.chainlit/config.toml +19 -16
code/main.py +2 -0
code/modules/config/config.yml +3 -1
code/modules/config/constants.py +11 -3
code/modules/dataloader/data_loader.py +82 -30
code/modules/dataloader/helpers.py +22 -2
code/modules/dataloader/pdf_readers/base.py +14 -0
code/modules/dataloader/pdf_readers/llama.py +92 -0
code/modules/dataloader/webpage_crawler.py +0 -1
code/modules/vectorstore/store_manager.py +12 -1
code/modules/vectorstore/vectorstore.py +31 -0

.github/workflows/push_to_hf_space_prototype.yml CHANGED Viewed

@@ -1,20 +1,21 @@
 name: Push Prototype to HuggingFace
 on:
-  pull_request:
-    branches:
-    - dev_branch
 jobs:
-  build:
     runs-on: ubuntu-latest
     steps:
-    - name: Deploy Prototype to HuggingFace
-      uses: nateraw/huggingface-sync-action@v0.0.4
-      with:
-        github_repo_id: DL4DS/dl4ds_tutor
-        huggingface_repo_id: dl4ds/tutor_dev
-        repo_type: space
-        space_sdk: static
-        hf_token: ${{ secrets.HF_TOKEN }}

 name: Push Prototype to HuggingFace
 on:
+  push:
+    branches: [dev_branch]
+  # run this workflow manuall from the Actions tab
+  workflow_dispatch:
 jobs:
+  sync-to-hub:
     runs-on: ubuntu-latest
     steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Deploy Prototype to HuggingFace
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://trgardos:$HF_TOKEN@huggingface.co/spaces/dl4ds/tutor_dev dev_branch:main

code/.chainlit/config.toml CHANGED Viewed

@@ -23,7 +23,7 @@ allow_origins = ["*"]
 unsafe_allow_html = false
 # Process and display mathematical expressions. This can clash with "$" characters in messages.
-latex = false
 # Automatically tag threads with the current chat profile (if a chat profile is used)
 auto_tag_thread = true
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
 # custom_build = "./public/build"
 [UI.theme]
-    default = "light"
     #layout = "wide"
     #font_family = "Inter, sans-serif"
 # Override default MUI light theme. (Check theme.ts)
 [UI.theme.light]
-    background = "#FAFAFA"
-    paper = "#FFFFFF"
     [UI.theme.light.primary]
-        main = "#b22222"  # Brighter shade of red
-        dark = "#8b0000"  # Darker shade of the brighter red
-        light = "#ff6347"  # Lighter shade of the brighter red
     [UI.theme.light.text]
-        primary = "#212121"
-        secondary = "#616161"
 # Override default MUI dark theme. (Check theme.ts)
 [UI.theme.dark]
-    background = "#1C1C1C" # Slightly lighter dark background color
-    paper = "#2A2A2A"      # Slightly lighter dark paper color
     [UI.theme.dark.primary]
-        main = "#89CFF0"    # Primary color
-        dark = "#3700B3"    # Dark variant of primary color
-        light = "#CFBCFF"   # Lighter variant of primary color
 [meta]
-generated_by = "1.1.302"

 unsafe_allow_html = false
 # Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = true
 # Automatically tag threads with the current chat profile (if a chat profile is used)
 auto_tag_thread = true
 # custom_build = "./public/build"
 [UI.theme]
+    default = "dark"
     #layout = "wide"
     #font_family = "Inter, sans-serif"
 # Override default MUI light theme. (Check theme.ts)
 [UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
     [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
     [UI.theme.light.text]
+        #primary = "#212121"
+        #secondary = "#616161"
 # Override default MUI dark theme. (Check theme.ts)
 [UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
     [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+    [UI.theme.dark.text]
+        #primary = "#EEEEEE"
+        #secondary = "#BDBDBD"
 [meta]
+generated_by = "1.1.304"

code/main.py CHANGED Viewed

@@ -173,4 +173,6 @@ async def main(message):
     answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
     processor._process(message.content, answer, sources_dict)
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

     answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
     processor._process(message.content, answer, sources_dict)
+    answer_with_sources = answer_with_sources.replace("$$", "$")
     await cl.Message(content=answer_with_sources, elements=source_elements).send()

code/modules/config/config.yml CHANGED Viewed

@@ -3,11 +3,13 @@ log_chunk_dir: '../storage/logs/chunks' # str
 device: 'cuda' # str [cuda, cpu]
 vectorstore:
   embedd_files: False # bool
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
   expand_urls: True # bool
-  db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
   db_path : '../vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int

 device: 'cuda' # str [cuda, cpu]
 vectorstore:
+  load_from_HF: True # bool
+  HF_path: "XThomasBU/Colbert_Index" # str
   embedd_files: False # bool
   data_path: '../storage/data' # str
   url_file_path: '../storage/data/urls.txt' # str
   expand_urls: True # bool
+  db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
   db_path : '../vectorstores' # str
   model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
   search_top_k : 3 # int

code/modules/config/constants.py CHANGED Viewed

@@ -6,6 +6,7 @@ load_dotenv()
 # API Keys - Loaded from the .env file
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 LITERAL_API_KEY = os.getenv("LITERAL_API_KEY")
@@ -14,7 +15,10 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
 # Prompt Templates
 openai_prompt_template = """Use the following pieces of information to answer the user's question.
-If you don't know the answer, just say that you don't know.
 Context: {context}
 Question: {question}
@@ -24,7 +28,11 @@ Helpful answer:
 """
 openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
 Use the history to answer the question if you can.
 Chat History:
 {chat_history}
@@ -37,7 +45,7 @@ Helpful answer:
 tinyllama_prompt_template = """
 <|im_start|>system
-Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question. Use the history to answer the question if you can.
 Context:
 {context}
@@ -56,7 +64,7 @@ Question: {question}
 tinyllama_prompt_template_with_history = """
 <|im_start|>system
-Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
 Chat History:
 {chat_history}

 # API Keys - Loaded from the .env file
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 LITERAL_API_KEY = os.getenv("LITERAL_API_KEY")
 # Prompt Templates
 openai_prompt_template = """Use the following pieces of information to answer the user's question.
+You are an intelligent chatbot designed to help students with questions regarding the course.
+Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
+Be sure to explain the parameters and variables in the equations.
+If you don't know the answer, just say that you don't know.
 Context: {context}
 Question: {question}
 """
 openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
+You are an intelligent chatbot designed to help students with questions regarding the course.
+Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
+Be sure to explain the parameters and variables in the equations.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
 Use the history to answer the question if you can.
 Chat History:
 {chat_history}
 tinyllama_prompt_template = """
 <|im_start|>system
+Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a brief and concise answer to the question. When asked for formulas, give a brief description of the formula and output math equations in LaTeX format between $ signs.
 Context:
 {context}
 tinyllama_prompt_template_with_history = """
 <|im_start|>system
+Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a brief and concise answer to the question. Output math equations in LaTeX format between $ signs. Use the history to answer the question if you can.
 Chat History:
 {chat_history}

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -20,26 +20,79 @@ from langchain_community.llms import OpenAI
 from langchain import PromptTemplate
 import json
 from concurrent.futures import ThreadPoolExecutor
-from modules.dataloader.helpers import get_metadata
-class PDFReader:
-    def __init__(self):
-        pass
-    def get_loader(self, pdf_path):
-        loader = PyMuPDFLoader(pdf_path)
-        return loader
-    def get_documents(self, loader):
-        return loader.load()
 class FileReader:
-    def __init__(self, logger):
-        self.pdf_reader = PDFReader()
         self.logger = logger
     def extract_text_from_pdf(self, pdf_path):
         text = ""
@@ -51,20 +104,12 @@ class FileReader:
                 text += page.extract_text()
         return text
-    def download_pdf_from_url(self, pdf_url):
-        response = requests.get(pdf_url)
-        if response.status_code == 200:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
-                temp_file.write(response.content)
-                temp_file_path = temp_file.name
-            return temp_file_path
-        else:
-            self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
-            return None
     def read_pdf(self, temp_file_path: str):
-        loader = self.pdf_reader.get_loader(temp_file_path)
-        documents = self.pdf_reader.get_documents(loader)
         return documents
     def read_txt(self, temp_file_path: str):
@@ -179,7 +224,6 @@ class ChunkProcessor:
             "https://dl4ds.github.io/sp2024/lectures/",
             "https://dl4ds.github.io/sp2024/schedule/",
         )  # For any additional metadata
         with ThreadPoolExecutor() as executor:
             executor.map(
                 self.process_file,
@@ -245,16 +289,17 @@ class ChunkProcessor:
                 )
                 self.document_chunks_full.extend(document_chunks)
         self.document_data[file_path] = file_data
         self.document_metadata[file_path] = file_metadata
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
         if file_name in self.document_data:
             return
-        file_type = file_name.split(".")[-1].lower()
-        self.logger.info(f"Reading file {file_index + 1}: {file_path}")
         read_methods = {
             "pdf": file_reader.read_pdf,
@@ -269,6 +314,7 @@ class ChunkProcessor:
         try:
             documents = read_methods[file_type](file_path)
             self.process_documents(
                 documents, file_path, file_type, "file", addl_metadata
             )
@@ -330,7 +376,7 @@ class ChunkProcessor:
 class DataLoader:
     def __init__(self, config, logger=None):
-        self.file_reader = FileReader(logger=logger)
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
@@ -348,13 +394,19 @@ if __name__ == "__main__":
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
             [],
-            ["https://dl4ds.github.io/sp2024/"],
         )
     )
-    print(document_names)
     print(len(document_chunks))

 from langchain import PromptTemplate
 import json
 from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urljoin
+import html2text
+import bs4
+import tempfile
+import PyPDF2
+from modules.dataloader.pdf_readers.base import PDFReader
+from modules.dataloader.pdf_readers.llama import LlamaParser
+try:
+    from modules.dataloader.helpers import get_metadata, download_pdf_from_url
+    from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
+except:
+    from dataloader.helpers import get_metadata, download_pdf_from_url
+    from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
+logger = logging.getLogger(__name__)
+BASE_DIR = os.getcwd()
+class HTMLReader:
+    def __init__(self):
+        pass
+    def read_url(self, url):
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.text
+        else:
+            logger.warning(f"Failed to download HTML from URL: {url}")
+            return None
+    def check_links(self, base_url, html_content):
+        soup = bs4.BeautifulSoup(html_content, "html.parser")
+        for link in soup.find_all("a"):
+            href = link.get("href")
+            if not href or href.startswith("#"):
+                continue
+            elif not href.startswith("https"):
+                href = href.replace("http", "https")
+            absolute_url = urljoin(base_url, href)
+            link['href'] = absolute_url
+            resp = requests.head(absolute_url)
+            if resp.status_code != 200:
+                logger.warning(f"Link {absolute_url} is broken")
+                logger.warning(f"Status code: {resp.status_code}")
+        return str(soup)
+    def html_to_md(self, url, html_content):
+        html_processed = self.check_links(url, html_content)
+        markdown_content = html2text.html2text(html_processed)
+        return markdown_content
+    def read_html(self, url):
+        html_content = self.read_url(url)
+        if html_content:
+            return self.html_to_md(url, html_content)
+        else:
+            return None
 class FileReader:
+    def __init__(self, logger, kind):
         self.logger = logger
+        self.kind = kind
+        if kind == "llama":
+            self.pdf_reader = LlamaParser()
+        else:
+            self.pdf_reader = PDFReader()
+        self.web_reader = HTMLReader()
     def extract_text_from_pdf(self, pdf_path):
         text = ""
                 text += page.extract_text()
         return text
     def read_pdf(self, temp_file_path: str):
+        if self.kind == "llama":
+            documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
+        else:
+            loader = self.pdf_reader.get_loader(temp_file_path)
+            documents = self.pdf_reader.get_documents(loader)
         return documents
     def read_txt(self, temp_file_path: str):
             "https://dl4ds.github.io/sp2024/lectures/",
             "https://dl4ds.github.io/sp2024/schedule/",
         )  # For any additional metadata
         with ThreadPoolExecutor() as executor:
             executor.map(
                 self.process_file,
                 )
                 self.document_chunks_full.extend(document_chunks)
+        print(f"Processed {file_path}. File_data: {file_data}")
         self.document_data[file_path] = file_data
         self.document_metadata[file_path] = file_metadata
     def process_file(self, file_path, file_index, file_reader, addl_metadata):
         file_name = os.path.basename(file_path)
         if file_name in self.document_data:
             return
+        file_type = file_name.split(".")[-1]
         read_methods = {
             "pdf": file_reader.read_pdf,
         try:
             documents = read_methods[file_type](file_path)
             self.process_documents(
                 documents, file_path, file_type, "file", addl_metadata
             )
 class DataLoader:
     def __init__(self, config, logger=None):
+        self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
         self.chunk_processor = ChunkProcessor(config, logger=logger)
     def get_chunks(self, uploaded_files, weblinks):
     with open("../code/modules/config/config.yml", "r") as f:
         config = yaml.safe_load(f)
+    STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
+    uploaded_files = [
+        os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
+    ]
     data_loader = DataLoader(config, logger=logger)
     document_chunks, document_names, documents, document_metadata = (
         data_loader.get_chunks(
+            ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
             [],
         )
     )
+    print(document_names[:5])
     print(len(document_chunks))

code/modules/dataloader/helpers.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import requests
 from bs4 import BeautifulSoup
-from tqdm import tqdm
 def get_urls_from_file(file_path: str):
     """
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
             continue
     return lecture_metadata

 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import tempfile
 def get_urls_from_file(file_path: str):
     """
             continue
     return lecture_metadata
+def download_pdf_from_url(pdf_url):
+    """
+    Function to temporarily download a PDF file from a URL and return the local file path.
+    Args:
+        pdf_url (str): The URL of the PDF file to download.
+    Returns:
+        str: The local file path of the downloaded PDF file.
+    """
+    response = requests.get(pdf_url)
+    if response.status_code == 200:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(response.content)
+            temp_file_path = temp_file.name
+        return temp_file_path
+    else:
+        return None

code/modules/dataloader/pdf_readers/base.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from langchain_community.document_loaders import PyMuPDFLoader
+class PDFReader:
+    def __init__(self):
+        pass
+    def get_loader(self, pdf_path):
+        loader = PyMuPDFLoader(pdf_path)
+        return loader
+    def parse(self, pdf_path):
+        loader = self.get_loader(pdf_path)
+        return loader.load()

code/modules/dataloader/pdf_readers/llama.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import requests
+from llama_parse import LlamaParse
+from langchain.schema import Document
+from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
+from modules.dataloader.helpers import download_pdf_from_url
+class LlamaParser:
+    def __init__(self):
+        self.GPT_API_KEY = OPENAI_API_KEY
+        self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
+        self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
+        self.headers = {
+            'Accept': 'application/json',
+            'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
+        }
+        self.parser = LlamaParse(
+            api_key=LLAMA_CLOUD_API_KEY,
+            result_type="markdown",
+            verbose=True,
+            language="en",
+            gpt4o_mode=False,
+            # gpt4o_api_key=OPENAI_API_KEY,
+            parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
+        )
+    def parse(self, pdf_path):
+        if not os.path.exists(pdf_path):
+            pdf_path = download_pdf_from_url(pdf_path)
+        documents = self.parser.load_data(pdf_path)
+        document = [document.to_langchain_format() for document in documents][0]
+        content = document.page_content
+        pages = content.split("\n---\n")
+        pages = [page.strip() for page in pages]
+        documents = [
+            Document(
+                page_content=page,
+                metadata={"source": pdf_path, "page": i}
+            ) for i, page in enumerate(pages)
+        ]
+        return documents
+    def make_request(self, pdf_url):
+        payload = {
+            "gpt4o_mode": "false",
+            "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
+        }
+        files = [
+            ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
+        ]
+        response = requests.request(
+            "POST", self.parse_url, headers=self.headers, data=payload, files=files)
+        return response.json()['id'], response.json()['status']
+    async def get_result(self, job_id):
+        url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
+        response = requests.request("GET", url, headers=self.headers, data={})
+        return response.json()['markdown']
+    async def _parse(self, pdf_path):
+        job_id, status = self.make_request(pdf_path)
+        while status != "SUCCESS":
+            url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
+            response = requests.request("GET", url, headers=self.headers, data={})
+            status = response.json()["status"]
+        result = await self.get_result(job_id)
+        documents = [
+            Document(
+                page_content=result,
+                metadata={"source": pdf_path}
+            )
+        ]
+        return documents
+    async def _parse(self, pdf_path):
+        return await self._parse(pdf_path)

code/modules/dataloader/webpage_crawler.py CHANGED Viewed

@@ -66,7 +66,6 @@ class WebpageCrawler:
                 )
                 for link in unchecked_links:
                     dict_links[link] = "Checked"
-                    print(f"Checked: {link}")
                 dict_links.update(
                     {
                         link: "Not-checked"

                 )
                 for link in unchecked_links:
                     dict_links[link] = "Checked"
                 dict_links.update(
                     {
                         link: "Not-checked"

code/modules/vectorstore/store_manager.py CHANGED Viewed

@@ -143,6 +143,14 @@ class VectorStoreManager:
         self.logger.info("Loaded database")
         return self.loaded_vector_db
 if __name__ == "__main__":
     import yaml
@@ -152,7 +160,10 @@ if __name__ == "__main__":
     print(config)
     print(f"Trying to create database with config: {config}")
     vector_db = VectorStoreManager(config)
-    vector_db.create_database()
     print("Created database")
     print(f"Trying to load the database")

         self.logger.info("Loaded database")
         return self.loaded_vector_db
+    def load_from_HF(self):
+        start_time = time.time()  # Start time for loading database
+        self.vector_db._load_from_HF()
+        end_time = time.time()
+        self.logger.info(
+            f"Time taken to load database from Hugging Face: {end_time - start_time} seconds"
+        )
 if __name__ == "__main__":
     import yaml
     print(config)
     print(f"Trying to create database with config: {config}")
     vector_db = VectorStoreManager(config)
+    if config["vectorstore"]["load_from_HF"] and "HF_path" in config["vectorstore"]:
+        vector_db.load_from_HF()
+    else:
+        vector_db.create_database()
     print("Created database")
     print(f"Trying to load the database")

code/modules/vectorstore/vectorstore.py CHANGED Viewed

@@ -2,6 +2,9 @@ from modules.vectorstore.faiss import FaissVectorStore
 from modules.vectorstore.chroma import ChromaVectorStore
 from modules.vectorstore.colbert import ColbertVectorStore
 from modules.vectorstore.raptor import RAPTORVectoreStore
 class VectorStore:
@@ -50,6 +53,34 @@ class VectorStore:
         else:
             return self.vectorstore.load_database(embedding_model)
     def _as_retriever(self):
         return self.vectorstore.as_retriever()

 from modules.vectorstore.chroma import ChromaVectorStore
 from modules.vectorstore.colbert import ColbertVectorStore
 from modules.vectorstore.raptor import RAPTORVectoreStore
+from huggingface_hub import snapshot_download
+import os
+import shutil
 class VectorStore:
         else:
             return self.vectorstore.load_database(embedding_model)
+    def _load_from_HF(self):
+        # Download the snapshot from Hugging Face Hub
+        # Note: Download goes to the cache directory
+        snapshot_path = snapshot_download(
+            repo_id=self.config["vectorstore"]["HF_path"],
+            repo_type="dataset",
+            force_download=True,
+        )
+        # Move the downloaded files to the desired directory
+        target_path = os.path.join(
+            self.config["vectorstore"]["db_path"],
+            "db_" + self.config["vectorstore"]["db_option"],
+        )
+        # Create target path if it doesn't exist
+        os.makedirs(target_path, exist_ok=True)
+        # move all files and directories from snapshot_path to target_path
+        # target path is used while loading the database
+        for item in os.listdir(snapshot_path):
+            s = os.path.join(snapshot_path, item)
+            d = os.path.join(target_path, item)
+            if os.path.isdir(s):
+                shutil.copytree(s, d, dirs_exist_ok=True)
+            else:
+                shutil.copy2(s, d)
     def _as_retriever(self):
         return self.vectorstore.as_retriever()