Ethan Chang commited on
Commit
3b63120
2 Parent(s): dd677c3 30045eb

Merge branch 'dev_branch' into remove_tinyllama

Browse files
.github/workflows/push_to_hf_space_prototype.yml CHANGED
@@ -1,20 +1,21 @@
1
  name: Push Prototype to HuggingFace
2
 
3
  on:
4
- pull_request:
5
- branches:
6
- - dev_branch
7
-
 
8
 
9
  jobs:
10
- build:
11
  runs-on: ubuntu-latest
12
  steps:
13
- - name: Deploy Prototype to HuggingFace
14
- uses: nateraw/huggingface-sync-action@v0.0.4
15
- with:
16
- github_repo_id: DL4DS/dl4ds_tutor
17
- huggingface_repo_id: dl4ds/tutor_dev
18
- repo_type: space
19
- space_sdk: static
20
- hf_token: ${{ secrets.HF_TOKEN }}
 
1
  name: Push Prototype to HuggingFace
2
 
3
  on:
4
+ push:
5
+ branches: [dev_branch]
6
+
7
+ # run this workflow manuall from the Actions tab
8
+ workflow_dispatch:
9
 
10
  jobs:
11
+ sync-to-hub:
12
  runs-on: ubuntu-latest
13
  steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0
17
+ lfs: true
18
+ - name: Deploy Prototype to HuggingFace
19
+ env:
20
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
+ run: git push https://trgardos:$HF_TOKEN@huggingface.co/spaces/dl4ds/tutor_dev dev_branch:main
code/.chainlit/config.toml CHANGED
@@ -23,7 +23,7 @@ allow_origins = ["*"]
23
  unsafe_allow_html = false
24
 
25
  # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
- latex = false
27
 
28
  # Automatically tag threads with the current chat profile (if a chat profile is used)
29
  auto_tag_thread = true
@@ -85,31 +85,34 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
- default = "light"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
- background = "#FAFAFA"
94
- paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
- main = "#b22222" # Brighter shade of red
98
- dark = "#8b0000" # Darker shade of the brighter red
99
- light = "#ff6347" # Lighter shade of the brighter red
100
  [UI.theme.light.text]
101
- primary = "#212121"
102
- secondary = "#616161"
 
103
  # Override default MUI dark theme. (Check theme.ts)
104
  [UI.theme.dark]
105
- background = "#1C1C1C" # Slightly lighter dark background color
106
- paper = "#2A2A2A" # Slightly lighter dark paper color
107
 
108
  [UI.theme.dark.primary]
109
- main = "#89CFF0" # Primary color
110
- dark = "#3700B3" # Dark variant of primary color
111
- light = "#CFBCFF" # Lighter variant of primary color
112
-
 
 
113
 
114
  [meta]
115
- generated_by = "1.1.302"
 
23
  unsafe_allow_html = false
24
 
25
  # Process and display mathematical expressions. This can clash with "$" characters in messages.
26
+ latex = true
27
 
28
  # Automatically tag threads with the current chat profile (if a chat profile is used)
29
  auto_tag_thread = true
 
85
  # custom_build = "./public/build"
86
 
87
  [UI.theme]
88
+ default = "dark"
89
  #layout = "wide"
90
  #font_family = "Inter, sans-serif"
91
  # Override default MUI light theme. (Check theme.ts)
92
  [UI.theme.light]
93
+ #background = "#FAFAFA"
94
+ #paper = "#FFFFFF"
95
 
96
  [UI.theme.light.primary]
97
+ #main = "#F80061"
98
+ #dark = "#980039"
99
+ #light = "#FFE7EB"
100
  [UI.theme.light.text]
101
+ #primary = "#212121"
102
+ #secondary = "#616161"
103
+
104
  # Override default MUI dark theme. (Check theme.ts)
105
  [UI.theme.dark]
106
+ #background = "#FAFAFA"
107
+ #paper = "#FFFFFF"
108
 
109
  [UI.theme.dark.primary]
110
+ #main = "#F80061"
111
+ #dark = "#980039"
112
+ #light = "#FFE7EB"
113
+ [UI.theme.dark.text]
114
+ #primary = "#EEEEEE"
115
+ #secondary = "#BDBDBD"
116
 
117
  [meta]
118
+ generated_by = "1.1.304"
code/main.py CHANGED
@@ -173,4 +173,6 @@ async def main(message):
173
  answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
174
  processor._process(message.content, answer, sources_dict)
175
 
 
 
176
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
 
173
  answer_with_sources, source_elements, sources_dict = get_sources(res, answer)
174
  processor._process(message.content, answer, sources_dict)
175
 
176
+ answer_with_sources = answer_with_sources.replace("$$", "$")
177
+
178
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
code/modules/config/config.yml CHANGED
@@ -3,11 +3,13 @@ log_chunk_dir: '../storage/logs/chunks' # str
3
  device: 'cuda' # str [cuda, cpu]
4
 
5
  vectorstore:
 
 
6
  embedd_files: False # bool
7
  data_path: '../storage/data' # str
8
  url_file_path: '../storage/data/urls.txt' # str
9
  expand_urls: True # bool
10
- db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR]
11
  db_path : '../vectorstores' # str
12
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
13
  search_top_k : 3 # int
 
3
  device: 'cuda' # str [cuda, cpu]
4
 
5
  vectorstore:
6
+ load_from_HF: True # bool
7
+ HF_path: "XThomasBU/Colbert_Index" # str
8
  embedd_files: False # bool
9
  data_path: '../storage/data' # str
10
  url_file_path: '../storage/data/urls.txt' # str
11
  expand_urls: True # bool
12
+ db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR]
13
  db_path : '../vectorstores' # str
14
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
15
  search_top_k : 3 # int
code/modules/config/constants.py CHANGED
@@ -6,6 +6,7 @@ load_dotenv()
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
9
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
  LITERAL_API_KEY = os.getenv("LITERAL_API_KEY")
11
 
@@ -14,7 +15,10 @@ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me question
14
  # Prompt Templates
15
 
16
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
17
- If you don't know the answer, just say that you don't know.
 
 
 
18
 
19
  Context: {context}
20
  Question: {question}
@@ -24,7 +28,11 @@ Helpful answer:
24
  """
25
 
26
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
 
 
 
27
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
28
  Use the history to answer the question if you can.
29
  Chat History:
30
  {chat_history}
@@ -37,7 +45,7 @@ Helpful answer:
37
 
38
  tinyllama_prompt_template = """
39
  <|im_start|>system
40
- Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question. Use the history to answer the question if you can.
41
 
42
  Context:
43
  {context}
@@ -56,7 +64,7 @@ Question: {question}
56
 
57
  tinyllama_prompt_template_with_history = """
58
  <|im_start|>system
59
- Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a breif and concise answer to the question.
60
 
61
  Chat History:
62
  {chat_history}
 
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+ LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
11
  LITERAL_API_KEY = os.getenv("LITERAL_API_KEY")
12
 
 
15
  # Prompt Templates
16
 
17
  openai_prompt_template = """Use the following pieces of information to answer the user's question.
18
+ You are an intelligent chatbot designed to help students with questions regarding the course.
19
+ Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
20
+ Be sure to explain the parameters and variables in the equations.
21
+ If you don't know the answer, just say that you don't know.
22
 
23
  Context: {context}
24
  Question: {question}
 
28
  """
29
 
30
  openai_prompt_template_with_history = """Use the following pieces of information to answer the user's question.
31
+ You are an intelligent chatbot designed to help students with questions regarding the course.
32
+ Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context.
33
+ Be sure to explain the parameters and variables in the equations.
34
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
35
+
36
  Use the history to answer the question if you can.
37
  Chat History:
38
  {chat_history}
 
45
 
46
  tinyllama_prompt_template = """
47
  <|im_start|>system
48
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a brief and concise answer to the question. When asked for formulas, give a brief description of the formula and output math equations in LaTeX format between $ signs.
49
 
50
  Context:
51
  {context}
 
64
 
65
  tinyllama_prompt_template_with_history = """
66
  <|im_start|>system
67
+ Assistant is an intelligent chatbot designed to help students with questions regarding the course. Only answer questions using the context below and if you're not sure of an answer, you can say "I don't know". Always give a brief and concise answer to the question. Output math equations in LaTeX format between $ signs. Use the history to answer the question if you can.
68
 
69
  Chat History:
70
  {chat_history}
code/modules/dataloader/data_loader.py CHANGED
@@ -20,26 +20,79 @@ from langchain_community.llms import OpenAI
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- from modules.dataloader.helpers import get_metadata
 
 
 
 
 
 
25
 
 
 
 
 
26
 
27
- class PDFReader:
28
- def __init__(self):
29
- pass
 
30
 
31
- def get_loader(self, pdf_path):
32
- loader = PyMuPDFLoader(pdf_path)
33
- return loader
34
 
35
- def get_documents(self, loader):
36
- return loader.load()
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  class FileReader:
40
- def __init__(self, logger):
41
- self.pdf_reader = PDFReader()
42
  self.logger = logger
 
 
 
 
 
 
 
43
 
44
  def extract_text_from_pdf(self, pdf_path):
45
  text = ""
@@ -51,20 +104,12 @@ class FileReader:
51
  text += page.extract_text()
52
  return text
53
 
54
- def download_pdf_from_url(self, pdf_url):
55
- response = requests.get(pdf_url)
56
- if response.status_code == 200:
57
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
58
- temp_file.write(response.content)
59
- temp_file_path = temp_file.name
60
- return temp_file_path
61
- else:
62
- self.logger.error(f"Failed to download PDF from URL: {pdf_url}")
63
- return None
64
-
65
  def read_pdf(self, temp_file_path: str):
66
- loader = self.pdf_reader.get_loader(temp_file_path)
67
- documents = self.pdf_reader.get_documents(loader)
 
 
 
68
  return documents
69
 
70
  def read_txt(self, temp_file_path: str):
@@ -179,7 +224,6 @@ class ChunkProcessor:
179
  "https://dl4ds.github.io/sp2024/lectures/",
180
  "https://dl4ds.github.io/sp2024/schedule/",
181
  ) # For any additional metadata
182
-
183
  with ThreadPoolExecutor() as executor:
184
  executor.map(
185
  self.process_file,
@@ -245,16 +289,17 @@ class ChunkProcessor:
245
  )
246
  self.document_chunks_full.extend(document_chunks)
247
 
 
248
  self.document_data[file_path] = file_data
249
  self.document_metadata[file_path] = file_metadata
250
 
251
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
252
  file_name = os.path.basename(file_path)
 
253
  if file_name in self.document_data:
254
  return
255
 
256
- file_type = file_name.split(".")[-1].lower()
257
- self.logger.info(f"Reading file {file_index + 1}: {file_path}")
258
 
259
  read_methods = {
260
  "pdf": file_reader.read_pdf,
@@ -269,6 +314,7 @@ class ChunkProcessor:
269
 
270
  try:
271
  documents = read_methods[file_type](file_path)
 
272
  self.process_documents(
273
  documents, file_path, file_type, "file", addl_metadata
274
  )
@@ -330,7 +376,7 @@ class ChunkProcessor:
330
 
331
  class DataLoader:
332
  def __init__(self, config, logger=None):
333
- self.file_reader = FileReader(logger=logger)
334
  self.chunk_processor = ChunkProcessor(config, logger=logger)
335
 
336
  def get_chunks(self, uploaded_files, weblinks):
@@ -348,13 +394,19 @@ if __name__ == "__main__":
348
  with open("../code/modules/config/config.yml", "r") as f:
349
  config = yaml.safe_load(f)
350
 
 
 
 
 
 
351
  data_loader = DataLoader(config, logger=logger)
352
  document_chunks, document_names, documents, document_metadata = (
353
  data_loader.get_chunks(
 
354
  [],
355
- ["https://dl4ds.github.io/sp2024/"],
356
  )
357
  )
358
 
359
- print(document_names)
360
  print(len(document_chunks))
 
 
20
  from langchain import PromptTemplate
21
  import json
22
  from concurrent.futures import ThreadPoolExecutor
23
+ from urllib.parse import urljoin
24
+ import html2text
25
+ import bs4
26
+ import tempfile
27
+ import PyPDF2
28
+ from modules.dataloader.pdf_readers.base import PDFReader
29
+ from modules.dataloader.pdf_readers.llama import LlamaParser
30
+
31
+ try:
32
+ from modules.dataloader.helpers import get_metadata, download_pdf_from_url
33
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
34
+ except:
35
+ from dataloader.helpers import get_metadata, download_pdf_from_url
36
+ from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
37
+
38
+ logger = logging.getLogger(__name__)
39
+ BASE_DIR = os.getcwd()
40
+
41
+
42
+ class HTMLReader:
43
+ def __init__(self):
44
+ pass
45
 
46
+ def read_url(self, url):
47
+ response = requests.get(url)
48
+ if response.status_code == 200:
49
+ return response.text
50
+ else:
51
+ logger.warning(f"Failed to download HTML from URL: {url}")
52
+ return None
53
 
54
+ def check_links(self, base_url, html_content):
55
+ soup = bs4.BeautifulSoup(html_content, "html.parser")
56
+ for link in soup.find_all("a"):
57
+ href = link.get("href")
58
 
59
+ if not href or href.startswith("#"):
60
+ continue
61
+ elif not href.startswith("https"):
62
+ href = href.replace("http", "https")
63
 
64
+ absolute_url = urljoin(base_url, href)
65
+ link['href'] = absolute_url
 
66
 
67
+ resp = requests.head(absolute_url)
68
+ if resp.status_code != 200:
69
+ logger.warning(f"Link {absolute_url} is broken")
70
+ logger.warning(f"Status code: {resp.status_code}")
71
+
72
+ return str(soup)
73
 
74
+ def html_to_md(self, url, html_content):
75
+ html_processed = self.check_links(url, html_content)
76
+ markdown_content = html2text.html2text(html_processed)
77
+ return markdown_content
78
+
79
+ def read_html(self, url):
80
+ html_content = self.read_url(url)
81
+ if html_content:
82
+ return self.html_to_md(url, html_content)
83
+ else:
84
+ return None
85
 
86
  class FileReader:
87
+ def __init__(self, logger, kind):
 
88
  self.logger = logger
89
+ self.kind = kind
90
+ if kind == "llama":
91
+ self.pdf_reader = LlamaParser()
92
+ else:
93
+ self.pdf_reader = PDFReader()
94
+ self.web_reader = HTMLReader()
95
+
96
 
97
  def extract_text_from_pdf(self, pdf_path):
98
  text = ""
 
104
  text += page.extract_text()
105
  return text
106
 
 
 
 
 
 
 
 
 
 
 
 
107
  def read_pdf(self, temp_file_path: str):
108
+ if self.kind == "llama":
109
+ documents = self.pdf_reader.parse(temp_file_path) # asyncio.run(self.pdf_reader.parse(temp_file_path)) if using async
110
+ else:
111
+ loader = self.pdf_reader.get_loader(temp_file_path)
112
+ documents = self.pdf_reader.get_documents(loader)
113
  return documents
114
 
115
  def read_txt(self, temp_file_path: str):
 
224
  "https://dl4ds.github.io/sp2024/lectures/",
225
  "https://dl4ds.github.io/sp2024/schedule/",
226
  ) # For any additional metadata
 
227
  with ThreadPoolExecutor() as executor:
228
  executor.map(
229
  self.process_file,
 
289
  )
290
  self.document_chunks_full.extend(document_chunks)
291
 
292
+ print(f"Processed {file_path}. File_data: {file_data}")
293
  self.document_data[file_path] = file_data
294
  self.document_metadata[file_path] = file_metadata
295
 
296
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
297
  file_name = os.path.basename(file_path)
298
+
299
  if file_name in self.document_data:
300
  return
301
 
302
+ file_type = file_name.split(".")[-1]
 
303
 
304
  read_methods = {
305
  "pdf": file_reader.read_pdf,
 
314
 
315
  try:
316
  documents = read_methods[file_type](file_path)
317
+
318
  self.process_documents(
319
  documents, file_path, file_type, "file", addl_metadata
320
  )
 
376
 
377
  class DataLoader:
378
  def __init__(self, config, logger=None):
379
+ self.file_reader = FileReader(logger=logger, kind=config["llm_params"]["pdf_reader"])
380
  self.chunk_processor = ChunkProcessor(config, logger=logger)
381
 
382
  def get_chunks(self, uploaded_files, weblinks):
 
394
  with open("../code/modules/config/config.yml", "r") as f:
395
  config = yaml.safe_load(f)
396
 
397
+ STORAGE_DIR = os.path.join(BASE_DIR, config['vectorstore']["data_path"])
398
+ uploaded_files = [
399
+ os.path.join(STORAGE_DIR, file) for file in os.listdir(STORAGE_DIR) if file != "urls.txt"
400
+ ]
401
+
402
  data_loader = DataLoader(config, logger=logger)
403
  document_chunks, document_names, documents, document_metadata = (
404
  data_loader.get_chunks(
405
+ ["https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf"],
406
  [],
 
407
  )
408
  )
409
 
410
+ print(document_names[:5])
411
  print(len(document_chunks))
412
+
code/modules/dataloader/helpers.py CHANGED
@@ -1,7 +1,7 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- from tqdm import tqdm
4
-
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
@@ -106,3 +106,23 @@ def get_metadata(lectures_url, schedule_url):
106
  continue
107
 
108
  return lecture_metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse
4
+ import tempfile
5
 
6
  def get_urls_from_file(file_path: str):
7
  """
 
106
  continue
107
 
108
  return lecture_metadata
109
+
110
+
111
+ def download_pdf_from_url(pdf_url):
112
+ """
113
+ Function to temporarily download a PDF file from a URL and return the local file path.
114
+
115
+ Args:
116
+ pdf_url (str): The URL of the PDF file to download.
117
+
118
+ Returns:
119
+ str: The local file path of the downloaded PDF file.
120
+ """
121
+ response = requests.get(pdf_url)
122
+ if response.status_code == 200:
123
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
124
+ temp_file.write(response.content)
125
+ temp_file_path = temp_file.name
126
+ return temp_file_path
127
+ else:
128
+ return None
code/modules/dataloader/pdf_readers/base.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyMuPDFLoader
2
+
3
+
4
+ class PDFReader:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def get_loader(self, pdf_path):
9
+ loader = PyMuPDFLoader(pdf_path)
10
+ return loader
11
+
12
+ def parse(self, pdf_path):
13
+ loader = self.get_loader(pdf_path)
14
+ return loader.load()
code/modules/dataloader/pdf_readers/llama.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from llama_parse import LlamaParse
4
+ from langchain.schema import Document
5
+ from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
6
+ from modules.dataloader.helpers import download_pdf_from_url
7
+
8
+
9
+
10
+ class LlamaParser:
11
+ def __init__(self):
12
+ self.GPT_API_KEY = OPENAI_API_KEY
13
+ self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
14
+ self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
15
+ self.headers = {
16
+ 'Accept': 'application/json',
17
+ 'Authorization': f'Bearer {LLAMA_CLOUD_API_KEY}'
18
+ }
19
+ self.parser = LlamaParse(
20
+ api_key=LLAMA_CLOUD_API_KEY,
21
+ result_type="markdown",
22
+ verbose=True,
23
+ language="en",
24
+ gpt4o_mode=False,
25
+ # gpt4o_api_key=OPENAI_API_KEY,
26
+ parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
27
+ )
28
+
29
+ def parse(self, pdf_path):
30
+ if not os.path.exists(pdf_path):
31
+ pdf_path = download_pdf_from_url(pdf_path)
32
+
33
+ documents = self.parser.load_data(pdf_path)
34
+ document = [document.to_langchain_format() for document in documents][0]
35
+
36
+ content = document.page_content
37
+ pages = content.split("\n---\n")
38
+ pages = [page.strip() for page in pages]
39
+
40
+ documents = [
41
+ Document(
42
+ page_content=page,
43
+ metadata={"source": pdf_path, "page": i}
44
+ ) for i, page in enumerate(pages)
45
+ ]
46
+
47
+ return documents
48
+
49
+ def make_request(self, pdf_url):
50
+ payload = {
51
+ "gpt4o_mode": "false",
52
+ "parsing_instruction": "The provided document is a PDF of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides and convert them to markdown format. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$. For images, give a description and if you can, a source.",
53
+ }
54
+
55
+ files = [
56
+ ('file', ('file', requests.get(pdf_url).content, 'application/octet-stream'))
57
+ ]
58
+
59
+ response = requests.request(
60
+ "POST", self.parse_url, headers=self.headers, data=payload, files=files)
61
+
62
+ return response.json()['id'], response.json()['status']
63
+
64
+ async def get_result(self, job_id):
65
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
66
+
67
+ response = requests.request("GET", url, headers=self.headers, data={})
68
+
69
+ return response.json()['markdown']
70
+
71
+ async def _parse(self, pdf_path):
72
+ job_id, status = self.make_request(pdf_path)
73
+
74
+ while status != "SUCCESS":
75
+ url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}"
76
+ response = requests.request("GET", url, headers=self.headers, data={})
77
+ status = response.json()["status"]
78
+
79
+ result = await self.get_result(job_id)
80
+
81
+ documents = [
82
+ Document(
83
+ page_content=result,
84
+ metadata={"source": pdf_path}
85
+ )
86
+ ]
87
+
88
+ return documents
89
+
90
+ async def _parse(self, pdf_path):
91
+ return await self._parse(pdf_path)
92
+
code/modules/dataloader/webpage_crawler.py CHANGED
@@ -66,7 +66,6 @@ class WebpageCrawler:
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
69
- print(f"Checked: {link}")
70
  dict_links.update(
71
  {
72
  link: "Not-checked"
 
66
  )
67
  for link in unchecked_links:
68
  dict_links[link] = "Checked"
 
69
  dict_links.update(
70
  {
71
  link: "Not-checked"
code/modules/vectorstore/store_manager.py CHANGED
@@ -143,6 +143,14 @@ class VectorStoreManager:
143
  self.logger.info("Loaded database")
144
  return self.loaded_vector_db
145
 
 
 
 
 
 
 
 
 
146
 
147
  if __name__ == "__main__":
148
  import yaml
@@ -152,7 +160,10 @@ if __name__ == "__main__":
152
  print(config)
153
  print(f"Trying to create database with config: {config}")
154
  vector_db = VectorStoreManager(config)
155
- vector_db.create_database()
 
 
 
156
  print("Created database")
157
 
158
  print(f"Trying to load the database")
 
143
  self.logger.info("Loaded database")
144
  return self.loaded_vector_db
145
 
146
+ def load_from_HF(self):
147
+ start_time = time.time() # Start time for loading database
148
+ self.vector_db._load_from_HF()
149
+ end_time = time.time()
150
+ self.logger.info(
151
+ f"Time taken to load database from Hugging Face: {end_time - start_time} seconds"
152
+ )
153
+
154
 
155
  if __name__ == "__main__":
156
  import yaml
 
160
  print(config)
161
  print(f"Trying to create database with config: {config}")
162
  vector_db = VectorStoreManager(config)
163
+ if config["vectorstore"]["load_from_HF"] and "HF_path" in config["vectorstore"]:
164
+ vector_db.load_from_HF()
165
+ else:
166
+ vector_db.create_database()
167
  print("Created database")
168
 
169
  print(f"Trying to load the database")
code/modules/vectorstore/vectorstore.py CHANGED
@@ -2,6 +2,9 @@ from modules.vectorstore.faiss import FaissVectorStore
2
  from modules.vectorstore.chroma import ChromaVectorStore
3
  from modules.vectorstore.colbert import ColbertVectorStore
4
  from modules.vectorstore.raptor import RAPTORVectoreStore
 
 
 
5
 
6
 
7
  class VectorStore:
@@ -50,6 +53,34 @@ class VectorStore:
50
  else:
51
  return self.vectorstore.load_database(embedding_model)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def _as_retriever(self):
54
  return self.vectorstore.as_retriever()
55
 
 
2
  from modules.vectorstore.chroma import ChromaVectorStore
3
  from modules.vectorstore.colbert import ColbertVectorStore
4
  from modules.vectorstore.raptor import RAPTORVectoreStore
5
+ from huggingface_hub import snapshot_download
6
+ import os
7
+ import shutil
8
 
9
 
10
  class VectorStore:
 
53
  else:
54
  return self.vectorstore.load_database(embedding_model)
55
 
56
+ def _load_from_HF(self):
57
+ # Download the snapshot from Hugging Face Hub
58
+ # Note: Download goes to the cache directory
59
+ snapshot_path = snapshot_download(
60
+ repo_id=self.config["vectorstore"]["HF_path"],
61
+ repo_type="dataset",
62
+ force_download=True,
63
+ )
64
+
65
+ # Move the downloaded files to the desired directory
66
+ target_path = os.path.join(
67
+ self.config["vectorstore"]["db_path"],
68
+ "db_" + self.config["vectorstore"]["db_option"],
69
+ )
70
+
71
+ # Create target path if it doesn't exist
72
+ os.makedirs(target_path, exist_ok=True)
73
+
74
+ # move all files and directories from snapshot_path to target_path
75
+ # target path is used while loading the database
76
+ for item in os.listdir(snapshot_path):
77
+ s = os.path.join(snapshot_path, item)
78
+ d = os.path.join(target_path, item)
79
+ if os.path.isdir(s):
80
+ shutil.copytree(s, d, dirs_exist_ok=True)
81
+ else:
82
+ shutil.copy2(s, d)
83
+
84
  def _as_retriever(self):
85
  return self.vectorstore.as_retriever()
86