XThomasBU commited on
Commit
6d056d5
1 Parent(s): 4dc8546

updates, added metadat to prompt

Browse files
Dockerfile CHANGED
@@ -1,14 +1,12 @@
1
- FROM python:3.9
2
 
3
  WORKDIR /code
4
 
5
  COPY ./requirements.txt /code/requirements.txt
6
 
7
- RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
 
9
- RUN pip install --no-cache-dir transformers==4.36.2 torch==2.1.2
10
-
11
- RUN pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python==0.2.32
12
 
13
  COPY . /code
14
 
 
1
+ FROM python:3.11
2
 
3
  WORKDIR /code
4
 
5
  COPY ./requirements.txt /code/requirements.txt
6
 
7
+ RUN pip install --upgrade pip
8
 
9
+ RUN pip install --no-cache-dir -r /code/requirements.txt
 
 
10
 
11
  COPY . /code
12
 
Dockerfile.dev ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --upgrade pip
8
+
9
+ RUN pip install --no-cache-dir -r /code/requirements.txt
10
+
11
+ COPY . /code
12
+
13
+ RUN ls -R
14
+
15
+ # Change permissions to allow writing to the directory
16
+ RUN chmod -R 777 /code
17
+
18
+ # Create a logs directory and set permissions
19
+ RUN mkdir /code/logs && chmod 777 /code/logs
20
+
21
+ # Create a cache directory within the application's working directory
22
+ RUN mkdir /.cache && chmod -R 777 /.cache
23
+
24
+ # Expose the port the app runs on
25
+ EXPOSE 8051
26
+
27
+ CMD python code/modules/vector_db.py && chainlit run code/main.py --port 8051
code/config.yml CHANGED
@@ -2,18 +2,18 @@ embedding_options:
2
  embedd_files: False # bool
3
  data_path: 'storage/data' # str
4
  url_file_path: 'storage/data/urls.txt' # str
5
- expand_urls: False # bool
6
  db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
7
  db_path : 'vectorstores' # str
8
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
9
  search_top_k : 3 # int
10
  score_threshold : 0.2 # float
11
  llm_params:
12
- use_history: False # bool
13
  memory_window: 3 # int
14
- llm_loader: 'local_llm' # str [local_llm, openai]
15
  openai_params:
16
- model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
17
  local_llm_params:
18
  model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
19
  model_type: "llama"
 
2
  embedd_files: False # bool
3
  data_path: 'storage/data' # str
4
  url_file_path: 'storage/data/urls.txt' # str
5
+ expand_urls: True # bool
6
  db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille]
7
  db_path : 'vectorstores' # str
8
  model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002']
9
  search_top_k : 3 # int
10
  score_threshold : 0.2 # float
11
  llm_params:
12
+ use_history: True # bool
13
  memory_window: 3 # int
14
+ llm_loader: 'openai' # str [local_llm, openai]
15
  openai_params:
16
+ model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4]
17
  local_llm_params:
18
  model: "storage/models/llama-2-7b-chat.Q4_0.gguf"
19
  model_type: "llama"
code/main.py CHANGED
@@ -38,10 +38,6 @@ logger.addHandler(file_handler)
38
  @cl.set_chat_profiles
39
  async def chat_profile():
40
  return [
41
- cl.ChatProfile(
42
- name="Llama",
43
- markdown_description="Use the local LLM: **Tiny Llama**.",
44
- ),
45
  # cl.ChatProfile(
46
  # name="Mistral",
47
  # markdown_description="Use the local LLM: **Mistral**.",
@@ -54,6 +50,10 @@ async def chat_profile():
54
  name="gpt-4",
55
  markdown_description="Use OpenAI API for **gpt-4**.",
56
  ),
 
 
 
 
57
  ]
58
 
59
 
@@ -96,7 +96,7 @@ async def start():
96
  model = config["llm_params"]["local_llm_params"]["model"]
97
  msg = cl.Message(content=f"Starting the bot {model}...")
98
  await msg.send()
99
- msg.content = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
100
  await msg.update()
101
 
102
  cl.user_session.set("chain", chain)
@@ -119,6 +119,10 @@ async def main(message):
119
  answer = res["result"]
120
  print(f"answer: {answer}")
121
 
 
 
 
 
122
  answer_with_sources, source_elements = get_sources(res, answer)
123
 
124
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
 
38
  @cl.set_chat_profiles
39
  async def chat_profile():
40
  return [
 
 
 
 
41
  # cl.ChatProfile(
42
  # name="Mistral",
43
  # markdown_description="Use the local LLM: **Mistral**.",
 
50
  name="gpt-4",
51
  markdown_description="Use OpenAI API for **gpt-4**.",
52
  ),
53
+ cl.ChatProfile(
54
+ name="Llama",
55
+ markdown_description="Use the local LLM: **Tiny Llama**.",
56
+ ),
57
  ]
58
 
59
 
 
96
  model = config["llm_params"]["local_llm_params"]["model"]
97
  msg = cl.Message(content=f"Starting the bot {model}...")
98
  await msg.send()
99
+ msg.content = opening_message
100
  await msg.update()
101
 
102
  cl.user_session.set("chain", chain)
 
119
  answer = res["result"]
120
  print(f"answer: {answer}")
121
 
122
+ logger.info(f"Question: {res['question']}")
123
+ logger.info(f"History: {res['chat_history']}")
124
+ logger.info(f"Answer: {answer}\n")
125
+
126
  answer_with_sources, source_elements = get_sources(res, answer)
127
 
128
  await cl.Message(content=answer_with_sources, elements=source_elements).send()
code/modules/constants.py CHANGED
@@ -6,7 +6,9 @@ load_dotenv()
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
9
 
 
10
 
11
  # Prompt Templates
12
 
 
6
  # API Keys - Loaded from the .env file
7
 
8
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
9
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
 
11
+ opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
12
 
13
  # Prompt Templates
14
 
code/modules/data_loader.py CHANGED
@@ -14,17 +14,15 @@ from llama_parse import LlamaParse
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from langchain_experimental.text_splitter import SemanticChunker
18
- from langchain_openai.embeddings import OpenAIEmbeddings
19
  from ragatouille import RAGPretrainedModel
20
  from langchain.chains import LLMChain
21
  from langchain.llms import OpenAI
22
  from langchain import PromptTemplate
23
 
24
  try:
25
- from modules.helpers import get_lecture_metadata
26
  except:
27
- from helpers import get_lecture_metadata
28
 
29
  logger = logging.getLogger(__name__)
30
 
@@ -96,6 +94,14 @@ class FileReader:
96
  loader = WebBaseLoader(url)
97
  return loader.load()
98
 
 
 
 
 
 
 
 
 
99
 
100
  class ChunkProcessor:
101
  def __init__(self, config):
@@ -120,17 +126,6 @@ class ChunkProcessor:
120
  self.splitter = None
121
  logger.info("ChunkProcessor instance created")
122
 
123
- # def extract_metadata(self, document_content):
124
-
125
- # llm = OpenAI()
126
- # prompt_template = PromptTemplate(
127
- # input_variables=["document_content"],
128
- # template="Extract metadata for this document:\n\n{document_content}\n\nMetadata:",
129
- # )
130
- # chain = LLMChain(llm=llm, prompt=prompt_template)
131
- # metadata = chain.run(document_content=document_content)
132
- # return metadata
133
-
134
  def remove_delimiters(self, document_chunks: list):
135
  for chunk in document_chunks:
136
  for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
@@ -151,7 +146,12 @@ class ChunkProcessor:
151
  self, documents, file_type="txt", source="", page=0, metadata={}
152
  ):
153
  documents = [Document(page_content=documents, source=source, page=page)]
154
- if file_type == "txt":
 
 
 
 
 
155
  document_chunks = self.splitter.split_documents(documents)
156
  elif file_type == "pdf":
157
  document_chunks = documents # Full page for now
@@ -179,58 +179,54 @@ class ChunkProcessor:
179
  self.documents = []
180
  self.document_metadata = []
181
 
182
- lecture_metadata = get_lecture_metadata(
183
  "https://dl4ds.github.io/sp2024/lectures/",
184
  "https://dl4ds.github.io/sp2024/schedule/",
185
- ) # TODO: Use more efficiently
186
 
187
  for file_index, file_path in enumerate(uploaded_files):
188
  file_name = os.path.basename(file_path)
189
- file_type = file_name.split(".")[-1].lower()
190
-
191
- # try:
192
- if file_type == "pdf":
193
- documents = file_reader.read_pdf(file_path)
194
- elif file_type == "txt":
195
- documents = file_reader.read_txt(file_path)
196
- elif file_type == "docx":
197
- documents = file_reader.read_docx(file_path)
198
- elif file_type == "srt":
199
- documents = file_reader.read_srt(file_path)
200
- else:
201
- logger.warning(f"Unsupported file type: {file_type}")
202
- continue
203
-
204
- # full_text = ""
205
- # for doc in documents:
206
- # full_text += doc.page_content
207
- # break # getting only first page for now
208
-
209
- # extracted_metadata = self.extract_metadata(full_text)
210
-
211
- for doc in documents:
212
- page_num = doc.metadata.get("page", 0)
213
- self.documents.append(doc.page_content)
214
- self.document_metadata.append({"source": file_path, "page": page_num})
215
- if "lecture" in file_path.lower():
216
- metadata = lecture_metadata.get(file_path, {})
217
- metadata["source_type"] = "lecture"
218
- self.document_metadata[-1].update(metadata)
219
  else:
220
- metadata = {"source_type": "other"}
221
-
222
- self.child_document_names.append(f"{file_name}_{page_num}")
223
-
224
- self.parent_document_names.append(file_name)
225
- if self.config["embedding_options"]["db_option"] not in ["RAGatouille"]:
226
- document_chunks = self.process_chunks(
227
- self.documents[-1],
228
- file_type,
229
- source=file_path,
230
- page=page_num,
231
- metadata=metadata,
232
  )
233
- self.document_chunks_full.extend(document_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  # except Exception as e:
236
  # logger.error(f"Error processing file {file_name}: {str(e)}")
@@ -252,37 +248,38 @@ class ChunkProcessor:
252
  logger.info(f"Splitting weblinks: total of {len(weblinks)}")
253
 
254
  for link_index, link in enumerate(weblinks):
255
- try:
256
- logger.info(f"\tSplitting link {link_index+1} : {link}")
257
- if "youtube" in link:
258
- documents = file_reader.read_youtube_transcript(link)
259
- else:
260
- documents = file_reader.read_html(link)
261
-
262
- for doc in documents:
263
- page_num = doc.metadata.get("page", 0)
264
- self.documents.append(doc.page_content)
265
- self.document_metadata.append(
266
- {"source": link, "page": page_num}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  )
268
- self.child_document_names.append(f"{link}")
269
-
270
- self.parent_document_names.append(link)
271
- if self.config["embedding_options"]["db_option"] not in [
272
- "RAGatouille"
273
- ]:
274
- document_chunks = self.process_chunks(
275
- self.documents[-1],
276
- "txt",
277
- source=link,
278
- page=0,
279
- metadata={"source_type": "webpage"},
280
- )
281
- self.document_chunks_full.extend(document_chunks)
282
- except Exception as e:
283
- logger.error(
284
- f"Error splitting link {link_index+1} : {link}: {str(e)}"
285
- )
286
 
287
 
288
  class DataLoader:
 
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
17
  from ragatouille import RAGPretrainedModel
18
  from langchain.chains import LLMChain
19
  from langchain.llms import OpenAI
20
  from langchain import PromptTemplate
21
 
22
  try:
23
+ from modules.helpers import get_metadata
24
  except:
25
+ from helpers import get_metadata
26
 
27
  logger = logging.getLogger(__name__)
28
 
 
94
  loader = WebBaseLoader(url)
95
  return loader.load()
96
 
97
+ def read_tex_from_url(self, tex_url):
98
+ response = requests.get(tex_url)
99
+ if response.status_code == 200:
100
+ return [Document(page_content=response.text)]
101
+ else:
102
+ print("Failed to fetch .tex file from URL:", tex_url)
103
+ return None
104
+
105
 
106
  class ChunkProcessor:
107
  def __init__(self, config):
 
126
  self.splitter = None
127
  logger.info("ChunkProcessor instance created")
128
 
 
 
 
 
 
 
 
 
 
 
 
129
  def remove_delimiters(self, document_chunks: list):
130
  for chunk in document_chunks:
131
  for delimiter in self.config["splitter_options"]["delimiters_to_remove"]:
 
146
  self, documents, file_type="txt", source="", page=0, metadata={}
147
  ):
148
  documents = [Document(page_content=documents, source=source, page=page)]
149
+ if (
150
+ file_type == "txt"
151
+ or file_type == "docx"
152
+ or file_type == "srt"
153
+ or file_type == "tex"
154
+ ):
155
  document_chunks = self.splitter.split_documents(documents)
156
  elif file_type == "pdf":
157
  document_chunks = documents # Full page for now
 
179
  self.documents = []
180
  self.document_metadata = []
181
 
182
+ addl_metadata = get_metadata(
183
  "https://dl4ds.github.io/sp2024/lectures/",
184
  "https://dl4ds.github.io/sp2024/schedule/",
185
+ ) # For any additional metadata
186
 
187
  for file_index, file_path in enumerate(uploaded_files):
188
  file_name = os.path.basename(file_path)
189
+ if file_name not in self.parent_document_names:
190
+ file_type = file_name.split(".")[-1].lower()
191
+
192
+ # try:
193
+ if file_type == "pdf":
194
+ documents = file_reader.read_pdf(file_path)
195
+ elif file_type == "txt":
196
+ documents = file_reader.read_txt(file_path)
197
+ elif file_type == "docx":
198
+ documents = file_reader.read_docx(file_path)
199
+ elif file_type == "srt":
200
+ documents = file_reader.read_srt(file_path)
201
+ elif file_type == "tex":
202
+ documents = file_reader.read_tex_from_url(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  else:
204
+ logger.warning(f"Unsupported file type: {file_type}")
205
+ continue
206
+
207
+ for doc in documents:
208
+ page_num = doc.metadata.get("page", 0)
209
+ self.documents.append(doc.page_content)
210
+ self.document_metadata.append(
211
+ {"source": file_path, "page": page_num}
 
 
 
 
212
  )
213
+ metadata = addl_metadata.get(file_path, {})
214
+ self.document_metadata[-1].update(metadata)
215
+
216
+ self.child_document_names.append(f"{file_name}_{page_num}")
217
+
218
+ self.parent_document_names.append(file_name)
219
+ if self.config["embedding_options"]["db_option"] not in [
220
+ "RAGatouille"
221
+ ]:
222
+ document_chunks = self.process_chunks(
223
+ self.documents[-1],
224
+ file_type,
225
+ source=file_path,
226
+ page=page_num,
227
+ metadata=metadata,
228
+ )
229
+ self.document_chunks_full.extend(document_chunks)
230
 
231
  # except Exception as e:
232
  # logger.error(f"Error processing file {file_name}: {str(e)}")
 
248
  logger.info(f"Splitting weblinks: total of {len(weblinks)}")
249
 
250
  for link_index, link in enumerate(weblinks):
251
+ if link not in self.parent_document_names:
252
+ try:
253
+ logger.info(f"\tSplitting link {link_index+1} : {link}")
254
+ if "youtube" in link:
255
+ documents = file_reader.read_youtube_transcript(link)
256
+ else:
257
+ documents = file_reader.read_html(link)
258
+
259
+ for doc in documents:
260
+ page_num = doc.metadata.get("page", 0)
261
+ self.documents.append(doc.page_content)
262
+ self.document_metadata.append(
263
+ {"source": link, "page": page_num}
264
+ )
265
+ self.child_document_names.append(f"{link}")
266
+
267
+ self.parent_document_names.append(link)
268
+ if self.config["embedding_options"]["db_option"] not in [
269
+ "RAGatouille"
270
+ ]:
271
+ document_chunks = self.process_chunks(
272
+ self.documents[-1],
273
+ "txt",
274
+ source=link,
275
+ page=0,
276
+ metadata={"source_type": "webpage"},
277
+ )
278
+ self.document_chunks_full.extend(document_chunks)
279
+ except Exception as e:
280
+ logger.error(
281
+ f"Error splitting link {link_index+1} : {link}: {str(e)}"
282
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
 
285
  class DataLoader:
code/modules/embedding_model_loader.py CHANGED
@@ -24,8 +24,12 @@ class EmbeddingModelLoader:
24
  )
25
  else:
26
  embedding_model = HuggingFaceEmbeddings(
27
- model_name="sentence-transformers/all-MiniLM-L6-v2",
28
- model_kwargs={"device": "cpu"},
 
 
 
 
29
  )
30
  # embedding_model = LlamaCppEmbeddings(
31
  # model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
 
24
  )
25
  else:
26
  embedding_model = HuggingFaceEmbeddings(
27
+ model_name=self.config["embedding_options"]["model"],
28
+ model_kwargs={
29
+ "device": "cpu",
30
+ "token": f"{HUGGINGFACE_TOKEN}",
31
+ "trust_remote_code": True,
32
+ },
33
  )
34
  # embedding_model = LlamaCppEmbeddings(
35
  # model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
code/modules/helpers.py CHANGED
@@ -1,11 +1,15 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from tqdm import tqdm
4
- from urllib.parse import urlparse
5
  import chainlit as cl
6
  from langchain import PromptTemplate
7
  import requests
8
  from bs4 import BeautifulSoup
 
 
 
 
 
9
 
10
  try:
11
  from modules.constants import *
@@ -19,82 +23,112 @@ Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db
19
 
20
  class WebpageCrawler:
21
  def __init__(self):
22
- pass
23
 
24
- def getdata(self, url):
25
- r = requests.get(url)
26
- return r.text
 
 
 
27
 
28
- def url_exists(self, url):
29
  try:
30
  response = requests.head(url)
31
  return response.status_code == 200
32
  except requests.ConnectionError:
33
  return False
34
 
35
- def get_links(self, website_link, base_url=None):
36
- if base_url is None:
37
- base_url = website_link
38
- html_data = self.getdata(website_link)
39
  soup = BeautifulSoup(html_data, "html.parser")
40
  list_links = []
41
  for link in soup.find_all("a", href=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # clean the link
44
- # remove empty spaces
45
- link["href"] = link["href"].strip()
46
- # Append to list if new link contains original link
47
- if str(link["href"]).startswith((str(website_link))):
48
- list_links.append(link["href"])
49
-
50
- # Include all href that do not start with website link but with "/"
51
- if str(link["href"]).startswith("/"):
52
- if link["href"] not in self.dict_href_links:
53
- print(link["href"])
54
- self.dict_href_links[link["href"]] = None
55
- link_with_www = base_url + link["href"][1:]
56
- if self.url_exists(link_with_www):
57
- print("adjusted link =", link_with_www)
58
- list_links.append(link_with_www)
59
-
60
- # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
61
- dict_links = dict.fromkeys(list_links, "Not-checked")
62
- return dict_links
63
-
64
- def get_subpage_links(self, l, base_url):
65
- for link in tqdm(l):
66
- print("checking link:", link)
67
- if not link.endswith("/"):
68
- l[link] = "Checked"
69
- dict_links_subpages = {}
70
  else:
71
- # If not crawled through this page start crawling and get links
72
- if l[link] == "Not-checked":
73
- dict_links_subpages = self.get_links(link, base_url)
74
- # Change the dictionary value of the link to "Checked"
75
- l[link] = "Checked"
76
- else:
77
- # Create an empty dictionary in case every link is checked
78
- dict_links_subpages = {}
79
- # Add new dictionary to old dictionary
80
- l = {**dict_links_subpages, **l}
81
- return l
82
-
83
- def get_all_pages(self, url, base_url):
84
- dict_links = {url: "Not-checked"}
85
- self.dict_href_links = {}
86
- counter, counter2 = None, 0
87
- while counter != 0:
88
- counter2 += 1
89
- dict_links2 = self.get_subpage_links(dict_links, base_url)
90
- # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
91
- # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
92
- counter = sum(value == "Not-checked" for value in dict_links2.values())
93
- dict_links = dict_links2
94
- checked_urls = [
95
- url for url, status in dict_links.items() if status == "Checked"
96
- ]
97
- return checked_urls
98
 
99
 
100
  def get_urls_from_file(file_path: str):
@@ -183,40 +217,38 @@ def get_sources(res, answer):
183
 
184
  name = f"Source {idx + 1} Text\n"
185
  full_answer += name
186
- source_elements.append(cl.Text(name=name, content=source_data["text"]))
 
 
187
 
188
  # Add a PDF element if the source is a PDF file
189
  if source_data["url"].lower().endswith(".pdf"):
190
  name = f"Source {idx + 1} PDF\n"
191
  full_answer += name
192
  pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
193
- source_elements.append(cl.Pdf(name=name, url=pdf_url))
194
-
195
- # Finally, include lecture metadata for each unique source
196
- # displayed_urls = set()
197
- # full_answer += "\n**Metadata:**\n"
198
- # for url_name, source_data in source_dict.items():
199
- # if source_data["url"] not in displayed_urls:
200
- # full_answer += f"\nSource: {source_data['url']}\n"
201
- # full_answer += f"Type: {source_data['source_type']}\n"
202
- # full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
203
- # full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
204
- # full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
205
- # displayed_urls.add(source_data["url"])
206
  full_answer += "\n**Metadata:**\n"
207
- for url_name, source_data in source_dict.items():
208
- full_answer += f"\nSource: {source_data['url']}\n"
209
- full_answer += f"Page: {source_data['page']}\n"
210
- full_answer += f"Type: {source_data['source_type']}\n"
211
- full_answer += f"Date: {source_data['date']}\n"
212
- full_answer += f"TL;DR: {source_data['lecture_tldr']}\n"
213
- full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n"
214
- full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n"
 
 
 
 
 
 
 
215
 
216
  return full_answer, source_elements
217
 
218
 
219
- def get_lecture_metadata(lectures_url, schedule_url):
220
  """
221
  Function to get the lecture metadata from the lectures and schedule URLs.
222
  """
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from tqdm import tqdm
 
4
  import chainlit as cl
5
  from langchain import PromptTemplate
6
  import requests
7
  from bs4 import BeautifulSoup
8
+ from urllib.parse import urlparse, urljoin, urldefrag
9
+ import asyncio
10
+ import aiohttp
11
+ from aiohttp import ClientSession
12
+ from typing import Dict, Any, List
13
 
14
  try:
15
  from modules.constants import *
 
23
 
24
  class WebpageCrawler:
25
  def __init__(self):
26
+ self.dict_href_links = {}
27
 
28
+ async def fetch(self, session: ClientSession, url: str) -> str:
29
+ async with session.get(url) as response:
30
+ try:
31
+ return await response.text()
32
+ except UnicodeDecodeError:
33
+ return await response.text(encoding="latin1")
34
 
35
+ def url_exists(self, url: str) -> bool:
36
  try:
37
  response = requests.head(url)
38
  return response.status_code == 200
39
  except requests.ConnectionError:
40
  return False
41
 
42
+ async def get_links(self, session: ClientSession, website_link: str, base_url: str):
43
+ html_data = await self.fetch(session, website_link)
 
 
44
  soup = BeautifulSoup(html_data, "html.parser")
45
  list_links = []
46
  for link in soup.find_all("a", href=True):
47
+ href = link["href"].strip()
48
+ full_url = urljoin(base_url, href)
49
+ normalized_url = self.normalize_url(full_url) # sections removed
50
+ if (
51
+ normalized_url not in self.dict_href_links
52
+ and self.is_child_url(normalized_url, base_url)
53
+ and self.url_exists(normalized_url)
54
+ ):
55
+ self.dict_href_links[normalized_url] = None
56
+ list_links.append(normalized_url)
57
+
58
+ return list_links
59
+
60
+ async def get_subpage_links(
61
+ self, session: ClientSession, urls: list, base_url: str
62
+ ):
63
+ tasks = [self.get_links(session, url, base_url) for url in urls]
64
+ results = await asyncio.gather(*tasks)
65
+ all_links = [link for sublist in results for link in sublist]
66
+ return all_links
67
+
68
+ async def get_all_pages(self, url: str, base_url: str):
69
+ async with aiohttp.ClientSession() as session:
70
+ dict_links = {url: "Not-checked"}
71
+ counter = None
72
+ while counter != 0:
73
+ unchecked_links = [
74
+ link
75
+ for link, status in dict_links.items()
76
+ if status == "Not-checked"
77
+ ]
78
+ if not unchecked_links:
79
+ break
80
+ new_links = await self.get_subpage_links(
81
+ session, unchecked_links, base_url
82
+ )
83
+ for link in unchecked_links:
84
+ dict_links[link] = "Checked"
85
+ print(f"Checked: {link}")
86
+ dict_links.update(
87
+ {
88
+ link: "Not-checked"
89
+ for link in new_links
90
+ if link not in dict_links
91
+ }
92
+ )
93
+ counter = len(
94
+ [
95
+ status
96
+ for status in dict_links.values()
97
+ if status == "Not-checked"
98
+ ]
99
+ )
100
+
101
+ checked_urls = [
102
+ url for url, status in dict_links.items() if status == "Checked"
103
+ ]
104
+ return checked_urls
105
+
106
+ def is_webpage(self, url: str) -> bool:
107
+ try:
108
+ response = requests.head(url, allow_redirects=True)
109
+ content_type = response.headers.get("Content-Type", "").lower()
110
+ return "text/html" in content_type
111
+ except requests.RequestException:
112
+ return False
113
+
114
+ def clean_url_list(self, urls):
115
+ files, webpages = [], []
116
 
117
+ for url in urls:
118
+ if self.is_webpage(url):
119
+ webpages.append(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
+ files.append(url)
122
+
123
+ return files, webpages
124
+
125
+ def is_child_url(self, url, base_url):
126
+ return url.startswith(base_url)
127
+
128
+ def normalize_url(self, url: str):
129
+ # Strip the fragment identifier
130
+ defragged_url, _ = urldefrag(url)
131
+ return defragged_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
 
134
  def get_urls_from_file(file_path: str):
 
217
 
218
  name = f"Source {idx + 1} Text\n"
219
  full_answer += name
220
+ source_elements.append(
221
+ cl.Text(name=name, content=source_data["text"], display="side")
222
+ )
223
 
224
  # Add a PDF element if the source is a PDF file
225
  if source_data["url"].lower().endswith(".pdf"):
226
  name = f"Source {idx + 1} PDF\n"
227
  full_answer += name
228
  pdf_url = f"{source_data['url']}#page={source_data['page']+1}"
229
+ source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side"))
230
+
 
 
 
 
 
 
 
 
 
 
 
231
  full_answer += "\n**Metadata:**\n"
232
+ for idx, (url_name, source_data) in enumerate(source_dict.items()):
233
+ full_answer += f"\nSource {idx + 1} Metadata:\n"
234
+ source_elements.append(
235
+ cl.Text(
236
+ name=f"Source {idx + 1} Metadata",
237
+ content=f"Source: {source_data['url']}\n"
238
+ f"Page: {source_data['page']}\n"
239
+ f"Type: {source_data['source_type']}\n"
240
+ f"Date: {source_data['date']}\n"
241
+ f"TL;DR: {source_data['lecture_tldr']}\n"
242
+ f"Lecture Recording: {source_data['lecture_recording']}\n"
243
+ f"Suggested Readings: {source_data['suggested_readings']}\n",
244
+ display="side",
245
+ )
246
+ )
247
 
248
  return full_answer, source_elements
249
 
250
 
251
+ def get_metadata(lectures_url, schedule_url):
252
  """
253
  Function to get the lecture metadata from the lectures and schedule URLs.
254
  """
code/modules/llm_tutor.py CHANGED
@@ -5,18 +5,99 @@ from langchain_community.embeddings import OpenAIEmbeddings
5
  from langchain.vectorstores import FAISS
6
  from langchain.chains import RetrievalQA, ConversationalRetrievalChain
7
  from langchain.llms import CTransformers
8
- from langchain.memory import ConversationBufferWindowMemory
9
  from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
10
  import os
11
  from modules.constants import *
12
  from modules.helpers import get_prompt
13
  from modules.chat_model_loader import ChatModelLoader
14
  from modules.vector_db import VectorDB, VectorDBScore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  class LLMTutor:
18
  def __init__(self, config, logger=None):
19
  self.config = config
 
20
  self.vector_db = VectorDB(config, logger=logger)
21
  if self.config["embedding_options"]["embedd_files"]:
22
  self.vector_db.create_database()
@@ -36,26 +117,28 @@ class LLMTutor:
36
  if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
37
  retriever = VectorDBScore(
38
  vectorstore=db,
39
- search_type="similarity_score_threshold",
40
- search_kwargs={
41
- "score_threshold": self.config["embedding_options"][
42
- "score_threshold"
43
- ],
44
- "k": self.config["embedding_options"]["search_top_k"],
45
- },
46
  )
47
  elif self.config["embedding_options"]["db_option"] == "RAGatouille":
48
  retriever = db.as_langchain_retriever(
49
  k=self.config["embedding_options"]["search_top_k"]
50
  )
51
  if self.config["llm_params"]["use_history"]:
52
- memory = ConversationBufferWindowMemory(
 
53
  k=self.config["llm_params"]["memory_window"],
54
  memory_key="chat_history",
55
  return_messages=True,
56
  output_key="answer",
 
57
  )
58
- qa_chain = ConversationalRetrievalChain.from_llm(
59
  llm=llm,
60
  chain_type="stuff",
61
  retriever=retriever,
@@ -82,7 +165,6 @@ class LLMTutor:
82
  # QA Model Function
83
  def qa_bot(self):
84
  db = self.vector_db.load_database()
85
- self.llm = self.load_llm()
86
  qa_prompt = self.set_custom_prompt()
87
  qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
88
 
 
5
  from langchain.vectorstores import FAISS
6
  from langchain.chains import RetrievalQA, ConversationalRetrievalChain
7
  from langchain.llms import CTransformers
8
+ from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryBufferMemory
9
  from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
10
  import os
11
  from modules.constants import *
12
  from modules.helpers import get_prompt
13
  from modules.chat_model_loader import ChatModelLoader
14
  from modules.vector_db import VectorDB, VectorDBScore
15
+ from typing import Dict, Any, Optional
16
+ from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
17
+ import inspect
18
+ from langchain.chains.conversational_retrieval.base import _get_chat_history
19
+
20
+
21
+ class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
22
+ async def _acall(
23
+ self,
24
+ inputs: Dict[str, Any],
25
+ run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
26
+ ) -> Dict[str, Any]:
27
+ _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
28
+ question = inputs["question"]
29
+ get_chat_history = self.get_chat_history or _get_chat_history
30
+ chat_history_str = get_chat_history(inputs["chat_history"])
31
+ print(f"chat_history_str: {chat_history_str}")
32
+ if chat_history_str:
33
+ callbacks = _run_manager.get_child()
34
+ new_question = await self.question_generator.arun(
35
+ question=question, chat_history=chat_history_str, callbacks=callbacks
36
+ )
37
+ else:
38
+ new_question = question
39
+ accepts_run_manager = (
40
+ "run_manager" in inspect.signature(self._aget_docs).parameters
41
+ )
42
+ if accepts_run_manager:
43
+ docs = await self._aget_docs(new_question, inputs, run_manager=_run_manager)
44
+ else:
45
+ docs = await self._aget_docs(new_question, inputs) # type: ignore[call-arg]
46
+
47
+ output: Dict[str, Any] = {}
48
+ if self.response_if_no_docs_found is not None and len(docs) == 0:
49
+ output[self.output_key] = self.response_if_no_docs_found
50
+ else:
51
+ new_inputs = inputs.copy()
52
+ if self.rephrase_question:
53
+ new_inputs["question"] = new_question
54
+ new_inputs["chat_history"] = chat_history_str
55
+
56
+ # Prepare the final prompt with metadata
57
+ context = "\n\n".join(
58
+ [
59
+ f"Document content: {doc.page_content}\nMetadata: {doc.metadata}"
60
+ for doc in docs
61
+ ]
62
+ )
63
+ final_prompt = f"""
64
+ You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Use the following pieces of information to answer the user's question.
65
+ If you don't know the answer, just say that you don't know—don't try to make up an answer.
66
+ Use the chat history to answer the question only if it's relevant; otherwise, ignore it. The context for the answer will be under "Document context:".
67
+ Use the metadata from each document to guide the user to the correct sources.
68
+ The context is ordered by relevance to the question. Give more weight to the most relevant documents.
69
+ Talk in a friendly and personalized manner, similar to how you would speak to a friend who needs help. Make the conversation engaging and avoid sounding repetitive or robotic.
70
+
71
+ Chat History:
72
+ {chat_history_str}
73
+
74
+ Context:
75
+ {context}
76
+
77
+ Question: {new_question}
78
+ AI Tutor:
79
+ """
80
+
81
+ new_inputs["input"] = final_prompt
82
+ new_inputs["question"] = final_prompt
83
+ output["final_prompt"] = final_prompt
84
+
85
+ answer = await self.combine_docs_chain.arun(
86
+ input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs
87
+ )
88
+ output[self.output_key] = answer
89
+
90
+ if self.return_source_documents:
91
+ output["source_documents"] = docs
92
+ if self.return_generated_question:
93
+ output["generated_question"] = new_question
94
+ return output
95
 
96
 
97
  class LLMTutor:
98
  def __init__(self, config, logger=None):
99
  self.config = config
100
+ self.llm = self.load_llm()
101
  self.vector_db = VectorDB(config, logger=logger)
102
  if self.config["embedding_options"]["embedd_files"]:
103
  self.vector_db.create_database()
 
117
  if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]:
118
  retriever = VectorDBScore(
119
  vectorstore=db,
120
+ # search_type="similarity_score_threshold",
121
+ # search_kwargs={
122
+ # "score_threshold": self.config["embedding_options"][
123
+ # "score_threshold"
124
+ # ],
125
+ # "k": self.config["embedding_options"]["search_top_k"],
126
+ # },
127
  )
128
  elif self.config["embedding_options"]["db_option"] == "RAGatouille":
129
  retriever = db.as_langchain_retriever(
130
  k=self.config["embedding_options"]["search_top_k"]
131
  )
132
  if self.config["llm_params"]["use_history"]:
133
+ memory = ConversationSummaryBufferMemory(
134
+ llm = llm,
135
  k=self.config["llm_params"]["memory_window"],
136
  memory_key="chat_history",
137
  return_messages=True,
138
  output_key="answer",
139
+ max_token_limit=128,
140
  )
141
+ qa_chain = CustomConversationalRetrievalChain.from_llm(
142
  llm=llm,
143
  chain_type="stuff",
144
  retriever=retriever,
 
165
  # QA Model Function
166
  def qa_bot(self):
167
  db = self.vector_db.load_database()
 
168
  qa_prompt = self.set_custom_prompt()
169
  qa = self.retrieval_qa_chain(self.llm, qa_prompt, db)
170
 
code/modules/vector_db.py CHANGED
@@ -96,21 +96,17 @@ class VectorDB:
96
  if self.config["embedding_options"]["expand_urls"]:
97
  all_urls = []
98
  for url in urls:
99
- base_url = get_base_url(url)
100
- all_urls.extend(self.webpage_crawler.get_all_pages(url, base_url))
 
 
 
 
 
 
101
  urls = all_urls
102
  return files, urls
103
 
104
- def clean_url_list(self, urls):
105
- # get lecture pdf links
106
- lecture_pdfs = [link for link in urls if link.endswith(".pdf")]
107
- lecture_pdfs = [link for link in lecture_pdfs if "lecture" in link.lower()]
108
- urls = [
109
- link for link in urls if link.endswith("/")
110
- ] # only keep links that end with a '/'. Extract Files Seperately
111
-
112
- return urls, lecture_pdfs
113
-
114
  def create_embedding_model(self):
115
  self.logger.info("Creating embedding function")
116
  self.embedding_model_loader = EmbeddingModelLoader(self.config)
@@ -158,12 +154,11 @@ class VectorDB:
158
  data_loader = DataLoader(self.config)
159
  self.logger.info("Loading data")
160
  files, urls = self.load_files()
161
- urls, lecture_pdfs = self.clean_url_list(urls)
162
- files += lecture_pdfs
163
  if "storage/data/urls.txt" in files:
164
  files.remove("storage/data/urls.txt")
165
  document_chunks, document_names, documents, document_metadata = (
166
- data_loader.get_chunks(files, urls)
167
  )
168
  self.logger.info("Completed loading data")
169
  self.initialize_database(
 
96
  if self.config["embedding_options"]["expand_urls"]:
97
  all_urls = []
98
  for url in urls:
99
+ loop = asyncio.get_event_loop()
100
+ all_urls.extend(
101
+ loop.run_until_complete(
102
+ self.webpage_crawler.get_all_pages(
103
+ url, url
104
+ ) # only get child urls, if you want to get all urls, replace the second argument with the base url
105
+ )
106
+ )
107
  urls = all_urls
108
  return files, urls
109
 
 
 
 
 
 
 
 
 
 
 
110
  def create_embedding_model(self):
111
  self.logger.info("Creating embedding function")
112
  self.embedding_model_loader = EmbeddingModelLoader(self.config)
 
154
  data_loader = DataLoader(self.config)
155
  self.logger.info("Loading data")
156
  files, urls = self.load_files()
157
+ files, webpages = self.webpage_crawler.clean_url_list(urls)
 
158
  if "storage/data/urls.txt" in files:
159
  files.remove("storage/data/urls.txt")
160
  document_chunks, document_names, documents, document_metadata = (
161
+ data_loader.get_chunks(files, webpages)
162
  )
163
  self.logger.info("Completed loading data")
164
  self.initialize_database(
public/test.css CHANGED
@@ -1,3 +1,16 @@
1
  a[href*='https://github.com/Chainlit/chainlit'] {
2
  visibility: hidden;
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  }
 
1
  a[href*='https://github.com/Chainlit/chainlit'] {
2
  visibility: hidden;
3
+ }
4
+
5
+ .message-avatar .MuiAvatar-root {
6
+ background-color: transparent; /* Remove the background color */
7
+ color: #FFFFFF; /* Change this to your desired text color */
8
+ border: 0.25px solid #FFFFFF; /* Add a white border for the circle */
9
+ border-radius: 50%; /* Ensure the avatar remains circular */
10
+ background-image: url('http://localhost:8051/logo?theme=dark'); /* Path to your logo */
11
+ background-size: cover; /* Ensure the logo covers the entire avatar */
12
+ background-position: center; /* Center the logo */
13
+ background-repeat: no-repeat; /* Prevent the logo from repeating */
14
+ width: 38px; /* Adjust the width as needed */
15
+ height: 38px; /* Adjust the height as needed */
16
  }
requirements.txt CHANGED
@@ -1,20 +1,20 @@
1
- streamlit==1.29.0
2
- PyYAML==6.0.1
 
 
 
 
 
 
3
  pysrt==1.1.2
4
- langchain==0.0.353
5
- tiktoken==0.5.2
6
- streamlit-chat==0.1.1
7
- pypdf==3.17.4
8
- sentence-transformers==2.2.2
9
- faiss-cpu==1.7.4
10
- ctransformers==0.2.27
11
- python-dotenv==1.0.0
12
- openai==1.6.1
13
- pymupdf==1.23.8
14
- chainlit==1.0.200
15
- beautifulsoup4==4.12.2
16
- fake-useragent==1.4.0
17
- git+https://github.com/huggingface/accelerate.git
18
- llama-cpp-python
19
- PyPDF2==3.0.1
20
- ragatouille==0.0.8.post2
 
1
+ # Automatically generated by https://github.com/damnever/pigar.
2
+
3
+ beautifulsoup4==4.12.3
4
+ chainlit==1.1.202
5
+ langchain==0.1.20
6
+ langchain-community==0.0.38
7
+ langchain-core==0.1.52
8
+ llama-parse==0.4.4
9
  pysrt==1.1.2
10
+ python-dotenv==1.0.1
11
+ PyYAML==6.0.1
12
+ RAGatouille==0.0.8.post2
13
+ requests==2.32.3
14
+ torch==2.3.1
15
+ tqdm==4.66.4
16
+ transformers==4.41.2
17
+ llama-cpp-python==0.2.77
18
+ fake_useragent==1.5.1
19
+ chromadb==0.5.0
20
+ pymupdf==1.24.5