Rauhan commited on
Commit
d3176f4
1 Parent(s): c5522cd

UPDATE: functions

Browse files
Files changed (2) hide show
  1. app.py +20 -99
  2. functions.py +17 -10
app.py CHANGED
@@ -13,8 +13,7 @@ from src.api.speech_api import speech_translator_router
13
  from functions import client as supabase
14
  from urllib.parse import urlparse
15
  import nltk
16
- import time
17
- import uuid
18
 
19
  nltk.download('punkt_tab')
20
 
@@ -236,67 +235,34 @@ async def newChatbot(chatbotName: str, username: str):
236
  return createTable(tablename=chatbotName)
237
 
238
 
239
- @app.post("/addPDF")
240
  async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
241
  source = pdf.filename
242
  pdf = await pdf.read()
243
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
244
  temp_file.write(pdf)
245
  temp_file_path = temp_file.name
246
- start = time.time()
247
  text = extractTextFromPdf(temp_file_path)
248
- textExtraction = time.time()
249
  os.remove(temp_file_path)
250
- username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
251
- df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
252
- currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
253
- limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
254
- "tokenLimit"]
255
- newCount = currentCount + len(text)
256
- if newCount < int(limit):
257
- supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
258
- "chatbotname", chatbotname).execute()
259
- uploadStart = time.time()
260
- output = addDocuments(text=text, source=source, vectorstore=vectorstore)
261
- uploadEnd = time.time()
262
- uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
263
- timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
264
- tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
265
- tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
266
- wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
267
- newText = ("=" * 75 + "\n").join([timeTaken, uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
268
- fileId = str(uuid.uuid4())
269
- with open(f"{fileId}.txt", "w") as file:
270
- file.write(newText)
271
- with open(f"{fileId}.txt", "rb") as f:
272
- supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
273
- file_options={"content-type": "text/plain"})
274
- os.remove(f"{fileId}.txt")
275
- output["supabaseFileName"] = f"{fileId}.txt"
276
- return output
277
- else:
278
- return {
279
- "output": "DOCUMENT EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
280
- }
281
 
282
 
283
- @app.post("/scanAndReturnText")
284
  async def returnText(pdf: UploadFile = File(...)):
285
  source = pdf.filename
286
  pdf = await pdf.read()
287
- start = time.time()
288
  text = getTextFromImagePDF(pdfBytes=pdf)
289
- end = time.time()
290
- timeTaken = f"{end - start}s"
291
  return {
292
- "source": source,
293
- "extractionTime": timeTaken,
294
- "output": text
295
  }
296
 
297
 
298
  @app.post("/addText")
299
- async def addText(vectorstore: str, text: str, source: str | None = None):
300
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
301
  df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
302
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
@@ -306,22 +272,7 @@ async def addText(vectorstore: str, text: str, source: str | None = None):
306
  if newCount < int(limit):
307
  supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
308
  "chatbotname", chatbotname).execute()
309
- uploadStart = time.time()
310
  output = addDocuments(text=text, source=source, vectorstore=vectorstore)
311
- uploadEnd = time.time()
312
- uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
313
- tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
314
- tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
315
- wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
316
- newText = ("=" * 75 + "\n").join([uploadTime, wordCount, tokenCount, "TEXT: \n" + text + "\n"])
317
- fileId = str(uuid.uuid4())
318
- with open(f"{fileId}.txt", "w") as file:
319
- file.write(newText)
320
- with open(f"{fileId}.txt", "rb") as f:
321
- supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
322
- file_options={"content-type": "text/plain"})
323
- os.remove(f"{fileId}.txt")
324
- output["supabaseFileName"] = f"{fileId}.txt"
325
  return output
326
  else:
327
  return {
@@ -354,44 +305,12 @@ async def addQAPairData(addQaPair: AddQAPair):
354
  }
355
 
356
 
357
- @app.post("/addWebsite")
358
  async def addWebsite(vectorstore: str, websiteUrls: list[str]):
359
- start = time.time()
360
- text = extractTextFromUrlList(urls=websiteUrls)
361
- textExtraction = time.time()
362
- username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
363
- df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
364
- currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
365
- newCount = currentCount + len(text)
366
- limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
367
- "tokenLimit"]
368
- if newCount < int(limit):
369
- supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
370
- "chatbotname", chatbotname).execute()
371
- uploadStart = time.time()
372
- output = addDocuments(text=text, source=urlparse(websiteUrls[0]).netloc, vectorstore=vectorstore)
373
- uploadEnd = time.time()
374
- uploadTime = f"VECTOR UPLOAD TIME: {uploadEnd - uploadStart}s" + "\n"
375
- timeTaken = f"TEXT EXTRACTION TIME: {textExtraction - start}s" + "\n"
376
- tokenCount = f"TOKEN COUNT: {len(text)}" + "\n"
377
- tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
378
- wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
379
- links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
380
- newText = ("=" * 75 + "\n").join(
381
- [timeTaken, uploadTime, wordCount, tokenCount, links, "TEXT: \n" + text + "\n"])
382
- fileId = str(uuid.uuid4())
383
- with open(f"{fileId}.txt", "w") as file:
384
- file.write(newText)
385
- with open(f"{fileId}.txt", "rb") as f:
386
- supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
387
- file_options={"content-type": "text/plain"})
388
- os.remove(f"{fileId}.txt")
389
- output["supabaseFileName"] = f"{fileId}.txt"
390
- return output
391
- else:
392
- return {
393
- "output": "WEBSITE EXCEEDING LIMITS, PLEASE TRY WITH A SMALLER DOCUMENT."
394
- }
395
 
396
 
397
  @app.post("/answerQuery")
@@ -422,7 +341,8 @@ async def delete(username: str):
422
  @app.post("/getLinks")
423
  async def crawlUrl(baseUrl: str):
424
  return {
425
- "urls": getLinks(url=baseUrl, timeout=30)
 
426
  }
427
 
428
 
@@ -436,9 +356,10 @@ async def getCount(vectorstore: str):
436
 
437
 
438
  @app.post("/getYoutubeTranscript")
439
- async def getYTTranscript(urls: str):
440
  return {
441
- "transcript": getTranscript(urls=urls)
 
442
  }
443
 
444
 
 
13
  from functions import client as supabase
14
  from urllib.parse import urlparse
15
  import nltk
16
+
 
17
 
18
  nltk.download('punkt_tab')
19
 
 
235
  return createTable(tablename=chatbotName)
236
 
237
 
238
+ @app.post("/loadPDF")
239
  async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
240
  source = pdf.filename
241
  pdf = await pdf.read()
242
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
243
  temp_file.write(pdf)
244
  temp_file_path = temp_file.name
 
245
  text = extractTextFromPdf(temp_file_path)
 
246
  os.remove(temp_file_path)
247
+ return {
248
+ "output": text,
249
+ "source": source
250
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
 
253
+ @app.post("/loadImagePDF")
254
  async def returnText(pdf: UploadFile = File(...)):
255
  source = pdf.filename
256
  pdf = await pdf.read()
 
257
  text = getTextFromImagePDF(pdfBytes=pdf)
 
 
258
  return {
259
+ "output": text,
260
+ "source": source
 
261
  }
262
 
263
 
264
  @app.post("/addText")
265
+ async def addText(vectorstore: str, text: str, source: str = "Text"):
266
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
267
  df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
268
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
 
272
  if newCount < int(limit):
273
  supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
274
  "chatbotname", chatbotname).execute()
 
275
  output = addDocuments(text=text, source=source, vectorstore=vectorstore)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  return output
277
  else:
278
  return {
 
305
  }
306
 
307
 
308
+ @app.post("/loadWebURLs")
309
  async def addWebsite(vectorstore: str, websiteUrls: list[str]):
310
+ text = extractTextFromUrlList(urls=websiteUrls)
311
+ return {
312
+ "output": text
313
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
 
316
  @app.post("/answerQuery")
 
341
  @app.post("/getLinks")
342
  async def crawlUrl(baseUrl: str):
343
  return {
344
+ "urls": getLinks(url=baseUrl, timeout=30),
345
+ "source": urlparse(baseUrl).netloc
346
  }
347
 
348
 
 
356
 
357
 
358
  @app.post("/getYoutubeTranscript")
359
+ async def getYTTranscript(urls: list[str]):
360
  return {
361
+ "output": getTranscript(urls=urls),
362
+ "source": "www.youtube.com"
363
  }
364
 
365
 
functions.py CHANGED
@@ -56,7 +56,7 @@ INSTRUCTIONS:
56
  2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
57
  3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
58
  4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
59
- Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Include sources to support your answers when possible.
60
  CONTEXT:
61
  =====================================
62
  {context}
@@ -139,14 +139,19 @@ def addDocuments(text: str, source: str, vectorstore: str):
139
 
140
 
141
  def format_docs(docs: str):
 
 
142
  context = ""
143
  for doc in docs:
144
- print("METADATA ::: ", type(doc.metadata))
145
- context += f"CONTENT: {doc.page_content}\nSOURCE: {doc.metadata} \n\n\n"
 
 
146
  if context == "":
147
  context = "No context found"
148
  else:
149
  pass
 
150
  return context
151
 
152
 
@@ -171,6 +176,7 @@ def trimMessages(chain_input):
171
  def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
172
  global prompt
173
  global client
 
174
  global vectorEmbeddings
175
  global sparseEmbeddings
176
  vectorStoreName = vectorstore
@@ -201,7 +207,8 @@ def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192")
201
  "output": chain.invoke(
202
  {"question": query},
203
  {"configurable": {"session_id": vectorStoreName}}
204
- )
 
205
  }
206
 
207
 
@@ -271,13 +278,12 @@ def getTextFromImagePDF(pdfBytes):
271
  return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
272
  allImages = convert_from_bytes(pdfBytes)
273
  texts = [getText(image) for image in allImages]
274
- return "\n\n\n".join(texts)
275
 
276
 
277
  def getTranscript(urls: str):
278
- urls = urls.split(",")
279
  texts = []
280
- for url in urls:
281
  try:
282
  loader = YoutubeLoader.from_youtube_url(
283
  url, add_video_info=False
@@ -287,10 +293,11 @@ def getTranscript(urls: str):
287
  except:
288
  doc = ""
289
  texts.append(doc)
290
- return "\n\n".join(texts)
291
 
292
 
293
  def analyzeData(query, dataframe):
 
294
  llm = ChatGroq(name="llama-3.1-8b-instant")
295
  df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
296
  response = df.chat(query)
@@ -312,7 +319,7 @@ def extractTextFromPdf(pdf_path):
312
  with ThreadPoolExecutor() as executor:
313
  texts = list(executor.map(extractTextFromPage, pages))
314
  doc.close()
315
- return '.'.join(texts)
316
 
317
 
318
  def extractTextFromUrl(url):
@@ -326,4 +333,4 @@ def extractTextFromUrl(url):
326
  def extractTextFromUrlList(urls):
327
  with ThreadPoolExecutor() as executor:
328
  texts = list(executor.map(extractTextFromUrl, urls))
329
- return '.'.join(texts)
 
56
  2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
57
  3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
58
  4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
59
+ Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Make sure the user is always happy and satisfied with the outputs you return.
60
  CONTEXT:
61
  =====================================
62
  {context}
 
139
 
140
 
141
  def format_docs(docs: str):
142
+ global sources
143
+ sources = []
144
  context = ""
145
  for doc in docs:
146
+ context += f"{doc.page_content}\n\n\n"
147
+ source = doc.metadata
148
+ source = source["source"]
149
+ sources.append(source)
150
  if context == "":
151
  context = "No context found"
152
  else:
153
  pass
154
+ sources = list(set(sources))
155
  return context
156
 
157
 
 
176
  def answerQuery(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192") -> str:
177
  global prompt
178
  global client
179
+ global sources
180
  global vectorEmbeddings
181
  global sparseEmbeddings
182
  vectorStoreName = vectorstore
 
207
  "output": chain.invoke(
208
  {"question": query},
209
  {"configurable": {"session_id": vectorStoreName}}
210
+ ),
211
+ "sources": sources
212
  }
213
 
214
 
 
278
  return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
279
  allImages = convert_from_bytes(pdfBytes)
280
  texts = [getText(image) for image in allImages]
281
+ return {x + 1: y for x, y in enumerate(texts)}
282
 
283
 
284
  def getTranscript(urls: str):
 
285
  texts = []
286
+ for url in set(urls):
287
  try:
288
  loader = YoutubeLoader.from_youtube_url(
289
  url, add_video_info=False
 
293
  except:
294
  doc = ""
295
  texts.append(doc)
296
+ return {x: y for x, y in zip(urls, texts)}
297
 
298
 
299
  def analyzeData(query, dataframe):
300
+ query += ". In case, you are to plot a chart, make sure the x-axis labels are 90 degree rotated"
301
  llm = ChatGroq(name="llama-3.1-8b-instant")
302
  df = SmartDataframe(dataframe, config={"llm": llm, "verbose": False})
303
  response = df.chat(query)
 
319
  with ThreadPoolExecutor() as executor:
320
  texts = list(executor.map(extractTextFromPage, pages))
321
  doc.close()
322
+ return {x + 1: y for x, y in enumerate(texts)}
323
 
324
 
325
  def extractTextFromUrl(url):
 
333
  def extractTextFromUrlList(urls):
334
  with ThreadPoolExecutor() as executor:
335
  texts = list(executor.map(extractTextFromUrl, urls))
336
+ return {x: y for x, y in zip(urls, texts)}