Rauhan commited on
Commit
80cfec3
1 Parent(s): f176992

UPDATE: trainChatbot

Browse files
Files changed (2) hide show
  1. app.py +16 -2
  2. functions.py +12 -5
app.py CHANGED
@@ -320,7 +320,7 @@ async def loadText(addTextConfig: AddText):
320
  vectorstore, text = addTextConfig.vectorstore, addTextConfig.text
321
  username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
322
  dct = {
323
- "output": text,
324
  "source": "Text"
325
  }
326
  dct = json.dumps(dct, indent=1).encode("utf-8")
@@ -544,13 +544,27 @@ async def loadEditedJson(loadEditedJsonConfig: LoadEditedJson):
544
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
545
  .execute()
546
  )
547
-
548
  return {
549
  "output": "SUCCESS"
550
  }
551
 
552
 
553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
  class TrainChatbot(BaseModel):
556
  vectorstore: str
 
320
  vectorstore, text = addTextConfig.vectorstore, addTextConfig.text
321
  username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
322
  dct = {
323
+ "output": cleanText(text = text),
324
  "source": "Text"
325
  }
326
  dct = json.dumps(dct, indent=1).encode("utf-8")
 
544
  "sourceContentURL": os.path.join(os.environ["SUPABASE_PUBLIC_BASE_URL"], f"{fileName}_data.json")})
545
  .execute()
546
  )
 
547
  return {
548
  "output": "SUCCESS"
549
  }
550
 
551
 
552
 
553
+ @app.post("/publicOrPrivate")
554
+ async def publicOrPrivate(vectorstore: str, mode: str = "public"):
555
+ username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
556
+ response = (
557
+ supabase.table("ConversAI_ChatbotInfo")
558
+ .update({"public/private": mode})
559
+ .eq("user_id", username)
560
+ .eq("chatbotname", chatbotName)
561
+ .execute()
562
+ )
563
+ return {
564
+ "output": "SUCCESS"
565
+ }
566
+
567
+
568
 
569
  class TrainChatbot(BaseModel):
570
  vectorstore: str
functions.py CHANGED
@@ -7,6 +7,7 @@ from langchain_qdrant import QdrantVectorStore
7
  from langchain_qdrant import RetrievalMode
8
  from langchain_core.prompts.chat import ChatPromptTemplate
9
  from uuid import uuid4
 
10
  from langchain_core.output_parsers import StrOutputParser
11
  from langchain.retrievers import ParentDocumentRetriever
12
  from langchain_core.runnables.history import RunnableWithMessageHistory
@@ -120,6 +121,10 @@ def createTable(tablename: str):
120
  "output": "SUCCESS"
121
  }
122
 
 
 
 
 
123
 
124
  def addDocuments(texts: list[tuple[str]], vectorstore: str):
125
  global vectorEmbeddings
@@ -288,7 +293,8 @@ def getLinks(url: str, timeout=30):
288
  def getTextFromImagePDF(pdfBytes):
289
  def getText(image):
290
  global reader
291
- return "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
 
292
 
293
  allImages = convert_from_bytes(pdfBytes)
294
  texts = [base64.b64encode(getText(image).encode("utf-8")).decode("utf-8") for image in allImages]
@@ -303,7 +309,7 @@ def getTranscript(urls: str):
303
  url, add_video_info=False
304
  )
305
  doc = " ".join([x.page_content for x in loader.load()])
306
- texts.append(doc)
307
  except:
308
  doc = ""
309
  texts.append(doc)
@@ -325,7 +331,7 @@ def analyzeData(query, dataframe):
325
 
326
 
327
  def extractTextFromPage(page):
328
- text = page.get_text()
329
  return base64.b64encode(text.encode("utf-8")).decode("utf-8")
330
 
331
 
@@ -343,7 +349,7 @@ def extractTextFromUrl(url):
343
  response.raise_for_status()
344
  html = response.text
345
  soup = BeautifulSoup(html, 'lxml')
346
- text = soup.get_text(separator=' ', strip=True)
347
  return base64.b64encode(text.encode("utf-8")).decode("utf-8")
348
 
349
 
@@ -361,4 +367,5 @@ def createDataSourceName(sourceName):
361
  i = 1
362
  while True:
363
  sourceName = sourceName + "-" + str(i)
364
- return createDataSourceName(sourceName)
 
 
7
  from langchain_qdrant import RetrievalMode
8
  from langchain_core.prompts.chat import ChatPromptTemplate
9
  from uuid import uuid4
10
+ import nltk
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain.retrievers import ParentDocumentRetriever
13
  from langchain_core.runnables.history import RunnableWithMessageHistory
 
121
  "output": "SUCCESS"
122
  }
123
 
124
+ def cleanText(text: str):
125
+ text = text.replace("\n", " ")
126
+ text = text.translate(str.maketrans('', '', string.punctuation.replace(".", "")))
127
+ return text
128
 
129
  def addDocuments(texts: list[tuple[str]], vectorstore: str):
130
  global vectorEmbeddings
 
293
  def getTextFromImagePDF(pdfBytes):
294
  def getText(image):
295
  global reader
296
+ text = "\n".join([text[1] for text in reader.readtext(np.array(image), paragraph=True)])
297
+ return cleanText(text = text)
298
 
299
  allImages = convert_from_bytes(pdfBytes)
300
  texts = [base64.b64encode(getText(image).encode("utf-8")).decode("utf-8") for image in allImages]
 
309
  url, add_video_info=False
310
  )
311
  doc = " ".join([x.page_content for x in loader.load()])
312
+ texts.append(cleanText(text = doc))
313
  except:
314
  doc = ""
315
  texts.append(doc)
 
331
 
332
 
333
  def extractTextFromPage(page):
334
+ text = cleanText(text = page.get_text())
335
  return base64.b64encode(text.encode("utf-8")).decode("utf-8")
336
 
337
 
 
349
  response.raise_for_status()
350
  html = response.text
351
  soup = BeautifulSoup(html, 'lxml')
352
+ text = cleanText(text = soup.get_text(separator=' ', strip=True))
353
  return base64.b64encode(text.encode("utf-8")).decode("utf-8")
354
 
355
 
 
367
  i = 1
368
  while True:
369
  sourceName = sourceName + "-" + str(i)
370
+ return createDataSourceName(sourceName)
371
+