Spaces:

techconspartners
/

ConversAI

Sleeping

App Files Files Community

Rauhan commited on Jul 29

Commit

c4684f9

•

1 Parent(s): e6cb545

DEBUG: WEB CRAWLER

Browse files

Files changed (1) hide show

functions.py +32 -25

functions.py CHANGED Viewed

@@ -19,7 +19,7 @@ from supabase.client import create_client
 from qdrant_client import QdrantClient
 from langchain_groq import ChatGroq
 from bs4 import BeautifulSoup
-from urllib.parse import urlparse
 from supabase import create_client
 from dotenv import load_dotenv
 import os
@@ -258,29 +258,36 @@ def listTables(username: str):
         }
 def getLinks(url: str, timeout = 30):
-  start = time.time()
-  def getLinksFromPage(url: str):
-    response = requests.get(url)
-    htmlContent = response.content
-    soup = BeautifulSoup(htmlContent, "lxml")
-    anchorTags = soup.find_all("a")
-    allLinks = []
-    for tag in anchorTags:
-      if "href" in tag.attrs:
-        if urlparse(tag.attrs["href"]).netloc == urlparse(url).netloc:
-          allLinks.append(tag.attrs["href"])
         else:
-          continue
-      else:
-        continue
-    return allLinks
-  links = getLinksFromPage(url)
-  uniqueLinks = set()
-  for link in links:
-    now = time.time()
-    if now - start > timeout:
-      break
-    else:
-      uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
-  return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))

 from qdrant_client import QdrantClient
 from langchain_groq import ChatGroq
 from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
 from supabase import create_client
 from dotenv import load_dotenv
 import os
         }
 def getLinks(url: str, timeout = 30):
+    start = time.time()
+    def getLinksFromPage(url: str):
+        response = requests.get(url)
+        htmlContent = response.content
+        soup = BeautifulSoup(htmlContent, "lxml")
+        anchorTags = soup.find_all("a")
+        allLinks = []
+        for tag in anchorTags:
+            if "href" in tag.attrs:
+                href = tag.attrs["href"]
+                parseObject = urlparse(href)
+                if ((parseObject.scheme == "") | (parseObject.netloc == "")):
+                    fullUrl = urljoin(url, os.path.join(parseObject.path, parseObject.params, parseObject.query, parseObject.fragment))
+                else:
+                    fullUrl = href
+                if urlparse(fullUrl).netloc == urlparse(url).netloc:
+                    allLinks.append(fullUrl)
+                else:
+                    continue
+            else:
+                continue
+        return allLinks
+    links = getLinksFromPage(url)
+    uniqueLinks = set()
+    for link in links:
+        now = time.time()
+        if now - start > timeout:
+            break
         else:
+            uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
+    return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))