Rauhan commited on
Commit
c4684f9
1 Parent(s): e6cb545

DEBUG: WEB CRAWLER

Browse files
Files changed (1) hide show
  1. functions.py +32 -25
functions.py CHANGED
@@ -19,7 +19,7 @@ from supabase.client import create_client
19
  from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
21
  from bs4 import BeautifulSoup
22
- from urllib.parse import urlparse
23
  from supabase import create_client
24
  from dotenv import load_dotenv
25
  import os
@@ -258,29 +258,36 @@ def listTables(username: str):
258
  }
259
 
260
 
 
261
  def getLinks(url: str, timeout = 30):
262
- start = time.time()
263
- def getLinksFromPage(url: str):
264
- response = requests.get(url)
265
- htmlContent = response.content
266
- soup = BeautifulSoup(htmlContent, "lxml")
267
- anchorTags = soup.find_all("a")
268
- allLinks = []
269
- for tag in anchorTags:
270
- if "href" in tag.attrs:
271
- if urlparse(tag.attrs["href"]).netloc == urlparse(url).netloc:
272
- allLinks.append(tag.attrs["href"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  else:
274
- continue
275
- else:
276
- continue
277
- return allLinks
278
- links = getLinksFromPage(url)
279
- uniqueLinks = set()
280
- for link in links:
281
- now = time.time()
282
- if now - start > timeout:
283
- break
284
- else:
285
- uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
286
- return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
 
19
  from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
21
  from bs4 import BeautifulSoup
22
+ from urllib.parse import urlparse, urljoin
23
  from supabase import create_client
24
  from dotenv import load_dotenv
25
  import os
 
258
  }
259
 
260
 
261
+
262
  def getLinks(url: str, timeout = 30):
263
+ start = time.time()
264
+ def getLinksFromPage(url: str):
265
+ response = requests.get(url)
266
+ htmlContent = response.content
267
+ soup = BeautifulSoup(htmlContent, "lxml")
268
+ anchorTags = soup.find_all("a")
269
+ allLinks = []
270
+ for tag in anchorTags:
271
+ if "href" in tag.attrs:
272
+ href = tag.attrs["href"]
273
+ parseObject = urlparse(href)
274
+ if ((parseObject.scheme == "") | (parseObject.netloc == "")):
275
+ fullUrl = urljoin(url, os.path.join(parseObject.path, parseObject.params, parseObject.query, parseObject.fragment))
276
+ else:
277
+ fullUrl = href
278
+ if urlparse(fullUrl).netloc == urlparse(url).netloc:
279
+ allLinks.append(fullUrl)
280
+ else:
281
+ continue
282
+ else:
283
+ continue
284
+ return allLinks
285
+ links = getLinksFromPage(url)
286
+ uniqueLinks = set()
287
+ for link in links:
288
+ now = time.time()
289
+ if now - start > timeout:
290
+ break
291
  else:
292
+ uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
293
+ return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))