pvanand commited on
Commit
2d9fd73
1 Parent(s): f87e22f

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +11 -16
helper_functions_api.py CHANGED
@@ -4,7 +4,7 @@ from mistune.plugins.table import table
4
  from jinja2 import Template
5
  import re
6
  import os
7
- import requests
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
@@ -183,20 +183,16 @@ def rephrase_content(data_format, content, query):
183
  max_tokens=500,
184
  )
185
 
186
- class Scraper:
187
- def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
188
- self.session = requests.Session()
189
- self.session.headers.update({"User-Agent": user_agent})
190
 
191
- @retry(tries=3, delay=1)
192
- def fetch_content(self, url):
193
- try:
194
- response = self.session.get(url, timeout=2)
195
- if response.status_code == 200:
196
- return response.text
197
- except requests.exceptions.RequestException as e:
198
- print(f"Error fetching page content for {url}: {e}")
199
- return None
200
 
201
  def extract_main_content(html):
202
  extracted = trafilatura.extract(
@@ -215,8 +211,7 @@ def extract_main_content(html):
215
  return ""
216
 
217
  def process_content(data_format, url, query):
218
- scraper = Scraper()
219
- html_content = scraper.fetch_content(url)
220
  if html_content:
221
  content = extract_main_content(html_content)
222
  if content:
 
4
  from jinja2 import Template
5
  import re
6
  import os
7
+ import hrequests
8
 
9
  def md_to_html(md_text):
10
  renderer = mistune.HTMLRenderer()
 
183
  max_tokens=500,
184
  )
185
 
 
 
 
 
186
 
187
+ @retry(tries=3, delay=1)
188
+ def fetch_content(url):
189
+ try:
190
+ response = hrequests.get(url)
191
+ if response.status_code == 200:
192
+ return response.text
193
+ except Exception as e:
194
+ print(f"Error fetching page content for {url}: {e}")
195
+ return None
196
 
197
  def extract_main_content(html):
198
  extracted = trafilatura.extract(
 
211
  return ""
212
 
213
  def process_content(data_format, url, query):
214
+ html_content = fetch_content(url)
 
215
  if html_content:
216
  content = extract_main_content(html_content)
217
  if content: