pvanand commited on
Commit
98f779e
1 Parent(s): 7703d1f

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +41 -60
helper_functions_api.py CHANGED
@@ -4,8 +4,6 @@ from mistune.plugins.table import table
4
  from jinja2 import Template
5
  import re
6
  import os
7
- from urllib.parse import urlparse
8
- from typing import Dict, Any, List, Tuple
9
 
10
  def md_to_html(md_text):
11
  renderer = mistune.HTMLRenderer()
@@ -182,77 +180,60 @@ def rephrase_content(data_format, content, query):
182
  max_tokens=500,
183
  )
184
 
185
- def extract_main_content(url):
186
- if url:
187
- try:
188
- result = urlparse(url)
189
- if all([result.scheme, result.netloc]):
190
- # Prepare query parameters
191
- params = {
192
- "url": url,
193
- "favor_precision": False,
194
- "favor_recall": False,
195
- "output_format": "markdown",
196
- "target_language": "en",
197
- "include_tables": True,
198
- "include_images": False,
199
- "include_links": False,
200
- "deduplicate": True,
201
- }
202
-
203
- # Make request to FastAPI endpoint
204
- response = requests.get("https://pvanand-web-scraping.hf.space/extract-article", params=params)
205
 
206
- if response.status_code == 200:
207
- return response.json()["article"]
208
- else:
209
- return ""
210
- except:
211
- return ""
 
 
 
 
 
 
 
 
 
 
 
212
  return ""
213
 
214
  def process_content(data_format, url, query):
215
- content = extract_main_content(url)
216
- if content:
217
- rephrased_content = rephrase_content(
218
- data_format=data_format,
219
- content=limit_tokens(content, token_limit=4000),
220
- query=query,
221
- )
222
- return rephrased_content, url
 
 
 
223
  return "", url
224
 
225
- def fetch_and_extract_content(
226
- data_format: str, query: str, urls: List[str], num_refrences: int = 6
227
- ) -> List[Tuple[str | None, str]]:
228
- """
229
- Asynchronously makeing request to urls and doing further process
230
- """
231
- all_text_with_urls = []
232
- start_url = 0
233
- while (len(all_text_with_urls) != num_refrences) and (start_url < len(urls)):
234
- end_url = start_url + (num_refrences - len(all_text_with_urls))
235
- urls_subset = urls[start_url:end_url]
236
- with ThreadPoolExecutor(max_workers=len(urls_subset)) as executor:
237
- future_to_url = {
238
- executor.submit(process_content, data_format, url, query): url
239
- for url in urls_subset
240
- }
241
- all_text_with_urls += [
242
- future.result()
243
- for future in as_completed(future_to_url)
244
- if future.result()[0] != ""
245
- ]
246
- start_url = end_url
247
 
248
  return all_text_with_urls
249
 
250
 
251
  @retry(tries=3, delay=0.25)
252
  def search_brave(query, num_results=5):
253
- cleaned_query = query #re.sub(r'[^a-zA-Z0-9]+', '', query)
254
  search_query = together_response(cleaned_query, model=llm_default_small, SysPrompt=SysPromptSearch, max_tokens = 25).strip()
255
- cleaned_search_query = search_query #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
256
  brave = Brave(BRAVE_API_KEY)
257
  search_results = brave.search(q=cleaned_search_query, count=num_results)
258
  return [url.__str__() for url in search_results.urls],cleaned_search_query
 
4
  from jinja2 import Template
5
  import re
6
  import os
 
 
7
 
8
  def md_to_html(md_text):
9
  renderer = mistune.HTMLRenderer()
 
180
  max_tokens=500,
181
  )
182
 
183
+ class Scraper:
184
+ def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
185
+ self.session = requests.Session()
186
+ self.session.headers.update({"User-Agent": user_agent})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ @retry(tries=3, delay=1)
189
+ def fetch_content(self, url):
190
+ try:
191
+ response = self.session.get(url, timeout=2)
192
+ if response.status_code == 200:
193
+ return response.text
194
+ except requests.exceptions.RequestException as e:
195
+ print(f"Error fetching page content for {url}: {e}")
196
+ return None
197
+
198
+ def extract_main_content(html):
199
+ if html:
200
+ plain_text = ""
201
+ soup = BeautifulSoup(html, 'lxml')
202
+ for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
203
+ plain_text += element.get_text(separator=" ", strip=True) + "\n"
204
+ return plain_text
205
  return ""
206
 
207
  def process_content(data_format, url, query):
208
+ scraper = Scraper()
209
+ html_content = scraper.fetch_content(url)
210
+ if html_content:
211
+ content = extract_main_content(html_content)
212
+ if content:
213
+ rephrased_content = rephrase_content(
214
+ data_format=data_format,
215
+ content=limit_tokens(remove_stopwords(content), token_limit=1000),
216
+ query=query,
217
+ )
218
+ return rephrased_content, url
219
  return "", url
220
 
221
+ def fetch_and_extract_content(data_format, urls, query):
222
+ with ThreadPoolExecutor(max_workers=len(urls)) as executor:
223
+ future_to_url = {
224
+ executor.submit(process_content, data_format, url, query): url
225
+ for url in urls
226
+ }
227
+ all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  return all_text_with_urls
230
 
231
 
232
  @retry(tries=3, delay=0.25)
233
  def search_brave(query, num_results=5):
234
+ cleaned_query = re.sub(r'[^a-zA-Z0-9]+', '', query)
235
  search_query = together_response(cleaned_query, model=llm_default_small, SysPrompt=SysPromptSearch, max_tokens = 25).strip()
236
+ cleaned_search_query = re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
237
  brave = Brave(BRAVE_API_KEY)
238
  search_results = brave.search(q=cleaned_search_query, count=num_results)
239
  return [url.__str__() for url in search_results.urls],cleaned_search_query