pvanand commited on
Commit
a2e6e86
1 Parent(s): ec971eb

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +19 -35
helper_functions_api.py CHANGED
@@ -196,43 +196,27 @@ class Scraper:
196
  print(f"Error fetching page content for {url}: {e}")
197
  return None
198
 
199
- def extract_main_content(url):
200
- if url:
201
- try:
202
- result = urlparse(url)
203
- if all([result.scheme, result.netloc]):
204
- # Prepare query parameters
205
- params = {
206
- "url": url,
207
- "favor_precision": False,
208
- "favor_recall": False,
209
- "output_format": "markdown",
210
- "target_language": "en",
211
- "include_tables": True,
212
- "include_images": False,
213
- "include_links": False,
214
- "deduplicate": True,
215
- }
216
-
217
- # Make request to FastAPI endpoint
218
- response = requests.get("https://pvanand-web-scraping.hf.space/extract-article", params=params)
219
-
220
- if response.status_code == 200:
221
- return response.json()["article"]
222
- else:
223
- return ""
224
- except:
225
- return ""
226
 
227
  def process_content(data_format, url, query):
228
- content = extract_main_content(url)
229
- if content:
230
- rephrased_content = rephrase_content(
231
- data_format=data_format,
232
- content=limit_tokens(remove_stopwords(content), token_limit=4000),
233
- query=query,
234
- )
235
- return rephrased_content, url
 
 
 
236
  return "", url
237
 
238
  def fetch_and_extract_content(data_format, urls, query):
 
196
  print(f"Error fetching page content for {url}: {e}")
197
  return None
198
 
199
+ def extract_main_content(html):
200
+ if html:
201
+ plain_text = ""
202
+ soup = BeautifulSoup(html, 'lxml')
203
+ for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
204
+ plain_text += element.get_text(separator=" ", strip=True) + "\n"
205
+ return plain_text
206
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  def process_content(data_format, url, query):
209
+ scraper = Scraper()
210
+ html_content = scraper.fetch_content(url)
211
+ if html_content:
212
+ content = extract_main_content(html_content)
213
+ if content:
214
+ rephrased_content = rephrase_content(
215
+ data_format=data_format,
216
+ content=limit_tokens(remove_stopwords(content), token_limit=1000),
217
+ query=query,
218
+ )
219
+ return rephrased_content, url
220
  return "", url
221
 
222
  def fetch_and_extract_content(data_format, urls, query):