pvanand commited on
Commit
d143039
1 Parent(s): 26e0ddc

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +213 -234
helper_functions_api.py CHANGED
@@ -1,4 +1,3 @@
1
- # !pip install mistune
2
  import mistune
3
  from mistune.plugins.table import table
4
  from jinja2 import Template
@@ -9,97 +8,18 @@ import markdown
9
  from bs4 import BeautifulSoup
10
  from lxml import etree
11
  import markdown
12
-
13
- def md_to_html(md_text):
14
- html_content = markdown.markdown(md_text,extensions=["extra"])
15
- return html_content.replace('\n', '')
16
-
17
- def has_tables(html_string):
18
- try:
19
- # Use BeautifulSoup with lxml parser
20
- soup = BeautifulSoup(html_string, 'lxml')
21
-
22
- # First, try BeautifulSoup's find_all method
23
- if soup.find_all('table'):
24
- return True
25
-
26
- # If no tables found, try a more aggressive search using lxml's XPath
27
- tree = etree.HTML(str(soup))
28
- return len(tree.xpath('//table')) > 0
29
-
30
- except Exception as e:
31
- # Log the exception if needed
32
- print(f"An error occurred: {str(e)}")
33
- return False
34
-
35
- def extract_data_from_tag(input_string, tag):
36
- # Create the regex pattern
37
- pattern = f'<{tag}.*?>(.*?)</{tag}>'
38
-
39
- # Find all matches
40
- matches = re.findall(pattern, input_string, re.DOTALL)
41
-
42
- # If matches are found, return them joined by newlines
43
- if matches:
44
- out = '\n'.join(match.strip() for match in matches)
45
- # Check for incorrect tagging
46
- if len(out) > 0.8*len(input_string):
47
- return out
48
- else:
49
- return input_string
50
-
51
- # If no matches are found, return the original string
52
- return input_string
53
-
54
- ####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------####
55
  from datetime import datetime
56
  import psycopg2
57
-
58
- from dotenv import load_dotenv, find_dotenv
59
-
60
- # Load environment variables from .env file
61
- load_dotenv("keys.env")
62
-
63
- TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
64
- BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
65
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
66
- HELICON_API_KEY = os.getenv("HELICON_API_KEY")
67
- SUPABASE_USER = os.environ['SUPABASE_USER']
68
- SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']
69
- OPENROUTER_API_KEY = "sk-or-v1-"+os.environ['OPENROUTER_API_KEY']
70
-
71
- def insert_data(user_id, user_query, subtopic_query, response, html_report):
72
- # Connect to your database
73
- conn = psycopg2.connect(
74
- dbname="postgres",
75
- user=SUPABASE_USER,
76
- password=SUPABASE_PASSWORD,
77
- host="aws-0-us-west-1.pooler.supabase.com",
78
- port="5432"
79
- )
80
- cur = conn.cursor()
81
- insert_query = """
82
- INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
83
- VALUES (%s, %s, %s, %s, %s, %s);
84
- """
85
- cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now()))
86
- conn.commit()
87
- cur.close()
88
- conn.close()
89
-
90
- ####-----------------------------------------------------END----------------------------------------------------------####
91
-
92
-
93
  import ast
94
  from fpdf import FPDF
95
- import re
96
  import pandas as pd
97
  import nltk
98
  import requests
99
  import json
100
  from retry import retry
101
  from concurrent.futures import ThreadPoolExecutor, as_completed
102
- from bs4 import BeautifulSoup
103
  from nltk.corpus import stopwords
104
  from nltk.tokenize import word_tokenize
105
  from brave import Brave
@@ -109,21 +29,28 @@ from openai import OpenAI
109
  from together import Together
110
  from urllib.parse import urlparse
111
  import trafilatura
 
112
 
113
- llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
114
- llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
115
 
116
- # SysPromptData = """You are expert in information extraction from the given context.
117
- # Steps to follow:
118
- # 1. Check if relevant factual data regarding <USER QUERY> is present in the <SCRAPED DATA>.
119
- # - IF YES, extract the maximum relevant factual information related to <USER QUERY> from the <SCRAPED DATA>.
120
- # - IF NO, then return "N/A"
121
-
122
- # Rules to follow:
123
- # - Return N/A if information is not present in the scraped data.
124
- # - FORGET EVERYTHING YOU KNOW, Only output information that is present in the scraped data, DO NOT MAKE UP INFORMATION
125
- # """
126
- SysPromptData = """
 
 
 
 
 
 
127
  You are an AI assistant tasked with extracting relevant information from scraped website data based on a given query. Your goal is to provide accurate and concise information that directly relates to the query, using only the data provided.
128
  Guidelines for extraction:
129
  1. Only use information present in the scraped data.
@@ -131,66 +58,108 @@ Guidelines for extraction:
131
  3. If there is no relevant information in the scraped data, state that clearly.
132
  4. Do not make assumptions or add information not present in the data.
133
  5. If the query is ambiguous, interpret it in the most reasonable way based on the available data.
134
- """
135
 
136
- SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
137
- SysPromptSearch = """You are a search query generator, create a concise Google search query, focusing only on the main topic and omitting additional redundant details, include year if necessory, 2024, Do not add any additional comments. OUTPUT ONLY THE SEARCH QUERY
138
- #Additional instructions:
139
- ##Use the following search operator if necessory
140
- OR #to cover multiple topics"""
141
 
142
- import tiktoken # Used to limit tokens
143
- encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
144
-
145
- def limit_tokens(input_string, token_limit=7500):
146
- """
147
- Limit tokens sent to the model
148
- """
149
- return encoding.decode(encoding.encode(input_string)[:token_limit])
150
 
151
  together_client = OpenAI(
152
- api_key=TOGETHER_API_KEY,
153
- base_url="https://together.hconeai.com/v1",
154
- default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"})
155
 
156
  groq_client = OpenAI(
157
- api_key=GROQ_API_KEY,
158
- base_url="https://groq.hconeai.com/openai/v1",
159
- default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"})
160
 
161
  or_client = OpenAI(
162
  base_url="https://openrouter.ai/api/v1",
163
  api_key=OPENROUTER_API_KEY)
164
 
165
- # Groq model names
166
- llm_default_small = "llama3-8b-8192"
167
- llm_default_medium = "llama3-70b-8192"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Together Model names (fallback)
170
- llm_fallback_small = "meta-llama/Llama-3-8b-chat-hf"
171
- llm_fallback_medium = "meta-llama/Llama-3-70b-chat-hf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- ### ------END OF LLM CONFIG-------- ###
 
 
 
 
 
174
 
175
- def together_response(message, model = llm_default_small, SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000):
176
-
177
- messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}]
178
  params = {
179
- "model": model,
180
- "messages": messages,
181
- "temperature": temperature,
182
- "frequency_penalty": frequency_penalty,
183
- "max_tokens": max_tokens
184
  }
185
  try:
186
- response = groq_client.chat.completions.create(**params)
187
- return response.choices[0].message.content
188
-
189
  except Exception as e:
190
- print(f"Error calling GROQ API: {e}")
191
- params["model"] = llm_fallback_small if model == llm_default_small else llm_fallback_medium
192
- response = together_client.chat.completions.create(**params)
193
- return response.choices[0].message.content
 
 
 
 
194
 
195
  def openrouter_response(messages, model="meta-llama/llama-3-70b-instruct:nitro"):
196
  try:
@@ -199,45 +168,47 @@ def openrouter_response(messages, model="meta-llama/llama-3-70b-instruct:nitro")
199
  messages=messages,
200
  max_tokens=4096,
201
  )
202
-
203
- response_message = response.choices[0].message.content
204
- return response_message
205
  except Exception as e:
206
- print(f"An error occurred: {str(e)}")
207
  return None
208
 
209
  def openrouter_response_stream(messages, model="meta-llama/llama-3-70b-instruct:nitro"):
210
- response = or_client.chat.completions.create(
211
- model=model,
212
- messages=messages,
213
- max_tokens=4096,
214
- stream=True
215
- )
216
-
217
- for chunk in response:
218
- if chunk.choices[0].delta.content is not None:
219
- yield chunk.choices[0].delta.content
 
 
 
220
 
221
  def json_from_text(text):
222
- """
223
- Extracts JSON from text using regex and fuzzy JSON loading.
224
- """
225
  try:
226
- return json.loads(text)
227
- except:
228
- match = re.search(r'\{[\s\S]*\}', text)
229
- if match:
230
- json_out = match.group(0)
231
- else:
232
- json_out = text
233
- # Use Fuzzy JSON loading
234
- return loads(json_out)
235
 
236
  def remove_stopwords(text):
237
- stop_words = set(stopwords.words('english'))
238
- words = word_tokenize(text)
239
- filtered_text = [word for word in words if word.lower() not in stop_words]
240
- return ' '.join(filtered_text)
 
 
 
 
241
 
242
  def rephrase_content(data_format, content, query):
243
  try:
@@ -245,101 +216,109 @@ def rephrase_content(data_format, content, query):
245
  return together_response(
246
  f"""return only the relevant information regarding the query: {{{query}}}. Output should be concise chunks of \
247
  paragraphs or tables or both, extracted from the following scraped context {{{limit_tokens(content,token_limit=2000)}}}""",
248
- SysPrompt=SysPromptData,
249
  max_tokens=900,
250
  )
251
  elif data_format == "Quantitative data":
252
  return together_response(
253
  f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=2000)}}}",
254
- SysPrompt=SysPromptData,
255
  max_tokens=500,
256
  )
257
  else:
258
  return together_response(
259
  f"return only the relevant information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=2000)}}}",
260
- SysPrompt=SysPromptData,
261
  max_tokens=500,
262
  )
263
  except Exception as e:
264
- print(f"An error occurred: {str(e)}")
265
- return limit_tokens(content,token_limit=500)
266
 
267
  def fetch_content(url):
268
  try:
269
  response = hrequests.get(url, timeout=5)
270
  if response.status_code == 200:
271
  return response.text
 
 
272
  except Exception as e:
273
- print(f"Error fetching page content for {url}: {e}")
274
  return None
275
 
276
  def extract_main_content(html):
277
- extracted = trafilatura.extract(
278
- html,
279
- output_format="markdown",
280
- target_language="en",
281
- include_tables=True,
282
- include_images=False,
283
- include_links=False,
284
- deduplicate=True,
285
- )
286
-
287
- if extracted:
288
- return trafilatura.utils.sanitize(extracted)
289
- else:
290
  return ""
291
 
292
  def process_content(data_format, url, query):
293
- html_content = fetch_content(url)
294
- if html_content:
295
- content = extract_main_content(html_content)
296
- if content:
297
- rephrased_content = rephrase_content(
298
- data_format=data_format,
299
- content=limit_tokens(remove_stopwords(content), token_limit=4000),
300
- query=query,
301
- )
302
- return rephrased_content, url
 
 
 
303
  return "", url
304
 
305
  def fetch_and_extract_content(data_format, urls, query):
306
- with ThreadPoolExecutor(max_workers=len(urls)) as executor:
307
- future_to_url = {
308
- executor.submit(process_content, data_format, url, query): url
309
- for url in urls
310
- }
311
- all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
312
-
313
- return all_text_with_urls
 
 
 
314
 
315
  def search_brave(query, num_results=5):
316
- """Fetch search results from Brave's API."""
317
-
318
- cleaned_query = query #re.sub(r'[^a-zA-Z0-9]+', '', query)
319
- search_query = together_response(cleaned_query, model=llm_default_small, SysPrompt=SysPromptSearch, max_tokens = 25).strip()
320
- cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
321
-
322
- url = "https://api.search.brave.com/res/v1/web/search"
323
- headers = {
324
- "Accept": "application/json",
325
- "Accept-Encoding": "gzip",
326
- "X-Subscription-Token": BRAVE_API_KEY
327
- }
328
- params = {"q": cleaned_search_query}
329
-
330
- response = requests.get(url, headers=headers, params=params)
331
-
332
- if response.status_code == 200:
333
- result = response.json() # Return the JSON response if successful
334
- return [item["url"] for item in result["web"]["results"]][:num_results],cleaned_search_query, result
335
- else:
336
- return [],cleaned_search_query # Return error code if not successful
337
-
338
- # #@retry(tries=3, delay=0.25)
339
- # def search_brave(query, num_results=5):
340
- # cleaned_query = query #re.sub(r'[^a-zA-Z0-9]+', '', query)
341
- # search_query = together_response(cleaned_query, model=llm_default_small, SysPrompt=SysPromptSearch, max_tokens = 25).strip()
342
- # cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip() #re.sub(r'[^a-zA-Z0-9*]+', '', search_query)
343
- # brave = Brave(BRAVE_API_KEY)
344
- # search_results = brave.search(q=cleaned_search_query, count=num_results)
345
- # return [url.__str__() for url in search_results.urls],cleaned_search_query
 
 
1
  import mistune
2
  from mistune.plugins.table import table
3
  from jinja2 import Template
 
8
  from bs4 import BeautifulSoup
9
  from lxml import etree
10
  import markdown
11
+ import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from datetime import datetime
13
  import psycopg2
14
+ from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  import ast
16
  from fpdf import FPDF
 
17
  import pandas as pd
18
  import nltk
19
  import requests
20
  import json
21
  from retry import retry
22
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
23
  from nltk.corpus import stopwords
24
  from nltk.tokenize import word_tokenize
25
  from brave import Brave
 
29
  from together import Together
30
  from urllib.parse import urlparse
31
  import trafilatura
32
+ import tiktoken
33
 
34
+ # Set up logging
35
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
36
 
37
+ # Load environment variables
38
+ load_dotenv("keys.env")
39
+ TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
40
+ BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
41
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
42
+ HELICON_API_KEY = os.getenv("HELICON_API_KEY")
43
+ SUPABASE_USER = os.environ['SUPABASE_USER']
44
+ SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']
45
+ OPENROUTER_API_KEY = "sk-or-v1-" + os.environ['OPENROUTER_API_KEY']
46
+
47
+ # Define constants
48
+ LLM_DEFAULT_SMALL = "llama3-8b-8192"
49
+ LLM_DEFAULT_MEDIUM = "llama3-70b-8192"
50
+ LLM_FALLBACK_SMALL = "meta-llama/Llama-3-8b-chat-hf"
51
+ LLM_FALLBACK_MEDIUM = "meta-llama/Llama-3-70b-chat-hf"
52
+
53
+ SYS_PROMPT_DATA = """
54
  You are an AI assistant tasked with extracting relevant information from scraped website data based on a given query. Your goal is to provide accurate and concise information that directly relates to the query, using only the data provided.
55
  Guidelines for extraction:
56
  1. Only use information present in the scraped data.
 
58
  3. If there is no relevant information in the scraped data, state that clearly.
59
  4. Do not make assumptions or add information not present in the data.
60
  5. If the query is ambiguous, interpret it in the most reasonable way based on the available data.
61
+ """
62
 
63
+ SYS_PROMPT_DEFAULT = "You are an expert AI, complete the given task. Do not add any additional comments."
64
+ SYS_PROMPT_SEARCH = """You are a search query generator, create a concise Google search query, focusing only on the main topic and omitting additional redundant details, include year if necessary, 2024, Do not add any additional comments. OUTPUT ONLY THE SEARCH QUERY
65
+ #Additional instructions:
66
+ ##Use the following search operator if necessary
67
+ OR #to cover multiple topics"""
68
 
69
+ # Initialize API clients
70
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
 
 
 
 
 
 
71
 
72
  together_client = OpenAI(
73
+ api_key=TOGETHER_API_KEY,
74
+ base_url="https://together.hconeai.com/v1",
75
+ default_headers={"Helicone-Auth": f"Bearer {HELICON_API_KEY}"})
76
 
77
  groq_client = OpenAI(
78
+ api_key=GROQ_API_KEY,
79
+ base_url="https://groq.hconeai.com/openai/v1",
80
+ default_headers={"Helicone-Auth": f"Bearer {HELICON_API_KEY}"})
81
 
82
  or_client = OpenAI(
83
  base_url="https://openrouter.ai/api/v1",
84
  api_key=OPENROUTER_API_KEY)
85
 
86
+ def md_to_html(md_text):
87
+ try:
88
+ html_content = markdown.markdown(md_text, extensions=["extra"])
89
+ return html_content.replace('\n', '')
90
+ except Exception as e:
91
+ logging.error(f"Error converting markdown to HTML: {e}")
92
+ return md_text
93
+
94
+ def has_tables(html_string):
95
+ try:
96
+ soup = BeautifulSoup(html_string, 'lxml')
97
+ if soup.find_all('table'):
98
+ return True
99
+ tree = etree.HTML(str(soup))
100
+ return len(tree.xpath('//table')) > 0
101
+ except Exception as e:
102
+ logging.error(f"Error checking for tables: {e}")
103
+ return False
104
+
105
+ def extract_data_from_tag(input_string, tag):
106
+ try:
107
+ pattern = f'<{tag}.*?>(.*?)</{tag}>'
108
+ matches = re.findall(pattern, input_string, re.DOTALL)
109
+ if matches:
110
+ out = '\n'.join(match.strip() for match in matches)
111
+ return out if len(out) <= 0.8 * len(input_string) else input_string
112
+ return input_string
113
+ except Exception as e:
114
+ logging.error(f"Error extracting data from tag: {e}")
115
+ return input_string
116
 
117
+ def insert_data(user_id, user_query, subtopic_query, response, html_report):
118
+ try:
119
+ with psycopg2.connect(
120
+ dbname="postgres",
121
+ user=SUPABASE_USER,
122
+ password=SUPABASE_PASSWORD,
123
+ host="aws-0-us-west-1.pooler.supabase.com",
124
+ port="5432"
125
+ ) as conn:
126
+ with conn.cursor() as cur:
127
+ insert_query = """
128
+ INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
129
+ VALUES (%s, %s, %s, %s, %s, %s);
130
+ """
131
+ cur.execute(insert_query, (user_id, user_query, subtopic_query, response, html_report, datetime.now()))
132
+ except Exception as e:
133
+ logging.error(f"Error inserting data into database: {e}")
134
 
135
+ def limit_tokens(input_string, token_limit=7500):
136
+ try:
137
+ return encoding.decode(encoding.encode(input_string)[:token_limit])
138
+ except Exception as e:
139
+ logging.error(f"Error limiting tokens: {e}")
140
+ return input_string[:token_limit] # Fallback to simple string slicing
141
 
142
+ def together_response(message, model=LLM_DEFAULT_SMALL, SysPrompt=SYS_PROMPT_DEFAULT, temperature=0.2, frequency_penalty=0.1, max_tokens=2000):
143
+ messages = [{"role": "system", "content": SysPrompt}, {"role": "user", "content": message}]
 
144
  params = {
145
+ "model": model,
146
+ "messages": messages,
147
+ "temperature": temperature,
148
+ "frequency_penalty": frequency_penalty,
149
+ "max_tokens": max_tokens
150
  }
151
  try:
152
+ response = groq_client.chat.completions.create(**params)
153
+ return response.choices[0].message.content
 
154
  except Exception as e:
155
+ logging.error(f"Error calling GROQ API: {e}")
156
+ try:
157
+ params["model"] = LLM_FALLBACK_SMALL if model == LLM_DEFAULT_SMALL else LLM_FALLBACK_MEDIUM
158
+ response = together_client.chat.completions.create(**params)
159
+ return response.choices[0].message.content
160
+ except Exception as e:
161
+ logging.error(f"Error calling Together API: {e}")
162
+ return "An error occurred while processing your request."
163
 
164
  def openrouter_response(messages, model="meta-llama/llama-3-70b-instruct:nitro"):
165
  try:
 
168
  messages=messages,
169
  max_tokens=4096,
170
  )
171
+ return response.choices[0].message.content
 
 
172
  except Exception as e:
173
+ logging.error(f"Error calling OpenRouter API: {e}")
174
  return None
175
 
176
  def openrouter_response_stream(messages, model="meta-llama/llama-3-70b-instruct:nitro"):
177
+ try:
178
+ response = or_client.chat.completions.create(
179
+ model=model,
180
+ messages=messages,
181
+ max_tokens=4096,
182
+ stream=True
183
+ )
184
+ for chunk in response:
185
+ if chunk.choices[0].delta.content is not None:
186
+ yield chunk.choices[0].delta.content
187
+ except Exception as e:
188
+ logging.error(f"Error streaming response from OpenRouter API: {e}")
189
+ yield "An error occurred while streaming the response."
190
 
191
  def json_from_text(text):
 
 
 
192
  try:
193
+ return json.loads(text)
194
+ except json.JSONDecodeError:
195
+ try:
196
+ match = re.search(r'\{[\s\S]*\}', text)
197
+ json_out = match.group(0) if match else text
198
+ return loads(json_out)
199
+ except Exception as e:
200
+ logging.error(f"Error parsing JSON from text: {e}")
201
+ return {}
202
 
203
  def remove_stopwords(text):
204
+ try:
205
+ stop_words = set(stopwords.words('english'))
206
+ words = word_tokenize(text)
207
+ filtered_text = [word for word in words if word.lower() not in stop_words]
208
+ return ' '.join(filtered_text)
209
+ except Exception as e:
210
+ logging.error(f"Error removing stopwords: {e}")
211
+ return text
212
 
213
  def rephrase_content(data_format, content, query):
214
  try:
 
216
  return together_response(
217
  f"""return only the relevant information regarding the query: {{{query}}}. Output should be concise chunks of \
218
  paragraphs or tables or both, extracted from the following scraped context {{{limit_tokens(content,token_limit=2000)}}}""",
219
+ SysPrompt=SYS_PROMPT_DATA,
220
  max_tokens=900,
221
  )
222
  elif data_format == "Quantitative data":
223
  return together_response(
224
  f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=2000)}}}",
225
+ SysPrompt=SYS_PROMPT_DATA,
226
  max_tokens=500,
227
  )
228
  else:
229
  return together_response(
230
  f"return only the relevant information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=2000)}}}",
231
+ SysPrompt=SYS_PROMPT_DATA,
232
  max_tokens=500,
233
  )
234
  except Exception as e:
235
+ logging.error(f"Error rephrasing content: {e}")
236
+ return limit_tokens(content, token_limit=500)
237
 
238
  def fetch_content(url):
239
  try:
240
  response = hrequests.get(url, timeout=5)
241
  if response.status_code == 200:
242
  return response.text
243
+ else:
244
+ logging.warning(f"Failed to fetch content from {url}. Status code: {response.status_code}")
245
  except Exception as e:
246
+ logging.error(f"Error fetching page content for {url}: {e}")
247
  return None
248
 
249
  def extract_main_content(html):
250
+ try:
251
+ extracted = trafilatura.extract(
252
+ html,
253
+ output_format="markdown",
254
+ target_language="en",
255
+ include_tables=True,
256
+ include_images=False,
257
+ include_links=False,
258
+ deduplicate=True,
259
+ )
260
+ return trafilatura.utils.sanitize(extracted) if extracted else ""
261
+ except Exception as e:
262
+ logging.error(f"Error extracting main content: {e}")
263
  return ""
264
 
265
  def process_content(data_format, url, query):
266
+ try:
267
+ html_content = fetch_content(url)
268
+ if html_content:
269
+ content = extract_main_content(html_content)
270
+ if content:
271
+ rephrased_content = rephrase_content(
272
+ data_format=data_format,
273
+ content=limit_tokens(remove_stopwords(content), token_limit=4000),
274
+ query=query,
275
+ )
276
+ return rephrased_content, url
277
+ except Exception as e:
278
+ logging.error(f"Error processing content for {url}: {e}")
279
  return "", url
280
 
281
  def fetch_and_extract_content(data_format, urls, query):
282
+ try:
283
+ with ThreadPoolExecutor(max_workers=len(urls)) as executor:
284
+ future_to_url = {
285
+ executor.submit(process_content, data_format, url, query): url
286
+ for url in urls
287
+ }
288
+ all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
289
+ return all_text_with_urls
290
+ except Exception as e:
291
+ logging.error(f"Error fetching and extracting content: {e}")
292
+ return []
293
 
294
  def search_brave(query, num_results=5):
295
+ try:
296
+ cleaned_query = query
297
+ search_query = together_response(cleaned_query, model=LLM_DEFAULT_SMALL, SysPrompt=SYS_PROMPT_SEARCH, max_tokens=25).strip()
298
+ cleaned_search_query = re.sub(r'[^\w\s]', '', search_query).strip()
299
+
300
+ url = "https://api.search.brave.com/res/v1/web/search"
301
+ headers = {
302
+ "Accept": "application/json",
303
+ "Accept-Encoding": "gzip",
304
+ "X-Subscription-Token": BRAVE_API_KEY
305
+ }
306
+ params = {"q": cleaned_search_query}
307
+
308
+ response = requests.get(url, headers=headers, params=params)
309
+
310
+ if response.status_code == 200:
311
+ result = response.json()
312
+ return [item["url"] for item in result["web"]["results"]][:num_results], cleaned_search_query, result
313
+ else:
314
+ logging.warning(f"Brave search API returned status code {response.status_code}")
315
+ return [], cleaned_search_query, None
316
+ except Exception as e:
317
+ logging.error(f"Error in Brave search: {e}")
318
+ return [], query, None
319
+
320
+ # Main execution
321
+ if __name__ == "__main__":
322
+ logging.info("Script started")
323
+ # Add your main execution logic here
324
+ logging.info("Script completed")