# !pip install mistune
import mistune
from mistune.plugins.table import table
from jinja2 import Template
import re
import os

def md_to_html(md_text):
    renderer = mistune.HTMLRenderer()
    markdown_renderer = mistune.Markdown(renderer, plugins=[table])
    html_content = markdown_renderer(md_text)
    return html_content.replace('\n', '')

####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------####
from datetime import datetime
import psycopg2

from dotenv import load_dotenv, find_dotenv

# Load environment variables from .env file
load_dotenv("keys.env")

TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
BRAVE_API_KEY = os.getenv('BRAVE_API_KEY')
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HELICON_API_KEY = os.getenv("HELICON_API_KEY")
SUPABASE_USER = os.environ['SUPABASE_USER']
SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD']

def insert_data(user_id, user_query, subtopic_query, response, html_report):
    # Connect to your database
    conn = psycopg2.connect(
    dbname="postgres",
    user=SUPABASE_USER,
    password=SUPABASE_PASSWORD,
    host="aws-0-us-west-1.pooler.supabase.com",
    port="5432"
)
    cur = conn.cursor()
    insert_query = """
    INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at)
    VALUES (%s, %s, %s, %s, %s, %s);
    """
    cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now()))
    conn.commit()
    cur.close()
    conn.close()

####-----------------------------------------------------END----------------------------------------------------------####


import ast
from fpdf import FPDF
import re
import pandas as pd
import nltk
import requests
import json
from retry import retry
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from brave import Brave
from fuzzy_json import loads
from half_json.core import JSONFixer
from openai import OpenAI
from together import Together

llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"

SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query"
SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."

import tiktoken # Used to limit tokens
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better

def limit_tokens(input_string, token_limit=7500):
    """
    Limit tokens sent to the model
    """
    return encoding.decode(encoding.encode(input_string)[:token_limit])

together_client = OpenAI(
        api_key=TOGETHER_API_KEY, 
        base_url="https://together.hconeai.com/v1", 
        default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"})

groq_client = OpenAI(
        api_key=GROQ_API_KEY, 
        base_url="https://groq.hconeai.com/openai/v1", 
        default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"})

# Groq model names
llm_default_small = "llama3-8b-8192"
llm_default_medium = "llama3-70b-8192"

# Together Model names (fallback)
llm_fallback_small = "meta-llama/Llama-3-8b-chat-hf"
llm_fallback_medium = "meta-llama/Llama-3-70b-chat-hf"

### ------END OF LLM CONFIG-------- ###

def together_response(message, model = llm_default_small, SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000):
    
    messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}]
    params = {
      "model": model,
      "messages": messages,
      "temperature": temperature,
      "frequency_penalty": frequency_penalty,
      "max_tokens": max_tokens
    }
    try:
      response = groq_client.chat.completions.create(**params)
      return response.choices[0].message.content
    
    except Exception as e:
      print(f"Error calling GROQ API: {e}")
      params["model"] = llm_fallback_small if model == llm_default_small else llm_fallback_medium 
      response = together_client.chat.completions.create(**params)
      return response.choices[0].message.content

def json_from_text(text):
    """
    Extracts JSON from text using regex and fuzzy JSON loading.
    """
    try:
      return json.loads(text)
    except:
      match = re.search(r'\{[\s\S]*\}', text)
      if match:
        json_out = match.group(0)
      else:
        json_out = text
      # Use Fuzzy JSON loading
      return loads(json_out)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_text = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def rephrase_content(data_format, content, query):

    if data_format == "Structured data":
        return together_response(
            f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \
    paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}",
            SysPrompt=SysPromptData,
            max_tokens=500,
        )
    elif data_format == "Quantitative data":
        return together_response(
            f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
            SysPrompt=SysPromptData,
            max_tokens=500,
        )
    else:
        return together_response(
            f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
            SysPrompt=SysPromptData,
            max_tokens=500,
        )
        
class Scraper:
    def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": user_agent})

    @retry(tries=3, delay=1)
    def fetch_content(self, url):
        try:
            response = self.session.get(url, timeout=2)
            if response.status_code == 200:
                return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching page content for {url}: {e}")
        return None

def extract_main_content(html):
    if html:
        plain_text = ""
        soup = BeautifulSoup(html, 'lxml')
        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
            plain_text += element.get_text(separator=" ", strip=True) + "\n"
        return plain_text
    return ""

def process_content(data_format, url, query):
    scraper = Scraper()
    html_content = scraper.fetch_content(url)
    if html_content:
        content = extract_main_content(html_content)
        if content:
            rephrased_content = rephrase_content(
                data_format=data_format,
                content=limit_tokens(remove_stopwords(content), token_limit=1000),
                query=query,
            )
            return rephrased_content, url
    return "", url

def fetch_and_extract_content(data_format, urls, query):
    with ThreadPoolExecutor(max_workers=len(urls)) as executor:
        future_to_url = {
            executor.submit(process_content, data_format, url, query): url
            for url in urls
        }
        all_text_with_urls = [future.result() for future in as_completed(future_to_url)]

    return all_text_with_urls


def search_brave(query, num_results=5):

    brave = Brave(BRAVE_API_KEY)

    search_results = brave.search(q=query, count=num_results)

    return [url.__str__() for url in search_results.urls]