import gradio as gr
import pdfplumber
from transformers import pipeline
from io import BytesIO
import re

# Initialize the question-answering pipeline with a specific pre-trained model
qa_pipeline = pipeline("question-answering", model="deepset/gelectra-large-germanquad")

def extract_text_from_pdf(file_obj):
    """Extracts text from a PDF file."""
    text = []
    with pdfplumber.open(file_obj) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Make sure there's text on the page
                text.append(page_text)
    return " ".join(text)

def answer_questions(context):
    """Generates answers to predefined questions based on the provided context."""
    questions = [
        "Welches ist das Titel des Moduls?",
        "Welches ist das Sektor oder das Kernthema?",
        "Welches ist das Land?",
        "Zu welchem Program oder EZ-Programm gehört das Projekt?"
    ]
    answers = {q: qa_pipeline(question=q, context=context)['answer'] for q in questions}
    return answers

def process_pdf(file):
    """Process a PDF file to extract text and then use the text to answer questions."""
    # Read the PDF file from Gradio's file input, which is a temporary file path
    with file as file_path:
        text = extract_text_from_pdf(BytesIO(file_path.read()))
        results = answer_questions(text)
        return "\n".join(f"{q}: {a}" for q, a in results.items())

# Define the Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.inputs.File(type="pdf", label="Upload your PDF file"),
    outputs=gr.outputs.Textbox(label="Extracted Information and Answers"),
    title="PDF Text Extractor and Question Answerer",
    description="Upload a PDF file to extract text and answer predefined questions based on the content."
)

if __name__ == "__main__":
    iface.launch()