andreeabodea
/

Extraction

Model card Files Files and versions Community

Extraction / app.py

andreeabodea's picture

Create app.py

e993c2b verified 6 months ago

history blame contribute delete

No virus

1.88 kB


	import gradio as gr
	import pdfplumber
	from transformers import pipeline
	from io import BytesIO
	import re

	# Initialize the question-answering pipeline with a specific pre-trained model
	qa_pipeline = pipeline("question-answering", model="deepset/gelectra-large-germanquad")

	def extract_text_from_pdf(file_obj):
	"""Extracts text from a PDF file."""
	text = []
	with pdfplumber.open(file_obj) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text: # Make sure there's text on the page
	text.append(page_text)
	return " ".join(text)

	def answer_questions(context):
	"""Generates answers to predefined questions based on the provided context."""
	questions = [
	"Welches ist das Titel des Moduls?",
	"Welches ist das Sektor oder das Kernthema?",
	"Welches ist das Land?",
	"Zu welchem Program oder EZ-Programm gehört das Projekt?"
	]
	answers = {q: qa_pipeline(question=q, context=context)['answer'] for q in questions}
	return answers

	def process_pdf(file):
	"""Process a PDF file to extract text and then use the text to answer questions."""
	# Read the PDF file from Gradio's file input, which is a temporary file path
	with file as file_path:
	text = extract_text_from_pdf(BytesIO(file_path.read()))
	results = answer_questions(text)
	return "\n".join(f"{q}: {a}" for q, a in results.items())

	# Define the Gradio interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.inputs.File(type="pdf", label="Upload your PDF file"),
	outputs=gr.outputs.Textbox(label="Extracted Information and Answers"),
	title="PDF Text Extractor and Question Answerer",
	description="Upload a PDF file to extract text and answer predefined questions based on the content."
	)

	if __name__ == "__main__":
	iface.launch()