Hugging Face's logo Hugging Face Search models, datasets, users... Models Datasets Spaces Posts Docs Solutions Pricing Spaces: andreeabodea / Extract_Project_Report_Section_1 like 0 Logs App Files Community Settings Extract_Project_Report_Section_1 / app.py andreeabodea's picture andreeabodea Update app.py 536f374 VERIFIED about 2 hours ago raw history blame edit delete No virus 5.51 kB import os import pdfplumber import re import gradio as gr from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer from io import BytesIO import torch """ Extract the text from a section of a PDF file between 'wanted_section' and 'next_section'. Parameters: - path (str): The file path to the PDF file. - wanted_section (str): The section to start extracting text from. - next_section (str): The section to stop extracting text at. Returns: - text (str): The extracted text from the specified section range. """ def get_section(path, wanted_section, next_section): print(wanted_section) # Open the PDF file doc = pdfplumber.open(BytesIO(path)) start_page = [] end_page = [] # Find the all the pages for the specified sections for page in range(len(doc.pages)): if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0: start_page.append(page) if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0: end_page.append(page) # Extract the text between the start and end page of the wanted section text = [] for page_num in range(max(start_page), max(end_page)+1): page = doc.pages[page_num] text.append(page.extract_text()) text = " ".join(text) final_text = text.replace("\n", " ") return final_text def extract_between(big_string, start_string, end_string): # Use a non-greedy match for content between start_string and end_string pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string) match = re.search(pattern, big_string, re.DOTALL) if match: # Return the content without the start and end strings return match.group(1) else: # Return None if the pattern is not found return None def format_section1(section1_text): result_section1_dict = {} result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm") result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm") result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE") result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel") result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum") result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan") result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung") result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche") return result_section1_dict def answer_questions(text,language="de"): # Initialize the zero-shot classification pipeline model_name = "deepset/gelectra-large-germanquad" model = AutoModelForQuestionAnswering.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Initialize the QA pipeline qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) questions = [ "Welches ist das Titel des Moduls?", "Welches ist das Sektor oder das Kernthema?", "Welches ist das Land?", "Zu welchem Program oder EZ-Programm gehort das Projekt?" #"Welche Durchführungsorganisation aus den 4 Varianten 'giz', 'kfw', 'ptb' und 'bgr' implementiert das Projekt?" # "In dem Dokument was steht bei Sektor?", # "In dem Dokument was steht von 'EZ-Programm' bis 'EZ-Programmziel'?", # "In dem Dokument was steht bei EZ-Programmziel?", # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Modul?", # "In dem Dokument was steht bei Zielerreichung des Moduls?", # "In dem Dokument in dem Abschnitt '1. Kurzbeschreibung' was steht bei Maßnahme im Zeitplan?", # "In dem Dokument was steht bei Vorschläge zur Modulanpassung?", # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als erstes Datum?", # "In dem Dokument in dem Abschnitt 'Anlage 1: Wirkungsmatrix des Moduls' was steht unter Laufzeit als zweites Datum?" ] # Iterate over each question and get answers answers_dict = {} for question in questions: result = qa_pipeline(question=question, context=text) # print(f"Question: {question}") # print(f"Answer: {result['answer']}\n") answers_dict[question] = result['answer'] return answers_dict def process_pdf(path): results_dict = {} results_dict["1. Kurzbeschreibung"] = \ get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls") answers = answer_questions(results_dict["1. Kurzbeschreibung"]) return answers def get_first_page_text(file_data): doc = pdfplumber.open(BytesIO(file_data)) if len(doc.pages): return doc.pages[0].extract_text() if __name__ == "__main__": # Define the Gradio interface # iface = gr.Interface(fn=process_pdf, demo = gr.Interface(fn=process_pdf, inputs=gr.File(type="binary", label="Upload PDF"), outputs=gr.Textbox(label="Extracted Text"), title="PDF Text Extractor", description="Upload a PDF file to extract.") demo.launch()