File size: 7,812 Bytes
22894bf 2af6ea9 22894bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
"""
This module provides functions for working with PDF files and URLs. It uses the urllib.request library
to download files from URLs, and the fitz library to extract text from PDF files. And GPT3 modules to generate
text completions.
"""
import fitz
import re
import numpy as np
import tensorflow_hub as hub
import openai
import gradio as gr
import os
from sklearn.neighbors import NearestNeighbors
from pdfQuestions import dataFromPDF
def preprocess(text):
text = text.replace('\n', ' ')
text = re.sub('\s+', ' ', text)
return text
def pdf_to_text(path, start_page=1, end_page=None):
doc = fitz.open(path)
total_pages = doc.page_count
if end_page is None:
end_page = total_pages
text_list = []
for i in range(start_page-1, end_page):
text = doc.load_page(i).get_text("text")
text = preprocess(text)
text_list.append(text)
doc.close()
return text_list
def text_to_chunks(texts, word_length=150, start_page=1):
text_toks = [t.split(' ') for t in texts]
page_nums = []
chunks = []
for idx, words in enumerate(text_toks):
for i in range(0, len(words), word_length):
chunk = words[i:i+word_length]
if (i+word_length) > len(words) and (len(chunk) < word_length) and (
len(text_toks) != (idx+1)):
text_toks[idx+1] = chunk + text_toks[idx+1]
continue
chunk = ' '.join(chunk).strip()
chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
chunks.append(chunk)
return chunks
class SemanticSearch:
def __init__(self):
self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
self.fitted = False
def fit(self, data, batch=1000, n_neighbors=5):
self.data = data
self.embeddings = self.get_text_embedding(data, batch=batch)
n_neighbors = min(n_neighbors, len(self.embeddings))
self.nn = NearestNeighbors(n_neighbors=n_neighbors)
self.nn.fit(self.embeddings)
self.fitted = True
def __call__(self, text, return_data=True):
inp_emb = self.use([text])
neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
if return_data:
return [self.data[i] for i in neighbors]
else:
return neighbors
def get_text_embedding(self, texts, batch=1000):
embeddings = []
for i in range(0, len(texts), batch):
text_batch = texts[i:(i+batch)]
emb_batch = self.use(text_batch)
embeddings.append(emb_batch)
embeddings = np.vstack(embeddings)
return embeddings
def load_recommender(path, start_page=1):
global recommender
texts = pdf_to_text(path, start_page=start_page)
chunks = text_to_chunks(texts, start_page=start_page)
recommender.fit(chunks)
return 'Corpus Loaded.'
def generate_text(openAI_key,prompt, engine="text-davinci-003"):
openai.api_key = openAI_key
completions = openai.Completion.create(
engine=engine,
prompt=prompt,
max_tokens=512,
n=1,
stop=None,
temperature=0.7,
)
message = completions.choices[0].text
return message
def generate_answer(question,openAI_key):
topn_chunks = recommender(question)
prompt = ""
prompt += 'search results:\n\n'
for c in topn_chunks:
prompt += c + '\n\n'
prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "\
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "\
"with the same name, create separate answers for each. Only include information found in the results and "\
"don't add any additional information. Make sure the answer is correct and don't output false content. "\
"If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "\
"search results which has nothing to do with the question. Only answer what is asked. The "\
"answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer: "
prompt += f"Query: {question}\nAnswer:"
answer = generate_text(openAI_key, prompt,"text-davinci-003")
return answer
def question_answer(file,question,openAI_key):
if openAI_key.strip()=='':
return '[ERROR]: Please enter you Open AI Key. Get your key here : https://platform.openai.com/account/api-keys'
file_paths = ["./Smart Protect Goal Brochure.pdf", "./Future Wealth Gain Brochure.pdf"]
for path in file_paths:
if file in path:
load_recommender(path)
print(path)
if question.strip() == '':
return '[ERROR]: Question field is empty'
return generate_answer(question,openAI_key)
def allQuestion(file):
questionOptions1 = ["What are the various options under Life Cover Variant?","What are the Add-on covers options under Variant description Life Cover?","What is the total claim covered under Minor and Major CI?","What is Waiver of Premium Benefit on CI?","What is Annualized Premium under Life Cover Variant?","Does the ROP include GST?","Under what condition is Add-on Covers applicable?","What is the duration period of premiums for CIB & WOPBI?","What is the maximum maturity age with ROP under the variant?","What is the maximum maturity age with Whole Life under the variant?"]
questionOptions2 = ["What is Future Wealth Gain plan?","If the customer has done a partial withdrawls, is he eligible for the Loyalty Additions/Fund Boosters?","What are the steps to select the plan?","What are the maturity benefits available in the wealth plus variant of this plan?","How can one revive a discontinued policy?",'What are the tax benefit options available under this policy?','What are the features under "Wealth Plus" & "Wealth Plus Care" Variant?',"Can I switch between the funds?"]
if file == None:
return '[ERROR]: Provide select atleast one option.'
if file == "Smart Protect Goal Brochure":
question = questionOptions1
if file == "Future Wealth Gain Brochure":
question = questionOptions2
return gr.Dropdown.update(choices=question)
recommender = SemanticSearch()
title = 'Madgical Chatbots - PDF GPT'
description = """ This chatbot will generate answer from PDF file :"""
options = ['Smart Protect Goal Brochure', 'Future Wealth Gain Brochure']
newOption = None
with gr.Blocks() as demo:
gr.Markdown(f'<center><h1>{title}</h1></center>')
gr.Markdown(f'<center><h4>{description}</h4></center>')
with gr.Row():
with gr.Group():
openAI_key=gr.Textbox(label='Enter your OpenAI API key here')
file = gr.Dropdown(options, label="Select PDF file from below options")
btn2 = gr.Button(value='Submit')
question = gr.Dropdown(newOption,label="Select questions from below options")
btn = gr.Button(value='Submit')
btn.style(full_width=True)
btn2.style(full_width=True)
with gr.Group():
answer = gr.Textbox(label='Medgical Chatbot Answer :')
with gr.Group():
pdfData = gr.Textbox(label='Answer from PDF :')
btn.click(question_answer, inputs=[file, question,openAI_key], outputs=[answer])
btn.click(dataFromPDF, inputs=[question], outputs=[pdfData])
btn2.click(allQuestion, inputs=[file], outputs=[question])
if __name__ == "__main__":
demo.launch() |