samarthagarwal23 commited on
Commit
442d312
1 Parent(s): df1cdb5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.mix import Series
3
+ import re
4
+ from rank_bm25 import BM25Okapi
5
+ import string
6
+ from transformers import pipeline
7
+ import pdfminer
8
+ from pdfminer.high_level import extract_text
9
+ from termcolor import colored
10
+
11
+ def read_pdf(file):
12
+ text = extract_text(file)
13
+ # Split text into smaller docs
14
+ len_doc = 400
15
+ overlap = 50
16
+ docs = []
17
+
18
+ i = 0
19
+ while i < len(text):
20
+ docs.append(text[i:i+len_doc])
21
+ i = i + len_doc - overlap
22
+ return docs
23
+
24
+ # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
25
+
26
+ def bm25_tokenizer(text):
27
+ tokenized_doc = []
28
+ for token in text.lower().split():
29
+ token = token.strip(string.punctuation)
30
+
31
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
32
+ tokenized_doc.append(token)
33
+ return tokenized_doc
34
+
35
+ tokenized_corpus = []
36
+ for doc in docs:
37
+ tokenized_corpus.append(bm25_tokenizer(doc))
38
+
39
+ bm25 = BM25Okapi(tokenized_corpus)
40
+
41
+ def retrieval(query, top_k_retriver, docs):
42
+
43
+ bm25_scores = bm25.get_scores(bm25_tokenizer(query))
44
+ top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
45
+ bm25_hits = [{'corpus_id': idx,
46
+ 'score': bm25_scores[idx],
47
+ 'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
48
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
49
+
50
+ return bm25_hits
51
+
52
+ qa_model = pipeline("question-answering",
53
+ model = "huggingface/deepset/roberta-base-squad2")
54
+
55
+ def qa_ranker(query, docs_, top_k_ranker):
56
+ ans = []
57
+ for doc in docs_:
58
+ answer = qa_model(question = query,
59
+ context = doc)
60
+ answer['doc'] = doc
61
+ ans.append(answer)
62
+ return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
63
+
64
+ def final_qa_pipeline(file, query):
65
+ docs = read_pdf(file)
66
+ top_k_retriver, top_k_ranker = 10,1
67
+ lvl1 = retrieval(query, top_k_retriver, docs)
68
+
69
+ if len(lvl1) > 0:
70
+ fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
71
+ return (fnl_rank[0]["answer"], fnl_rank[0]["score"])
72
+ #for fnl_ in fnl_rank:
73
+ # print("\n")
74
+ # print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
75
+ # print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
76
+ else:
77
+ return ("No match", 0)
78
+
79
+ iface = gr.Interface(
80
+ fn = pdf_to_text,
81
+ inputs = '[gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
82
+ outputs = [gr.outputs.HTML(label="Answer"), gr.outputs.HTML(label="Score")]
83
+ )
84
+ iface.launch()