File size: 2,860 Bytes
442d312
472a794
 
bcbee82
472a794
442d312
 
 
 
bcbee82
442d312
 
 
472a794
442d312
 
981b258
442d312
 
 
 
 
 
 
 
 
 
 
 
 
 
d1391ee
442d312
 
 
 
d1391ee
442d312
 
 
a3e1a90
442d312
a3e1a90
442d312
 
 
 
 
 
 
 
 
dbd16a5
442d312
 
 
 
 
 
 
 
 
 
 
 
89b0019
 
 
 
 
 
442d312
a3e1a90
442d312
 
 
 
 
 
 
 
 
 
 
 
7ef436d
7e61444
442d312
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import os

os.system("pip install pdfminer.six rank_bm25 torch transformers")

from gradio.mix import Series
import re
from rank_bm25 import BM25Okapi
import string 
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
#from termcolor import colored

def read_pdf(file):
  text = extract_text(file.name)
  # Split text into smaller docs
  len_doc = 400
  overlap = 50 
  docs = []
  
  i = 0
  while i < len(text):
      docs.append(text[i:i+len_doc])
      i = i + len_doc - overlap
  return docs
  
  # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching

def bm25_tokenizer(text):
    stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why']
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in stop_w:
            tokenized_doc.append(token)
    return tokenized_doc

def retrieval(query, top_k_retriver, docs, bm25_):

    bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
    top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
    bm25_hits = [{'corpus_id': idx, 
                  'score': bm25_scores[idx], 
                  'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    return bm25_hits

qa_model = pipeline("question-answering", 
                    model = "deepset/roberta-base-squad2")
                    
def qa_ranker(query, docs_, top_k_ranker):
    ans = []
    for doc in docs_:
        answer = qa_model(question = query, 
                            context = doc)
        answer['doc'] = doc
        ans.append(answer)
    return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
    
def final_qa_pipeline(file, query):
    docs = read_pdf(file)
    tokenized_corpus = []
    for doc in docs:
        tokenized_corpus.append(bm25_tokenizer(doc))
    
    bm25 = BM25Okapi(tokenized_corpus)
    
    top_k_retriver, top_k_ranker = 10,1
    lvl1 = retrieval(query, top_k_retriver, docs, bm25)

    if len(lvl1) > 0:
        fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
        return (fnl_rank[0]["answer"], fnl_rank[0]["score"])
        #for fnl_ in fnl_rank:
        #    print("\n")
        #    print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
        #    print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
    else:
        return ("No match", 0)
        
iface = gr.Interface(
   fn = final_qa_pipeline,
   inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
   outputs = [gr.outputs.HTML(label="Answer"), gr.outputs.HTML(label="Score")]
   )
iface.launch()