File size: 3,731 Bytes
14cac88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pke
from sense2vec import Sense2Vec
import time
import gradio as gr
from transformers import AutoTokenizer
import os
from pathlib import Path
from FastT5 import get_onnx_runtime_sessions, OnnxT5

# commands = [
#     "curl -LO https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",
#     "tar -xvf  s2v_reddit_2015_md.tar.gz",
# ]

# for command in commands:
#     return_code = os.system(command)
#     if return_code == 0:
#         print(f"Command '{command}' executed successfully")
#     else:
#         print(f"Command '{command}' failed with return code {return_code}")

s2v = Sense2Vec().from_disk("s2v_old")

trained_model_path = './t5_squad_v1/'

pretrained_model_name = Path(trained_model_path).stem

encoder_path = os.path.join(
    trained_model_path, f"{pretrained_model_name}-encoder_quantized.onnx")
decoder_path = os.path.join(
    trained_model_path, f"{pretrained_model_name}-decoder_quantized.onnx")
init_decoder_path = os.path.join(
    trained_model_path, f"{pretrained_model_name}-init-decoder_quantized.onnx")

model_paths = encoder_path, decoder_path, init_decoder_path
model_sessions = get_onnx_runtime_sessions(model_paths)
model = OnnxT5(trained_model_path, model_sessions)

tokenizer = AutoTokenizer.from_pretrained(trained_model_path)


def get_question(sentence, answer, mdl, tknizer):
    text = f"context: {sentence} answer: {answer}"
    print(text)
    max_len = 256
    encoding = tknizer.encode_plus(
        text, max_length=max_len, pad_to_max_length=False, truncation=True, return_tensors="pt")
    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
    outs = mdl.generate(input_ids=input_ids,
                        attention_mask=attention_mask,
                        early_stopping=True,
                        num_beams=5,
                        num_return_sequences=1,
                        no_repeat_ngram_size=2,
                        max_length=300)

    dec = [tknizer.decode(ids, skip_special_tokens=True) for ids in outs]

    Question = dec[0].replace("question:", "")
    Question = Question.strip()
    return Question


def generate_question(context, answer):
    start_time = time.time()  # Record the start time
    result = get_question(context, answer, model, tokenizer)
    end_time = time.time()    # Record the end time
    latency = end_time - start_time  # Calculate latency
    print(f"Latency: {latency} seconds")
    return result


def generate_mcq(context):
    extractor = pke.unsupervised.TopicRank()
    extractor.load_document(input=context, language='en')
    extractor.candidate_selection(pos={"NOUN", "PROPN", "ADJ"})
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=10)

    results = []

    for keyword, _ in keyphrases:
        original_keyword = keyword
        keyword = original_keyword.lower().replace(" ", "_")
        sense = s2v.get_best_sense(keyword)

        if sense is not None:
            most_similar = s2v.most_similar(sense, n=2)
            distractors = [word.split("|")[0].lower().replace(
                "_", " ") for word, _ in most_similar]

            question = generate_question(context, original_keyword)

            result = {
                "Question": question,
                "Keyword": original_keyword,
                "Distractor1": distractors[0],
                "Distractor2": distractors[1]
            }

            results.append(result)

    return results


iface = gr.Interface(
    fn=generate_mcq,
    inputs=gr.Textbox(label="Context", type='text'),
    outputs=gr.JSON(value=list),
    title="Questgen AI",
    description="Enter a context to generate MCQs for keywords."
)

iface.launch()