ViXuan commited on
Commit
14cac88
β€’
1 Parent(s): 94a2f2d

Improved Inference

Browse files
.gitignore CHANGED
@@ -3,4 +3,5 @@ venv
3
  s2v_reddit_2015_md.tar.gz
4
  __pycache__
5
  s2v_old
6
- ._s2v_old
 
 
3
  s2v_reddit_2015_md.tar.gz
4
  __pycache__
5
  s2v_old
6
+ ._s2v_old
7
+ %Projects%School%questgen
app.py CHANGED
@@ -1,11 +1,25 @@
1
- import pke
2
- from sense2vec import Sense2Vec
3
- import time
4
  import gradio as gr
5
- from transformers import AutoTokenizer
 
 
6
  import os
7
  from pathlib import Path
8
- from FastT5 import get_onnx_runtime_sessions, OnnxT5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  commands = [
11
  "curl -LO https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",
@@ -19,96 +33,619 @@ for command in commands:
19
  else:
20
  print(f"Command '{command}' failed with return code {return_code}")
21
 
22
- s2v = Sense2Vec().from_disk("s2v_old")
23
 
24
- trained_model_path = './t5_squad_v1/'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- pretrained_model_name = Path(trained_model_path).stem
 
 
 
 
 
 
 
 
 
 
27
 
28
- encoder_path = os.path.join(
29
- trained_model_path, f"{pretrained_model_name}-encoder_quantized.onnx")
30
- decoder_path = os.path.join(
31
- trained_model_path, f"{pretrained_model_name}-decoder_quantized.onnx")
32
- init_decoder_path = os.path.join(
33
- trained_model_path, f"{pretrained_model_name}-init-decoder_quantized.onnx")
34
 
35
- model_paths = encoder_path, decoder_path, init_decoder_path
36
- model_sessions = get_onnx_runtime_sessions(model_paths)
37
- model = OnnxT5(trained_model_path, model_sessions)
38
 
39
- tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
40
 
41
 
42
- def get_question(sentence, answer, mdl, tknizer):
43
- text = "context: {} answer: {}".format(sentence, answer)
44
- print(text)
45
- max_len = 256
46
- encoding = tknizer.encode_plus(
47
- text, max_length=max_len, pad_to_max_length=False, truncation=True, return_tensors="pt")
48
- input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
49
- outs = mdl.generate(input_ids=input_ids,
50
- attention_mask=attention_mask,
51
- early_stopping=True,
52
- num_beams=5,
53
- num_return_sequences=1,
54
- no_repeat_ngram_size=2,
55
- max_length=300)
56
 
57
- dec = [tknizer.decode(ids, skip_special_tokens=True) for ids in outs]
58
 
59
- Question = dec[0].replace("question:", "")
60
- Question = Question.strip()
61
- return Question
 
 
 
 
 
 
 
62
 
63
 
64
- def generate_question(context, answer):
65
- start_time = time.time() # Record the start time
66
- result = get_question(context, answer, model, tokenizer)
67
- end_time = time.time() # Record the end time
68
- latency = end_time - start_time # Calculate latency
69
- print(f"Latency: {latency} seconds")
70
- return result
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- def generate_mcq(context):
74
- extractor = pke.unsupervised.TopicRank()
75
- extractor.load_document(input=context, language='en')
76
- extractor.candidate_selection(pos={"NOUN", "PROPN", "ADJ"})
77
- extractor.candidate_weighting()
78
  keyphrases = extractor.get_n_best(n=10)
79
 
80
- results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- for keyword, _ in keyphrases:
83
- original_keyword = keyword
84
- keyword = original_keyword.lower().replace(" ", "_")
85
- sense = s2v.get_best_sense(keyword)
 
 
 
 
86
 
87
- if sense is not None:
88
- most_similar = s2v.most_similar(sense, n=2)
89
- distractors = [word.split("|")[0].lower().replace(
90
- "_", " ") for word, _ in most_similar]
91
 
92
- question = generate_question(context, original_keyword)
 
93
 
94
- result = {
95
- "Question": question,
96
- "Keyword": original_keyword,
97
- "Distractor1": distractors[0],
98
- "Distractor2": distractors[1]
99
- }
100
 
101
- results.append(result)
 
 
 
 
102
 
103
- return results
104
 
105
 
 
106
  iface = gr.Interface(
107
  fn=generate_mcq,
108
- inputs=gr.Textbox(label="Context", type='text'),
109
- outputs=gr.JSON(value=list),
110
- title="Questgen AI",
111
- description="Enter a context to generate MCQs for keywords."
 
 
112
  )
113
 
 
114
  iface.launch()
 
 
 
 
1
  import gradio as gr
2
+ import time
3
+ from pprint import pprint
4
+ import numpy
5
  import os
6
  from pathlib import Path
7
+ from FastT5 import OnnxT5, get_onnx_runtime_sessions
8
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer
9
+ from flashtext import KeywordProcessor
10
+ from nltk.tokenize import sent_tokenize
11
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
12
+ from nltk.corpus import brown
13
+ from nltk.corpus import stopwords
14
+ from nltk import FreqDist
15
+ import nltk
16
+ import pke
17
+ import string
18
+ from collections import OrderedDict
19
+ from sense2vec import Sense2Vec
20
+ import spacy
21
+ import random
22
+ import torch
23
 
24
  commands = [
25
  "curl -LO https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",
 
33
  else:
34
  print(f"Command '{command}' failed with return code {return_code}")
35
 
 
36
 
37
+ def greedy_decoding(inp_ids, attn_mask, model, tokenizer):
38
+ greedy_output = model.generate(
39
+ input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
40
+ Question = tokenizer.decode(
41
+ greedy_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
42
+ return Question.strip().capitalize()
43
+
44
+
45
+ def beam_search_decoding(inp_ids, attn_mask, model, tokenizer):
46
+ beam_output = model.generate(input_ids=inp_ids,
47
+ attention_mask=attn_mask,
48
+ max_length=256,
49
+ num_beams=10,
50
+ num_return_sequences=3,
51
+ no_repeat_ngram_size=2,
52
+ early_stopping=True
53
+ )
54
+ Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
55
+ beam_output]
56
+ return [Question.strip().capitalize() for Question in Questions]
57
+
58
+
59
+ def topkp_decoding(inp_ids, attn_mask, model, tokenizer):
60
+ topkp_output = model.generate(input_ids=inp_ids,
61
+ attention_mask=attn_mask,
62
+ max_length=256,
63
+ do_sample=True,
64
+ top_k=40,
65
+ top_p=0.80,
66
+ num_return_sequences=3,
67
+ no_repeat_ngram_size=2,
68
+ early_stopping=True
69
+ )
70
+ Questions = [tokenizer.decode(
71
+ out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in topkp_output]
72
+ return [Question.strip().capitalize() for Question in Questions]
73
+
74
+
75
+ nltk.download('brown')
76
+ nltk.download('stopwords')
77
+ nltk.download('popular')
78
+
79
+
80
+ def MCQs_available(word, s2v):
81
+ word = word.replace(" ", "_")
82
+ sense = s2v.get_best_sense(word)
83
+ return sense is not None
84
+
85
+
86
+ def edits(word):
87
+ "All edits that are one edit away from `word`."
88
+ letters = f'abcdefghijklmnopqrstuvwxyz {string.punctuation}'
89
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
90
+ deletes = [L + R[1:] for L, R in splits if R]
91
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
92
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
93
+ inserts = [L + c + R for L, R in splits for c in letters]
94
+ return set(deletes + transposes + replaces + inserts)
95
+
96
+
97
+ def sense2vec_get_words(word, s2v):
98
+ output = []
99
+
100
+ word_preprocessed = word.translate(
101
+ word.maketrans("", "", string.punctuation))
102
+ word_preprocessed = word_preprocessed.lower()
103
+
104
+ word_edits = edits(word_preprocessed)
105
+
106
+ word = word.replace(" ", "_")
107
+
108
+ sense = s2v.get_best_sense(word)
109
+ most_similar = s2v.most_similar(sense, n=15)
110
+
111
+ compare_list = [word_preprocessed]
112
+ for each_word in most_similar:
113
+ append_word = each_word[0].split("|")[0].replace("_", " ")
114
+ append_word = append_word.strip()
115
+ append_word_processed = append_word.lower()
116
+ append_word_processed = append_word_processed.translate(
117
+ append_word_processed.maketrans("", "", string.punctuation))
118
+ if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
119
+ output.append(append_word.title())
120
+ compare_list.append(append_word_processed)
121
+
122
+ return list(OrderedDict.fromkeys(output))
123
+
124
+
125
+ def get_options(answer, s2v):
126
+ distractors = []
127
+
128
+ try:
129
+ distractors = sense2vec_get_words(answer, s2v)
130
+ if len(distractors) > 0:
131
+ print(" Sense2vec_distractors successful for word : ", answer)
132
+ return distractors, "sense2vec"
133
+ except Exception:
134
+ print(" Sense2vec_distractors failed for word : ", answer)
135
+
136
+ return distractors, "None"
137
+
138
+
139
+ def tokenize_sentences(text):
140
+ sentences = [sent_tokenize(text)]
141
+ sentences = [y for x in sentences for y in x]
142
+ return [sentence.strip() for sentence in sentences if len(sentence) > 20]
143
+
144
 
145
+ def get_sentences_for_keyword(keywords, sentences):
146
+ keyword_processor = KeywordProcessor()
147
+ keyword_sentences = {}
148
+ for word in keywords:
149
+ word = word.strip()
150
+ keyword_sentences[word] = []
151
+ keyword_processor.add_keyword(word)
152
+ for sentence in sentences:
153
+ keywords_found = keyword_processor.extract_keywords(sentence)
154
+ for key in keywords_found:
155
+ keyword_sentences[key].append(sentence)
156
 
157
+ for key, values in keyword_sentences.items():
158
+ values = sorted(values, key=len, reverse=True)
159
+ keyword_sentences[key] = values
 
 
 
160
 
161
+ delete_keys = [k for k, v in keyword_sentences.items() if len(v) == 0]
162
+ for del_key in delete_keys:
163
+ del keyword_sentences[del_key]
164
 
165
+ return keyword_sentences
166
 
167
 
168
+ def is_far(words_list, currentword, thresh, normalized_levenshtein):
169
+ threshold = thresh
170
+ score_list = [
171
+ normalized_levenshtein.distance(word.lower(), currentword.lower())
172
+ for word in words_list
173
+ ]
174
+ return min(score_list) >= threshold
 
 
 
 
 
 
 
175
 
 
176
 
177
+ def filter_phrases(phrase_keys, max, normalized_levenshtein):
178
+ filtered_phrases = []
179
+ if len(phrase_keys) > 0:
180
+ filtered_phrases.append(phrase_keys[0])
181
+ for ph in phrase_keys[1:]:
182
+ if is_far(filtered_phrases, ph, 0.7, normalized_levenshtein):
183
+ filtered_phrases.append(ph)
184
+ if len(filtered_phrases) >= max:
185
+ break
186
+ return filtered_phrases
187
 
188
 
189
+ def get_nouns_multipartite(text):
190
+ out = []
 
 
 
 
 
191
 
192
+ extractor = pke.unsupervised.MultipartiteRank()
193
+ extractor.load_document(input=text, language='en')
194
+ pos = {'PROPN', 'NOUN'}
195
+ stoplist = list(string.punctuation)
196
+ stoplist += stopwords.words('english')
197
+ extractor.candidate_selection(pos=pos)
198
+ # 4. build the Multipartite graph and rank candidates using random walk,
199
+ # alpha controls the weight adjustment mechanism, see TopicRank for
200
+ # threshold/method parameters.
201
+ try:
202
+ extractor.candidate_weighting(alpha=1.1,
203
+ threshold=0.75,
204
+ method='average')
205
+ except Exception:
206
+ return out
207
 
 
 
 
 
 
208
  keyphrases = extractor.get_n_best(n=10)
209
 
210
+ out.extend(key[0] for key in keyphrases)
211
+ return out
212
+
213
+
214
+ def get_phrases(doc):
215
+ phrases = {}
216
+ for np in doc.noun_chunks:
217
+ phrase = np.text
218
+ len_phrase = len(phrase.split())
219
+ if len_phrase > 1:
220
+ phrases[phrase] = 1 if phrase not in phrases else phrases[phrase]+1
221
+ phrase_keys = list(phrases.keys())
222
+ phrase_keys = sorted(phrase_keys, key=lambda x: len(x), reverse=True)
223
+ return phrase_keys[:50]
224
+
225
+
226
+ def get_keywords(nlp, text, max_keywords, s2v, fdist, normalized_levenshtein, no_of_sentences):
227
+ doc = nlp(text)
228
+ max_keywords = int(max_keywords)
229
+
230
+ keywords = get_nouns_multipartite(text)
231
+ keywords = sorted(keywords, key=lambda x: fdist[x])
232
+ keywords = filter_phrases(keywords, max_keywords, normalized_levenshtein)
233
+
234
+ phrase_keys = get_phrases(doc)
235
+ filtered_phrases = filter_phrases(
236
+ phrase_keys, max_keywords, normalized_levenshtein)
237
+
238
+ total_phrases = keywords + filtered_phrases
239
+
240
+ total_phrases_filtered = filter_phrases(total_phrases, min(
241
+ max_keywords, 2*no_of_sentences), normalized_levenshtein)
242
+
243
+ answers = []
244
+ for answer in total_phrases_filtered:
245
+ if answer not in answers and MCQs_available(answer, s2v):
246
+ answers.append(answer)
247
+
248
+ return answers[:max_keywords]
249
+
250
+
251
+ def generate_questions_mcq(keyword_sent_mapping, device, tokenizer, model, sense2vec, normalized_levenshtein):
252
+ batch_text = []
253
+ answers = keyword_sent_mapping.keys()
254
+ for answer in answers:
255
+ txt = keyword_sent_mapping[answer]
256
+ context = f"context: {txt}"
257
+ text = f"{context} answer: {answer} </s>"
258
+ batch_text.append(text)
259
+
260
+ encoding = tokenizer.batch_encode_plus(
261
+ batch_text, pad_to_max_length=True, return_tensors="pt")
262
+
263
+ print("Running model for generation")
264
+ input_ids, attention_masks = encoding["input_ids"].to(
265
+ device), encoding["attention_mask"].to(device)
266
+
267
+ with torch.no_grad():
268
+ outs = model.generate(input_ids=input_ids,
269
+ attention_mask=attention_masks,
270
+ max_length=150)
271
+
272
+ output_array = {"questions": []}
273
+ # print(outs)
274
+ for index, val in enumerate(answers):
275
+ out = outs[index, :]
276
+ dec = tokenizer.decode(out, skip_special_tokens=True,
277
+ clean_up_tokenization_spaces=True)
278
+
279
+ Question = dec.replace("question:", "")
280
+ Question = Question.strip()
281
+ individual_question = {
282
+ "question_statement": Question,
283
+ "question_type": "MCQ",
284
+ "answer": val,
285
+ "id": index + 1,
286
+ }
287
+ individual_question["options"], individual_question["options_algorithm"] = get_options(
288
+ val, sense2vec)
289
+
290
+ individual_question["options"] = filter_phrases(
291
+ individual_question["options"], 10, normalized_levenshtein)
292
+ index = 3
293
+ individual_question["extra_options"] = individual_question["options"][index:]
294
+ individual_question["options"] = individual_question["options"][:index]
295
+ individual_question["context"] = keyword_sent_mapping[val]
296
+
297
+ if len(individual_question["options"]) > 0:
298
+ output_array["questions"].append(individual_question)
299
+
300
+ return output_array
301
+
302
+
303
+ # for normal one word questions
304
+ def generate_normal_questions(keyword_sent_mapping, device, tokenizer, model):
305
+ batch_text = []
306
+ answers = keyword_sent_mapping.keys()
307
+ for answer in answers:
308
+ txt = keyword_sent_mapping[answer]
309
+ context = f"context: {txt}"
310
+ text = f"{context} answer: {answer} </s>"
311
+ batch_text.append(text)
312
+
313
+ encoding = tokenizer.batch_encode_plus(
314
+ batch_text, pad_to_max_length=True, return_tensors="pt")
315
+
316
+ print("Running model for generation")
317
+ input_ids, attention_masks = encoding["input_ids"].to(
318
+ device), encoding["attention_mask"].to(device)
319
+
320
+ with torch.no_grad():
321
+ outs = model.generate(input_ids=input_ids,
322
+ attention_mask=attention_masks,
323
+ max_length=150)
324
+
325
+ output_array = {"questions": []}
326
+ for index, val in enumerate(answers):
327
+ out = outs[index, :]
328
+ dec = tokenizer.decode(out, skip_special_tokens=True,
329
+ clean_up_tokenization_spaces=True)
330
+
331
+ Question = dec.replace('question:', '')
332
+ Question = Question.strip()
333
+
334
+ individual_quest = {
335
+ 'Question': Question,
336
+ 'Answer': val,
337
+ "id": index + 1,
338
+ "context": keyword_sent_mapping[val],
339
+ }
340
+ output_array["questions"].append(individual_quest)
341
+
342
+ return output_array
343
+
344
+
345
+ def random_choice():
346
+ a = random.choice([0, 1])
347
+ return bool(a)
348
+
349
+
350
+ nltk.download('brown')
351
+ nltk.download('stopwords')
352
+ nltk.download('popular')
353
+
354
+
355
+ class QGen:
356
+
357
+ def __init__(self):
358
+
359
+ trained_model_path = './model/'
360
+
361
+ pretrained_model_name = Path(trained_model_path).stem
362
+
363
+ encoder_path = os.path.join(
364
+ trained_model_path, f"{pretrained_model_name}-encoder_quantized.onnx")
365
+ decoder_path = os.path.join(
366
+ trained_model_path, f"{pretrained_model_name}-decoder_quantized.onnx")
367
+ init_decoder_path = os.path.join(
368
+ trained_model_path, f"{pretrained_model_name}-init-decoder_quantized.onnx")
369
+
370
+ model_paths = encoder_path, decoder_path, init_decoder_path
371
+ model_sessions = get_onnx_runtime_sessions(model_paths)
372
+ model = OnnxT5(trained_model_path, model_sessions)
373
+
374
+ self.tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
375
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
376
+ model.to(device)
377
+ # model.eval()
378
+ self.device = device
379
+ self.model = model
380
+ self.nlp = spacy.load('en_core_web_sm')
381
+
382
+ self.s2v = Sense2Vec().from_disk('s2v_old')
383
+
384
+ self.fdist = FreqDist(brown.words())
385
+ self.normalized_levenshtein = NormalizedLevenshtein()
386
+ self.set_seed(42)
387
+
388
+ def set_seed(self, seed):
389
+ numpy.random.seed(seed)
390
+ torch.manual_seed(seed)
391
+ if torch.cuda.is_available():
392
+ torch.cuda.manual_seed_all(seed)
393
+
394
+ def predict_mcq(self, payload):
395
+ start = time.time()
396
+ inp = {
397
+ "input_text": payload.get("input_text"),
398
+ "max_questions": payload.get("max_questions", 4)
399
+ }
400
+
401
+ text = inp['input_text']
402
+ sentences = tokenize_sentences(text)
403
+ joiner = " "
404
+ modified_text = joiner.join(sentences)
405
+
406
+ keywords = get_keywords(
407
+ self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
408
+
409
+ keyword_sentence_mapping = get_sentences_for_keyword(
410
+ keywords, sentences)
411
+
412
+ for k in keyword_sentence_mapping.keys():
413
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
414
+ keyword_sentence_mapping[k] = text_snippet
415
+
416
+ final_output = {}
417
+
418
+ if len(keyword_sentence_mapping.keys()) != 0:
419
+ try:
420
+ generated_questions = generate_questions_mcq(
421
+ keyword_sentence_mapping, self.device, self.tokenizer, self.model, self.s2v, self.normalized_levenshtein)
422
+
423
+ except Exception:
424
+ return final_output
425
+ end = time.time()
426
+
427
+ final_output["statement"] = modified_text
428
+ final_output["questions"] = generated_questions["questions"]
429
+ final_output["time_taken"] = end-start
430
+
431
+ if torch.device == 'cuda':
432
+ torch.cuda.empty_cache()
433
+
434
+ return final_output
435
+
436
+ def predict_shortq(self, payload):
437
+ inp = {
438
+ "input_text": payload.get("input_text"),
439
+ "max_questions": payload.get("max_questions", 4)
440
+ }
441
+
442
+ text = inp['input_text']
443
+ sentences = tokenize_sentences(text)
444
+ joiner = " "
445
+ modified_text = joiner.join(sentences)
446
+
447
+ keywords = get_keywords(
448
+ self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
449
+
450
+ keyword_sentence_mapping = get_sentences_for_keyword(
451
+ keywords, sentences)
452
+
453
+ for k in keyword_sentence_mapping.keys():
454
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
455
+ keyword_sentence_mapping[k] = text_snippet
456
+
457
+ final_output = {}
458
+
459
+ if len(keyword_sentence_mapping.keys()) == 0:
460
+ print('ZERO')
461
+ return final_output
462
+ else:
463
+
464
+ generated_questions = generate_normal_questions(
465
+ keyword_sentence_mapping, self.device, self.tokenizer, self.model)
466
+ print(generated_questions)
467
+
468
+ final_output["statement"] = modified_text
469
+ final_output["questions"] = generated_questions["questions"]
470
+
471
+ if torch.device == 'cuda':
472
+ torch.cuda.empty_cache()
473
+
474
+ return final_output
475
+
476
+ def paraphrase(self, payload):
477
+ start = time.time()
478
+ inp = {
479
+ "input_text": payload.get("input_text"),
480
+ "max_questions": payload.get("max_questions", 3)
481
+ }
482
+
483
+ text = inp['input_text']
484
+ num = inp['max_questions']
485
+
486
+ self.sentence = text
487
+ self.text = f"paraphrase: {self.sentence} </s>"
488
+
489
+ encoding = self.tokenizer.encode_plus(
490
+ self.text, pad_to_max_length=True, return_tensors="pt")
491
+ input_ids, attention_masks = encoding["input_ids"].to(
492
+ self.device), encoding["attention_mask"].to(self.device)
493
+
494
+ beam_outputs = self.model.generate(
495
+ input_ids=input_ids,
496
+ attention_mask=attention_masks,
497
+ max_length=50,
498
+ num_beams=50,
499
+ num_return_sequences=num,
500
+ no_repeat_ngram_size=2,
501
+ early_stopping=True
502
+ )
503
+
504
+ # print ("\nOriginal Question ::")
505
+ # print (text)
506
+ # print ("\n")
507
+ # print ("Paraphrased Questions :: ")
508
+ final_outputs = []
509
+ for beam_output in beam_outputs:
510
+ sent = self.tokenizer.decode(
511
+ beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
512
+ if sent.lower() != self.sentence.lower() and sent not in final_outputs:
513
+ final_outputs.append(sent)
514
+
515
+ output = {
516
+ 'Question': text,
517
+ 'Count': num,
518
+ 'Paraphrased Questions': final_outputs,
519
+ }
520
+ for i, final_output in enumerate(final_outputs):
521
+ print(f"{i}: {final_output}")
522
+
523
+ if torch.device == 'cuda':
524
+ torch.cuda.empty_cache()
525
+
526
+ return output
527
+
528
+
529
+ class BoolQGen:
530
+
531
+ def __init__(self):
532
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
533
+ model = T5ForConditionalGeneration.from_pretrained(
534
+ 'ramsrigouthamg/t5_boolean_questions')
535
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
536
+ model.to(device)
537
+ # model.eval()
538
+ self.device = device
539
+ self.model = model
540
+ self.set_seed(42)
541
+
542
+ def set_seed(self, seed):
543
+ numpy.random.seed(seed)
544
+ torch.manual_seed(seed)
545
+ if torch.cuda.is_available():
546
+ torch.cuda.manual_seed_all(seed)
547
+
548
+ def random_choice(self):
549
+ a = random.choice([0, 1])
550
+ return bool(a)
551
+
552
+ def predict_boolq(self, payload):
553
+ start = time.time()
554
+ inp = {
555
+ "input_text": payload.get("input_text"),
556
+ "max_questions": payload.get("max_questions", 4)
557
+ }
558
+
559
+ text = inp['input_text']
560
+ num = inp['max_questions']
561
+ sentences = tokenize_sentences(text)
562
+ joiner = " "
563
+ modified_text = joiner.join(sentences)
564
+ answer = self.random_choice()
565
+ form = f"truefalse: {modified_text} passage: {answer} </s>"
566
+
567
+ encoding = self.tokenizer.encode_plus(form, return_tensors="pt")
568
+ input_ids, attention_masks = encoding["input_ids"].to(
569
+ self.device), encoding["attention_mask"].to(self.device)
570
+
571
+ output = beam_search_decoding(
572
+ input_ids, attention_masks, self.model, self.tokenizer)
573
+ if torch.device == 'cuda':
574
+ torch.cuda.empty_cache()
575
+
576
+ return {'Text': text, 'Count': num, 'Boolean Questions': output}
577
+
578
+
579
+ class AnswerPredictor:
580
+
581
+ def __init__(self):
582
+ self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
583
+ model = T5ForConditionalGeneration.from_pretrained('Parth/boolean')
584
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
585
+ model.to(device)
586
+ # model.eval()
587
+ self.device = device
588
+ self.model = model
589
+ self.set_seed(42)
590
+
591
+ def set_seed(self, seed):
592
+ numpy.random.seed(seed)
593
+ torch.manual_seed(seed)
594
+ if torch.cuda.is_available():
595
+ torch.cuda.manual_seed_all(seed)
596
+
597
+ def greedy_decoding(self, attn_mask, model, tokenizer):
598
+ greedy_output = model.generate(
599
+ input_ids=self, attention_mask=attn_mask, max_length=256
600
+ )
601
+ Question = tokenizer.decode(
602
+ greedy_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
603
+ return Question.strip().capitalize()
604
+
605
+ def predict_answer(self, payload):
606
+ start = time.time()
607
+ inp = {
608
+ "input_text": payload.get("input_text"),
609
+ "input_question": payload.get("input_question")
610
+ }
611
+
612
+ context = inp["input_text"]
613
+ question = inp["input_question"]
614
+ input_text = f"question: {question} <s> context: {context} </s>"
615
 
616
+ encoding = self.tokenizer.encode_plus(input_text, return_tensors="pt")
617
+ input_ids, attention_masks = encoding["input_ids"].to(
618
+ self.device), encoding["attention_mask"].to(self.device)
619
+ greedy_output = self.model.generate(
620
+ input_ids=input_ids, attention_mask=attention_masks, max_length=256)
621
+ Question = self.tokenizer.decode(
622
+ greedy_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
623
+ return Question.strip().capitalize()
624
 
 
 
 
 
625
 
626
+ qg = QGen()
627
+ # Define the QGen function
628
 
 
 
 
 
 
 
629
 
630
+ def generate_mcq(input_text, max_questions):
631
+ payload = {
632
+ "input_text": input_text,
633
+ "max_questions": max_questions
634
+ }
635
 
636
+ return qg.predict_mcq(payload)
637
 
638
 
639
+ # Create a Gradio interface
640
  iface = gr.Interface(
641
  fn=generate_mcq,
642
+ inputs=[
643
+ gr.Textbox(label="Input Text"),
644
+ gr.Number(label="Max Questions", placeholder=4,
645
+ default=4, minimum=1, maximum=10)
646
+ ],
647
+ outputs=gr.JSON(label="Generated MCQs"),
648
  )
649
 
650
+ # Launch the Gradio app
651
  iface.launch()
backup ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pke
2
+ from sense2vec import Sense2Vec
3
+ import time
4
+ import gradio as gr
5
+ from transformers import AutoTokenizer
6
+ import os
7
+ from pathlib import Path
8
+ from FastT5 import get_onnx_runtime_sessions, OnnxT5
9
+
10
+ # commands = [
11
+ # "curl -LO https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",
12
+ # "tar -xvf s2v_reddit_2015_md.tar.gz",
13
+ # ]
14
+
15
+ # for command in commands:
16
+ # return_code = os.system(command)
17
+ # if return_code == 0:
18
+ # print(f"Command '{command}' executed successfully")
19
+ # else:
20
+ # print(f"Command '{command}' failed with return code {return_code}")
21
+
22
+ s2v = Sense2Vec().from_disk("s2v_old")
23
+
24
+ trained_model_path = './t5_squad_v1/'
25
+
26
+ pretrained_model_name = Path(trained_model_path).stem
27
+
28
+ encoder_path = os.path.join(
29
+ trained_model_path, f"{pretrained_model_name}-encoder_quantized.onnx")
30
+ decoder_path = os.path.join(
31
+ trained_model_path, f"{pretrained_model_name}-decoder_quantized.onnx")
32
+ init_decoder_path = os.path.join(
33
+ trained_model_path, f"{pretrained_model_name}-init-decoder_quantized.onnx")
34
+
35
+ model_paths = encoder_path, decoder_path, init_decoder_path
36
+ model_sessions = get_onnx_runtime_sessions(model_paths)
37
+ model = OnnxT5(trained_model_path, model_sessions)
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
40
+
41
+
42
+ def get_question(sentence, answer, mdl, tknizer):
43
+ text = f"context: {sentence} answer: {answer}"
44
+ print(text)
45
+ max_len = 256
46
+ encoding = tknizer.encode_plus(
47
+ text, max_length=max_len, pad_to_max_length=False, truncation=True, return_tensors="pt")
48
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
49
+ outs = mdl.generate(input_ids=input_ids,
50
+ attention_mask=attention_mask,
51
+ early_stopping=True,
52
+ num_beams=5,
53
+ num_return_sequences=1,
54
+ no_repeat_ngram_size=2,
55
+ max_length=300)
56
+
57
+ dec = [tknizer.decode(ids, skip_special_tokens=True) for ids in outs]
58
+
59
+ Question = dec[0].replace("question:", "")
60
+ Question = Question.strip()
61
+ return Question
62
+
63
+
64
+ def generate_question(context, answer):
65
+ start_time = time.time() # Record the start time
66
+ result = get_question(context, answer, model, tokenizer)
67
+ end_time = time.time() # Record the end time
68
+ latency = end_time - start_time # Calculate latency
69
+ print(f"Latency: {latency} seconds")
70
+ return result
71
+
72
+
73
+ def generate_mcq(context):
74
+ extractor = pke.unsupervised.TopicRank()
75
+ extractor.load_document(input=context, language='en')
76
+ extractor.candidate_selection(pos={"NOUN", "PROPN", "ADJ"})
77
+ extractor.candidate_weighting()
78
+ keyphrases = extractor.get_n_best(n=10)
79
+
80
+ results = []
81
+
82
+ for keyword, _ in keyphrases:
83
+ original_keyword = keyword
84
+ keyword = original_keyword.lower().replace(" ", "_")
85
+ sense = s2v.get_best_sense(keyword)
86
+
87
+ if sense is not None:
88
+ most_similar = s2v.most_similar(sense, n=2)
89
+ distractors = [word.split("|")[0].lower().replace(
90
+ "_", " ") for word, _ in most_similar]
91
+
92
+ question = generate_question(context, original_keyword)
93
+
94
+ result = {
95
+ "Question": question,
96
+ "Keyword": original_keyword,
97
+ "Distractor1": distractors[0],
98
+ "Distractor2": distractors[1]
99
+ }
100
+
101
+ results.append(result)
102
+
103
+ return results
104
+
105
+
106
+ iface = gr.Interface(
107
+ fn=generate_mcq,
108
+ inputs=gr.Textbox(label="Context", type='text'),
109
+ outputs=gr.JSON(value=list),
110
+ title="Questgen AI",
111
+ description="Enter a context to generate MCQs for keywords."
112
+ )
113
+
114
+ iface.launch()
{t5_squad_v1 β†’ model}/config.json RENAMED
File without changes
t5_squad_v1/t5_squad_v1-decoder_quantized.onnx β†’ model/model-decoder_quantized.onnx RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fd0f8a3a4f7865ca2d31d1e6d1078c9a17c2f27e969ba6c137d5457694506b9
3
- size 149128510
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ffb8a9e30ec8feac698b3c775ecc6fd257af8a23e5f1533cc5b6bd9c00527e7
3
+ size 149128511
t5_squad_v1/t5_squad_v1-encoder_quantized.onnx β†’ model/model-encoder_quantized.onnx RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93835d3fc5cd7e6e0e9582409b86184be4a2df6e0db3d3d75bcbb7cf2b5ba696
3
- size 110045668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cb66d200f9f83dd3f48773c5220ccc583fb5ebf5cef6948e45318a97160293
3
+ size 110045669
t5_squad_v1/t5_squad_v1-init-decoder_quantized.onnx β†’ model/model-init-decoder_quantized.onnx RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8afab51caddafca6a74103d9fd233abd03cc43c979ae9c7e1066858b6a5dc26d
3
- size 163346037
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c87b0e48b3070064d385e799a13bfd7ed1aa7944cb88048f2f8eaae3e5c3536
3
+ size 163346038
{t5_squad_v1 β†’ model}/ort_config.json RENAMED
File without changes
{t5_squad_v1 β†’ model}/special_tokens_map.json RENAMED
File without changes
{t5_squad_v1 β†’ model}/spiece.model RENAMED
File without changes
{t5_squad_v1 β†’ model}/tokenizer.json RENAMED
The diff for this file is too large to render. See raw diff
 
{t5_squad_v1 β†’ model}/tokenizer_config.json RENAMED
File without changes
requirements.txt CHANGED
@@ -8,4 +8,6 @@ progress
8
  psutil
9
  sense2vec
10
  git+https://github.com/boudinfl/pke.git
11
- en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl
 
 
 
8
  psutil
9
  sense2vec
10
  git+https://github.com/boudinfl/pke.git
11
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl
12
+ flashtext
13
+ strsim