bhavyapandya commited on
Commit
50512a1
1 Parent(s): 452f3da

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tensorflow as tf
3
+ import re
4
+ import string
5
+ from tokenizers import Tokenizer
6
+ import numpy as np
7
+
8
+ hind_tokenizer = Tokenizer.from_file("hind_tokenizer.json")
9
+ eng_tokenizer = Tokenizer.from_file("eng_tokenizer.json")
10
+
11
+ def clean_english_text(text):
12
+ # Remove special characters and digits
13
+ text = re.sub(r"[^a-zA-Z\s]", "", text)
14
+
15
+ # Convert to lowercase
16
+ text = text.lower()
17
+
18
+ # Remove punctuation
19
+ text = text.translate(str.maketrans("", "", string.punctuation))
20
+
21
+ # Remove extra whitespace and strip
22
+ text = re.sub(r"\s+", " ", text).strip()
23
+
24
+ return text
25
+
26
+
27
+ max_sequence_length = 50
28
+
29
+ # Encode a Hindi sentence into token IDs and pad the sequence
30
+ def encode_and_pad(sentence):
31
+
32
+ encoding = eng_tokenizer.encode(sentence)
33
+
34
+ encoded_ids = encoding.ids[:max_sequence_length]
35
+
36
+ padding_length = max_sequence_length - len(encoded_ids)
37
+ attention_mask = [1]*len(encoded_ids) + [0] * padding_length
38
+ padded_ids = encoded_ids + [0] * padding_length
39
+ return padded_ids, attention_mask
40
+
41
+
42
+
43
+
44
+ def positional_encoding(length, depth):
45
+
46
+
47
+ depth = depth/2
48
+
49
+ positions = np.arange(length)[:, np.newaxis] # (seq, 1)
50
+ depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)
51
+
52
+ angle_rates = 1 / (10000**depths) # (1, depth)
53
+ angle_rads = positions * angle_rates # (pos, depth)
54
+
55
+ pos_encoding = np.concatenate(
56
+ [np.sin(angle_rads), np.cos(angle_rads)],
57
+ axis=-1)
58
+
59
+ return tf.cast(pos_encoding, dtype=tf.float32)
60
+
61
+ class PositionalEmbedding(tf.keras.layers.Layer):
62
+ def __init__(self, vocab_size, d_model):
63
+ super().__init__()
64
+ self.d_model = d_model
65
+ self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
66
+ self.pos_encoding = positional_encoding(length=2048, depth=d_model)
67
+
68
+ def compute_mask(self, *args, **kwargs):
69
+ return self.embedding.compute_mask(*args, **kwargs)
70
+
71
+ def call(self, x):
72
+ length = tf.shape(x)[1]
73
+ x = self.embedding(x)
74
+ # This factor sets the relative scale of the embedding and positonal_encoding.
75
+ x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
76
+ x = x + self.pos_encoding[tf.newaxis, :length, :]
77
+ return x
78
+ class BaseAttention(tf.keras.layers.Layer):
79
+ def __init__(self, **kwargs):
80
+ super().__init__()
81
+ self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
82
+ self.layernorm = tf.keras.layers.LayerNormalization()
83
+ self.add = tf.keras.layers.Add()
84
+ class CrossAttention(BaseAttention):
85
+ def call(self, x, context):
86
+ attn_output, attn_scores = self.mha(
87
+ query=x,
88
+ key=context,
89
+ value=context,
90
+ return_attention_scores=True)
91
+
92
+ # Cache the attention scores for plotting later.
93
+ self.last_attn_scores = attn_scores
94
+
95
+ x = self.add([x, attn_output])
96
+ x = self.layernorm(x)
97
+
98
+ return x
99
+ class GlobalSelfAttention(BaseAttention):
100
+ def call(self, x):
101
+ attn_output = self.mha(
102
+ query=x,
103
+ value=x,
104
+ key=x)
105
+ x = self.add([x, attn_output])
106
+ x = self.layernorm(x)
107
+ return x
108
+ class CausalSelfAttention(BaseAttention):
109
+ def call(self, x):
110
+ attn_output = self.mha(
111
+ query=x,
112
+ value=x,
113
+ key=x,
114
+ use_causal_mask = True)
115
+ x = self.add([x, attn_output])
116
+ x = self.layernorm(x)
117
+ return x
118
+ class FeedForward(tf.keras.layers.Layer):
119
+ def __init__(self, d_model, dff, dropout_rate=0.1):
120
+ super().__init__()
121
+ self.seq = tf.keras.Sequential([
122
+ tf.keras.layers.Dense(dff, activation='relu'),
123
+ tf.keras.layers.Dense(d_model),
124
+ tf.keras.layers.Dropout(dropout_rate)
125
+ ])
126
+ self.add = tf.keras.layers.Add()
127
+ self.layer_norm = tf.keras.layers.LayerNormalization()
128
+
129
+ def call(self, x):
130
+ x = self.add([x, self.seq(x)])
131
+ x = self.layer_norm(x)
132
+ return x
133
+ class EncoderLayer(tf.keras.layers.Layer):
134
+ def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
135
+ super().__init__()
136
+
137
+ self.self_attention = GlobalSelfAttention(
138
+ num_heads=num_heads,
139
+ key_dim=d_model,
140
+ dropout=dropout_rate)
141
+
142
+ self.ffn = FeedForward(d_model, dff)
143
+
144
+ def call(self, x):
145
+ x = self.self_attention(x)
146
+ x = self.ffn(x)
147
+ return x
148
+ class Encoder(tf.keras.layers.Layer):
149
+ def __init__(self, *, num_layers, d_model, num_heads,
150
+ dff, vocab_size, dropout_rate=0.1):
151
+ super().__init__()
152
+
153
+ self.d_model = d_model
154
+ self.num_layers = num_layers
155
+
156
+ self.pos_embedding = PositionalEmbedding(
157
+ vocab_size=vocab_size, d_model=d_model)
158
+
159
+ self.enc_layers = [
160
+ EncoderLayer(d_model=d_model,
161
+ num_heads=num_heads,
162
+ dff=dff,
163
+ dropout_rate=dropout_rate)
164
+ for _ in range(num_layers)]
165
+ self.dropout = tf.keras.layers.Dropout(dropout_rate)
166
+
167
+ def call(self, x):
168
+ # `x` is token-IDs shape: (batch, seq_len)
169
+ x = self.pos_embedding(x) # Shape `(batch_size, seq_len, d_model)`.
170
+
171
+ # Add dropout.
172
+ x = self.dropout(x)
173
+
174
+ for i in range(self.num_layers):
175
+ x = self.enc_layers[i](x)
176
+
177
+ return x # Shape `(batch_size, seq_len, d_model)`.
178
+ class DecoderLayer(tf.keras.layers.Layer):
179
+ def __init__(self,
180
+ *,
181
+ d_model,
182
+ num_heads,
183
+ dff,
184
+ dropout_rate=0.1):
185
+ super(DecoderLayer, self).__init__()
186
+
187
+ self.causal_self_attention = CausalSelfAttention(
188
+ num_heads=num_heads,
189
+ key_dim=d_model,
190
+ dropout=dropout_rate)
191
+
192
+ self.cross_attention = CrossAttention(
193
+ num_heads=num_heads,
194
+ key_dim=d_model,
195
+ dropout=dropout_rate)
196
+
197
+ self.ffn = FeedForward(d_model, dff)
198
+
199
+ def call(self, x, context):
200
+ x = self.causal_self_attention(x=x)
201
+ x = self.cross_attention(x=x, context=context)
202
+
203
+ # Cache the last attention scores for plotting later
204
+ self.last_attn_scores = self.cross_attention.last_attn_scores
205
+
206
+ x = self.ffn(x) # Shape `(batch_size, seq_len, d_model)`.
207
+ return x
208
+ class Decoder(tf.keras.layers.Layer):
209
+ def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
210
+ dropout_rate=0.1):
211
+ super(Decoder, self).__init__()
212
+
213
+ self.d_model = d_model
214
+ self.num_layers = num_layers
215
+
216
+ self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
217
+ d_model=d_model)
218
+ self.dropout = tf.keras.layers.Dropout(dropout_rate)
219
+ self.dec_layers = [
220
+ DecoderLayer(d_model=d_model, num_heads=num_heads,
221
+ dff=dff, dropout_rate=dropout_rate)
222
+ for _ in range(num_layers)]
223
+
224
+ self.last_attn_scores = None
225
+
226
+ def call(self, x, context):
227
+ # `x` is token-IDs shape (batch, target_seq_len)
228
+ x = self.pos_embedding(x) # (batch_size, target_seq_len, d_model)
229
+
230
+ x = self.dropout(x)
231
+
232
+ for i in range(self.num_layers):
233
+ x = self.dec_layers[i](x, context)
234
+
235
+ self.last_attn_scores = self.dec_layers[-1].last_attn_scores
236
+
237
+ # The shape of x is (batch_size, target_seq_len, d_model).
238
+ return x
239
+ class Transformer(tf.keras.Model):
240
+ def __init__(self, *, num_layers, d_model, num_heads, dff,
241
+ input_vocab_size, target_vocab_size, dropout_rate=0.1):
242
+ super().__init__()
243
+ self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
244
+ num_heads=num_heads, dff=dff,
245
+ vocab_size=input_vocab_size,
246
+ dropout_rate=dropout_rate)
247
+
248
+ self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
249
+ num_heads=num_heads, dff=dff,
250
+ vocab_size=target_vocab_size,
251
+ dropout_rate=dropout_rate)
252
+
253
+ self.final_layer = tf.keras.layers.Dense(target_vocab_size)
254
+
255
+ def call(self, inputs):
256
+ # To use a Keras model with `.fit` you must pass all your inputs in the
257
+ # first argument.
258
+ context, x = inputs
259
+
260
+ context = self.encoder(context) # (batch_size, context_len, d_model)
261
+
262
+ x = self.decoder(x, context) # (batch_size, target_len, d_model)
263
+
264
+ # Final linear layer output.
265
+ logits = self.final_layer(x) # (batch_size, target_len, target_vocab_size)
266
+
267
+ try:
268
+ # Drop the keras mask, so it doesn't scale the losses/metrics.
269
+ # b/250038731
270
+ del logits._keras_mask
271
+ except AttributeError:
272
+ pass
273
+
274
+ # Return the final output and the attention weights.
275
+ return logits
276
+ class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
277
+ def __init__(self, d_model, warmup_steps=4000):
278
+ super().__init__()
279
+
280
+ self.d_model = d_model
281
+ self.d_model = tf.cast(self.d_model, tf.float32)
282
+
283
+ self.warmup_steps = warmup_steps
284
+
285
+ def __call__(self, step):
286
+ step = tf.cast(step, dtype=tf.float32)
287
+ arg1 = tf.math.rsqrt(step)
288
+ arg2 = step * (self.warmup_steps ** -1.5)
289
+
290
+ return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
291
+
292
+
293
+
294
+
295
+ num_layers = 6
296
+ d_model = 512
297
+ dff = 512
298
+ num_heads = 12
299
+ dropout_rate = 0.1
300
+
301
+
302
+ def masked_loss(label, pred):
303
+ mask = label != 0
304
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
305
+ from_logits=True, reduction='none')
306
+ loss = loss_object(label, pred)
307
+
308
+ mask = tf.cast(mask, dtype=loss.dtype)
309
+ loss *= mask
310
+
311
+ loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
312
+ return loss
313
+
314
+
315
+ def masked_accuracy(label, pred):
316
+ pred = tf.argmax(pred, axis=2)
317
+ label = tf.cast(label, pred.dtype)
318
+ match = label == pred
319
+
320
+ mask = label != 0
321
+
322
+ match = match & mask
323
+
324
+ match = tf.cast(match, dtype=tf.float32)
325
+ mask = tf.cast(mask, dtype=tf.float32)
326
+ return tf.reduce_sum(match)/tf.reduce_sum(mask)
327
+
328
+ transformer = Transformer(
329
+ num_layers=num_layers,
330
+ d_model=d_model,
331
+ num_heads=num_heads,
332
+ dff=dff,
333
+ input_vocab_size=eng_tokenizer.get_vocab_size(),
334
+ target_vocab_size=hind_tokenizer.get_vocab_size(),
335
+ dropout_rate=dropout_rate)
336
+
337
+ learning_rate = CustomSchedule(d_model)
338
+
339
+ optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
340
+ epsilon=1e-9)
341
+
342
+ transformer.compile(
343
+ loss=masked_loss,
344
+ optimizer=optimizer,
345
+ metrics=[masked_accuracy])
346
+
347
+ transformer.load_weights("best_weights_6_512_512")
348
+
349
+ class Translator(tf.Module):
350
+ def __init__(self, eng_tokenizer,hind_tokenizer, transformer):
351
+ self.eng_tokenizer = eng_tokenizer
352
+ self.hind_tokenizer = hind_tokenizer
353
+ self.transformer = transformer
354
+
355
+ def __call__(self, sentence, max_length=50):
356
+ # sentence = clean_english_text(sentence)
357
+
358
+ sentence = tf.reshape(tf.convert_to_tensor(self.eng_tokenizer.encode(sentence).ids+[0]*(50-len(self.eng_tokenizer.encode(sentence).ids))),(1, 50))
359
+
360
+ encoder_input = sentence
361
+
362
+ # As the output language is English, initialize the output with the
363
+ # English `[START]` token.
364
+ start = self.hind_tokenizer.encode("<START>").ids[0]
365
+ end = self.hind_tokenizer.encode("<END>").ids[0]
366
+
367
+
368
+ output_array = [[start]]
369
+ for i in tf.range(max_length):
370
+
371
+
372
+ predictions = self.transformer([encoder_input, tf.convert_to_tensor(output_array)], training=False)
373
+
374
+ # Select the last token from the `seq_len` dimension.
375
+ predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`.
376
+
377
+ predicted_id = tf.argmax(predictions, axis=-1)
378
+
379
+ # Concatenate the `predicted_id` to the output which is given to the
380
+ # decoder as its input.
381
+
382
+ output_array[0].append(predicted_id[0].numpy()[0])
383
+
384
+ if predicted_id == end:
385
+ break
386
+
387
+
388
+ return self.hind_tokenizer.decode(output_array[0])
389
+
390
+ translator = Translator(eng_tokenizer, hind_tokenizer, transformer)
391
+ # Function to perform the model's inference
392
+ def text_transform(input_text):
393
+ # Your machine learning model's inference code here
394
+ # Example: return input_text in uppercase
395
+ return ' '.join(translator(clean_english_text(input_text)).split()[1:-1])
396
+
397
+ # Create a Gradio interface
398
+ iface = gr.Interface(
399
+ fn=text_transform, # Function to perform the inference
400
+ inputs="text", # Specify input type as text
401
+ outputs="text" # Specify output type as text
402
+ )
403
+
404
+ # Start the Gradio interface
405
+ iface.launch()
best_weights_6_512_512.index ADDED
Binary file (49.9 kB). View file
 
eng_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
hind_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tokenizers
2
+ keras_nlp
3
+ tensorflow
4
+ gradio