ahella commited on
Commit
6043d93
1 Parent(s): c1ab089

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +44 -0
  2. g_project.py +813 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1CbDOX8PDJB6ZyLZiLMXbPyr6k7dvrs20
8
+ """
9
+
10
+ import gradio as gr
11
+ import torch
12
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
13
+
14
+ # Load the model and tokenizer
15
+ model_name = "qarib/bert-base-qarib"
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
18
+
19
+ # Preprocessing function
20
+ def light_preprocess(text):
21
+ text = text.replace("@USER", "").replace("RT", "").strip()
22
+ return text
23
+
24
+ # Prediction function
25
+ def predict_offensive(text):
26
+ preprocessed_text = light_preprocess(text)
27
+ inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True)
28
+ with torch.no_grad():
29
+ outputs = model(**inputs)
30
+ logits = outputs.logits
31
+ predicted_class = torch.argmax(logits, dim=1).item()
32
+ return "Offensive" if predicted_class == 1 else "Not Offensive"
33
+
34
+ # Create the Gradio interface
35
+ iface = gr.Interface(
36
+ fn=predict_offensive,
37
+ inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
38
+ outputs="text",
39
+ title="Offensive Language Detection",
40
+ description="Enter a text to check if it's offensive or not.",
41
+ )
42
+
43
+ # Launch the interface
44
+ iface.launch(share=True)
g_project.py ADDED
@@ -0,0 +1,813 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """G project.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/13NvZhwwfiJloW8ZsdQ6HLf-jfSRc-tfv
8
+ """
9
+
10
+ !wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-train.txt"
11
+ !wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-dev.txt"
12
+ !wget "https://alt.qcri.org/resources/OSACT2022/OSACT2022-sharedTask-test-tweets.txt"
13
+ !wget "https://alt.qcri.org/resources1/OSACT2022/OSACT2022-sharedTask-test-taskA-gold-labels.txt"
14
+
15
+ import pandas as pd
16
+ import csv
17
+ train_data = pd.read_csv("OSACT2022-sharedTask-train.txt", sep="\t", quoting=csv.QUOTE_NONE)
18
+ dev_data = pd.read_csv("OSACT2022-sharedTask-dev.txt", sep="\t", quoting=csv.QUOTE_NONE)
19
+ test_data = pd.read_csv("OSACT2022-sharedTask-test-tweets.txt", sep="\t", quoting=csv.QUOTE_NONE)
20
+ train_data
21
+
22
+ train_data = train_data.drop(columns=['1', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO'])
23
+ train_data
24
+
25
+ train_data = train_data.rename(columns={"@USER ردينا ع التطنز 😏👊🏻": "Text"})
26
+ train_data = train_data.rename(columns={"OFF": "label"})
27
+ train_data
28
+
29
+ dev_data
30
+
31
+ dev_data = dev_data.drop(columns=['8888', 'NOT_HS', 'NOT_VLG' , 'NOT_VIO'])
32
+
33
+ dev_data = dev_data.rename(columns={"@USER افطرت عليك بعقاء واثنين من فروخها الجن 🔪😂": "Text"})
34
+ dev_data = dev_data.rename(columns={"NOT_OFF": "label"})
35
+ dev_data
36
+
37
+ test_data
38
+
39
+ test_data = test_data.drop(columns=['10158'])
40
+
41
+ test_data = test_data.rename(columns={"@USER هتهزر معايا ولا ايه 😡😡😡😡": "Text"})
42
+ test_data
43
+
44
+ test_labels = pd.read_csv("OSACT2022-sharedTask-test-taskA-gold-labels.txt", sep="\t", quoting=csv.QUOTE_NONE)
45
+ test_labels = test_labels.rename(columns={"NOT_OFF": "label"})
46
+ test_data = test_data.join(test_labels)
47
+ test_data
48
+
49
+ """# **DOWNLOADING A LIST OF ARABIC STOPWORDS**"""
50
+
51
+ # Alharbi, Alaa, and Mark Lee. "Kawarith: an Arabic Twitter Corpus for Crisis Events."
52
+ # Proceedings of the Sixth Arabic Natural Language Processing Workshop. 2021
53
+
54
+ !wget https://raw.githubusercontent.com/alaa-a-a/multi-dialect-arabic-stop-words/main/Stop-words/stop_list_1177.txt
55
+ arabic_stop_words = []
56
+ with open ('./stop_list_1177.txt',encoding='utf-8') as f :
57
+ for word in f.readlines() :
58
+ arabic_stop_words.append(word.split("\n")[0])
59
+
60
+ import nltk
61
+ from nltk.corpus import stopwords
62
+ from nltk.tokenize import WordPunctTokenizer
63
+ from nltk.stem.isri import ISRIStemmer
64
+ import string
65
+ import re
66
+ from bs4 import BeautifulSoup
67
+ nltk.download('stopwords')
68
+
69
+
70
+ tok = WordPunctTokenizer()
71
+
72
+ def normalize_arabic(text):
73
+ text = re.sub("[إأآا]", "ا", text)
74
+ text = re.sub("ى", "ي", text)
75
+ text = re.sub("ؤ", "ء", text)
76
+ text = re.sub("ئ", "ء", text)
77
+ text = re.sub("ة", "ه", text)
78
+ text = re.sub("گ", "ك", text)
79
+ return text
80
+
81
+
82
+ def remove_diacritics(text):
83
+ arabic_diacritics = re.compile("""
84
+ ّ | # Tashdid
85
+ َ | # Fatha
86
+ ً | # Tanwin Fath
87
+ ُ | # Damma
88
+ ٌ | # Tanwin Damm
89
+ ِ | # Kasra
90
+ ٍ | # Tanwin Kasr
91
+ ْ | # Sukun
92
+ ـ # Tatwil/Kashida
93
+ """, re.VERBOSE)
94
+ return re.sub(arabic_diacritics, '', text)
95
+
96
+
97
+ def remove_punctuations(text):
98
+ arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
99
+ english_punctuations = string.punctuation
100
+ punctuations_list = arabic_punctuations + english_punctuations
101
+ translator = str.maketrans('', '', punctuations_list)
102
+ return text.translate(translator)
103
+
104
+
105
+ def remove_repeating_char(text):
106
+ # return re.sub(r'(.)\1+', r'\1', text) # keep only 1 repeat
107
+ return re.sub(r'(.)\1+', r'\1\1', text) # keep 2 repeat
108
+
109
+ def remove_stop_words(text):
110
+ word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
111
+ word_list = [ w for w in word_list if not w in arabic_stop_words]
112
+ return (" ".join(word_list)).strip()
113
+
114
+
115
+
116
+ def remove_non_arabic_letters(text):
117
+ text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) # removes non arabic letters
118
+ text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters
119
+ return text
120
+
121
+
122
+
123
+
124
+ def clean_str(text):
125
+ text = remove_non_arabic_letters(text)
126
+ text = remove_punctuations(text)
127
+ text = remove_diacritics(text)
128
+ text = remove_repeating_char(text)
129
+ # text = remove_stop_words(text)
130
+
131
+ # Extract text from HTML tags, especially when dealing with data from 𝕏 (Twitter)
132
+ soup = BeautifulSoup(text, 'lxml')
133
+ souped = soup.get_text()
134
+ pat1 = r'@[A-Za-z0-9]+'
135
+ pat2 = r'https?://[A-Za-z0-9./]+'
136
+ combined_pat = r'|'.join((pat1, pat2))
137
+ stripped = re.sub(combined_pat, '', souped)
138
+ try:
139
+ clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
140
+ except:
141
+ clean = stripped
142
+
143
+ words = tok.tokenize(clean)
144
+ return (" ".join(words)).strip()
145
+
146
+ """## **applying preprocessing on our dataset**"""
147
+
148
+ print("Cleaning and parsing the training dataset...\n")
149
+
150
+ train_data["Text"] = train_data["Text"].apply(lambda x: clean_str(x))
151
+
152
+ train_data.head()
153
+
154
+ print("Cleaning and parsing the development dataset...\n")
155
+
156
+ dev_data["Text"] = dev_data["Text"].apply(lambda x: clean_str(x))
157
+
158
+ dev_data.head()
159
+
160
+ print("Cleaning and parsing the test dataset...\n")
161
+
162
+ test_data["Text"] = test_data["Text"].apply(lambda x: clean_str(x))
163
+
164
+ test_data.head()
165
+
166
+ label2id = {"NOT_OFF": 0,"OFF": 1}
167
+ id2label = {0: "NOT_OFF", 1: "OFF"}
168
+
169
+ train_data['label'] = train_data['label'].apply(lambda x: label2id[x])
170
+ train_data=train_data[["Text", "label"]]
171
+ train_data.head()
172
+
173
+ dev_data['label'] = dev_data['label'].apply(lambda x: label2id[x])
174
+ dev_data=dev_data[["Text", "label"]]
175
+ dev_data.head()
176
+
177
+ test_data['label'] = test_data['label'].apply(lambda x: label2id[x])
178
+ test_data=test_data[["Text", "label"]]
179
+ test_data
180
+
181
+ import pandas as pd
182
+ from imblearn.over_sampling import RandomOverSampler
183
+ from collections import Counter
184
+
185
+ X = train_data[['Text']]
186
+ y = train_data['label']
187
+
188
+ print('Original class distribution:', Counter(y))
189
+
190
+ ros = RandomOverSampler(random_state=42)
191
+
192
+ X_resampled, y_resampled = ros.fit_resample(X, y)
193
+
194
+ train_data_resampled = pd.DataFrame(X_resampled, columns=['Text'])
195
+ train_data_resampled['label'] = y_resampled
196
+
197
+ print('Resampled class distribution:', Counter(y_resampled))
198
+
199
+ y_resampled.value_counts()
200
+
201
+ train_data_resampled.head()
202
+
203
+ from sklearn.model_selection import train_test_split
204
+
205
+ X_train = train_data_resampled['Text'].values
206
+ y_train = train_data_resampled['label'].values
207
+
208
+ X_val = dev_data['Text'].values
209
+ y_val = dev_data['label'].values
210
+
211
+
212
+
213
+ print("Training data shape:", X_train.shape, y_train.shape)
214
+ print("Validation data shape:", X_val.shape, y_val.shape)
215
+
216
+ train_text_lengths = [len(text.split()) for text in X_train]
217
+ max_length = max(train_text_lengths)
218
+
219
+ print("Maximum length of text:", max_length)
220
+
221
+ """### APPLYING QARIB MODEL"""
222
+
223
+ ! pip install transformers[torch]
224
+
225
+ import numpy as np
226
+
227
+ # to prepare dataset and calculate metrics
228
+ from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
229
+
230
+ from transformers import AutoConfig, BertForSequenceClassification, AutoTokenizer
231
+ from transformers.data.processors import SingleSentenceClassificationProcessor, InputFeatures
232
+ from transformers import Trainer , TrainingArguments
233
+
234
+ train_df = pd.DataFrame({
235
+ 'label':y_train,
236
+ 'text': X_train
237
+ })
238
+
239
+ dev_df = pd.DataFrame({
240
+ 'label':y_val,
241
+ 'text': X_val
242
+ })
243
+
244
+ test_df = pd.DataFrame({
245
+ 'label':test_data['label'],
246
+ 'text': test_data['Text']
247
+ })
248
+
249
+ PREFIX_LIST = [
250
+ "ال",
251
+ "و",
252
+ "ف",
253
+ "ب",
254
+ "ك",
255
+ "ل",
256
+ "لل",
257
+ "\u0627\u0644",
258
+ "\u0648",
259
+ "\u0641",
260
+ "\u0628",
261
+ "\u0643",
262
+ "\u0644",
263
+ "\u0644\u0644",
264
+ "س",
265
+ ]
266
+ SUFFIX_LIST = [
267
+ "ه",
268
+ "ها",
269
+ "ك",
270
+ "ي",
271
+ "هما",
272
+ "كما",
273
+ "نا",
274
+ "كم",
275
+ "هم",
276
+ "هن",
277
+ "كن",
278
+ "ا",
279
+ "ان",
280
+ "ين",
281
+ "ون",
282
+ "وا",
283
+ "ات",
284
+ "ت",
285
+ "ن",
286
+ "ة",
287
+ "\u0647",
288
+ "\u0647\u0627",
289
+ "\u0643",
290
+ "\u064a",
291
+ "\u0647\u0645\u0627",
292
+ "\u0643\u0645\u0627",
293
+ "\u0646\u0627",
294
+ "\u0643\u0645",
295
+ "\u0647\u0645",
296
+ "\u0647\u0646",
297
+ "\u0643\u0646",
298
+ "\u0627",
299
+ "\u0627\u0646",
300
+ "\u064a\u0646",
301
+ "\u0648\u0646",
302
+ "\u0648\u0627",
303
+ "\u0627\u062a",
304
+ "\u062a",
305
+ "\u0646",
306
+ "\u0629",
307
+ ]
308
+
309
+
310
+ # the never_split list is used with the transformers library
311
+ _PREFIX_SYMBOLS = [x + "+" for x in PREFIX_LIST]
312
+ _SUFFIX_SYMBOLS = ["+" + x for x in SUFFIX_LIST]
313
+ NEVER_SPLIT_TOKENS = list(set(_PREFIX_SYMBOLS + _SUFFIX_SYMBOLS))
314
+
315
+ model_name = "qarib/bert-base-qarib"
316
+ num_labels = 2
317
+ config = AutoConfig.from_pretrained(model_name,num_labels=num_labels, output_attentions=True)
318
+ tokenizer = AutoTokenizer.from_pretrained(model_name,
319
+ do_lower_case=False,
320
+ do_basic_tokenize=True,
321
+ never_split=NEVER_SPLIT_TOKENS)
322
+ tokenizer.max_len = 64
323
+ model = BertForSequenceClassification.from_pretrained(model_name, config=config)
324
+
325
+ train_dataset = SingleSentenceClassificationProcessor(mode='classification')
326
+ dev_dataset = SingleSentenceClassificationProcessor(mode='classification')
327
+
328
+ train_dataset.add_examples(texts_or_text_and_labels=train_df['text'],labels=train_df['label'],overwrite_examples = True)
329
+ dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'],labels=dev_df['label'],overwrite_examples = True)
330
+ print(train_dataset.examples[0])
331
+
332
+ train_features = train_dataset.get_features(tokenizer = tokenizer, max_length =64)
333
+ dev_features = dev_dataset.get_features(tokenizer = tokenizer, max_length =64)
334
+ # print(config)
335
+
336
+ print(len(train_features))
337
+ print(len(dev_features))
338
+
339
+ def compute_metrics(p): #p should be of type EvalPrediction
340
+ print(np.shape(p.predictions[0]))
341
+ print(np.shape(p.predictions[1]))
342
+ print(len(p.label_ids))
343
+ preds = np.argmax(p.predictions[0], axis=1)
344
+ assert len(preds) == len(p.label_ids)
345
+ print(classification_report(p.label_ids,preds))
346
+ print(confusion_matrix(p.label_ids,preds))
347
+
348
+ macro_f1 = f1_score(p.label_ids,preds,average='macro')
349
+ macro_precision = precision_score(p.label_ids,preds,average='macro')
350
+ macro_recall = recall_score(p.label_ids,preds,average='macro')
351
+ acc = accuracy_score(p.label_ids,preds)
352
+ return {
353
+ 'macro_f1' : macro_f1,
354
+ 'macro_precision': macro_precision,
355
+ 'macro_recall': macro_recall,
356
+ 'accuracy': acc
357
+ }
358
+
359
+ ! mkdir train
360
+ training_args = TrainingArguments("./train")
361
+ training_args.do_train = True
362
+ training_args.evaluate_during_training = True
363
+ training_args.adam_epsilon = 1e-8
364
+ training_args.learning_rate = 2e-5
365
+ training_args.warmup_steps = 0
366
+ training_args.per_device_train_batch_size = 64 #Increase batch size
367
+ training_args.per_device_eval_batch_size = 64 #Increase batch size
368
+ training_args.num_train_epochs = 2 #reduce number of epoch
369
+ training_args.logging_steps = 300 #Increase logging steps
370
+ training_args.save_steps = 2000 #Increase save steps
371
+ training_args.seed = 42
372
+ print(training_args.logging_steps)
373
+
374
+ # instantiate trainer
375
+ trainer = Trainer(model=model,
376
+ args = training_args,
377
+ train_dataset = train_features,
378
+ eval_dataset = dev_features,
379
+ compute_metrics = compute_metrics)
380
+ # start training
381
+ trainer.train()
382
+
383
+ trainer.evaluate()
384
+
385
+ !pip install fasttext
386
+ import fasttext
387
+ import fasttext.util
388
+ from huggingface_hub import hf_hub_download
389
+
390
+ model_path = hf_hub_download(repo_id="facebook/fasttext-ar-vectors", filename="model.bin")
391
+ # model_path = "./fasttext-ar-vectors-150.bin"
392
+ model_fasttext = fasttext.load_model(model_path)
393
+ # model_fasttext = fasttext.util.reduce_model(model_fasttext, 150) # reduce embeddings dimension to 150 from 300; requires a huge memory notebook
394
+ # model_fasttext.save_model("/content/drive/MyDrive/Colab Notebooks/text-aml/hate-speech-ds/fasttext-ar-vectors-150.bin")
395
+ print(len(model_fasttext.words))
396
+ model_fasttext['bread'].shape
397
+
398
+ import nltk
399
+ from nltk.corpus import stopwords
400
+ from nltk.tokenize import WordPunctTokenizer
401
+ from nltk.stem.isri import ISRIStemmer
402
+ import string
403
+ import re
404
+ from bs4 import BeautifulSoup
405
+ nltk.download('stopwords')
406
+
407
+
408
+ tok = WordPunctTokenizer()
409
+
410
+ def normalize_arabic(text):
411
+ text = re.sub("[إأآا]", "ا", text)
412
+ text = re.sub("ى", "ي", text)
413
+ text = re.sub("ؤ", "ء", text)
414
+ text = re.sub("ئ", "ء", text)
415
+ text = re.sub("ة", "ه", text)
416
+ text = re.sub("گ", "ك", text)
417
+ return text
418
+
419
+
420
+ def remove_diacritics(text):
421
+ arabic_diacritics = re.compile("""
422
+ ّ | # Tashdid
423
+ َ | # Fatha
424
+ ً | # Tanwin Fath
425
+ ُ | # Damma
426
+ ٌ | # Tanwin Damm
427
+ ِ | # Kasra
428
+ ٍ | # Tanwin Kasr
429
+ ْ | # Sukun
430
+ ـ # Tatwil/Kashida
431
+ """, re.VERBOSE)
432
+ return re.sub(arabic_diacritics, '', text)
433
+
434
+
435
+ def remove_punctuations(text):
436
+ arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
437
+ english_punctuations = string.punctuation
438
+ punctuations_list = arabic_punctuations + english_punctuations
439
+ translator = str.maketrans('', '', punctuations_list)
440
+ return text.translate(translator)
441
+
442
+
443
+ def remove_repeating_char(text):
444
+ # return re.sub(r'(.)\1+', r'\1', text) # keep only 1 repeat
445
+ return re.sub(r'(.)\1+', r'\1\1', text) # keep 2 repeat
446
+
447
+ def remove_stop_words(text):
448
+ #nltk.download('stopwords')
449
+ englishStopWords = stopwords.words('english')
450
+
451
+ all_stopwords = set(englishStopWords + arabic_stop_words)
452
+
453
+ word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
454
+ word_list = [ w for w in word_list if not w in all_stopwords ]
455
+ return (" ".join(word_list)).strip()
456
+
457
+ def get_root(text):
458
+ word_list = nltk.tokenize.wordpunct_tokenize(text.lower())
459
+ result = []
460
+ arstemmer = ISRIStemmer()
461
+ for word in word_list: result.append(arstemmer.stem(word))
462
+ return (' '.join(result)).strip()
463
+
464
+ def clean_tweet(text):
465
+ text = re.sub(r'([@A-Za-z0-9_]+)|#|http\S+', ' ', text) # removes non arabic letters
466
+ text = re.sub(r'ـــــــــــــ', '', text) # removes non arabic letters
467
+ return text
468
+
469
+
470
+
471
+
472
+ def clean_str(text):
473
+ text = clean_tweet(text)
474
+ # text = normalize_arabic(text)
475
+ text = remove_punctuations(text) ###
476
+ text = remove_diacritics(text)
477
+ text = remove_repeating_char(text) ###
478
+ # text = remove_stop_words(text) ###
479
+
480
+
481
+ text = text.replace('وو', 'و') ###
482
+ text = text.replace('يي', 'ي') ###
483
+ text = text.replace('اا', 'ا') ###
484
+
485
+ # text = get_root(text) ###
486
+
487
+ soup = BeautifulSoup(text, 'lxml')
488
+ souped = soup.get_text()
489
+ pat1 = r'@[A-Za-z0-9]+'
490
+ pat2 = r'https?://[A-Za-z0-9./]+'
491
+ combined_pat = r'|'.join((pat1, pat2))
492
+ stripped = re.sub(combined_pat, '', souped)
493
+ try:
494
+ clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
495
+ except:
496
+ clean = stripped
497
+
498
+ words = tok.tokenize(clean)
499
+ return (" ".join(words)).strip()
500
+
501
+ !gdown "165kzfZDsRTZAAfZKedeZiUlKzMcHNgPd" # arabic stop words
502
+ !gdown "1WdgbvqDYIa-g5ijjsz5zb-3lVvUXUtmS&confirm=t" # qarib pretrained model
503
+ !gdown "1foNTGFjhWAxS-_SfF7rga80UmFT7BDJ0&confirm=t" # fasttext-ar-vectors-150.bin
504
+
505
+ !pip install pyarabic
506
+ !pip install farasapy
507
+ !pip install transformers[torch]
508
+ !pip install Keras-Preprocessing
509
+
510
+ ! git clone https://github.com/facebookresearch/fastText.git
511
+ ! cd fastText && sudo pip install .
512
+
513
+ from transformers import pipeline
514
+ unmasker_MARBERT = pipeline('fill-mask', model='UBC-NLP/MARBERT', top_k=50)
515
+
516
+ def light_preprocess(text):
517
+ text = clean_tweet(text)
518
+ # text = normalize_arabic(text)
519
+ text = remove_punctuations(text) ###
520
+ text = remove_diacritics(text)
521
+ text = remove_repeating_char(text) ###
522
+ text = text.replace('وو', 'و') ###
523
+ text = text.replace('يي', 'ي') ###
524
+ text = text.replace('اا', 'ا') ###
525
+ return text
526
+
527
+ nltk.download('stopwords')
528
+ englishStopWords = stopwords.words('english')
529
+ arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
530
+ english_punctuations = string.punctuation
531
+ punctuations_list = arabic_punctuations + english_punctuations
532
+
533
+ all_stopwords = set(englishStopWords + arabic_stop_words)
534
+
535
+ !pip install torch # Install the PyTorch library if you haven't already
536
+
537
+ import torch
538
+ # Determine if a GPU is available and set the device accordingly
539
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
540
+ def classsify_tweets(tweet):
541
+ df = pd.DataFrame({"tweet": tweet})
542
+ df['clean_tweet'] = df['tweet'].apply(lambda x: clean_str(x))
543
+
544
+ dev_df = pd.DataFrame({
545
+ 'id':range(len(df)),
546
+ 'text': df["clean_tweet"]
547
+ })
548
+
549
+ test_example = SingleSentenceClassificationProcessor(mode='classification')
550
+ test_example.add_examples(texts_or_text_and_labels=dev_df['text'], overwrite_examples = True)
551
+
552
+ test_features = test_example.get_features(tokenizer = tokenizer, max_length =64)
553
+
554
+ input_ids = [i.input_ids for i in test_features]
555
+ attention_masks = [i.attention_mask for i in test_features]
556
+
557
+ inputs = torch.tensor(input_ids)
558
+ masks = torch.tensor(attention_masks)
559
+
560
+ # Put the model in an evaluation state
561
+ model.eval()
562
+
563
+ # Transfer model to GPU
564
+ model.to(device)
565
+
566
+ torch.cuda.empty_cache() # empty the gpu memory
567
+ # Transfer the batch to gpu
568
+ inputs = inputs.to(device)
569
+ masks = masks.to(device)
570
+
571
+ # Run inference on the example
572
+ output = model(inputs, attention_mask=masks)["logits"]
573
+ # Transfer the output to CPU again and convert to numpy
574
+ output = output.cpu().detach().numpy()
575
+
576
+ return output
577
+
578
+ size = len(test_data)
579
+ print("size of test set:", size)
580
+ correct_class_tweets = []
581
+ correct_class = []
582
+ for i in range(0, size):
583
+ txt = test_data['Text'].astype('U')[i]
584
+ cls = test_data['label'][i]
585
+ label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]]
586
+ if label == cls and label == 1:
587
+ correct_class_tweets.append(txt)
588
+ correct_class.append(cls)
589
+
590
+ from scipy.spatial import distance
591
+ from farasa.stemmer import FarasaStemmer
592
+ frasa_stemmer = FarasaStemmer(interactive=True)
593
+
594
+ !pip install emoji
595
+
596
+ import emoji
597
+
598
+ def select_best_replacement(pos, x_cur, verbose=False):
599
+ """ Select the most effective replacement to word at pos (pos) in (x_cur)"""
600
+
601
+ if bool(emoji.emoji_count(x_cur.split()[pos])):
602
+ return None
603
+
604
+ embedding_masked_word = model_fasttext[x_cur.split()[pos]]
605
+
606
+ x_masked = (" ".join(x_cur.split()[:pos]) + " [MASK] " + " ".join(x_cur.split()[pos + 1:])).strip()
607
+ unmasked_seq = unmasker_MARBERT(x_masked)[:20]
608
+
609
+ max_sim = -1
610
+ best_perturb_dict = {}
611
+ for seq in unmasked_seq:
612
+ if frasa_stemmer.stem(seq['token_str']) in frasa_stemmer.stem(x_cur.split()[pos]):
613
+ continue
614
+ if seq['token_str'] in punctuations_list or pos >= len(seq["sequence"].split()):
615
+ continue
616
+ embedding_masked_word_new = model_fasttext[seq['token_str']]
617
+ if np.sum(embedding_masked_word) == 0 or np.sum(embedding_masked_word_new) == 0:
618
+ continue
619
+ if verbose: print("New word: ", seq['token_str'])
620
+ sim = 1 - distance.cosine(embedding_masked_word, embedding_masked_word_new)
621
+ if sim > max_sim:
622
+ max_sim = sim
623
+ best_perturb_dict["sim"] = sim
624
+ best_perturb_dict["Masked word"] = x_cur.split()[pos]
625
+ best_perturb_dict["New word"] = seq['token_str']
626
+ best_perturb_dict["New seq"] = x_cur.replace(x_cur.split()[pos], seq['token_str'])
627
+
628
+ return best_perturb_dict.get("New seq", None)
629
+
630
+ # Process tweets and perturb
631
+ perturb_counter = 0
632
+ for tweet_ix, tweet in enumerate(correct_class_tweets):
633
+ print("Tweet index: ", tweet_ix)
634
+
635
+ x_adv = light_preprocess(tweet)
636
+ x_len = len(x_adv.split())
637
+ orig_class = np.argmax(classsify_tweets([x_adv]), axis=1)[0]
638
+ orig_label = id2label[orig_class]
639
+ print(f"Original tweet: {x_adv} : Original label: {orig_label}.")
640
+ splits = len(x_adv.split())
641
+ perturbed_flag = False
642
+ for split_ix in range(splits):
643
+ perturbed = select_best_replacement(split_ix, x_adv)
644
+ if perturbed:
645
+ new_class = np.argmax(classsify_tweets([perturbed]), axis=1)[0]
646
+ if orig_class != new_class:
647
+ print(f"Perturbed tweet: {perturbed} : New label: {id2label[new_class]}.")
648
+ print(10 * "==")
649
+ if not perturbed_flag:
650
+ perturb_counter += 1
651
+ perturbed_flag = True
652
+ if not perturbed_flag:
653
+ print(10 * "==")
654
+ print(f"Successful perturbation {perturb_counter} out of {len(correct_class_tweets)}.")
655
+
656
+ off_tweets_count = sum(test_data['label'] == 1 )
657
+ print(f"Number of offensive tweets in the dataset: {off_tweets_count}")
658
+
659
+ size = len(test_data)
660
+ print("size of test set:", size)
661
+ correct_class_tweets = []
662
+ correct_class = []
663
+ for i in range(0, size):
664
+ txt = test_data['Text'].astype('U')[i]
665
+ cls = test_data['label'][i]
666
+ label = id2label[np.argmax(classsify_tweets([txt]), axis=1)[0]]
667
+ print(f"Tweet: {txt} | Actual: {cls} | Predicted: {label}")
668
+ if label == cls and label == "OFF":
669
+ correct_class_tweets.append(txt)
670
+ correct_class.append(cls)
671
+ print(f"Correctly classified as OFF: {txt}")
672
+
673
+ !pip install gradio
674
+
675
+ import gradio as gr
676
+ import torch
677
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
678
+
679
+ # Load the model and tokenizer
680
+ model_name = "qarib/bert-base-qarib"
681
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
682
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
683
+
684
+ # Preprocessing function
685
+ def light_preprocess(text):
686
+ text = text.replace("@USER", "").replace("RT", "").strip()
687
+ return text
688
+
689
+ # Prediction function
690
+ def predict_offensive(text):
691
+ preprocessed_text = light_preprocess(text)
692
+ inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True)
693
+ with torch.no_grad():
694
+ outputs = model(**inputs)
695
+ logits = outputs.logits
696
+ predicted_class = torch.argmax(logits, dim=1).item()
697
+ return "Offensive" if predicted_class == 1 else "Not Offensive"
698
+
699
+ # Create the Gradio interface
700
+ iface = gr.Interface(
701
+ fn=predict_offensive,
702
+ inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
703
+ outputs="text",
704
+ title="Offensive Language Detection",
705
+ description="Enter a text to check if it's offensive or not.",
706
+ )
707
+
708
+ # Launch the interface
709
+ iface.launch()
710
+
711
+ import gradio as gr
712
+ import torch
713
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
714
+
715
+ # Load the models and tokenizers
716
+ model_name_1 = "qarib/bert-base-qarib"
717
+ model_name_2 = "bert-base-multilingual-cased"
718
+ tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1)
719
+ model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1, num_labels=2)
720
+
721
+ tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2)
722
+ model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2, num_labels=2)
723
+
724
+ # Preprocessing function
725
+ def light_preprocess(text):
726
+ text = text.replace("@USER", "").replace("RT", "").strip()
727
+ return text
728
+
729
+ # Prediction function
730
+ def predict_offensive(text, model_choice):
731
+ if model_choice == "Model 1":
732
+ tokenizer = tokenizer_1
733
+ model = model_1
734
+ else:
735
+ tokenizer = tokenizer_2
736
+ model = model_2
737
+
738
+ preprocessed_text = light_preprocess(text)
739
+ inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True)
740
+ with torch.no_grad():
741
+ outputs = model(**inputs)
742
+ logits = outputs.logits
743
+ predicted_class = torch.argmax(logits, dim=1).item()
744
+ return "Offensive" if predicted_class == 1 else "Not Offensive"
745
+
746
+ # Create the Gradio interface with a modern theme
747
+ iface = gr.Interface(
748
+ fn=predict_offensive,
749
+ inputs=[
750
+ gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
751
+ gr.Dropdown(choices=["Model 1", "Model 2"], label="Select Model")
752
+ ],
753
+ outputs=gr.Textbox(label="Prediction"),
754
+ title="Offensive Language Detection",
755
+ description="Enter a text to check if it's offensive or not using the selected model.",
756
+ theme="default", # Use "dark" for dark mode
757
+ css=".gradio-container { background-color: #f0f0f0; } .output-textbox { font-size: 20px; color: #007BFF; }"
758
+ )
759
+
760
+ # Launch the interface
761
+ iface.launch()
762
+
763
+ !pip install gradio
764
+ import gradio as gr
765
+ import torch
766
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
767
+
768
+ # Load the models and tokenizers
769
+ model_name_1 = "qarib/bert-base-qarib"
770
+ model_name_2 = "bert-base-multilingual-cased"
771
+ tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1)
772
+ model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1, num_labels=2)
773
+
774
+ tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2)
775
+ model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2, num_labels=2)
776
+
777
+ # Preprocessing function
778
+ def light_preprocess(text):
779
+ text = text.replace("@USER", "").replace("RT", "").strip()
780
+ return text
781
+
782
+ # Prediction function
783
+ def predict_offensive(text, model_choice):
784
+ if model_choice == "Model 1":
785
+ tokenizer = tokenizer_1
786
+ model = model_1
787
+ else:
788
+ tokenizer = tokenizer_2
789
+ model = model_2
790
+
791
+ preprocessed_text = light_preprocess(text)
792
+ inputs = tokenizer(preprocessed_text, return_tensors="pt", truncation=True, padding=True)
793
+ with torch.no_grad():
794
+ outputs = model(**inputs)
795
+ logits = outputs.logits
796
+ predicted_class = torch.argmax(logits, dim=1).item()
797
+ return "Offensive" if predicted_class == 1 else "Not Offensive"
798
+
799
+ # Create the Gradio interface using Text Classification template
800
+ iface = gr.Interface(
801
+ fn=predict_offensive,
802
+ inputs=[
803
+ gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
804
+ gr.Dropdown(choices=["Model 1", "Model 2"], label="Select Model")
805
+ ],
806
+ outputs=gr.Textbox(label="Prediction"),
807
+ title="Offensive Language Detection",
808
+ description="Enter a text to check if it's offensive or not using the selected model.",
809
+ theme="default", # Change to "dark" for dark mode
810
+ )
811
+
812
+ # Launch the interface
813
+ iface.launch()