gamza commited on
Commit
d37b5f8
โ€ข
1 Parent(s): f76fcc8

Upload 2 files

Browse files
Files changed (2) hide show
  1. bert_classification(0.87).pt +3 -0
  2. utils.py +156 -0
bert_classification(0.87).pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5847ffe4878ab7c57a22308310065d10fb603d2817eaffe12af543832cbd3610
3
+ size 711548626
utils.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """utils(2).ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1snWVRieogxGIRp-UsTCZWjLM5ir5KQxB
8
+ """
9
+
10
+ import re
11
+ import nltk
12
+ import torch
13
+ import numpy as np
14
+
15
+ from nltk.tokenize import TweetTokenizer
16
+ from nltk.stem import WordNetLemmatizer
17
+ from nltk.corpus import stopwords
18
+ from nltk.corpus import wordnet
19
+ from transformers import BertTokenizer
20
+ from keras.preprocessing.sequence import pad_sequences
21
+
22
+ nltk.download('stopwords')
23
+
24
+ stopword_list = nltk.corpus.stopwords.words('english')
25
+ stopword_list.remove('no')
26
+ stopword_list.remove('not')
27
+
28
+ nltk.download('punkt')
29
+ nltk.download('averaged_perceptron_tagger')
30
+ nltk.download('wordnet')
31
+
32
+ tokenizer = TweetTokenizer()
33
+ lemmatizer = WordNetLemmatizer()
34
+ tokenizer_B = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
35
+
36
+ device = torch.device("cuda")
37
+
38
+ # wordnet ๋ชจ๋“ˆ์„ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด์˜ ํ’ˆ์‚ฌ(POS, Part of Speech)๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ํ•จ์ˆ˜
39
+ def get_wordnet_pos(word):
40
+ """Map POS tag to first character lemmatize() accepts"""
41
+ tag = nltk.pos_tag([word])[0][1][0].upper()
42
+ tag_dict = {"J": wordnet.ADJ, #ํ˜•์šฉ์‚ฌ
43
+ "N": wordnet.NOUN, #๋ช…์‚ฌ
44
+ "V": wordnet.VERB, #๋™์‚ฌ
45
+ "R": wordnet.ADV} #๋ถ€์‚ฌ
46
+
47
+ return tag_dict.get(tag, wordnet.NOUN)
48
+
49
+ def get_wordnet_pos(word):
50
+ """Map POS tag to first character lemmatize() accepts"""
51
+ tag = nltk.pos_tag([word])[0][1][0].upper()
52
+ tag_dict = {"J": wordnet.ADJ, #ํ˜•์šฉ์‚ฌ
53
+ "N": wordnet.NOUN, #๋ช…์‚ฌ
54
+ "V": wordnet.VERB, #๋™์‚ฌ
55
+ "R": wordnet.ADV} #๋ถ€์‚ฌ
56
+
57
+ return tag_dict.get(tag, wordnet.NOUN)
58
+
59
+ # ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜
60
+ def pre_data(data):
61
+
62
+ #์†Œ๋ฌธ์ž
63
+ df2 = data.lower().strip()
64
+
65
+ #ํ† ํฐํ™”โ†’TweetTokenizer ์‚ฌ์šฉ
66
+ df_token = tokenizer.tokenize(df2)
67
+
68
+ #@์•„์ด๋”” โ†’ ์ œ๊ฑฐ
69
+ df_IDdel = []
70
+ for word in df_token:
71
+ if '@' not in word:
72
+ df_IDdel.append(word)
73
+
74
+ #๋‹ค์‹œ ๋ฌธ์žฅ,..
75
+ df_IDdel_sen = ' '.join(df_IDdel)
76
+
77
+ #์˜์–ด ์•„๋‹Œ ๋ฌธ์ž๋“ค ๊ณต๋ฐฑ์œผ๋กœ ์ „ํ™˜
78
+ df_eng = re.sub("[^a-zA-Z]", " ", df_IDdel_sen)
79
+
80
+ #๋ฐ˜๋ณต๋œ ์ฒ ์ž ์ง€์šฐ๊ธฐ (์ตœ๋Œ€ 2๊ฐœ๊นŒ์ง€ ๊ฐ€๋Šฅ)
81
+ df_rep_list = []
82
+ for i, e in enumerate(df_eng):
83
+ if i > 1 and e == df_eng[i - 2] and e == df_eng[i - 1]:
84
+ df_rep_list.append('')
85
+ else:
86
+ df_rep_list.append(e)
87
+ df_rep = ''.join(df_rep_list)
88
+ #์—ฐ์†๋œ ๊ณต๋ฐฑ ์ •๋ฆฌ
89
+ df_rep = re.sub(r'\s+', ' ', df_rep)
90
+
91
+ #ํ‘œ์ œ์–ด ์ถ”์ถœ(lemmatizer)
92
+ df_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(df_rep)]
93
+
94
+ #๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
95
+ df_clean = [w for w in df_lemma if not w in stopword_list]
96
+
97
+ if len(df_clean) == 0:
98
+ df_clean = 'NC' #NC=No Category - ๋ฆฌ์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์„๋•Œ ์‚ฌ์šฉํ•˜๋Š” ๋ฌธ์ž์—ด, ์˜๋ฏธ์žˆ๋Š” ๋‹จ์–ดX
99
+ else: df_clean = ' '.join(df_clean)
100
+
101
+ return df_clean
102
+
103
+ # ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜
104
+ def convert_input_data(sentences):
105
+
106
+ # BERT์˜ ํ† ํฌ๋‚˜์ด์ €๋กœ ๋ฌธ์žฅ์„ ํ† ํฐ์œผ๋กœ ๋ถ„๋ฆฌ
107
+ tokenized_texts = [tokenizer_B.tokenize(sent) for sent in sentences]
108
+
109
+ # ์ž…๋ ฅ ํ† ํฐ์˜ ์ตœ๋Œ€ ์‹œํ€€์Šค ๊ธธ์ด
110
+ MAX_LEN = 80
111
+
112
+ # ํ† ํฐ์„ ์ˆซ์ž ์ธ๋ฑ์Šค๋กœ ๋ณ€ํ™˜
113
+ input_ids = [tokenizer_B.convert_tokens_to_ids(x) for x in tokenized_texts]
114
+
115
+ # ๋ฌธ์žฅ์„ MAX_LEN ๊ธธ์ด์— ๋งž๊ฒŒ ์ž๋ฅด๊ณ , ๋ชจ์ž๋ž€ ๋ถ€๋ถ„์„ ํŒจ๋”ฉ 0์œผ๋กœ ์ฑ„์›€
116
+ input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
117
+
118
+ # ์–ดํ…์…˜ ๋งˆ์Šคํฌ ์ดˆ๊ธฐํ™”
119
+ attention_masks = []
120
+
121
+ # ์–ดํ…์…˜ ๋งˆ์Šคํฌ๋ฅผ ํŒจ๋”ฉ์ด ์•„๋‹ˆ๋ฉด 1, ํŒจ๋”ฉ์ด๋ฉด 0์œผ๋กœ ์„ค์ •
122
+ # ํŒจ๋”ฉ ๋ถ€๋ถ„์€ BERT ๋ชจ๋ธ์—์„œ ์–ดํ…์…˜์„ ์ˆ˜ํ–‰ํ•˜์ง€ ์•Š์•„ ์†๋„ ํ–ฅ์ƒ
123
+ for seq in input_ids:
124
+ seq_mask = [float(i>0) for i in seq]
125
+ attention_masks.append(seq_mask)
126
+
127
+ # ๋ฐ์ดํ„ฐ๋ฅผ ํŒŒ์ดํ† ์น˜์˜ ํ…์„œ๋กœ ๋ณ€ํ™˜
128
+ inputs = torch.tensor(input_ids)
129
+ masks = torch.tensor(attention_masks)
130
+
131
+ return inputs, masks
132
+
133
+ # ๋ฌธ์žฅ ํ…Œ์ŠคํŠธ
134
+ def test_sentences(sentences, load_model):
135
+
136
+ # ๋ฌธ์žฅ์„ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ๋กœ ๋ณ€ํ™˜
137
+ inputs, masks = convert_input_data(sentences)
138
+
139
+ # ๋ฐ์ดํ„ฐ๋ฅผ GPU์— ๋„ฃ์Œ
140
+ b_input_ids = inputs.to(device)
141
+ b_input_mask = masks.to(device)
142
+
143
+ # ๊ทธ๋ž˜๋””์–ธํŠธ ๊ณ„์‚ฐ ์•ˆํ•จ
144
+ with torch.no_grad():
145
+ # Forward ์ˆ˜ํ–‰
146
+ outputs = load_model(b_input_ids,
147
+ token_type_ids=None,
148
+ attention_mask=b_input_mask)
149
+
150
+ # ๋กœ์Šค ๊ตฌํ•จ
151
+ logits = outputs[0]
152
+
153
+ # CPU๋กœ ๋ฐ์ดํ„ฐ ์ด๋™
154
+ logits = logits.detach().cpu().numpy()
155
+
156
+ return logits