Twitter_nlp / utils.py
gamza's picture
Upload 2 files
d37b5f8
raw
history blame contribute delete
No virus
4.6 kB
# -*- coding: utf-8 -*-
"""utils(2).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1snWVRieogxGIRp-UsTCZWjLM5ir5KQxB
"""
import re
import nltk
import torch
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
tokenizer_B = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
device = torch.device("cuda")
# wordnet ๋ชจ๋“ˆ์„ ์‚ฌ์šฉํ•˜์—ฌ ๋‹จ์–ด์˜ ํ’ˆ์‚ฌ(POS, Part of Speech)๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ํ•จ์ˆ˜
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ, #ํ˜•์šฉ์‚ฌ
"N": wordnet.NOUN, #๋ช…์‚ฌ
"V": wordnet.VERB, #๋™์‚ฌ
"R": wordnet.ADV} #๋ถ€์‚ฌ
return tag_dict.get(tag, wordnet.NOUN)
def get_wordnet_pos(word):
"""Map POS tag to first character lemmatize() accepts"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ, #ํ˜•์šฉ์‚ฌ
"N": wordnet.NOUN, #๋ช…์‚ฌ
"V": wordnet.VERB, #๋™์‚ฌ
"R": wordnet.ADV} #๋ถ€์‚ฌ
return tag_dict.get(tag, wordnet.NOUN)
# ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜
def pre_data(data):
#์†Œ๋ฌธ์ž
df2 = data.lower().strip()
#ํ† ํฐํ™”โ†’TweetTokenizer ์‚ฌ์šฉ
df_token = tokenizer.tokenize(df2)
#@์•„์ด๋”” โ†’ ์ œ๊ฑฐ
df_IDdel = []
for word in df_token:
if '@' not in word:
df_IDdel.append(word)
#๋‹ค์‹œ ๋ฌธ์žฅ,..
df_IDdel_sen = ' '.join(df_IDdel)
#์˜์–ด ์•„๋‹Œ ๋ฌธ์ž๋“ค ๊ณต๋ฐฑ์œผ๋กœ ์ „ํ™˜
df_eng = re.sub("[^a-zA-Z]", " ", df_IDdel_sen)
#๋ฐ˜๋ณต๋œ ์ฒ ์ž ์ง€์šฐ๊ธฐ (์ตœ๋Œ€ 2๊ฐœ๊นŒ์ง€ ๊ฐ€๋Šฅ)
df_rep_list = []
for i, e in enumerate(df_eng):
if i > 1 and e == df_eng[i - 2] and e == df_eng[i - 1]:
df_rep_list.append('')
else:
df_rep_list.append(e)
df_rep = ''.join(df_rep_list)
#์—ฐ์†๋œ ๊ณต๋ฐฑ ์ •๋ฆฌ
df_rep = re.sub(r'\s+', ' ', df_rep)
#ํ‘œ์ œ์–ด ์ถ”์ถœ(lemmatizer)
df_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(df_rep)]
#๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
df_clean = [w for w in df_lemma if not w in stopword_list]
if len(df_clean) == 0:
df_clean = 'NC' #NC=No Category - ๋ฆฌ์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์„๋•Œ ์‚ฌ์šฉํ•˜๋Š” ๋ฌธ์ž์—ด, ์˜๋ฏธ์žˆ๋Š” ๋‹จ์–ดX
else: df_clean = ' '.join(df_clean)
return df_clean
# ์ž…๋ ฅ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜
def convert_input_data(sentences):
# BERT์˜ ํ† ํฌ๋‚˜์ด์ €๋กœ ๋ฌธ์žฅ์„ ํ† ํฐ์œผ๋กœ ๋ถ„๋ฆฌ
tokenized_texts = [tokenizer_B.tokenize(sent) for sent in sentences]
# ์ž…๋ ฅ ํ† ํฐ์˜ ์ตœ๋Œ€ ์‹œํ€€์Šค ๊ธธ์ด
MAX_LEN = 80
# ํ† ํฐ์„ ์ˆซ์ž ์ธ๋ฑ์Šค๋กœ ๋ณ€ํ™˜
input_ids = [tokenizer_B.convert_tokens_to_ids(x) for x in tokenized_texts]
# ๋ฌธ์žฅ์„ MAX_LEN ๊ธธ์ด์— ๋งž๊ฒŒ ์ž๋ฅด๊ณ , ๋ชจ์ž๋ž€ ๋ถ€๋ถ„์„ ํŒจ๋”ฉ 0์œผ๋กœ ์ฑ„์›€
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# ์–ดํ…์…˜ ๋งˆ์Šคํฌ ์ดˆ๊ธฐํ™”
attention_masks = []
# ์–ดํ…์…˜ ๋งˆ์Šคํฌ๋ฅผ ํŒจ๋”ฉ์ด ์•„๋‹ˆ๋ฉด 1, ํŒจ๋”ฉ์ด๋ฉด 0์œผ๋กœ ์„ค์ •
# ํŒจ๋”ฉ ๋ถ€๋ถ„์€ BERT ๋ชจ๋ธ์—์„œ ์–ดํ…์…˜์„ ์ˆ˜ํ–‰ํ•˜์ง€ ์•Š์•„ ์†๋„ ํ–ฅ์ƒ
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask)
# ๋ฐ์ดํ„ฐ๋ฅผ ํŒŒ์ดํ† ์น˜์˜ ํ…์„œ๋กœ ๋ณ€ํ™˜
inputs = torch.tensor(input_ids)
masks = torch.tensor(attention_masks)
return inputs, masks
# ๋ฌธ์žฅ ํ…Œ์ŠคํŠธ
def test_sentences(sentences, load_model):
# ๋ฌธ์žฅ์„ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ๋กœ ๋ณ€ํ™˜
inputs, masks = convert_input_data(sentences)
# ๋ฐ์ดํ„ฐ๋ฅผ GPU์— ๋„ฃ์Œ
b_input_ids = inputs.to(device)
b_input_mask = masks.to(device)
# ๊ทธ๋ž˜๋””์–ธํŠธ ๊ณ„์‚ฐ ์•ˆํ•จ
with torch.no_grad():
# Forward ์ˆ˜ํ–‰
outputs = load_model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask)
# ๋กœ์Šค ๊ตฌํ•จ
logits = outputs[0]
# CPU๋กœ ๋ฐ์ดํ„ฐ ์ด๋™
logits = logits.detach().cpu().numpy()
return logits