buscador-periodicos-dominicanos / preprocesamiento_articulos.py
Lisibonny's picture
Update preprocesamiento_articulos.py
982dd6a verified
raw
history blame
No virus
3.58 kB
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import textacy
from sklearn.feature_extraction.text import CountVectorizer
import csv
import re
nltk.download('stopwords')
nltk.download('punkt')
stopwords_es = stopwords.words('spanish')
spanish_stemmer = SnowballStemmer('spanish')
def remove_html_markup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
return out
def remove_URL(s):
"""Remove URLs from a sample string"""
return re.sub(r"http\S+", "", s)
def eliminar_puntuacion(articulo):
deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
new_articulo = ""
for x in articulo:
if x not in deletetion_symbols:
new_articulo += x
return new_articulo
def remove_emoji(s):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',s)
def remover_casos_especiales(s):
#Removiendo texto que termina con .-, ya que usualmente es un texto que se usa como inicio de algunos articulos
s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s)
return s
def frases_remover(s):
lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS']
for l in lista_frases_remover:
s = s.replace(l, '')
return s
def eliminar_stopwords(articulo):
articulo_splitted=articulo.split()
new_articulo = ""
for x in articulo_splitted:
if x not in stopwords_es:
new_articulo += " " + x
return new_articulo
def obtener_raices(articulo):
articulo_splitted=articulo.split()
new_articulo = ""
for x in articulo_splitted:
x_new = spanish_stemmer.stem(x)
new_articulo += " " + x_new
return new_articulo
def limpieza_articulos(df):
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
# Colocando texto en minusculas
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower())
# Eliminando signos de puntuacion
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x))
# Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk
df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x))
all_text = ' '. join(df_titulos['titulo'])
vocab= np.unique(word_tokenize(all_text))
return vocab
def obtener_kpes(df):
df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
all_text = '. '. join(df_titulos['titulo'])
titulos=textacy.make_spacy_doc(all_text, lang='es_core_news_sm')
return textacy.extract.keyterms.textrank(titulos,normalize='lower',topn=10)