import pandas as pd import numpy as np import nltk from nltk.tokenize import word_tokenize, RegexpTokenizer from nltk.corpus import stopwords from nltk.stem import SnowballStemmer import textacy from sklearn.feature_extraction.text import CountVectorizer import csv import re nltk.download('stopwords') nltk.download('punkt') stopwords_es = stopwords.words('spanish') spanish_stemmer = SnowballStemmer('spanish') def remove_html_markup(s): tag = False quote = False out = "" for c in s: if c == '<' and not quote: tag = True elif c == '>' and not quote: tag = False elif (c == '"' or c == "'") and tag: quote = not quote elif not tag: out = out + c return out def remove_URL(s): """Remove URLs from a sample string""" return re.sub(r"http\S+", "", s) def eliminar_puntuacion(articulo): deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] new_articulo = "" for x in articulo: if x not in deletetion_symbols: new_articulo += x return new_articulo def remove_emoji(s): regrex_pattern = re.compile(pattern = "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags = re.UNICODE) return regrex_pattern.sub(r'',s) def remover_casos_especiales(s): #Removiendo texto que termina con .-, ya que usualmente es un texto que se usa como inicio de algunos articulos s= re.sub(r'^\w+(,)*([\s]\w+)*([\s]\(\w+\))*.-','',s) return s def frases_remover(s): lista_frases_remover=['La entrada', 'la entrada', '(Seguir leyendo…)', 'se publicó primero en', 'Remolacha - Noticias Republica Dominicana', 'Read more ›', 'Read more','[…]', 'RELACIONADAS'] for l in lista_frases_remover: s = s.replace(l, '') return s def eliminar_stopwords(articulo): articulo_splitted=articulo.split() new_articulo = "" for x in articulo_splitted: if x not in stopwords_es: new_articulo += " " + x return new_articulo def obtener_raices(articulo): articulo_splitted=articulo.split() new_articulo = "" for x in articulo_splitted: x_new = spanish_stemmer.stem(x) new_articulo += " " + x_new return new_articulo def limpieza_articulos(df): df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) # Colocando texto en minusculas df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) # Eliminando signos de puntuacion df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) # Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) all_text = ' '. join(df_titulos['titulo']) vocab= np.unique(word_tokenize(all_text)) return vocab def obtener_kpes(df): df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) all_text = '. '. join(df_titulos['titulo']) titulos=textacy.make_spacy_doc(all_text, lang='es_core_news_sm') return textacy.extract.keyterms.textrank(titulos,normalize='lower',topn=10)