import pandas as pd import numpy as np import nltk from nltk.tokenize import word_tokenize, RegexpTokenizer from nltk.corpus import stopwords from nltk.stem import SnowballStemmer from sklearn.feature_extraction.text import CountVectorizer import csv import re nltk.download('stopwords') nltk.download('punkt') stopwords_es = stopwords.words('spanish') spanish_stemmer = SnowballStemmer('spanish') def remove_html_markup(s): tag = False quote = False out = "" for c in s: if c == '<' and not quote: tag = True elif c == '>' and not quote: tag = False elif (c == '"' or c == "'") and tag: quote = not quote elif not tag: out = out + c return out def remove_URL(s): """Remove URLs from a sample string""" return re.sub(r"http\S+", "", s) def eliminar_puntuacion(articulo): deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"] new_articulo = "" for x in articulo: if x not in deletetion_symbols: new_articulo += x return new_articulo def eliminar_stopwords(articulo): articulo_splitted=articulo.split() new_articulo = "" for x in articulo_splitted: if x not in stopwords_es: new_articulo += " " + x return new_articulo def obtener_raices(articulo): articulo_splitted=articulo.split() new_articulo = "" for x in articulo_splitted: x_new = spanish_stemmer.stem(x) new_articulo += " " + x_new return new_articulo def limpieza_articulos(df): df_titulos=pd.DataFrame(df['titulo'], columns=['titulo']) # Colocando texto en minusculas df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: x.lower()) # Eliminando signos de puntuacion df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_puntuacion(x)) # Eliminando palabras vacias (stopwords) utilizando el corpus para estos fines que tiene nltk df_titulos['titulo']= df_titulos['titulo'].apply(lambda x: eliminar_stopwords(x)) all_text = ' '. join(df_titulos['titulo']) vocab= np.unique(word_tokenize(all_text)) return vocab