Lisibonny commited on
Commit
0196371
1 Parent(s): 1d2be7e

Update preprocesamiento_articulos.py

Browse files
Files changed (1) hide show
  1. preprocesamiento_articulos.py +12 -0
preprocesamiento_articulos.py CHANGED
@@ -3,12 +3,15 @@ import numpy as np
3
  import nltk
4
  from nltk.tokenize import word_tokenize, RegexpTokenizer
5
  from nltk.corpus import stopwords
 
 
6
  from sklearn.feature_extraction.text import CountVectorizer
7
  import csv
8
 
9
  nltk.download('stopwords')
10
  nltk.download('punkt')
11
  stopwords_es = stopwords.words('spanish')
 
12
 
13
  def eliminar_puntuacion(articulo):
14
  deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
@@ -27,6 +30,15 @@ def eliminar_stopwords(articulo):
27
  new_articulo += " " + x
28
  return new_articulo
29
 
 
 
 
 
 
 
 
 
 
30
  def limpieza_articulos(df):
31
 
32
  df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])
 
3
  import nltk
4
  from nltk.tokenize import word_tokenize, RegexpTokenizer
5
  from nltk.corpus import stopwords
6
+ from nltk.stem import SnowballStemmer
7
+
8
  from sklearn.feature_extraction.text import CountVectorizer
9
  import csv
10
 
11
  nltk.download('stopwords')
12
  nltk.download('punkt')
13
  stopwords_es = stopwords.words('spanish')
14
+ spanish_stemmer = SnowballStemmer('spanish')
15
 
16
  def eliminar_puntuacion(articulo):
17
  deletetion_symbols = ['!','(',')',"'",'-','[',']','{','}',';',':','"','“','’','”',"'",'`','‘','``','\\' ,'/','|',',','|','<','>','.','..','...','?','@',"#",'$','^','&','*','_','~','+','%','=','¿','¡',"''"]
 
30
  new_articulo += " " + x
31
  return new_articulo
32
 
33
+ def obtener_raices(articulo)
34
+
35
+ articulo_splitted=articulo.split()
36
+ new_articulo = ""
37
+ for x in articulo_splitted:
38
+ x_new = spanish_stemmer.stem(x)
39
+ new_articulo += " " + x_new
40
+ return new_articulo
41
+
42
  def limpieza_articulos(df):
43
 
44
  df_titulos=pd.DataFrame(df['titulo'], columns=['titulo'])