karshreya98 commited on
Commit
2e4daca
1 Parent(s): e2fe55a

made corrections for preprocessing

Browse files
Files changed (2) hide show
  1. app.py +3 -2
  2. utils/haystack.py +1 -1
app.py CHANGED
@@ -8,7 +8,7 @@ from json import JSONDecodeError
8
  from markdown import markdown
9
  from utils.config import parser
10
  from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query, start_preprocessor_node, start_retriever, start_reader
11
- from utils.ui import reset_results, set_initial_state, upload_doc
12
 
13
  # Sliders
14
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
@@ -34,7 +34,7 @@ def upload_files():
34
 
35
  def process_file(data_file, preprocesor, document_store):
36
  # read file and add content
37
- file_contents = data_file.read()
38
  docs = [{
39
  'content': str(file_contents),
40
  'meta': {'name': str(data_file.name)}
@@ -47,6 +47,7 @@ def process_file(data_file, preprocesor, document_store):
47
  print(f"{data_file.name} already processed")
48
  else:
49
  print(f'preprocessing uploaded doc {data_file.name}.......')
 
50
  preprocessed_docs = preprocesor.process(docs)
51
  print('writing to document store.......')
52
  document_store.write_documents(preprocessed_docs)
 
8
  from markdown import markdown
9
  from utils.config import parser
10
  from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query, start_preprocessor_node, start_retriever, start_reader
11
+ from utils.ui import reset_results, set_initial_state
12
 
13
  # Sliders
14
  DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
 
34
 
35
  def process_file(data_file, preprocesor, document_store):
36
  # read file and add content
37
+ file_contents = data_file.read().decode("utf-8")
38
  docs = [{
39
  'content': str(file_contents),
40
  'meta': {'name': str(data_file.name)}
 
47
  print(f"{data_file.name} already processed")
48
  else:
49
  print(f'preprocessing uploaded doc {data_file.name}.......')
50
+ #print(data_file.read().decode("utf-8"))
51
  preprocessed_docs = preprocesor.process(docs)
52
  print('writing to document store.......')
53
  document_store.write_documents(preprocessed_docs)
utils/haystack.py CHANGED
@@ -13,7 +13,7 @@ from milvus_haystack import MilvusDocumentStore
13
  def start_preprocessor_node():
14
  print('initializing preprocessor node')
15
  processor = PreProcessor(
16
- clean_empty_lines=True,
17
  clean_whitespace=True,
18
  clean_header_footer=True,
19
  #remove_substrings=None,
 
13
  def start_preprocessor_node():
14
  print('initializing preprocessor node')
15
  processor = PreProcessor(
16
+ clean_empty_lines= True,
17
  clean_whitespace=True,
18
  clean_header_footer=True,
19
  #remove_substrings=None,