karshreya98 commited on
Commit
e2fe55a
β€’
1 Parent(s): 8329090

added upload functionality

Browse files
Files changed (2) hide show
  1. app.py +87 -11
  2. utils/haystack.py +44 -17
app.py CHANGED
@@ -7,27 +7,103 @@ from annotated_text import annotation
7
  from json import JSONDecodeError
8
  from markdown import markdown
9
  from utils.config import parser
10
- from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query
11
- from utils.ui import reset_results, set_initial_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  try:
14
  args = parser.parse_args()
15
- document_store = start_document_store(type = args.store)
 
 
 
 
 
 
 
 
16
  if args.task == 'extractive':
17
- pipeline = start_haystack_extractive(document_store)
18
  else:
19
- pipeline = start_haystack_rag(document_store)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- set_initial_state()
22
 
23
- st.write('# '+args.name)
24
 
25
  # Search bar
26
  question = st.text_input("Ask a question", value=st.session_state.question, max_chars=100, on_change=reset_results)
27
- #question = "what is Pi?"
28
-
29
  run_pressed = st.button("Run")
30
- #run_pressed = True
31
 
32
  run_query = (
33
  run_pressed or question != st.session_state.question
@@ -43,7 +119,7 @@ try:
43
  except JSONDecodeError as je:
44
  st.error(
45
  "πŸ‘“    An error occurred reading the results. Is the document store working?"
46
- )
47
  except Exception as e:
48
  logging.exception(e)
49
  st.error("🐞    An error occurred during the request.")
 
7
  from json import JSONDecodeError
8
  from markdown import markdown
9
  from utils.config import parser
10
+ from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query, start_preprocessor_node, start_retriever, start_reader
11
+ from utils.ui import reset_results, set_initial_state, upload_doc
12
+
13
+ # Sliders
14
+ DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
15
+ DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
16
+
17
+ # Labels for the evaluation
18
+ #EVAL_LABELS = os.getenv("EVAL_FILE", str(Path(__file__).parent / "eval_labels_volksbank_QA.csv"))
19
+
20
+ # Whether the file upload should be enabled or not
21
+ DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
22
+ UPLOAD_DOCUMENTS = []
23
+
24
+
25
+
26
+ # Define a function to handle file uploads
27
+ def upload_files():
28
+ uploaded_files = st.sidebar.file_uploader(
29
+ "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
30
+ )
31
+ return uploaded_files
32
+
33
+ # Define a function to process a single file
34
+
35
+ def process_file(data_file, preprocesor, document_store):
36
+ # read file and add content
37
+ file_contents = data_file.read()
38
+ docs = [{
39
+ 'content': str(file_contents),
40
+ 'meta': {'name': str(data_file.name)}
41
+ }]
42
+ try:
43
+ names = [item.meta.get('name') for item in document_store.get_all_documents()]
44
+ #if args.store == 'inmemory':
45
+ # doc = converter.convert(file_path=files, meta=None)
46
+ if data_file.name in names:
47
+ print(f"{data_file.name} already processed")
48
+ else:
49
+ print(f'preprocessing uploaded doc {data_file.name}.......')
50
+ preprocessed_docs = preprocesor.process(docs)
51
+ print('writing to document store.......')
52
+ document_store.write_documents(preprocessed_docs)
53
+ print('updating emebdding.......')
54
+ document_store.update_embeddings(retriever)
55
+ except Exception as e:
56
+ print(e)
57
 
58
  try:
59
  args = parser.parse_args()
60
+
61
+ set_initial_state()
62
+ st.write('# '+args.name)
63
+ session_state = st.session_state
64
+
65
+ preprocesor = start_preprocessor_node()
66
+ document_store = start_document_store(args.store)
67
+ retriever = start_retriever(document_store)
68
+ reader = start_reader()
69
  if args.task == 'extractive':
70
+ pipeline = start_haystack_extractive(document_store, retriever, reader)
71
  else:
72
+ pipeline = start_haystack_rag(document_store, retriever)
73
+
74
+ # Sidebar
75
+
76
+ #st.sidebar.header("Options")
77
+
78
+ # File upload block
79
+ if not DISABLE_FILE_UPLOAD:
80
+ st.sidebar.write("## File Upload:")
81
+ #data_files = st.sidebar.file_uploader(
82
+ # "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
83
+ #)
84
+ data_files = upload_files()
85
+ if data_files is not None:
86
+ for data_file in data_files:
87
+ # Upload file
88
+ if data_file:
89
+ try:
90
+ #raw_json = upload_doc(data_file)
91
+ # Call the process_file function for each uploaded file
92
+ if args.store == 'inmemory':
93
+ processed_data = process_file(data_file, preprocesor, document_store)
94
+ st.sidebar.write(str(data_file.name) + "    βœ… ")
95
+ except Exception as e:
96
+ st.sidebar.write(str(data_file.name) + "    ❌ ")
97
+ st.sidebar.write("_This file could not be parsed, see the logs for more information._")
98
 
 
99
 
 
100
 
101
  # Search bar
102
  question = st.text_input("Ask a question", value=st.session_state.question, max_chars=100, on_change=reset_results)
103
+ # question = "what is Pi?"
104
+
105
  run_pressed = st.button("Run")
106
+ # run_pressed = True
107
 
108
  run_query = (
109
  run_pressed or question != st.session_state.question
 
119
  except JSONDecodeError as je:
120
  st.error(
121
  "πŸ‘“    An error occurred reading the results. Is the document store working?"
122
+ )
123
  except Exception as e:
124
  logging.exception(e)
125
  st.error("🐞    An error occurred during the request.")
utils/haystack.py CHANGED
@@ -5,13 +5,31 @@ from haystack import Pipeline
5
  from haystack.schema import Answer
6
  from haystack.document_stores import BaseDocumentStore
7
  from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
8
- from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode
9
  from milvus_haystack import MilvusDocumentStore
10
  #Use this file to set up your Haystack pipeline and querying
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  @st.cache_resource(show_spinner=False)
13
  def start_document_store(type: str):
14
  #This function starts the documents store of your choice based on your command line preference
 
15
  if type == 'inmemory':
16
  document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
17
  documents = [
@@ -24,7 +42,7 @@ def start_document_store(type: str):
24
  'meta': {'name': "siemens.txt"}
25
  },
26
  ]
27
- document_store.write_documents(documents)
28
  elif type == 'opensearch':
29
  document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
30
  username = document_store_configs['OPENSEARCH_USERNAME'],
@@ -45,34 +63,43 @@ def start_document_store(type: str):
45
  return_embedding=True)
46
  return document_store
47
 
48
- # cached to make index and models load only at start
49
  @st.cache_resource(show_spinner=False)
50
- def start_haystack_extractive(_document_store: BaseDocumentStore):
51
- retriever = EmbeddingRetriever(document_store=_document_store,
52
- embedding_model=model_configs['EMBEDDING_MODEL'],
 
53
  top_k=5)
54
- _document_store.update_embeddings(retriever)
 
 
 
55
 
 
 
 
 
56
  reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
57
-
58
- pipe = Pipeline()
59
- pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
60
- pipe.add_node(component=reader, name="Reader", inputs=["Retriever"])
61
 
 
 
 
 
 
 
 
 
 
62
  return pipe
63
 
64
  @st.cache_resource(show_spinner=False)
65
- def start_haystack_rag(_document_store: BaseDocumentStore):
66
- retriever = EmbeddingRetriever(document_store=_document_store,
67
- embedding_model=model_configs['EMBEDDING_MODEL'],
68
- top_k=5)
69
- _document_store.update_embeddings(retriever)
70
  prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
71
  model_name_or_path=model_configs['GENERATIVE_MODEL'],
72
  api_key=model_configs['OPENAI_KEY'])
73
  pipe = Pipeline()
74
 
75
- pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
76
  pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
77
 
78
  return pipe
 
5
  from haystack.schema import Answer
6
  from haystack.document_stores import BaseDocumentStore
7
  from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
8
+ from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode, PreProcessor
9
  from milvus_haystack import MilvusDocumentStore
10
  #Use this file to set up your Haystack pipeline and querying
11
 
12
+ @st.cache_resource(show_spinner=False)
13
+ def start_preprocessor_node():
14
+ print('initializing preprocessor node')
15
+ processor = PreProcessor(
16
+ clean_empty_lines=True,
17
+ clean_whitespace=True,
18
+ clean_header_footer=True,
19
+ #remove_substrings=None,
20
+ split_by="word",
21
+ split_length=100,
22
+ split_respect_sentence_boundary=True,
23
+ #split_overlap=0,
24
+ #max_chars_check= 10_000
25
+ )
26
+ return processor
27
+ #return docs
28
+
29
  @st.cache_resource(show_spinner=False)
30
  def start_document_store(type: str):
31
  #This function starts the documents store of your choice based on your command line preference
32
+ print('initializing document store')
33
  if type == 'inmemory':
34
  document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
35
  documents = [
 
42
  'meta': {'name': "siemens.txt"}
43
  },
44
  ]
45
+ #document_store.write_documents(documents)
46
  elif type == 'opensearch':
47
  document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
48
  username = document_store_configs['OPENSEARCH_USERNAME'],
 
63
  return_embedding=True)
64
  return document_store
65
 
 
66
  @st.cache_resource(show_spinner=False)
67
+ def start_retriever(_document_store: BaseDocumentStore):
68
+ print('initializing retriever')
69
+ retriever = EmbeddingRetriever(document_store=_document_store,
70
+ embedding_model=model_configs['EMBEDDING_MODEL'],
71
  top_k=5)
72
+ #
73
+
74
+ #_document_store.update_embeddings(retriever)
75
+ return retriever
76
 
77
+
78
+ @st.cache_resource(show_spinner=False)
79
+ def start_reader():
80
+ print('initializing reader')
81
  reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
82
+ return reader
 
 
 
83
 
84
+
85
+
86
+ # cached to make index and models load only at start
87
+ @st.cache_resource(show_spinner=False)
88
+ def start_haystack_extractive(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, _reader: FARMReader):
89
+ print('initializing pipeline')
90
+ pipe = Pipeline()
91
+ pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
92
+ pipe.add_node(component= _reader, name="Reader", inputs=["Retriever"])
93
  return pipe
94
 
95
  @st.cache_resource(show_spinner=False)
96
+ def start_haystack_rag(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever):
 
 
 
 
97
  prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
98
  model_name_or_path=model_configs['GENERATIVE_MODEL'],
99
  api_key=model_configs['OPENAI_KEY'])
100
  pipe = Pipeline()
101
 
102
+ pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
103
  pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
104
 
105
  return pipe